Accurate cache accounting

8 years ago · d8e9eb7796
4 changed files with 77 additions and 40 deletions
--- a/lib/coins.py
+++ b/lib/coins.py
@ -137,7 +137,6 @@ class Coin(object):

    @classmethod
    def read_block(cls, block):
-        assert isinstance(block, memoryview)
        d = Deserializer(block[cls.HEADER_LEN:])
        return d.read_block()

--- a/lib/util.py
+++ b/lib/util.py
@ -1,8 +1,9 @@
 # See the file "LICENSE" for information about the copyright
 # and warranty status of this software.

-
+import array
 import sys
+from collections import Container, Mapping


 # Method decorator.  To be used for calculations that will always
@ -25,6 +26,46 @@ class cachedproperty(object):
                             .format(self.f.__name__, obj))


+def deep_getsizeof(obj):
+    """Find the memory footprint of a Python object.
+
+    Based on from code.tutsplus.com: http://goo.gl/fZ0DXK
+
+    This is a recursive function that drills down a Python object graph
+    like a dictionary holding nested dictionaries with lists of lists
+    and tuples and sets.
+
+    The sys.getsizeof function does a shallow size of only. It counts each
+    object inside a container as pointer only regardless of how big it
+    really is.
+
+    :param o: the object
+
+    :return:
+    """
+
+    ids = set()
+
+    def size(o):
+        if id(o) in ids:
+            return 0
+
+        r = sys.getsizeof(o)
+        ids.add(id(o))
+
+        if isinstance(o, (str, bytes, bytearray, array.array)):
+            return r
+
+        if isinstance(o, Mapping):
+            return r + sum(size(k) + size(v) for k, v in o.items())
+
+        if isinstance(o, Container):
+            return r + sum(size(x) for x in o)
+
+        return r
+
+    return size(obj)
+
 def chunks(items, size):
    for i in range(0, len(items), size):
        yield items[i: i + size]
--- a/server/db.py
+++ b/server/db.py
@ -45,10 +45,10 @@ class UTXOCache(object):
      Key:    TX_HASH + TX_IDX           (32 + 2 = 34 bytes)
      Value:  HASH168 + TX_NUM + VALUE   (21 + 4 + 8 = 33 bytes)

-    That's 67 bytes of raw data.  Assume 100 bytes per UTXO accounting
-    for Python datastructure overhead, then perhaps 20 million UTXOs
-    can fit in 2GB of RAM.  There are approximately 42 million UTXOs
-    on bitcoin mainnet at height 433,000.
+    That's 67 bytes of raw data.  Python dictionary overhead means
+    each entry actually uses about 187 bytes of memory.  So almost
+    11.5 million UTXOs can fit in 2GB of RAM.  There are approximately
+    42 million UTXOs on bitcoin mainnet at height 433,000.

    Semantics:

@ -80,6 +80,7 @@ class UTXOCache(object):
    tx_num is stored to resolve them.  The collision rate is around
    0.02% for the hash168 table, and almost zero for the UTXO table
    (there are around 100 collisions in the whole bitcoin blockchain).
+
    '''

    def __init__(self, parent, db, coin):
@ -290,6 +291,7 @@ class DB(object):

        self.coin = env.coin
        self.flush_MB = env.flush_MB
+        self.next_cache_check = 0
        self.logger.info('flushing after cache reaches {:,d} MB'
                         .format(self.flush_MB))

@ -298,7 +300,7 @@ class DB(object):
        # Unflushed items.  Headers and tx_hashes have one entry per block
        self.headers = []
        self.tx_hashes = []
-        self.history = defaultdict(list)
+        self.history = defaultdict(partial(array.array, 'I'))
        self.history_size = 0

        db_name = '{}-{}'.format(self.coin.NAME, self.coin.NET)
@ -432,13 +434,12 @@ class DB(object):
        flush_id = struct.pack('>H', self.flush_count)
        for hash168, hist in self.history.items():
            key = b'H' + hash168 + flush_id
-            batch.put(key, array.array('I', hist).tobytes())
+            batch.put(key, hist.tobytes())

-        self.logger.info('flushed {:,d} history entries ({:,d} MB)...'
-                         .format(self.history_size,
-                                 self.history_size * 4 // 1048576))
+        self.logger.info('flushed {:,d} history entries in {:,d} addrs...'
+                         .format(self.history_size, len(self.history)))

-        self.history = defaultdict(list)
+        self.history = defaultdict(partial(array.array, 'I'))
        self.history_size = 0

    def open_file(self, filename, truncate=False, create=False):
@ -488,20 +489,24 @@ class DB(object):

    def cache_MB(self):
        '''Returns the approximate size of the cache, in MB.'''
-        utxo_MB = ((len(self.utxo_cache.cache) + len(self.utxo_cache.db_cache))
-                   * 100 // 1048576)
-        hist_MB = (len(self.history) * 48 + self.history_size * 20) // 1048576
-        if self.height % 200 == 0:
-            self.logger.info('cache size at height {:,d}: '
-                             'UTXOs: {:,d} MB history: {:,d} MB'
-                             .format(self.height, utxo_MB, hist_MB))
-            self.logger.info('cache entries: UTXOs: {:,d}/{:,d} '
-                             'history: {:,d}/{:,d}'
-                             .format(len(self.utxo_cache.cache),
-                                     len(self.utxo_cache.db_cache),
-                                     len(self.history),
-                                     self.history_size))
-        return utxo_MB + hist_MB
+        # Good average estimates
+        utxo_cache_size = len(self.utxo_cache.cache) * 187
+        db_cache_size = len(self.utxo_cache.db_cache) * 105
+        hist_cache_size = len(self.history) * 180 + self.history_size * 4
+        utxo_MB = (db_cache_size + utxo_cache_size) // 1048576
+        hist_MB = hist_cache_size // 1048576
+        cache_MB = utxo_MB + hist_MB
+
+        self.logger.info('cache entries: UTXO: {:,d} DB: {:,d} '
+                         'hist count: {:,d} hist size: {:,d}'
+                         .format(len(self.utxo_cache.cache),
+                                 len(self.utxo_cache.db_cache),
+                                 len(self.history),
+                                 self.history_size))
+        self.logger.info('cache size at height {:,d}: {:,d}MB  '
+                         '(UTXOs {:,d}MB hist {:,d}MB)'
+                         .format(self.height, cache_MB, utxo_MB, hist_MB))
+        return cache_MB

    def process_block(self, block):
        self.headers.append(block[:self.coin.HEADER_LEN])
@ -519,9 +524,12 @@ class DB(object):
        for tx_hash, tx in zip(tx_hashes, txs):
            self.process_tx(tx_hash, tx)

-        # Flush if we're getting full
-        if self.cache_MB() > self.flush_MB:
-            self.flush()
+        # Check if we're getting full and time to flush?
+        now = time.time()
+        if now > self.next_cache_check:
+            self.next_cache_check = now + 60
+            if self.cache_MB() > self.flush_MB:
+                self.flush()

    def process_tx(self, tx_hash, tx):
        cache = self.utxo_cache
--- a/server/server.py
+++ b/server/server.py
@ -74,18 +74,7 @@ class BlockCache(object):
        self.logger.info('catching up, block cache limit {:d}MB...'
                         .format(self.cache_limit))

-        last_log = 0
-        prior_height = self.db.height
        while await self.maybe_prefill():
-            now = time.time()
-            count = self.fetched_height - prior_height
-            if now > last_log + 15 and count:
-                last_log = now
-                prior_height = self.fetched_height
-                self.logger.info('prefilled {:,d} blocks to height {:,d} '
-                                 'daemon height: {:,d}'
-                                 .format(count, self.fetched_height,
-                                         self.daemon_height))
            await asyncio.sleep(1)

        if not self.stop: