From f4651c0ca45aa88e9225660a25d1f7e124676c3b Mon Sep 17 00:00:00 2001
From: Neil Booth <kyuupichan@gmail.com>
Date: Thu, 17 Nov 2016 07:59:40 +0900
Subject: [PATCH] Store tx_num in 'h' table key so keys are unique

---
 server/block_processor.py | 132 +++++++++++++++-----------------------
 server/db.py              |  36 ++++++-----
 2 files changed, 72 insertions(+), 96 deletions(-)

diff --git a/server/block_processor.py b/server/block_processor.py
index 1b7d8d3..3761609 100644
--- a/server/block_processor.py
+++ b/server/block_processor.py
@@ -353,9 +353,8 @@ class BlockProcessor(server.db.DB):
 
         # UTXO cache
         self.utxo_cache = {}
-        self.db_cache = {}
         self.utxo_cache_spends = 0
-        self.db_deletes = 0
+        self.db_deletes = []
 
         # Log state
         self.logger.info('{}/{} height: {:,d} tx count: {:,d} '
@@ -540,7 +539,7 @@ class BlockProcessor(server.db.DB):
         assert self.height == self.fs_height == self.db_height
         assert not self.history
         assert not self.utxo_cache
-        assert not self.db_cache
+        assert not self.db_deletes
 
     def flush(self, flush_utxos=False, flush_history=None):
         '''Flush out cached state.
@@ -708,15 +707,16 @@ class BlockProcessor(server.db.DB):
         # more, so we scale our already bloated object sizes.
         one_MB = int(1048576 / 1.3)
         utxo_cache_size = len(self.utxo_cache) * 187
-        db_cache_size = len(self.db_cache) * 105
+        db_deletes_size = len(self.db_deletes) * 61
         hist_cache_size = len(self.history) * 180 + self.history_size * 4
         tx_hash_size = (self.tx_count - self.fs_tx_count) * 74
-        utxo_MB = (db_cache_size + utxo_cache_size) // one_MB
+        utxo_MB = (db_deletes_size + utxo_cache_size) // one_MB
         hist_MB = (hist_cache_size + tx_hash_size) // one_MB
 
         self.logger.info('UTXOs: {:,d}  deletes: {:,d}  '
                          'UTXOs {:,d}MB  hist {:,d}MB'
-                         .format(len(self.utxo_cache), self.db_deletes,
+                         .format(len(self.utxo_cache),
+                                 len(self.db_deletes) // 2,
                                  utxo_MB, hist_MB))
         self.logger.info('our height: {:,d}  daemon height: {:,d}'
                          .format(self.height, self.daemon.cached_height()))
@@ -915,17 +915,18 @@ class BlockProcessor(server.db.DB):
 
     To this end we maintain two "tables", one for each point above:
 
-      1.  Key: b'u' + address_hash168 + tx_num + tx_idx
+      1.  Key: b'u' + address_hash168 + tx_idx + tx_num
           Value: the UTXO value as a 64-bit unsigned integer
 
-      2.  Key: b'h' + compressed_tx_hash + tx_idx
-          Value: [address_hash168 + tx_num]
+      2.  Key: b'h' + compressed_tx_hash + tx_idx + tx_num
+          Value: hash168
 
     The compressed tx hash is just the first few bytes of the hash of
     the tx in which the UTXO was created.  As this is not unique there
-    will are potential collisions when saving and looking up UTXOs;
-    hence why the second table has a list as its value.  The collision
-    can be resolved with the tx_num.  The collision rate is low (<0.1%).
+    will be potential collisions so tx_num is also in the key.  When
+    looking up a UTXO the prefix space of the compressed hash needs to
+    be searched and resolved if necessary with the tx_num.  The
+    collision rate is low (<0.1%).
     '''
 
     def spend_utxo(self, tx_hash, tx_idx):
@@ -942,55 +943,36 @@ class BlockProcessor(server.db.DB):
             self.utxo_cache_spends += 1
             return cache_value
 
-        # Spend it from the DB.  Read the UTXO through the cache
-        # because compressed keys can collide.
-        # The 4 is the COMPRESSED_TX_HASH_LEN
-        db_key = b'h' + tx_hash[:4] + idx_packed
-        db_value = self.db_cache_get(db_key)
-        if db_value:
-            # FIXME: this matches what we did previously but until we store
-            # all UTXOs isn't safe
-            if len(db_value) == 25:
-                udb_key = b'u' + db_value + idx_packed
-                utxo_value_packed = self.db.get(udb_key)
-                if utxo_value_packed:
-                    # Remove the UTXO from both tables
-                    self.db_deletes += 1
-                    self.db_cache[db_key] = None
-                    self.db_cache[udb_key] = None
-                    return db_value + utxo_value_packed
-                # Fall through to below loop for error
-
-            assert len(db_value) % 25 == 0
-
-            # Find which entry, if any, the TX_HASH matches.
-            for n in range(0, len(db_value), 25):
-                tx_num, = unpack('<I', db_value[n + 21:n + 25])
+        # Spend it from the DB.
+
+        # Key: b'h' + compressed_tx_hash + tx_idx + tx_num
+        # Value: hash168
+        prefix = b'h' + tx_hash[:4] + idx_packed
+        candidates = {db_key: hash168 for db_key, hash168
+                      in self.db.iterator(prefix=prefix)}
+
+        for hdb_key, hash168 in candidates.items():
+            tx_num_packed = hdb_key[-4:]
+
+            if len(candidates) > 1:
+                tx_num, = unpack('<I', tx_num_packed)
                 hash, height = self.get_tx_hash(tx_num)
-                if hash == tx_hash:
-                    match = db_value[n:n+25]
-                    udb_key = b'u' + match + idx_packed
-                    utxo_value_packed = self.db.get(udb_key)
-                    if utxo_value_packed:
-                        # Remove the UTXO from both tables
-                        self.db_deletes += 1
-                        self.db_cache[db_key] = db_value[:n] + db_value[n+25:]
-                        self.db_cache[udb_key] = None
-                        return match + utxo_value_packed
-
-                    raise self.DBError('UTXO {} / {:,d} not found in "u" table'
-                                       .format(hash_to_str(tx_hash), tx_idx))
+                if hash != tx_hash:
+                    continue
+
+            # Key: b'u' + address_hash168 + tx_idx + tx_num
+            # Value: the UTXO value as a 64-bit unsigned integer
+            udb_key = b'u' + hash168 + hdb_key[-6:]
+            utxo_value_packed = self.db.get(udb_key)
+            if utxo_value_packed:
+                # Remove both entries for this UTXO
+                self.db_deletes.append(hdb_key)
+                self.db_deletes.append(udb_key)
+                return hash168 + tx_num_packed + utxo_value_packed
 
         raise ChainError('UTXO {} / {:,d} not found in "h" table'
                          .format(hash_to_str(tx_hash), tx_idx))
 
-    def db_cache_get(self, key):
-        '''Fetch a 'h' value from the DB through our write cache.'''
-        value = self.db_cache.get(key)
-        if value:
-            return value
-        return self.db.get(key)
-
     def flush_utxos(self, batch):
         '''Flush the cached DB writes and UTXO set to the batch.'''
         # Care is needed because the writes generated by flushing the
@@ -1004,34 +986,24 @@ class BlockProcessor(server.db.DB):
                          'DB spends: {:,d}'
                          .format(len(self.utxo_cache) + self.utxo_cache_spends,
                                  self.utxo_cache_spends,
-                                 self.db_deletes))
+                                 len(self.db_deletes) // 2))
 
-        for cache_key, cache_value in self.utxo_cache.items():
-            # Frist write to the hash168 lookup table
-            # The 4 is the COMPRESSED_TX_HASH_LEN
-            db_key = b'h' + cache_key[:4] + cache_key[-2:]
-            prior_value = self.db_cache_get(db_key)
-            if prior_value:   # Should rarely happen
-                self.db_cache[db_key] = prior_value + cache_value[:25]
-            else:
-                self.db_cache[db_key] = cache_value[:25]
+        batch_delete = batch.delete
+        for key in self.db_deletes:
+            batch_delete(key)
+        self.db_deletes = []
 
-            # Next write the UTXO table
-            db_key = b'u' + cache_value[:25] + cache_key[-2:]
-            self.db_cache[db_key] = cache_value[-8:]
+        batch_put = batch.put
+        for cache_key, cache_value in self.utxo_cache.items():
+            # suffix = tx_num + tx_idx
+            hash168 = cache_value[:21]
+            suffix =  cache_key[-2:] + cache_value[21:25]
+            batch_put(b'h' + cache_key[:4] + suffix, hash168)
+            batch_put(b'u' + hash168 + suffix, cache_value[25:])
 
-        # GC-ing this now can only help the levelDB write.
         self.utxo_cache = {}
-
-        # Now we can update to the batch.
-        for key, value in self.db_cache.items():
-            if value:
-                batch.put(key, value)
-            else:  # b'' or None
-                batch.delete(key)
-
-        self.db_cache = {}
-        self.utxo_cache_spends = self.db_deletes = 0
+        self.db_deletes = []
+        self.utxo_cache_spends = 0
         self.utxo_flush_count = self.flush_count
         self.db_tx_count = self.tx_count
         self.db_height = self.height
diff --git a/server/db.py b/server/db.py
index 4858f18..2427b99 100644
--- a/server/db.py
+++ b/server/db.py
@@ -29,7 +29,7 @@ class DB(LoggedClass):
     it was shutdown uncleanly.
     '''
 
-    VERSIONS = [2]
+    VERSIONS = [3]
 
     class MissingUTXOError(Exception):
         '''Raised if a mempool tx input UTXO couldn't be found.'''
@@ -198,12 +198,14 @@ class DB(LoggedClass):
         '''
         limit = self._resolve_limit(limit)
         s_unpack = unpack
+        # Key: b'u' + address_hash168 + tx_idx + tx_num
+        # Value: the UTXO value as a 64-bit unsigned integer
         prefix = b'u' + hash168
         for db_key, db_value in self.db.iterator(prefix=prefix):
             if limit == 0:
                 return
             limit -= 1
-            tx_num, tx_pos = s_unpack('<IH', db_key[-6:])
+            tx_num, tx_pos = s_unpack('<HI', db_key[-6:])
             value, = unpack('<Q', db_value)
             tx_hash, height = self.fs_tx_hash(tx_num)
             yield UTXO(tx_num, tx_pos, tx_hash, height, value)
@@ -223,19 +225,19 @@ class DB(LoggedClass):
         '''Return (hash168, tx_num_packed) for the given TXO.
 
         Both are None if not found.'''
-        # The 4 is the COMPRESSED_TX_HASH_LEN
-        key = b'h' + tx_hash[:4] + idx_packed
-        db_value = self.db.get(key)
-        if db_value:
-            assert len(db_value) % 25 == 0
+        # Key: b'h' + compressed_tx_hash + tx_idx + tx_num
+        # Value: hash168
+        prefix = b'h' + tx_hash[:4] + idx_packed
+
+        # Find which entry, if any, the TX_HASH matches.
+        for db_key, hash168 in self.db.iterator(prefix=prefix):
+            assert len(hash168) == 21
 
-            # Find which entry, if any, the TX_HASH matches.
-            for n in range(0, len(db_value), 25):
-                tx_num_packed = db_value[n + 21: n + 25]
-                tx_num, = unpack('<I', tx_num_packed)
-                hash, height = self.fs_tx_hash(tx_num)
-                if hash == tx_hash:
-                    return db_value[n:n+21], tx_num_packed
+            tx_num_packed = db_key[-4:]
+            tx_num, = unpack('<I', tx_num_packed)
+            hash, height = self.fs_tx_hash(tx_num)
+            if hash == tx_hash:
+                return hash168, tx_num_packed
 
         return None, None
 
@@ -249,10 +251,12 @@ class DB(LoggedClass):
         hash168, tx_num_packed = self.db_hash168(tx_hash, idx_packed)
         if not hash168:
             # This can happen when the daemon is a block ahead of us
-            # and has mempool txs spending new txs in that block
+            # and has mempool txs spending outputs from that new block
             raise self.MissingUTXOError
 
-        key = b'u' + hash168 + tx_num_packed + idx_packed
+        # Key: b'u' + address_hash168 + tx_idx + tx_num
+        # Value: the UTXO value as a 64-bit unsigned integer
+        key = b'u' + hash168 + idx_packed + tx_num_packed
         db_value = self.db.get(key)
         if not db_value:
             raise self.DBError('UTXO {} / {:,d} in one table only'