# Copyright (c) 2016, Neil Booth # # All rights reserved. # # See the file "LICENCE" for information about the copyright # and warranty status of this software. '''UTXO and file cache. During initial sync these cache data and only flush occasionally. Once synced flushes are performed after processing each block. ''' import array import itertools import os import struct from bisect import bisect_right from lib.util import chunks, LoggedClass from lib.hash import double_sha256, hash_to_str # History can hold approx. 65536 * HIST_ENTRIES_PER_KEY entries HIST_ENTRIES_PER_KEY = 1024 HIST_VALUE_BYTES = HIST_ENTRIES_PER_KEY * 4 ADDR_TX_HASH_LEN = 4 UTXO_TX_HASH_LEN = 4 NO_HASH_168 = bytes([255]) * 21 NO_CACHE_ENTRY = NO_HASH_168 + bytes(12) class UTXOCache(LoggedClass): '''An in-memory UTXO cache, representing all changes to UTXO state since the last DB flush. We want to store millions, perhaps 10s of millions of these in memory for optimal performance during initial sync, because then it is possible to spend UTXOs without ever going to the database (other than as an entry in the address history, and there is only one such entry per TX not per UTXO). So store them in a Python dictionary with binary keys and values. Key: TX_HASH + TX_IDX (32 + 2 = 34 bytes) Value: HASH168 + TX_NUM + VALUE (21 + 4 + 8 = 33 bytes) That's 67 bytes of raw data. Python dictionary overhead means each entry actually uses about 187 bytes of memory. So almost 11.5 million UTXOs can fit in 2GB of RAM. There are approximately 42 million UTXOs on bitcoin mainnet at height 433,000. Semantics: add: Add it to the cache dictionary. spend: Remove it if in the cache dictionary. Otherwise it's been flushed to the DB. Each UTXO is responsible for two entries in the DB stored using compressed keys. Mark both for deletion in the next flush of the in-memory UTXO cache. A UTXO is stored in the DB in 2 "tables": 1. The output value and tx number. Must be keyed with a hash168 prefix so the unspent outputs and balance of an arbitrary address can be looked up with a simple key traversal. Key: b'u' + hash168 + compressed_tx_hash + tx_idx Value: a (tx_num, value) pair 2. Given a prevout, we need to be able to look up the UTXO key to remove it. As is keyed by hash168 and that is not part of the prevout, we need a hash168 lookup. Key: b'h' + compressed tx_hash + tx_idx Value: (hash168, tx_num) pair The compressed TX hash is just the first few bytes of the hash of the TX the UTXO is in (and needn't be the same number of bytes in each table). As this is not unique there will be collisions; tx_num is stored to resolve them. The collision rate is around 0.02% for the hash168 table, and almost zero for the UTXO table (there are around 100 collisions in the whole bitcoin blockchain). ''' def __init__(self, get_tx_hash, db, coin): super().__init__() self.get_tx_hash = get_tx_hash self.coin = coin self.cache = {} self.put = self.cache.__setitem__ self.db = db self.db_cache = {} # Statistics self.cache_spends = 0 self.db_deletes = 0 def lookup(self, prev_hash, prev_idx): '''Given a prevout, return a pair (hash168, value). If the UTXO is not found, returns (None, None).''' # Fast track is it being in the cache idx_packed = struct.pack('= 0 # Just update in-memory. It doesn't matter if disk files are # too long, they will be overwritten when advancing. self.height -= 1 self.tx_counts.pop() def flush(self, new_height, new_tx_count): '''Flush the things stored on the filesystem. The arguments are passed for sanity check assertions only.''' self.logger.info('flushing to file system') blocks_done = len(self.headers) prior_tx_count = self.tx_counts[self.height] if self.height >= 0 else 0 cur_tx_count = self.tx_counts[-1] if self.tx_counts else 0 txs_done = cur_tx_count - prior_tx_count assert self.height + blocks_done == new_height assert len(self.tx_hashes) == blocks_done assert len(self.tx_counts) == new_height + 1 assert cur_tx_count == new_tx_count, \ 'cur: {:,d} new: {:,d}'.format(cur_tx_count, new_tx_count) # First the headers headers = b''.join(self.headers) header_len = self.coin.HEADER_LEN self.headers_file.seek((self.height + 1) * header_len) self.headers_file.write(headers) self.headers_file.flush() # Then the tx counts self.txcount_file.seek((self.height + 1) * self.tx_counts.itemsize) self.txcount_file.write(self.tx_counts[self.height + 1:]) self.txcount_file.flush() # Finally the hashes hashes = memoryview(b''.join(itertools.chain(*self.tx_hashes))) assert len(hashes) % 32 == 0 assert len(hashes) // 32 == txs_done cursor = 0 file_pos = prior_tx_count * 32 while cursor < len(hashes): file_num, offset = divmod(file_pos, self.tx_hash_file_size) size = min(len(hashes) - cursor, self.tx_hash_file_size - offset) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename, create=True) as f: f.seek(offset) f.write(hashes[cursor:cursor + size]) cursor += size file_pos += size os.sync() self.tx_hashes = [] self.headers = [] self.height += blocks_done def read_headers(self, start, count): result = b'' # Read some from disk disk_count = min(count, self.height + 1 - start) if disk_count > 0: header_len = self.coin.HEADER_LEN assert start >= 0 self.headers_file.seek(start * header_len) result = self.headers_file.read(disk_count * header_len) count -= disk_count start += disk_count # The rest from memory start -= self.height + 1 assert count >= 0 and start + count <= len(self.headers) result += b''.join(self.headers[start: start + count]) return result def get_tx_hash(self, tx_num): '''Returns the tx_hash and height of a tx number.''' height = bisect_right(self.tx_counts, tx_num) # Is this on disk or unflushed? if height > self.height: tx_hashes = self.tx_hashes[height - (self.height + 1)] tx_hash = tx_hashes[tx_num - self.tx_counts[height - 1]] else: file_pos = tx_num * 32 file_num, offset = divmod(file_pos, self.tx_hash_file_size) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename) as f: f.seek(offset) tx_hash = f.read(32) return tx_hash, height def block_hashes(self, height, count): headers = self.read_headers(height, count) hlen = self.coin.HEADER_LEN return [double_sha256(header) for header in chunks(headers, hlen)]