# See the file "LICENSE" for information about the copyright # and warranty status of this software. import array import ast import itertools import os import struct import time from binascii import hexlify, unhexlify from bisect import bisect_right from collections import defaultdict, namedtuple from functools import partial import logging import plyvel from lib.coins import Bitcoin from lib.script import ScriptPubKey # History can hold approx. 65536 * HIST_ENTRIES_PER_KEY entries HIST_ENTRIES_PER_KEY = 1024 HIST_VALUE_BYTES = HIST_ENTRIES_PER_KEY * 4 ADDR_TX_HASH_LEN = 4 UTXO_TX_HASH_LEN = 4 UTXO = namedtuple("UTXO", "tx_num tx_pos tx_hash height value") def formatted_time(t): t = int(t) return '{:d}d {:02d}h {:02d}m {:02d}s'.format( t // 86400, (t % 86400) // 3600, (t % 3600) // 60, t % 60) class UTXOCache(object): '''An in-memory UTXO cache, representing all changes to UTXO state since the last DB flush. We want to store millions, perhaps 10s of millions of these in memory for optimal performance during initial sync, because then it is possible to spend UTXOs without ever going to the database (other than as an entry in the address history, and there is only one such entry per TX not per UTXO). So store them in a Python dictionary with binary keys and values. Key: TX_HASH + TX_IDX (32 + 2 = 34 bytes) Value: HASH168 + TX_NUM + VALUE (21 + 4 + 8 = 33 bytes) That's 67 bytes of raw data. Python dictionary overhead means each entry actually uses about 187 bytes of memory. So almost 11.5 million UTXOs can fit in 2GB of RAM. There are approximately 42 million UTXOs on bitcoin mainnet at height 433,000. Semantics: add: Add it to the cache dictionary. spend: Remove it if in the cache dictionary. Otherwise it's been flushed to the DB. Each UTXO is responsible for two entries in the DB stored using compressed keys. Mark both for deletion in the next flush of the in-memory UTXO cache. A UTXO is stored in the DB in 2 "tables": 1. The output value and tx number. Must be keyed with a hash168 prefix so the unspent outputs and balance of an arbitrary address can be looked up with a simple key traversal. Key: b'u' + hash168 + compressed_tx_hash + tx_idx Value: a (tx_num, value) pair 2. Given a prevout, we need to be able to look up the UTXO key to remove it. As is keyed by hash168 and that is not part of the prevout, we need a hash168 lookup. Key: b'h' + compressed tx_hash + tx_idx Value: (hash168, tx_num) pair The compressed TX hash is just the first few bytes of the hash of the TX the UTXO is in (and needn't be the same number of bytes in each table). As this is not unique there will be collisions; tx_num is stored to resolve them. The collision rate is around 0.02% for the hash168 table, and almost zero for the UTXO table (there are around 100 collisions in the whole bitcoin blockchain). ''' def __init__(self, parent, db, coin): self.logger = logging.getLogger('UTXO') self.logger.setLevel(logging.INFO) self.parent = parent self.coin = coin self.cache = {} self.db = db self.db_cache = {} # Statistics self.adds = 0 self.cache_hits = 0 self.db_deletes = 0 def add_many(self, tx_hash, tx_num, txouts): '''Add a sequence of UTXOs to the cache, return the set of hash168s seen. Pass the hash of the TX it appears in, its TX number, and the TX outputs. ''' parse_script = ScriptPubKey.from_script pack = struct.pack tx_numb = pack('H', key[-2:]) if flush_id > self.utxo_flush_count: keys.append(key) self.logger.info('deleting {:,d} history entries'.format(len(keys))) with db.write_batch(transaction=True) as batch: for key in keys: db.delete(key) self.utxo_flush_count = self.flush_count self.flush_state(batch) self.logger.info('deletion complete') def flush_to_fs(self): '''Flush the things stored on the filesystem.''' # First the headers headers = b''.join(self.headers) header_len = self.coin.HEADER_LEN self.headers_file.seek((self.fs_height + 1) * header_len) self.headers_file.write(headers) self.headers_file.flush() self.headers = [] # Then the tx counts self.txcount_file.seek((self.fs_height + 1) * self.tx_counts.itemsize) self.txcount_file.write(self.tx_counts[self.fs_height + 1: self.height + 1]) self.txcount_file.flush() # Finally the hashes hashes = memoryview(b''.join(itertools.chain(*self.tx_hashes))) assert len(hashes) % 32 == 0 assert self.tx_hash_file_size % 32 == 0 cursor = 0 file_pos = self.fs_tx_count * 32 while cursor < len(hashes): file_num, offset = divmod(file_pos, self.tx_hash_file_size) size = min(len(hashes) - cursor, self.tx_hash_file_size - offset) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename, create=True) as f: f.seek(offset) f.write(hashes[cursor:cursor + size]) cursor += size file_pos += size self.tx_hashes = [] self.fs_height = self.height self.fs_tx_count = self.tx_count os.sync() def flush_state(self, batch): '''Flush chain state to the batch.''' now = time.time() self.wall_time += now - self.last_flush self.last_flush = now state = { 'genesis': self.coin.GENESIS_HASH, 'height': self.db_height, 'tx_count': self.db_tx_count, 'tip': self.tip, 'flush_count': self.flush_count, 'utxo_flush_count': self.utxo_flush_count, 'wall_time': self.wall_time, } batch.put(b'state', repr(state).encode('ascii')) def flush_utxos(self, batch): self.logger.info('flushing UTXOs: {:,d} txs and {:,d} blocks' .format(self.tx_count - self.db_tx_count, self.height - self.db_height)) self.utxo_cache.flush(batch) self.utxo_flush_count = self.flush_count self.db_tx_count = self.tx_count self.db_height = self.height def flush(self, daemon_height, flush_utxos=False): '''Flush out cached state. History is always flushed. UTXOs are flushed if flush_utxos.''' flush_start = time.time() last_flush = self.last_flush # Write out the files to the FS before flushing to the DB. If # the DB transaction fails, the files being too long doesn't # matter. But if writing the files fails we do not want to # have updated the DB. self.logger.info('commencing history flush') self.flush_to_fs() with self.db.write_batch(transaction=True) as batch: # History first - fast and frees memory. Flush state last # as it reads the wall time. self.flush_history(batch) if flush_utxos: self.flush_utxos(batch) self.flush_state(batch) self.logger.info('committing transaction...') # Update and put the wall time again - otherwise we drop the # time it took leveldb to commit the batch self.flush_state(self.db) flush_time = int(self.last_flush - flush_start) self.logger.info('flush #{:,d} to height {:,d} took {:,d}s' .format(self.flush_count, self.height, flush_time)) # Log handy stats tx_diff = self.tx_count - self.fs_tx_count txs_per_sec = int(self.tx_count / self.wall_time) this_txs_per_sec = 1 + int(tx_diff / (self.last_flush - last_flush)) if self.height > self.coin.TX_COUNT_HEIGHT: tx_est = (daemon_height - self.height) * self.coin.TX_PER_BLOCK else: tx_est = ((daemon_height - self.coin.TX_COUNT_HEIGHT) * self.coin.TX_PER_BLOCK + (self.coin.TX_COUNT - self.tx_count)) self.logger.info('txs: {:,d} tx/sec since genesis: {:,d}, ' 'since last flush: {:,d}' .format(self.tx_count, txs_per_sec, this_txs_per_sec)) self.logger.info('sync time: {} ETA: {}' .format(formatted_time(self.wall_time), formatted_time(tx_est / this_txs_per_sec))) def flush_history(self, batch): # Drop any None entry self.history.pop(None, None) self.flush_count += 1 flush_id = struct.pack('>H', self.flush_count) for hash168, hist in self.history.items(): key = b'H' + hash168 + flush_id batch.put(key, hist.tobytes()) self.logger.info('{:,d} history entries in {:,d} addrs' .format(self.history_size, len(self.history))) self.history = defaultdict(partial(array.array, 'I')) self.history_size = 0 def open_file(self, filename, create=False): '''Open the file name. Return its handle.''' try: return open(filename, 'rb+') except FileNotFoundError: if create: return open(filename, 'wb+') raise def read_headers(self, height, count): header_len = self.coin.HEADER_LEN self.headers_file.seek(height * header_len) return self.headers_file.read(count * header_len) def cache_sizes(self, daemon_height): '''Returns the approximate size of the cache, in MB.''' # Good average estimates based on traversal of subobjects and # requesting size from Python (see deep_getsizeof). For # whatever reason Python O/S mem usage is typically +30% or # more, so we scale our already bloated object sizes. one_MB = int(1048576 / 1.3) utxo_cache_size = len(self.utxo_cache.cache) * 187 db_cache_size = len(self.utxo_cache.db_cache) * 105 hist_cache_size = len(self.history) * 180 + self.history_size * 4 utxo_MB = (db_cache_size + utxo_cache_size) // one_MB hist_MB = hist_cache_size // one_MB self.logger.info('cache stats at height {:,d} daemon height: {:,d}' .format(self.height, daemon_height)) self.logger.info(' entries: UTXO: {:,d} DB: {:,d} ' 'hist addrs: {:,d} hist size: {:,d}' .format(len(self.utxo_cache.cache), len(self.utxo_cache.db_cache), len(self.history), self.history_size)) self.logger.info(' size: {:,d}MB (UTXOs {:,d}MB hist {:,d}MB)' .format(utxo_MB + hist_MB, utxo_MB, hist_MB)) return utxo_MB, hist_MB def process_block(self, block, daemon_height): self.headers.append(block[:self.coin.HEADER_LEN]) tx_hashes, txs = self.coin.read_block(block) self.height += 1 assert len(self.tx_counts) == self.height # These both need to be updated before calling process_tx(). # It uses them for tx hash lookup self.tx_hashes.append(tx_hashes) self.tx_counts.append(self.tx_count + len(txs)) for tx_hash, tx in zip(tx_hashes, txs): self.process_tx(tx_hash, tx) # Check if we're getting full and time to flush? now = time.time() if now > self.next_cache_check: self.next_cache_check = now + 60 utxo_MB, hist_MB = self.cache_sizes(daemon_height) if utxo_MB >= self.utxo_MB or hist_MB >= self.hist_MB: self.flush(daemon_height, utxo_MB >= self.utxo_MB) def process_tx(self, tx_hash, tx): cache = self.utxo_cache tx_num = self.tx_count # Add the outputs as new UTXOs; spend the inputs hash168s = cache.add_many(tx_hash, tx_num, tx.outputs) if not tx.is_coinbase: for txin in tx.inputs: hash168s.add(cache.spend(txin.prevout)) for hash168 in hash168s: self.history[hash168].append(tx_num) self.history_size += len(hash168s) self.tx_count += 1 def get_tx_hash(self, tx_num): '''Returns the tx_hash and height of a tx number.''' height = bisect_right(self.tx_counts, tx_num) # Is this on disk or unflushed? if height > self.fs_height: tx_hashes = self.tx_hashes[height - (self.fs_height + 1)] tx_hash = tx_hashes[tx_num - self.tx_counts[height - 1]] else: file_pos = tx_num * 32 file_num, offset = divmod(file_pos, self.tx_hash_file_size) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename) as f: f.seek(offset) tx_hash = f.read(32) return tx_hash, height @staticmethod def resolve_limit(limit): if limit is None: return -1 assert isinstance(limit, int) and limit >= 0 return limit def get_history(self, hash168, limit=1000): '''Generator that returns an unpruned, sorted list of (tx_hash, height) tuples of transactions that touched the address, earliest in the blockchain first. Includes both spending and receiving transactions. By default yields at most 1000 entries. Set limit to None to get them all. ''' limit = self.resolve_limit(limit) prefix = b'H' + hash168 for key, hist in self.db.iterator(prefix=prefix): a = array.array('I') a.frombytes(hist) for tx_num in a: if limit == 0: return yield self.get_tx_hash(tx_num) limit -= 1 def get_balance(self, hash168): '''Returns the confirmed balance of an address.''' return sum(utxo.value for utxo in self.get_utxos(hash168, limit=None)) def get_utxos(self, hash168, limit=1000): '''Generator that yields all UTXOs for an address sorted in no particular order. By default yields at most 1000 entries. Set limit to None to get them all. ''' limit = self.resolve_limit(limit) unpack = struct.unpack prefix = b'u' + hash168 utxos = [] for k, v in self.db.iterator(prefix=prefix): (tx_pos, ) = unpack('