# See the file "LICENSE" for information about the copyright # and warranty status of this software. import array import ast import itertools import os import struct import time from binascii import hexlify, unhexlify from bisect import bisect_right from collections import defaultdict, namedtuple from functools import partial import logging import plyvel from lib.coins import Bitcoin from lib.script import ScriptPubKey # History can hold approx. 65536 * HIST_ENTRIES_PER_KEY entries HIST_ENTRIES_PER_KEY = 1024 HIST_VALUE_BYTES = HIST_ENTRIES_PER_KEY * 4 ADDR_TX_HASH_LEN = 4 UTXO_TX_HASH_LEN = 4 UTXO = namedtuple("UTXO", "tx_num tx_pos tx_hash height value") def formatted_time(t): t = int(t) return '{:d}d {:02d}h {:02d}m {:02d}s'.format( t // 86400, (t % 86400) // 3600, (t % 3600) // 60, t % 60) class UTXOCache(object): '''An in-memory UTXO cache, representing all changes to UTXO state since the last DB flush. We want to store millions, perhaps 10s of millions of these in memory for optimal performance during initial sync, because then it is possible to spend UTXOs without ever going to the database (other than as an entry in the address history, and there is only one such entry per TX not per UTXO). So store them in a Python dictionary with binary keys and values. Key: TX_HASH + TX_IDX (32 + 2 = 34 bytes) Value: HASH168 + TX_NUM + VALUE (21 + 4 + 8 = 33 bytes) That's 67 bytes of raw data. Python dictionary overhead means each entry actually uses about 187 bytes of memory. So almost 11.5 million UTXOs can fit in 2GB of RAM. There are approximately 42 million UTXOs on bitcoin mainnet at height 433,000. Semantics: add: Add it to the cache dictionary. spend: Remove it if in the cache dictionary. Otherwise it's been flushed to the DB. Each UTXO is responsible for two entries in the DB stored using compressed keys. Mark both for deletion in the next flush of the in-memory UTXO cache. A UTXO is stored in the DB in 2 "tables": 1. The output value and tx number. Must be keyed with a hash168 prefix so the unspent outputs and balance of an arbitrary address can be looked up with a simple key traversal. Key: b'u' + hash168 + compressed_tx_hash + tx_idx Value: a (tx_num, value) pair 2. Given a prevout, we need to be able to look up the UTXO key to remove it. As is keyed by hash168 and that is not part of the prevout, we need a hash168 lookup. Key: b'h' + compressed tx_hash + tx_idx Value: (hash168, tx_num) pair The compressed TX hash is just the first few bytes of the hash of the TX the UTXO is in (and needn't be the same number of bytes in each table). As this is not unique there will be collisions; tx_num is stored to resolve them. The collision rate is around 0.02% for the hash168 table, and almost zero for the UTXO table (there are around 100 collisions in the whole bitcoin blockchain). ''' def __init__(self, parent, db, coin): self.logger = logging.getLogger('UTXO') self.logger.setLevel(logging.INFO) self.parent = parent self.coin = coin self.cache = {} self.db = db self.db_cache = {} # Statistics self.adds = 0 self.cache_hits = 0 self.db_deletes = 0 def add_many(self, tx_hash, tx_num, txouts): '''Add a sequence of UTXOs to the cache, return the set of hash168s seen. Pass the hash of the TX it appears in, its TX number, and the TX outputs. ''' parse_script = ScriptPubKey.from_script pack = struct.pack tx_numb = pack(' self.coin.TX_COUNT_HEIGHT: tx_est = (daemon_height - self.height) * self.coin.TX_PER_BLOCK else: tx_est = ((daemon_height - self.coin.TX_COUNT_HEIGHT) * self.coin.TX_PER_BLOCK + self.coin.TX_COUNT) self.logger.info('txs: {:,d} tx/sec since genesis: {:,d}, ' 'since last flush: {:,d}' .format(self.tx_count, txs_per_sec, this_txs_per_sec)) self.logger.info('sync time: {} ETA: {}' .format(formatted_time(self.wall_time), formatted_time(tx_est / this_txs_per_sec))) def flush_to_fs(self): '''Flush the things stored on the filesystem.''' self.write_headers() self.write_tx_counts() self.write_tx_hashes() os.sync() def flush_history(self, batch): # Drop any None entry self.history.pop(None, None) self.flush_count += 1 flush_id = struct.pack('>H', self.flush_count) for hash168, hist in self.history.items(): key = b'H' + hash168 + flush_id batch.put(key, hist.tobytes()) self.logger.info('flushed {:,d} history entries in {:,d} addrs...' .format(self.history_size, len(self.history))) self.history = defaultdict(partial(array.array, 'I')) self.history_size = 0 def open_file(self, filename, create=False): '''Open the file name. Return its handle.''' try: return open(filename, 'rb+') except FileNotFoundError: if create: return open(filename, 'wb+') raise def read_headers(self, height, count): header_len = self.coin.HEADER_LEN self.headers_file.seek(height * header_len) return self.headers_file.read(count * header_len) def write_headers(self): headers = b''.join(self.headers) header_len = self.coin.HEADER_LEN assert len(headers) % header_len == 0 self.headers_file.seek((self.db_height + 1) * header_len) self.headers_file.write(headers) self.headers_file.flush() self.headers = [] def write_tx_counts(self): self.txcount_file.seek((self.db_height + 1) * self.tx_counts.itemsize) self.txcount_file.write(self.tx_counts[self.db_height + 1: self.height + 1]) self.txcount_file.flush() def write_tx_hashes(self): hash_blob = b''.join(itertools.chain(*self.tx_hashes)) assert len(hash_blob) % 32 == 0 assert self.tx_hash_file_size % 32 == 0 hashes = memoryview(hash_blob) cursor = 0 file_pos = self.db_tx_count * 32 while cursor < len(hashes): file_num, offset = divmod(file_pos, self.tx_hash_file_size) size = min(len(hashes) - cursor, self.tx_hash_file_size - offset) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename, create=True) as f: f.seek(offset) f.write(hashes[cursor:cursor + size]) cursor += size file_pos += size self.tx_hashes = [] def cache_size(self, daemon_height): '''Returns the approximate size of the cache, in MB.''' # Good average estimates utxo_cache_size = len(self.utxo_cache.cache) * 187 db_cache_size = len(self.utxo_cache.db_cache) * 105 hist_cache_size = len(self.history) * 180 + self.history_size * 4 utxo_MB = (db_cache_size + utxo_cache_size) // 1048576 hist_MB = hist_cache_size // 1048576 cache_MB = utxo_MB + hist_MB self.logger.info('cache stats at height {:,d} daemon height: {:,d}' .format(self.height, daemon_height)) self.logger.info(' entries: UTXO: {:,d} DB: {:,d} ' 'hist count: {:,d} hist size: {:,d}' .format(len(self.utxo_cache.cache), len(self.utxo_cache.db_cache), len(self.history), self.history_size)) self.logger.info(' size: {:,d}MB (UTXOs {:,d}MB hist {:,d}MB)' .format(cache_MB, utxo_MB, hist_MB)) return cache_MB def process_block(self, block, daemon_height): self.headers.append(block[:self.coin.HEADER_LEN]) tx_hashes, txs = self.coin.read_block(block) self.height += 1 assert len(self.tx_counts) == self.height # These both need to be updated before calling process_tx(). # It uses them for tx hash lookup self.tx_hashes.append(tx_hashes) self.tx_counts.append(self.tx_count + len(txs)) for tx_hash, tx in zip(tx_hashes, txs): self.process_tx(tx_hash, tx) # Check if we're getting full and time to flush? now = time.time() if now > self.next_cache_check: self.next_cache_check = now + 60 if self.cache_size(daemon_height) > self.flush_MB: self.flush_all(daemon_height) def process_tx(self, tx_hash, tx): cache = self.utxo_cache tx_num = self.tx_count # Add the outputs as new UTXOs; spend the inputs hash168s = cache.add_many(tx_hash, tx_num, tx.outputs) if not tx.is_coinbase: for txin in tx.inputs: hash168s.add(cache.spend(txin.prevout)) for hash168 in hash168s: self.history[hash168].append(tx_num) self.history_size += len(hash168s) self.tx_count += 1 def get_tx_hash(self, tx_num): '''Returns the tx_hash and height of a tx number.''' height = bisect_right(self.tx_counts, tx_num) # Is this on disk or unflushed? if height > self.db_height: tx_hashes = self.tx_hashes[height - (self.db_height + 1)] tx_hash = tx_hashes[tx_num - self.tx_counts[height - 1]] else: file_pos = tx_num * 32 file_num, offset = divmod(file_pos, self.tx_hash_file_size) filename = 'hashes{:04d}'.format(file_num) with self.open_file(filename) as f: f.seek(offset) tx_hash = f.read(32) return tx_hash, height @staticmethod def resolve_limit(limit): if limit is None: return -1 assert isinstance(limit, int) and limit >= 0 return limit def get_history(self, hash168, limit=1000): '''Generator that returns an unpruned, sorted list of (tx_hash, height) tuples of transactions that touched the address, earliest in the blockchain first. Includes both spending and receiving transactions. By default yields at most 1000 entries. Set limit to None to get them all. ''' limit = self.resolve_limit(limit) prefix = b'H' + hash168 for key, hist in self.db.iterator(prefix=prefix): a = array.array('I') a.frombytes(hist) for tx_num in a: if limit == 0: return yield self.get_tx_hash(tx_num) limit -= 1 def get_balance(self, hash168): '''Returns the confirmed balance of an address.''' return sum(utxo.value for utxo in self.get_utxos(hash168, limit=None)) def get_utxos(self, hash168, limit=1000): '''Generator that yields all UTXOs for an address sorted in no particular order. By default yields at most 1000 entries. Set limit to None to get them all. ''' limit = self.resolve_limit(limit) unpack = struct.unpack prefix = b'u' + hash168 utxos = [] for k, v in self.db.iterator(prefix=prefix): (tx_pos, ) = unpack('