package com.xsn.explorer.gcs import com.google.common.hash.Hashing import com.xsn.explorer.models.persisted.Block import scala.collection.SortedSet /** * A Golomb-coded set, matches all items in the set with probability 1, and matches other items with probability 1/M. * * The encoding is also parameterized by P, the bit length of the remainder code. * * see https://github.com/bitcoin/bips/blob/master/bip-0158.mediawikis */ class GolombEncoding(p: Int, m: Int, key: SipHashKey) { require(p > 1 && p < 31) private val hasher = Hashing.sipHash24(key.k0, key.k1) /** * Encodes the given word set. */ def encode(words: Set[String]): Option[GolombCodedSet] = { if (words.isEmpty) { Option.empty } else { val gcs = encodeNonEmptySet(words) Option(gcs) } } private def encodeNonEmptySet(words: Set[String]): GolombCodedSet = { val sortedHashes = hashes(words) val diffList = differences(sortedHashes) val encodedBits = diffList.flatMap(golombEncode) val encodedBytes = encodedBits .grouped(8) .map { bits => UnsignedByte.parse(bits.padTo(8, Bit.Zero)) } .toList GolombCodedSet.apply( p = p, m = m, n = words.size, data = encodedBytes) } /** * Recovers the hashes from the encoded bytes. * * This method doesn't handle corrupted inputs, which shouldn't be a problem because * the method is used only to verify that the filter is correct. * * @param encoded the encoded bytes, we expect them to be correct * @param n the number of words encoded in the bytes * @return the recovered sorted set of hashes */ private[gcs] def decode(encoded: List[UnsignedByte], n: Int): SortedSet[BigInt] = { val encodedBits = encoded.flatMap(_.bits) val (_, _, result) = List.fill(n)(0) .foldLeft((encodedBits, BigInt(0), List.empty[BigInt])) { case ((bits, acc, hashes), _) => val (remaining, delta) = golombDecode(bits) val hash = acc + delta (remaining, hash, hash :: hashes) } result.to[SortedSet] } /** * Maps the word set to a sorted set of hashes. */ private[gcs] def hashes(words: Set[String]): SortedSet[BigInt] = { val modulus = BigInt(m) * words.size val f = fastReduction(_: BigInt, modulus) words .map(hash) .map(f) .to[SortedSet] } private def golombEncode(x: BigInt): List[Bit] = { val q = (x >> p).toInt val r = (x & ((1 << p)-1)).toInt val qBits = List.fill[Bit](q)(Bit.One) :+ Bit.Zero val rBits = toBits(r, p) qBits ++ rBits } private def golombDecode(bits: List[Bit]): (List[Bit], BigInt) = { val q = bits.takeWhile(_ == Bit.One).size val rBits = bits.drop(q + 1).take(p) val r = toBigInt(rBits) val x = (q * (1L << p)) + r val pending = bits.drop(q + 1 + p) (pending, x) } private def differences(sortedHashes: SortedSet[BigInt]): List[BigInt] = { (BigInt(0) :: sortedHashes.toList) .sliding(2) .map { case a :: b :: Nil => b - a } .toList } private def hash(string: String): BigInt = { val x = hasher.hashBytes(string.getBytes) BigInt(java.lang.Long.toUnsignedString(x.asLong())) } private def toBigInt(bits: List[Bit]): BigInt = { bits.foldLeft(BigInt(0)) { case (acc, cur) => (acc * 2) + cur.toInt } } private def toBits(x: Long, size: Int): List[Bit] = { val bits = x .toBinaryString .flatMap(Bit.from) .toList List.fill(size - bits.size)(Bit.Zero) ++ bits } /** * NOTE: This is a copy from https://github.com/btcsuite/btcutil/blob/master/gcs/gcs.go * that is used for compatibility reasons, here we don't care about such optimizations * because a filter is built once per block and never queried. * * Original docs: * fastReduction calculates a mapping that's more ore less equivalent to: x mod N. * * However, instead of using a mod operation, which using a non-power of two * will lead to slowness on many processors due to unnecessary division, we * instead use a "multiply-and-shift" trick which eliminates all divisions, * described in: * https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ * * * v * N >> log_2(N) * * In our case, using 64-bit integers, log_2 is 64. As most processors don't * support 128-bit arithmetic natively, we'll be super portable and unfold the * operation into several operations with 64-bit arithmetic. As inputs, we the * number to reduce, and our modulus N divided into its high 32-bits and lower * 32-bits. */ private def fastReduction(v: BigInt, modulus: BigInt): BigInt = { val nHi = modulus >> 32 val nLo = modulus & 0xFFFFFFFFL // First, we'll spit the item we need to reduce into its higher and lower bits. val vhi = v >> 32 val vlo = v & 0xFFFFFFFFL // Then, we distribute multiplication over each part. val vnphi = vhi * nHi val vnpmid = vhi * nLo val npvmid = nHi * vlo val vnplo = vlo * nLo // We calculate the carry bit. val carry = ((vnpmid & 0xFFFFFFFFL) + (npvmid & 0xFFFFFFFFL) + (vnplo >> 32)) >> 32 // Last, we add the high bits, the middle bits, and the carry. val result = vnphi + (vnpmid >> 32) + (npvmid >> 32) + carry result } } object GolombEncoding { val DefaultP = 19 val DefaultM = 784931 def default(key: SipHashKey): GolombEncoding = { new GolombEncoding(p = DefaultP, m = DefaultM, key = key) } def encode(block: Block.HasTransactions): Option[GolombCodedSet] = { val key = SipHashKey.fromBtcutil(block.hash) val encoder = default(key) val addresses = block.collectAddresses encoder.encode(addresses.map(_.string)) } }