You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

191 lines
5.7 KiB

package com.xsn.explorer.gcs
import com.google.common.hash.Hashing
import com.xsn.explorer.models.persisted.Block
import scala.collection.SortedSet
/**
* A Golomb-coded set, matches all items in the set with probability 1, and matches other items with probability 1/M.
*
* The encoding is also parameterized by P, the bit length of the remainder code.
*
* see https://github.com/bitcoin/bips/blob/master/bip-0158.mediawikis
*/
class GolombEncoding(p: Int, m: Int, key: SipHashKey) {
require(p > 1 && p < 31)
private val hasher = Hashing.sipHash24(key.k0, key.k1)
/**
* Encodes the given word set.
*/
def encode(words: Set[String]): Option[GolombCodedSet] = {
if (words.isEmpty) {
Option.empty
} else {
val gcs = encodeNonEmptySet(words)
Option(gcs)
}
}
private def encodeNonEmptySet(words: Set[String]): GolombCodedSet = {
val sortedHashes = hashes(words)
val diffList = differences(sortedHashes)
val encodedBits = diffList.flatMap(golombEncode)
val encodedBytes = encodedBits
.grouped(8)
.map { bits => UnsignedByte.parse(bits.padTo(8, Bit.Zero)) }
.toList
GolombCodedSet.apply(
p = p,
m = m,
n = words.size,
data = encodedBytes)
}
/**
* Recovers the hashes from the encoded bytes.
*
* This method doesn't handle corrupted inputs, which shouldn't be a problem because
* the method is used only to verify that the filter is correct.
*
* @param encoded the encoded bytes, we expect them to be correct
* @param n the number of words encoded in the bytes
* @return the recovered sorted set of hashes
*/
private[gcs] def decode(encoded: List[UnsignedByte], n: Int): SortedSet[BigInt] = {
val encodedBits = encoded.flatMap(_.bits)
val (_, _, result) = List.fill(n)(0)
.foldLeft((encodedBits, BigInt(0), List.empty[BigInt])) { case ((bits, acc, hashes), _) =>
val (remaining, delta) = golombDecode(bits)
val hash = acc + delta
(remaining, hash, hash :: hashes)
}
result.to[SortedSet]
}
/**
* Maps the word set to a sorted set of hashes.
*/
private[gcs] def hashes(words: Set[String]): SortedSet[BigInt] = {
val modulus = BigInt(m) * words.size
val f = fastReduction(_: BigInt, modulus)
words
.map(hash)
.map(f)
.to[SortedSet]
}
private def golombEncode(x: BigInt): List[Bit] = {
val q = (x >> p).toInt
val r = (x & ((1 << p)-1)).toInt
val qBits = List.fill[Bit](q)(Bit.One) :+ Bit.Zero
val rBits = toBits(r, p)
qBits ++ rBits
}
private def golombDecode(bits: List[Bit]): (List[Bit], BigInt) = {
val q = bits.takeWhile(_ == Bit.One).size
val rBits = bits.drop(q + 1).take(p)
val r = toBigInt(rBits)
val x = (q * (1L << p)) + r
val pending = bits.drop(q + 1 + p)
(pending, x)
}
private def differences(sortedHashes: SortedSet[BigInt]): List[BigInt] = {
(BigInt(0) :: sortedHashes.toList)
.sliding(2)
.map { case a :: b :: Nil => b - a }
.toList
}
private def hash(string: String): BigInt = {
val x = hasher.hashBytes(string.getBytes)
BigInt(java.lang.Long.toUnsignedString(x.asLong()))
}
private def toBigInt(bits: List[Bit]): BigInt = {
bits.foldLeft(BigInt(0)) { case (acc, cur) =>
(acc * 2) + cur.toInt
}
}
private def toBits(x: Long, size: Int): List[Bit] = {
val bits = x
.toBinaryString
.flatMap(Bit.from)
.toList
List.fill(size - bits.size)(Bit.Zero) ++ bits
}
/**
* NOTE: This is a copy from https://github.com/btcsuite/btcutil/blob/master/gcs/gcs.go
* that is used for compatibility reasons, here we don't care about such optimizations
* because a filter is built once per block and never queried.
*
* Original docs:
* fastReduction calculates a mapping that's more ore less equivalent to: x mod N.
*
* However, instead of using a mod operation, which using a non-power of two
* will lead to slowness on many processors due to unnecessary division, we
* instead use a "multiply-and-shift" trick which eliminates all divisions,
* described in:
* https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
*
* * v * N >> log_2(N)
*
* In our case, using 64-bit integers, log_2 is 64. As most processors don't
* support 128-bit arithmetic natively, we'll be super portable and unfold the
* operation into several operations with 64-bit arithmetic. As inputs, we the
* number to reduce, and our modulus N divided into its high 32-bits and lower
* 32-bits.
*/
private def fastReduction(v: BigInt, modulus: BigInt): BigInt = {
val nHi = modulus >> 32
val nLo = modulus & 0xFFFFFFFFL
// First, we'll spit the item we need to reduce into its higher and lower bits.
val vhi = v >> 32
val vlo = v & 0xFFFFFFFFL
// Then, we distribute multiplication over each part.
val vnphi = vhi * nHi
val vnpmid = vhi * nLo
val npvmid = nHi * vlo
val vnplo = vlo * nLo
// We calculate the carry bit.
val carry = ((vnpmid & 0xFFFFFFFFL) + (npvmid & 0xFFFFFFFFL) + (vnplo >> 32)) >> 32
// Last, we add the high bits, the middle bits, and the carry.
val result = vnphi + (vnpmid >> 32) + (npvmid >> 32) + carry
result
}
}
object GolombEncoding {
val DefaultP = 19
val DefaultM = 784931
def default(key: SipHashKey): GolombEncoding = {
new GolombEncoding(p = DefaultP, m = DefaultM, key = key)
}
def encode(block: Block.HasTransactions): Option[GolombCodedSet] = {
val key = SipHashKey.fromBtcutil(block.hash)
val encoder = default(key)
val addresses = block.collectAddresses
encoder.encode(addresses.map(_.string))
}
}