From f6f328888035c91c9c361c542093d5d25c64728f Mon Sep 17 00:00:00 2001 From: Alexis Hernandez Date: Sun, 17 Mar 2019 19:38:47 -0700 Subject: [PATCH] server: Add the GolombEncoding --- server/app/com/xsn/explorer/gcs/Bit.scala | 22 +++ .../com/xsn/explorer/gcs/GolombCodedSet.scala | 18 +++ .../com/xsn/explorer/gcs/GolombEncoding.scala | 150 ++++++++++++++++++ .../com/xsn/explorer/gcs/UnsignedByte.scala | 34 ++++ .../xsn/explorer/gcs/GolombEncodingSpec.scala | 93 +++++++++++ 5 files changed, 317 insertions(+) create mode 100644 server/app/com/xsn/explorer/gcs/Bit.scala create mode 100644 server/app/com/xsn/explorer/gcs/GolombCodedSet.scala create mode 100644 server/app/com/xsn/explorer/gcs/GolombEncoding.scala create mode 100644 server/app/com/xsn/explorer/gcs/UnsignedByte.scala create mode 100644 server/test/com/xsn/explorer/gcs/GolombEncodingSpec.scala diff --git a/server/app/com/xsn/explorer/gcs/Bit.scala b/server/app/com/xsn/explorer/gcs/Bit.scala new file mode 100644 index 0000000..7434c5f --- /dev/null +++ b/server/app/com/xsn/explorer/gcs/Bit.scala @@ -0,0 +1,22 @@ +package com.xsn.explorer.gcs + +sealed trait Bit extends Product with Serializable { + + def toInt: Int = this match { + case Bit.Zero => 0 + case Bit.One => 1 + } + + override def toString: String = toInt.toString +} + +object Bit { + final case object Zero extends Bit + final case object One extends Bit + + def from(char: Char): Option[Bit] = char match { + case '0' => Option(Bit.Zero) + case '1' => Option(Bit.One) + case _ => None + } +} diff --git a/server/app/com/xsn/explorer/gcs/GolombCodedSet.scala b/server/app/com/xsn/explorer/gcs/GolombCodedSet.scala new file mode 100644 index 0000000..820a514 --- /dev/null +++ b/server/app/com/xsn/explorer/gcs/GolombCodedSet.scala @@ -0,0 +1,18 @@ +package com.xsn.explorer.gcs + +import com.xsn.explorer.models.values.HexString + +class GolombCodedSet( + val p: Int, + val m: Int, + val n: Int, + val data: List[UnsignedByte]) { + + def hex: HexString = { + val string = data.map(_.byte).map("%02x".format(_)).mkString("") + HexString.from(string) match { + case Some(value) => value + case None => throw new RuntimeException("Unexpected error, unable to create hex value") + } + } +} diff --git a/server/app/com/xsn/explorer/gcs/GolombEncoding.scala b/server/app/com/xsn/explorer/gcs/GolombEncoding.scala new file mode 100644 index 0000000..b936ddc --- /dev/null +++ b/server/app/com/xsn/explorer/gcs/GolombEncoding.scala @@ -0,0 +1,150 @@ +package com.xsn.explorer.gcs + +import com.google.common.hash.Hashing + +/** + * A Golomb-coded set, matches all items in the set with probability 1, and matches other items with probability 1/M. + * + * The encoding is also parameterized by P, the bit length of the remainder code. + * + * see https://github.com/bitcoin/bips/blob/master/bip-0158.mediawikis + */ +class GolombEncoding(p: Int, m: Int, key: SipHashKey) { + require(p > 1 && p < 31) + + private val hasher = Hashing.sipHash24(key.k0, key.k1) + + /** + * Encodes the given word list. + */ + def encode(words: List[String]): GolombCodedSet = { + val hashList = hashes(words) + val diffList = differences(hashList) + val encodedBits = diffList.flatMap(golombEncode) + val encodedBytes = encodedBits + .grouped(8) + .map { bits => UnsignedByte.parse(bits.padTo(8, Bit.Zero)) } + .toList + + new GolombCodedSet( + p = p, + m = m, + n = words.size, + data = encodedBytes) + } + + /** + * Recovers the hashes from the encoded bytes. + * + * This method doesn't handle corrupted inputs, which shouldn't be a problem because + * the method is used only to verify that the filter is correct. + * + * @param encoded the encoded bytes, we expect them to be correct + * @param n the number of words encoded in the bytes + * @return the recovered list of hashes + */ + private[gcs] def decode(encoded: List[UnsignedByte], n: Int): List[BigInt] = { + val encodedBits = encoded.flatMap(_.bits) + val (_, _, result) = List.fill(n)(0) + .foldLeft((encodedBits, BigInt(0), List.empty[BigInt])) { case ((bits, acc, hashes), _) => + val (remaining, delta) = golombDecode(bits) + val hash = acc + delta + (remaining, hash, hash :: hashes) + } + + result.reverse + } + + /** + * Maps the word list to a list of hashes. + */ + private[gcs] def hashes(words: List[String]): List[BigInt] = { + val modulus = BigInt(m) * words.length + val f = fastReduction(_: BigInt, modulus) + words + .map(hash) + .map(f) + .sorted + } + + private def golombEncode(x: BigInt): List[Bit] = { + val q = (x >> p).toInt + val r = (x & ((1 << p)-1)).toInt + + val qBits = List.fill[Bit](q)(Bit.One) :+ Bit.Zero + val rBits = toBits(r, p) + + qBits ++ rBits + } + + private def golombDecode(bits: List[Bit]): (List[Bit], BigInt) = { + val q = bits.takeWhile(_ == Bit.One).size + val rBits = bits.drop(q + 1).take(p) + val r = toBigInt(rBits) + + val x = (q * (1L << p)) + r + val pending = bits.drop(q + 1 + p) + + (pending, x) + } + + private def differences(sortedHashList: List[BigInt]): List[BigInt] = { + (BigInt(0) :: sortedHashList) + .sliding(2) + .map { case a :: b :: Nil => b - a } + .toList + } + + private def hash(string: String): BigInt = { + val x = hasher.hashBytes(string.getBytes) + BigInt(java.lang.Long.toUnsignedString(x.asLong())) + } + + private def toBigInt(bits: List[Bit]): BigInt = { + bits.foldLeft(BigInt(0)) { case (acc, cur) => + (acc * 2) + cur.toInt + } + } + + private def toBits(x: Long, size: Int): List[Bit] = { + val bits = x + .toBinaryString + .flatMap(Bit.from) + .toList + + List.fill(size - bits.size)(Bit.Zero) ++ bits + } + + private def fastReduction(v: BigInt, modulus: BigInt): BigInt = { + val nHi = modulus >> 32 + val nLo = modulus & 0xFFFFFFFFL + + // First, we'll spit the item we need to reduce into its higher and lower bits. + val vhi = v >> 32 + val vlo = v & 0xFFFFFFFFL + + // Then, we distribute multiplication over each part. + val vnphi = vhi * nHi + val vnpmid = vhi * nLo + val npvmid = nHi * vlo + val vnplo = vlo * nLo + + // We calculate the carry bit. + val carry = ((vnpmid & 0xFFFFFFFFL) + (npvmid & 0xFFFFFFFFL) + (vnplo >> 32)) >> 32 + + // Last, we add the high bits, the middle bits, and the carry. + val result = vnphi + (vnpmid >> 32) + (npvmid >> 32) + carry + + result + } +} + +object GolombEncoding { + + val DefaultP = 19 + val DefaultM = 784931 + + def default(key: SipHashKey): GolombEncoding = { + new GolombEncoding(p = DefaultP, m = DefaultM, key = key) + } +} diff --git a/server/app/com/xsn/explorer/gcs/UnsignedByte.scala b/server/app/com/xsn/explorer/gcs/UnsignedByte.scala new file mode 100644 index 0000000..c7b3280 --- /dev/null +++ b/server/app/com/xsn/explorer/gcs/UnsignedByte.scala @@ -0,0 +1,34 @@ +package com.xsn.explorer.gcs + +class UnsignedByte(val byte: Byte) extends AnyVal { + + override def toString: String = { + toInt.toString + } + + def toFixedBinaryString: String = { + val string = toInt.toBinaryString + val missing = List.fill(8 - string.length)(0).mkString("") + missing + string + } + + def toInt: Int = byte.toInt & 0xFF + + def bits: List[Bit] = { + toFixedBinaryString + .flatMap(Bit.from) + .toList + } +} + +object UnsignedByte { + def parse(bits: List[Bit]): UnsignedByte = { + require(bits.size <= 8) + + val int = bits.foldLeft(0) { case (acc, cur) => + (acc * 2) + cur.toInt + } + + new UnsignedByte(int.asInstanceOf[Byte]) + } +} diff --git a/server/test/com/xsn/explorer/gcs/GolombEncodingSpec.scala b/server/test/com/xsn/explorer/gcs/GolombEncodingSpec.scala new file mode 100644 index 0000000..572920f --- /dev/null +++ b/server/test/com/xsn/explorer/gcs/GolombEncodingSpec.scala @@ -0,0 +1,93 @@ +package com.xsn.explorer.gcs + +import org.scalatest.{MustMatchers, WordSpec} + +class GolombEncodingSpec extends WordSpec with MustMatchers { + + val words = List( + "Alex", + "Bob", + "Charlie", + "Dick", + "Ed", + "Frank", + "George", + "Harry", + "Ilya", + "John", + "Kevin", + "Larry", + "Michael", + "Nate", + "Owen", + "Paul", + "Quentin" + ) + + "the encoding" should { + val keyBytes = List( + 0x4c, 0xb1, 0xab, 0x12, 0x57, 0x62, 0x1e, 0x41, + 0x3b, 0x8b, 0x0e, 0x26, 0x64, 0x8d, 0x4a, 0x15).map(_.asInstanceOf[Byte]) + + val key = SipHashKey.fromBtcutil(keyBytes) + val golomb = GolombEncoding.default(key) + val encoded = golomb.encode(words) + + "decode the same hashes" in { + val hashes = golomb.hashes(words) + val decoded = golomb.decode(encoded.data, words.size) + + decoded mustEqual hashes + } + + "return the encoded hex from the btcutil gcs" in { + /** + * The hex was generated from this go code: +{{{ +package main + +import ( + "encoding/hex" + "fmt" + "github.com/btcsuite/btcutil/gcs/builder" +) + +func main() { + contents := [][]byte{ + []byte("Alex"), + []byte("Bob"), + []byte("Charlie"), + []byte("Dick"), + []byte("Ed"), + []byte("Frank"), + []byte("George"), + []byte("Harry"), + []byte("Ilya"), + []byte("John"), + []byte("Kevin"), + []byte("Larry"), + []byte("Michael"), + []byte("Nate"), + []byte("Owen"), + []byte("Paul"), + []byte("Quentin"), + } + testKey := [16]byte{0x4c, 0xb1, 0xab, 0x12, 0x57, 0x62, 0x1e, 0x41, + 0x3b, 0x8b, 0x0e, 0x26, 0x64, 0x8d, 0x4a, 0x15} + + b := builder.WithRandomKey().SetKey(testKey); + f, err := b.AddEntries(contents).Build(); + if err != nil { + fmt.Println("Error", err) + } + rawBytes, _ := f.Bytes() + encoded := hex.EncodeToString(rawBytes); + fmt.Println("Filter: %X\n", len(encoded), encoded) +} +}}} + */ + val expected = "056ff79e6c2994ba5d91402f327f807097c5c571f8d212511a8237f005331346102b41967f35ef488406c38a88" + encoded.hex.string must be(expected) + } + } +}