Browse Source

ccan: update.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
ppa-0.6.1
Rusty Russell 10 years ago
parent
commit
81e73926f3
  1. 2
      ccan/README
  2. 2
      ccan/ccan/asort/_info
  3. 12
      ccan/ccan/asort/asort.h
  4. 20
      ccan/ccan/crypto/sha256/benchmarks/Makefile
  5. 122
      ccan/ccan/crypto/sha256/benchmarks/double-sha-bench.c
  6. 32
      ccan/ccan/crypto/sha256/benchmarks/open_software_license.txt
  7. 586
      ccan/ccan/crypto/sha256/benchmarks/sha256_avx1.asm
  8. 826
      ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx2.asm
  9. 1507
      ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx8.asm
  10. 544
      ccan/ccan/crypto/sha256/benchmarks/sha256_sse4.asm
  11. 2
      ccan/ccan/crypto/sha256/sha256.c
  12. 1
      ccan/ccan/htable/LICENSE
  13. 116
      ccan/ccan/htable/_info
  14. 296
      ccan/ccan/htable/htable.c
  15. 191
      ccan/ccan/htable/htable.h
  16. 108
      ccan/ccan/htable/htable_type.h
  17. 36
      ccan/ccan/htable/test/run-size.c
  18. 175
      ccan/ccan/htable/test/run-type.c
  19. 61
      ccan/ccan/htable/test/run-zero-hash-first-entry.c
  20. 207
      ccan/ccan/htable/test/run.c
  21. 40
      ccan/ccan/htable/tools/Makefile
  22. 95
      ccan/ccan/htable/tools/hsearchspeed.c
  23. 370
      ccan/ccan/htable/tools/speed.c
  24. 240
      ccan/ccan/htable/tools/stringspeed.c
  25. 1
      ccan/ccan/order/LICENSE
  26. 33
      ccan/ccan/order/_info
  27. 70
      ccan/ccan/order/order.c
  28. 73
      ccan/ccan/order/order.h
  29. 138
      ccan/ccan/order/test/api.c
  30. 24
      ccan/ccan/order/test/compile_fail_1.c
  31. 25
      ccan/ccan/order/test/compile_fail_2.c
  32. 19
      ccan/ccan/order/test/compile_ok.c
  33. 47
      ccan/ccan/order/test/fancy_cmp.h
  34. 1
      ccan/ccan/ptrint/LICENSE
  35. 59
      ccan/ccan/ptrint/_info
  36. 34
      ccan/ccan/ptrint/ptrint.h
  37. 29
      ccan/ccan/ptrint/test/run.c
  38. 26
      ccan/ccan/tal/benchmark/Makefile

2
ccan/README

@ -1,3 +1,3 @@
CCAN imported from http://ccodearchive.net.
CCAN version: init-1956-ged95d86
CCAN version: init-2039-g396f2fc

2
ccan/ccan/asort/_info

@ -58,7 +58,7 @@ int main(int argc, char *argv[])
return 1;
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/typesafe_cb\n");
printf("ccan/order\n");
return 0;
}
if (strcmp(argv[1], "testdepends") == 0) {

12
ccan/ccan/asort/asort.h

@ -2,7 +2,7 @@
#ifndef CCAN_ASORT_H
#define CCAN_ASORT_H
#include "config.h"
#include <ccan/typesafe_cb/typesafe_cb.h>
#include <ccan/order/order.h>
#include <stdlib.h>
/**
@ -20,19 +20,13 @@
*/
#define asort(base, num, cmp, ctx) \
_asort((base), (num), sizeof(*(base)), \
typesafe_cb_cast(int (*)(const void *, const void *, void *), \
int (*)(const __typeof__(*(base)) *, \
const __typeof__(*(base)) *, \
__typeof__(ctx)), \
(cmp)), \
(ctx))
total_order_cast((cmp), *(base), (ctx)), (ctx))
#if HAVE_QSORT_R_PRIVATE_LAST
#define _asort(b, n, s, cmp, ctx) qsort_r(b, n, s, cmp, ctx)
#else
void _asort(void *base, size_t nmemb, size_t size,
int(*compar)(const void *, const void *, void *),
void *ctx);
_total_order_cb compar, void *ctx);
#endif
#endif /* CCAN_ASORT_H */

20
ccan/ccan/crypto/sha256/benchmarks/Makefile

@ -0,0 +1,20 @@
CCANDIR := ../../../../
CFLAGS := -Wall -I$(CCANDIR) -O3 -flto -DCCAN_USE_ORIGINAL=1
LDFLAGS := -O3 -flto
INTEL_OBJS := sha256_avx1.o sha256_avx2_rorx2.o sha256_avx2_rorx8.o sha256_sse4.o
double-sha-bench: double-sha-bench.o ccan-time.o $(INTEL_OBJS) #ccan-crypto-sha256.o
$(INTEL_OBJS): %.o : %.asm
%.o : %.asm
yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o $@ $<
clean:
$(RM) -f *.o
ccan-crypto-sha256.o: $(CCANDIR)/ccan/crypto/sha256/sha256.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-time.o: $(CCANDIR)/ccan/time/time.c
$(CC) $(CFLAGS) -c -o $@ $<

122
ccan/ccan/crypto/sha256/benchmarks/double-sha-bench.c

@ -0,0 +1,122 @@
/* Bitcoin does a lot of SHA of SHA. Benchmark that. */
#include <ccan/crypto/sha256/sha256.c>
#include <ccan/time/time.h>
#include <stdio.h>
void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
void sha256_rorx(void *input_data, uint32_t digest[8], uint64_t num_blks);
void sha256_rorx_x8ms(void *input_data, uint32_t digest[8], uint64_t num_blks);
void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
int main(int argc, char *argv[])
{
struct timeabs start;
struct timerel diff;
size_t i, n;
union {
struct sha256 h;
uint32_t u32[16];
uint8_t u8[64];
} block;
n = atoi(argv[1] ? argv[1] : "1000000");
memset(&block, 0, sizeof(block));
sha256(&block.h, &n, sizeof(n));
start = time_now();
for (i = 0; i < n; i++) {
sha256(&block.h, &block.h, sizeof(block.h));
}
diff = time_divide(time_between(time_now(), start), n);
printf("Normal gave %02x%02x%02x%02x%02x%02x... in %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
/* Now, don't re-initalize every time; use Transform */
memset(&block, 0, sizeof(block));
sha256(&block.h, &n, sizeof(n));
block.u8[sizeof(block.h)] = 0x80;
// Size is 256 bits
block.u8[sizeof(block)-2] = 1;
start = time_now();
for (i = 0; i < n; i++) {
struct sha256_ctx ctx = SHA256_INIT;
size_t j;
Transform(ctx.s, block.u32);
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
}
diff = time_divide(time_between(time_now(), start), n);
printf("Transform gave %02x%02x%02x%02x%02x%02x... in %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
/* Now, assembler variants */
sha256(&block.h, &n, sizeof(n));
start = time_now();
for (i = 0; i < n; i++) {
struct sha256_ctx ctx = SHA256_INIT;
size_t j;
sha256_rorx(block.u32, ctx.s, 1);
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
}
diff = time_divide(time_between(time_now(), start), n);
printf("Asm rorx for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
sha256(&block.h, &n, sizeof(n));
start = time_now();
for (i = 0; i < n; i++) {
struct sha256_ctx ctx = SHA256_INIT;
size_t j;
sha256_sse4(block.u32, ctx.s, 1);
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
}
diff = time_divide(time_between(time_now(), start), n);
printf("Asm SSE4 for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
sha256(&block.h, &n, sizeof(n));
start = time_now();
for (i = 0; i < n; i++) {
struct sha256_ctx ctx = SHA256_INIT;
size_t j;
sha256_rorx_x8ms(block.u32, ctx.s, 1);
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
}
diff = time_divide(time_between(time_now(), start), n);
printf("Asm RORx-x8ms for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
sha256(&block.h, &n, sizeof(n));
start = time_now();
for (i = 0; i < n; i++) {
struct sha256_ctx ctx = SHA256_INIT;
size_t j;
sha256_avx(block.u32, ctx.s, 1);
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
}
diff = time_divide(time_between(time_now(), start), n);
printf("Asm AVX for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
(unsigned long long)time_to_nsec(diff));
return 0;
}

32
ccan/ccan/crypto/sha256/benchmarks/open_software_license.txt

@ -0,0 +1,32 @@
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
* Neither the name of the Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

586
ccan/ccan/crypto/sha256/benchmarks/sha256_avx1.asm

@ -0,0 +1,586 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright (c) 2012, Intel Corporation
;
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are
; met:
;
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the
; distribution.
;
; * Neither the name of the Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived from
; this software without specific prior written permission.
;
;
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Example YASM command lines:
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; This code is described in an Intel White-Paper:
; "Fast SHA-256 Implementations on Intel Architecture Processors"
;
; To find it, surf to http://www.intel.com/p/en_US/embedded
; and search for that title.
; The paper is expected to be released roughly at the end of April, 2012
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This code schedules 1 blocks at a time, with 4 lanes per block
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define VMOVDQ vmovdqu ;; assume buffers not aligned
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
; addm [mem], reg
; Add reg to mem using reg-mem add and store
%macro addm 2
add %2, %1
mov %1, %2
%endm
%macro MY_ROR 2
shld %1,%1,(32-(%2))
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
; Load xmm with mem and byte swap each dword
%macro COPY_XMM_AND_BSWAP 3
VMOVDQ %1, %2
vpshufb %1, %1, %3
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define X0 xmm4
%define X1 xmm5
%define X2 xmm6
%define X3 xmm7
%define XTMP0 xmm0
%define XTMP1 xmm1
%define XTMP2 xmm2
%define XTMP3 xmm3
%define XTMP4 xmm8
%define XFER xmm9
%define XTMP5 xmm11
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK xmm13
%ifdef LINUX
%define NUM_BLKS rdx ; 3rd arg
%define CTX rsi ; 2nd arg
%define INP rdi ; 1st arg
%define SRND rdi ; clobbers INP
%define c ecx
%define d r8d
%define e edx
%else
%define NUM_BLKS r8 ; 3rd arg
%define CTX rdx ; 2nd arg
%define INP rcx ; 1st arg
%define SRND rcx ; clobbers INP
%define c edi
%define d esi
%define e r8d
%endif
%define TBL rbp
%define a eax
%define b ebx
%define f r9d
%define g r10d
%define h r11d
%define y0 r13d
%define y1 r14d
%define y2 r15d
_INP_END_SIZE equ 8
_INP_SIZE equ 8
_XFER_SIZE equ 8
%ifdef LINUX
_XMM_SAVE_SIZE equ 0
%else
_XMM_SAVE_SIZE equ 8*16
%endif
; STACK_SIZE plus pushes must be an odd multiple of 8
_ALIGN_SIZE equ 8
_INP_END equ 0
_INP equ _INP_END + _INP_END_SIZE
_XFER equ _INP + _INP_SIZE
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endm
; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
%macro FOUR_ROUNDS_AND_SCHED 0
;; compute s0 four at a time and s1 two at a time
;; compute W[-16] + W[-7] 4 at a time
;vmovdqa XTMP0, X3
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
;vmovdqa XTMP1, X1
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
;; compute s0
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpsrld XTMP2, XTMP1, 7
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpslld XTMP3, XTMP1, (32-7)
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
mov y0, e ; y0 = e
mov y1, a ; y1 = a
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
vpsrld XTMP2, XTMP1,18
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
vpslld XTMP1, XTMP1, (32-18)
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
vpxor XTMP3, XTMP3, XTMP1
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
;; compute low s1
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
mov y0, e ; y0 = e
mov y1, a ; y1 = a
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
xor y0, e ; y0 = e ^ (e >> (25-11))
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
xor y2, g ; y2 = f^g
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
vpxor XTMP2, XTMP2, XTMP3
add y2, y0 ; y2 = S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
;; compute high s1
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
vpxor XTMP2, XTMP2, XTMP3
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
rotate_Xs
%endm
;; input is [rsp + _XFER + %1 * 4]
%macro DO_ROUND 1
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
xor y0, e ; y0 = e ^ (e >> (25-11))
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
and y2, e ; y2 = (f^g)&e
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
add y2, y0 ; y2 = S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
;; arg 1 : pointer to input data
;; arg 2 : pointer to digest
;; arg 3 : Num blocks
section .text
global sha256_avx
align 32
sha256_avx:
push rbx
%ifndef LINUX
push rsi
push rdi
%endif
push rbp
push r13
push r14
push r15
sub rsp,STACK_SIZE
%ifndef LINUX
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
%endif
shl NUM_BLKS, 6 ; convert to bytes
jz done_hash
add NUM_BLKS, INP ; pointer to end of data
mov [rsp + _INP_END], NUM_BLKS
;; load initial digest
mov a,[4*0 + CTX]
mov b,[4*1 + CTX]
mov c,[4*2 + CTX]
mov d,[4*3 + CTX]
mov e,[4*4 + CTX]
mov f,[4*5 + CTX]
mov g,[4*6 + CTX]
mov h,[4*7 + CTX]
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
loop0:
lea TBL,[K256 wrt rip]
;; byte swap first 16 dwords
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
mov [rsp + _INP], INP
;; schedule 48 input dwords, by doing 3 rounds of 16 each
mov SRND, 3
align 16
loop1:
vpaddd XFER, X0, [TBL + 0*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 1*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 2*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 3*16]
vmovdqa [rsp + _XFER], XFER
add TBL, 4*16
FOUR_ROUNDS_AND_SCHED
sub SRND, 1
jne loop1
mov SRND, 2
loop2:
vpaddd XFER, X0, [TBL + 0*16]
vmovdqa [rsp + _XFER], XFER
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
vpaddd XFER, X1, [TBL + 1*16]
vmovdqa [rsp + _XFER], XFER
add TBL, 2*16
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
vmovdqa X0, X2
vmovdqa X1, X3
sub SRND, 1
jne loop2
addm [4*0 + CTX],a
addm [4*1 + CTX],b
addm [4*2 + CTX],c
addm [4*3 + CTX],d
addm [4*4 + CTX],e
addm [4*5 + CTX],f
addm [4*6 + CTX],g
addm [4*7 + CTX],h
mov INP, [rsp + _INP]
add INP, 64
cmp INP, [rsp + _INP_END]
jne loop0
done_hash:
%ifndef LINUX
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
%endif
add rsp, STACK_SIZE
pop r15
pop r14
pop r13
pop rbp
%ifndef LINUX
pop rdi
pop rsi
%endif
pop rbx
ret
section .data
align 64
K256:
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
; shuffle xBxA -> 00BA
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
; shuffle xDxC -> DC00
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF

826
ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx2.asm

@ -0,0 +1,826 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright (c) 2012, Intel Corporation
;
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are
; met:
;
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the
; distribution.
;
; * Neither the name of the Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived from
; this software without specific prior written permission.
;
;
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Example YASM command lines:
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; This code is described in an Intel White-Paper:
; "Fast SHA-256 Implementations on Intel Architecture Processors"
;
; To find it, surf to http://www.intel.com/p/en_US/embedded
; and search for that title.
; The paper is expected to be released roughly at the end of April, 2012
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This code schedules 2 blocks at a time, with 4 lanes per block
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define VMOVDQ vmovdqu ;; assume buffers not aligned
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
; addm [mem], reg
; Add reg to mem using reg-mem add and store
%macro addm 2
add %2, %1
mov %1, %2
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define X0 ymm4
%define X1 ymm5
%define X2 ymm6
%define X3 ymm7
; XMM versions of above
%define XWORD0 xmm4
%define XWORD1 xmm5
%define XWORD2 xmm6
%define XWORD3 xmm7
%define XTMP0 ymm0
%define XTMP1 ymm1
%define XTMP2 ymm2
%define XTMP3 ymm3
%define XTMP4 ymm8
%define XFER ymm9
%define XTMP5 ymm11
%define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA
%define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK ymm13
%define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
%ifdef LINUX
%define NUM_BLKS rdx ; 3rd arg
%define CTX rsi ; 2nd arg
%define INP rdi ; 1st arg
%define c ecx
%define d r8d
%define e edx ; clobbers NUM_BLKS
%define y3 edi ; clobbers INP
%else
%define NUM_BLKS r8 ; 3rd arg
%define CTX rdx ; 2nd arg
%define INP rcx ; 1st arg
%define c edi
%define d esi
%define e r8d ; clobbers NUM_BLKS
%define y3 ecx ; clobbers INP
%endif
%define TBL rbp
%define SRND CTX ; SRND is same register as CTX
%define a eax
%define b ebx
%define f r9d
%define g r10d
%define h r11d
%define old_h r11d
%define T1 r12d
%define y0 r13d
%define y1 r14d
%define y2 r15d
_XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round
%ifdef LINUX
_XMM_SAVE_SIZE equ 0
%else
_XMM_SAVE_SIZE equ 8*16
%endif
_INP_END_SIZE equ 8
_INP_SIZE equ 8
_CTX_SIZE equ 8
_RSP_SIZE equ 8
_XFER equ 0
_XMM_SAVE equ _XFER + _XFER_SIZE
_INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE
_INP equ _INP_END + _INP_END_SIZE
_CTX equ _INP + _INP_SIZE
_RSP equ _CTX + _CTX_SIZE
STACK_SIZE equ _RSP + _RSP_SIZE
; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endm
; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine old_h h
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
%macro FOUR_ROUNDS_AND_SCHED 1
%define %%XFER %1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov y3, a ; y3 = a ; MAJA
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
add h, dword[%%XFER+0*4] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
mov y2, f ; y2 = f ; CH
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
xor y2, g ; y2 = f^g ; CH
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
add d, h ; d = k + w + h + d ; --
and y3, b ; y3 = (a|c)&b ; MAJA
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
vpsrld XTMP2, XTMP1, 7
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
vpslld XTMP3, XTMP1, (32-7)
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7
vpsrld XTMP2, XTMP1,18
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov y3, a ; y3 = a ; MAJA
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
add h, dword[%%XFER+1*4] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
mov y2, f ; y2 = f ; CH
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
xor y2, g ; y2 = f^g ; CH
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
and y2, e ; y2 = (f^g)&e ; CH
add d, h ; d = k + w + h + d ; --
vpslld XTMP1, XTMP1, (32-18)
and y3, b ; y3 = (a|c)&b ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
vpxor XTMP3, XTMP3, XTMP1
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
add h, y3 ; h = t1 + S0 + MAJ ; --
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov y3, a ; y3 = a ; MAJA
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
add h, [%%XFER+2*4] ; h = k + w + h ; --
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
or y3, c ; y3 = a|c ; MAJA
mov y2, f ; y2 = f ; CH
xor y2, g ; y2 = f^g ; CH
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
and y2, e ; y2 = (f^g)&e ; CH
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
vpxor XTMP2, XTMP2, XTMP3
add d, h ; d = k + w + h + d ; --
and y3, b ; y3 = (a|c)&b ; MAJA
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov y3, a ; y3 = a ; MAJA
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
add h, dword[%%XFER+3*4] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
mov y2, f ; y2 = f ; CH
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
xor y2, g ; y2 = f^g ; CH
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
add d, h ; d = k + w + h + d ; --
and y3, b ; y3 = (a|c)&b ; MAJA
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
vpxor XTMP2, XTMP2, XTMP3
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
add y2, y0 ; y2 = S1 + CH ; --
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and T1, c ; T1 = a&c ; MAJB
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
rotate_Xs
%endm
%macro DO_4ROUNDS 1
%define %%XFER %1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov y2, f ; y2 = f ; CH
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
xor y2, g ; y2 = f^g ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
mov y3, a ; y3 = a ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
add h, dword[%%XFER + 4*0] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and y3, b ; y3 = (a|c)&b ; MAJA
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
add d, h ; d = k + w + h + d ; --
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
;add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
mov y2, f ; y2 = f ; CH
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
xor y2, g ; y2 = f^g ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
add old_h, y3 ; h = t1 + S0 + MAJ ; --
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
mov y3, a ; y3 = a ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
add h, dword[%%XFER + 4*1] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and y3, b ; y3 = (a|c)&b ; MAJA
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
add d, h ; d = k + w + h + d ; --
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
;add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
mov y2, f ; y2 = f ; CH
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
xor y2, g ; y2 = f^g ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
add old_h, y3 ; h = t1 + S0 + MAJ ; --
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
mov y3, a ; y3 = a ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
add h, dword[%%XFER + 4*2] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and y3, b ; y3 = (a|c)&b ; MAJA
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
add d, h ; d = k + w + h + d ; --
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
;add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
mov y2, f ; y2 = f ; CH
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
xor y2, g ; y2 = f^g ; CH
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
and y2, e ; y2 = (f^g)&e ; CH
add old_h, y3 ; h = t1 + S0 + MAJ ; --
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
mov y3, a ; y3 = a ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
add h, dword[%%XFER + 4*3] ; h = k + w + h ; --
or y3, c ; y3 = a|c ; MAJA
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
mov T1, a ; T1 = a ; MAJB
and y3, b ; y3 = (a|c)&b ; MAJA
and T1, c ; T1 = a&c ; MAJB
add y2, y0 ; y2 = S1 + CH ; --
add d, h ; d = k + w + h + d ; --
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
add h, y1 ; h = k + w + h + S0 ; --
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
add h, y3 ; h = t1 + S0 + MAJ ; --
ROTATE_ARGS
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
;; arg 1 : pointer to input data
;; arg 2 : pointer to digest
;; arg 3 : Num blocks
section .text
global sha256_rorx
align 32
sha256_rorx:
push rbx
%ifndef LINUX
push rsi
push rdi
%endif
push rbp
push r12
push r13
push r14
push r15
mov rax, rsp
sub rsp,STACK_SIZE
and rsp, -32
mov [rsp + _RSP], rax
%ifndef LINUX
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
%endif
shl NUM_BLKS, 6 ; convert to bytes
jz done_hash
lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
mov [rsp + _INP_END], NUM_BLKS
cmp INP, NUM_BLKS
je only_one_block
;; load initial digest
mov a,[4*0 + CTX]
mov b,[4*1 + CTX]
mov c,[4*2 + CTX]
mov d,[4*3 + CTX]
mov e,[4*4 + CTX]
mov f,[4*5 + CTX]
mov g,[4*6 + CTX]
mov h,[4*7 + CTX]
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
mov [rsp + _CTX], CTX
loop0:
lea TBL,[K256 wrt rip]
;; Load first 16 dwords from two blocks
VMOVDQ XTMP0, [INP + 0*32]
VMOVDQ XTMP1, [INP + 1*32]
VMOVDQ XTMP2, [INP + 2*32]
VMOVDQ XTMP3, [INP + 3*32]
;; byte swap data
vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK
vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK
vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK
vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK
;; transpose data into high/low halves
vperm2i128 X0, XTMP0, XTMP2, 0x20
vperm2i128 X1, XTMP0, XTMP2, 0x31
vperm2i128 X2, XTMP1, XTMP3, 0x20
vperm2i128 X3, XTMP1, XTMP3, 0x31
last_block_enter:
add INP, 64
mov [rsp + _INP], INP
;; schedule 48 input dwords, by doing 3 rounds of 12 each
xor SRND, SRND
align 16
loop1:
vpaddd XFER, X0, [TBL + SRND + 0*32]
vmovdqa [rsp + _XFER + SRND + 0*32], XFER
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32
vpaddd XFER, X0, [TBL + SRND + 1*32]
vmovdqa [rsp + _XFER + SRND + 1*32], XFER
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32
vpaddd XFER, X0, [TBL + SRND + 2*32]
vmovdqa [rsp + _XFER + SRND + 2*32], XFER
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32
vpaddd XFER, X0, [TBL + SRND + 3*32]
vmovdqa [rsp + _XFER + SRND + 3*32], XFER
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32
add SRND, 4*32
cmp SRND, 3 * 4*32
jb loop1
loop2:
;; Do last 16 rounds with no scheduling
vpaddd XFER, X0, [TBL + SRND + 0*32]
vmovdqa [rsp + _XFER + SRND + 0*32], XFER
DO_4ROUNDS rsp + _XFER + SRND + 0*32
vpaddd XFER, X1, [TBL + SRND + 1*32]
vmovdqa [rsp + _XFER + SRND + 1*32], XFER
DO_4ROUNDS rsp + _XFER + SRND + 1*32
add SRND, 2*32
vmovdqa X0, X2
vmovdqa X1, X3
cmp SRND, 4 * 4*32
jb loop2
mov CTX, [rsp + _CTX]
mov INP, [rsp + _INP]
addm [4*0 + CTX],a
addm [4*1 + CTX],b
addm [4*2 + CTX],c
addm [4*3 + CTX],d
addm [4*4 + CTX],e
addm [4*5 + CTX],f
addm [4*6 + CTX],g
addm [4*7 + CTX],h
cmp INP, [rsp + _INP_END]
ja done_hash
;;;; Do second block using previously scheduled results
xor SRND, SRND
align 16
loop3:
DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16
DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16
add SRND, 2*32
cmp SRND, 4 * 4*32
jb loop3
mov CTX, [rsp + _CTX]
mov INP, [rsp + _INP]
add INP, 64
addm [4*0 + CTX],a
addm [4*1 + CTX],b
addm [4*2 + CTX],c
addm [4*3 + CTX],d
addm [4*4 + CTX],e
addm [4*5 + CTX],f
addm [4*6 + CTX],g
addm [4*7 + CTX],h
cmp INP, [rsp + _INP_END]
jb loop0
ja done_hash
do_last_block:
;;;; do last block
lea TBL,[K256 wrt rip]
VMOVDQ XWORD0, [INP + 0*16]
VMOVDQ XWORD1, [INP + 1*16]
VMOVDQ XWORD2, [INP + 2*16]
VMOVDQ XWORD3, [INP + 3*16]
vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK
vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK
vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK
vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK
jmp last_block_enter
only_one_block:
;; load initial digest
mov a,[4*0 + CTX]
mov b,[4*1 + CTX]
mov c,[4*2 + CTX]
mov d,[4*3 + CTX]
mov e,[4*4 + CTX]
mov f,[4*5 + CTX]
mov g,[4*6 + CTX]
mov h,[4*7 + CTX]
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
mov [rsp + _CTX], CTX
jmp do_last_block
done_hash:
%ifndef LINUX
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
%endif
mov rsp, [rsp + _RSP]
pop r15
pop r14
pop r13
pop r12
pop rbp
%ifndef LINUX
pop rdi
pop rsi
%endif
pop rbx
ret
section .data
align 64
K256:
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
PSHUFFLE_BYTE_FLIP_MASK:
ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
; shuffle xBxA -> 00BA
_SHUF_00BA:
ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
; shuffle xDxC -> DC00
_SHUF_DC00:
ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF

1507
ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx8.asm

File diff suppressed because it is too large

544
ccan/ccan/crypto/sha256/benchmarks/sha256_sse4.asm

@ -0,0 +1,544 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright (c) 2012, Intel Corporation
;
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are
; met:
;
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the
; distribution.
;
; * Neither the name of the Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived from
; this software without specific prior written permission.
;
;
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Example YASM command lines:
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; This code is described in an Intel White-Paper:
; "Fast SHA-256 Implementations on Intel Architecture Processors"
;
; To find it, surf to http://www.intel.com/p/en_US/embedded
; and search for that title.
; The paper is expected to be released roughly at the end of April, 2012
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This code schedules 1 blocks at a time, with 4 lanes per block
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define MOVDQ movdqu ;; assume buffers not aligned
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
; addm [mem], reg
; Add reg to mem using reg-mem add and store
%macro addm 2
add %2, %1
mov %1, %2
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
; Load xmm with mem and byte swap each dword
%macro COPY_XMM_AND_BSWAP 3
MOVDQ %1, %2
pshufb %1, %3
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define X0 xmm4
%define X1 xmm5
%define X2 xmm6
%define X3 xmm7
%define XTMP0 xmm0
%define XTMP1 xmm1
%define XTMP2 xmm2
%define XTMP3 xmm3
%define XTMP4 xmm8
%define XFER xmm9
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK xmm12
%ifdef LINUX
%define NUM_BLKS rdx ; 3rd arg
%define CTX rsi ; 2nd arg
%define INP rdi ; 1st arg
%define SRND rdi ; clobbers INP
%define c ecx
%define d r8d
%define e edx
%else
%define NUM_BLKS r8 ; 3rd arg
%define CTX rdx ; 2nd arg
%define INP rcx ; 1st arg
%define SRND rcx ; clobbers INP
%define c edi
%define d esi
%define e r8d
%endif
%define TBL rbp
%define a eax
%define b ebx
%define f r9d
%define g r10d
%define h r11d
%define y0 r13d
%define y1 r14d
%define y2 r15d
_INP_END_SIZE equ 8
_INP_SIZE equ 8
_XFER_SIZE equ 8
%ifdef LINUX
_XMM_SAVE_SIZE equ 0
%else
_XMM_SAVE_SIZE equ 7*16
%endif
; STACK_SIZE plus pushes must be an odd multiple of 8
_ALIGN_SIZE equ 8
_INP_END equ 0
_INP equ _INP_END + _INP_END_SIZE
_XFER equ _INP + _INP_SIZE
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endm
; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
%macro FOUR_ROUNDS_AND_SCHED 0
;; compute s0 four at a time and s1 two at a time
;; compute W[-16] + W[-7] 4 at a time
movdqa XTMP0, X3
mov y0, e ; y0 = e
ror y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
ror y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
movdqa XTMP1, X1
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
;; compute s0
palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
pslld XTMP1, (32-7)
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
psrld XTMP2, 7
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
mov y0, e ; y0 = e
mov y1, a ; y1 = a
movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
ror y0, (25-11) ; y0 = e >> (25-11)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
ror y1, (22-13) ; y1 = a >> (22-13)
pslld XTMP3, (32-18)
xor y1, a ; y1 = a ^ (a >> (22-13)
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
psrld XTMP2, 18
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
pxor XTMP1, XTMP3
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
pxor XTMP1, XTMP4 ; XTMP1 = s0
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
;; compute low s1
pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
mov y0, e ; y0 = e
mov y1, a ; y1 = a
ror y0, (25-11) ; y0 = e >> (25-11)
movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
xor y0, e ; y0 = e ^ (e >> (25-11))
ror y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
xor y2, g ; y2 = f^g
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
pxor XTMP2, XTMP3
add y2, y0 ; y2 = S1 + CH
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
;; compute high s1
pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
mov y0, e ; y0 = e
ror y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
ror y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
pxor XTMP2, XTMP3
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
pxor X0, XTMP2 ; X0 = s1 {xDxC}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
rotate_Xs
%endm
;; input is [rsp + _XFER + %1 * 4]
%macro DO_ROUND 1
mov y0, e ; y0 = e
ror y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
xor y0, e ; y0 = e ^ (e >> (25-11))
ror y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
and y2, e ; y2 = (f^g)&e
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
add y2, y0 ; y2 = S1 + CH
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
;; arg 1 : pointer to input data
;; arg 2 : pointer to digest
;; arg 3 : Num blocks
section .text
global sha256_sse4
align 32
sha256_sse4:
push rbx
%ifndef LINUX
push rsi
push rdi
%endif
push rbp
push r13
push r14
push r15
sub rsp,STACK_SIZE
%ifndef LINUX
movdqa [rsp + _XMM_SAVE + 0*16],xmm6
movdqa [rsp + _XMM_SAVE + 1*16],xmm7
movdqa [rsp + _XMM_SAVE + 2*16],xmm8
movdqa [rsp + _XMM_SAVE + 3*16],xmm9
movdqa [rsp + _XMM_SAVE + 4*16],xmm10
movdqa [rsp + _XMM_SAVE + 5*16],xmm11
movdqa [rsp + _XMM_SAVE + 6*16],xmm12
%endif
shl NUM_BLKS, 6 ; convert to bytes
jz done_hash
add NUM_BLKS, INP ; pointer to end of data
mov [rsp + _INP_END], NUM_BLKS
;; load initial digest
mov a,[4*0 + CTX]
mov b,[4*1 + CTX]
mov c,[4*2 + CTX]
mov d,[4*3 + CTX]
mov e,[4*4 + CTX]
mov f,[4*5 + CTX]
mov g,[4*6 + CTX]
mov h,[4*7 + CTX]
movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
loop0:
lea TBL,[K256 wrt rip]
;; byte swap first 16 dwords
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
mov [rsp + _INP], INP
;; schedule 48 input dwords, by doing 3 rounds of 16 each
mov SRND, 3
align 16
loop1:
movdqa XFER, [TBL + 0*16]
paddd XFER, X0
movdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
movdqa XFER, [TBL + 1*16]
paddd XFER, X0
movdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
movdqa XFER, [TBL + 2*16]
paddd XFER, X0
movdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
movdqa XFER, [TBL + 3*16]
paddd XFER, X0
movdqa [rsp + _XFER], XFER
add TBL, 4*16
FOUR_ROUNDS_AND_SCHED
sub SRND, 1
jne loop1
mov SRND, 2
loop2:
paddd X0, [TBL + 0*16]
movdqa [rsp + _XFER], X0
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
paddd X1, [TBL + 1*16]
movdqa [rsp + _XFER], X1
add TBL, 2*16
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
movdqa X0, X2
movdqa X1, X3
sub SRND, 1
jne loop2
addm [4*0 + CTX],a
addm [4*1 + CTX],b
addm [4*2 + CTX],c
addm [4*3 + CTX],d
addm [4*4 + CTX],e
addm [4*5 + CTX],f
addm [4*6 + CTX],g
addm [4*7 + CTX],h
mov INP, [rsp + _INP]
add INP, 64
cmp INP, [rsp + _INP_END]
jne loop0
done_hash:
%ifndef LINUX
movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
%endif
add rsp, STACK_SIZE
pop r15
pop r14
pop r13
pop rbp
%ifndef LINUX
pop rdi
pop rsi
%endif
pop rbx
ret
section .data
align 64
K256:
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
; shuffle xBxA -> 00BA
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
; shuffle xDxC -> DC00
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF

2
ccan/ccan/crypto/sha256/sha256.c

@ -36,7 +36,7 @@ void sha256_init(struct sha256_ctx *ctx)
SHA256_Init(&ctx->c);
}
void sha256_update_bytes(struct sha256_ctx *ctx, const void *p, size_t size)
void sha256_update(struct sha256_ctx *ctx, const void *p, size_t size)
{
check_sha256(ctx);
SHA256_Update(&ctx->c, p, size);

1
ccan/ccan/htable/LICENSE

@ -0,0 +1 @@
../../licenses/LGPL-2.1

116
ccan/ccan/htable/_info

@ -0,0 +1,116 @@
#include "config.h"
#include <string.h>
#include <stdio.h>
/**
* htable - hash table routines
*
* A hash table is an efficient structure for looking up keys. This version
* grows with usage and allows efficient deletion.
*
* Example:
* #include <ccan/htable/htable.h>
* #include <ccan/hash/hash.h>
* #include <stdio.h>
* #include <err.h>
* #include <string.h>
*
* struct name_to_digit {
* const char *name;
* unsigned int val;
* };
*
* static struct name_to_digit map[] = {
* { "zero", 0},
* { "one", 1 },
* { "two", 2 },
* { "three", 3 },
* { "four", 4 },
* { "five", 5 },
* { "six", 6 },
* { "seven", 7 },
* { "eight", 8 },
* { "nine", 9 }
* };
*
* // Wrapper for rehash function pointer.
* static size_t rehash(const void *e, void *unused)
* {
* return hash_string(((struct name_to_digit *)e)->name);
* }
*
* // Comparison function.
* static bool streq(const void *e, void *string)
* {
* return strcmp(((struct name_to_digit *)e)->name, string) == 0;
* }
*
* // We let them add their own aliases, eg. --alias=v=5
* static void add_alias(struct htable *ht, const char *alias)
* {
* char *eq;
* struct name_to_digit *n;
*
* n = malloc(sizeof(*n));
* n->name = strdup(alias);
*
* eq = strchr(n->name, '=');
* if (!eq || ((n->val = atoi(eq+1)) == 0 && !strcmp(eq+1, "0")))
* errx(1, "Usage: --alias=<name>=<value>");
* *eq = '\0';
* htable_add(ht, hash_string(n->name), n);
* }
*
* int main(int argc, char *argv[])
* {
* struct htable ht;
* unsigned int i;
* unsigned long val;
*
* if (argc < 2)
* errx(1, "Usage: %s [--alias=<name>=<val>]... <str>...",
* argv[0]);
*
* // Create and populate hash table.
* htable_init(&ht, rehash, NULL);
* for (i = 0; i < sizeof(map)/sizeof(map[0]); i++)
* htable_add(&ht, hash_string(map[i].name), &map[i]);
*
* // Add any aliases to the hash table.
* for (i = 1; i < argc; i++) {
* if (!strncmp(argv[i], "--alias=", strlen("--alias=")))
* add_alias(&ht, argv[i] + strlen("--alias="));
* else
* break;
* }
*
* // Find the other args in the hash table.
* for (val = 0; i < argc; i++) {
* struct name_to_digit *n;
* n = htable_get(&ht, hash_string(argv[i]),
* streq, argv[i]);
* if (!n)
* errx(1, "Invalid digit name %s", argv[i]);
* // Append it to the value we are building up.
* val *= 10;
* val += n->val;
* }
* printf("%lu\n", val);
* return 0;
* }
*
* License: LGPL (v2.1 or any later version)
* Author: Rusty Russell <rusty@rustcorp.com.au>
*/
int main(int argc, char *argv[])
{
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/compiler\n");
return 0;
}
return 1;
}

296
ccan/ccan/htable/htable.c

@ -0,0 +1,296 @@
/* Licensed under LGPLv2+ - see LICENSE file for details */
#include <ccan/htable/htable.h>
#include <ccan/compiler/compiler.h>
#include <stdlib.h>
#include <limits.h>
#include <stdbool.h>
#include <assert.h>
/* We use 0x1 as deleted marker. */
#define HTABLE_DELETED (0x1)
/* We clear out the bits which are always the same, and put metadata there. */
static inline uintptr_t get_extra_ptr_bits(const struct htable *ht,
uintptr_t e)
{
return e & ht->common_mask;
}
static inline void *get_raw_ptr(const struct htable *ht, uintptr_t e)
{
return (void *)((e & ~ht->common_mask) | ht->common_bits);
}
static inline uintptr_t make_hval(const struct htable *ht,
const void *p, uintptr_t bits)
{
return ((uintptr_t)p & ~ht->common_mask) | bits;
}
static inline bool entry_is_valid(uintptr_t e)
{
return e > HTABLE_DELETED;
}
static inline uintptr_t get_hash_ptr_bits(const struct htable *ht,
size_t hash)
{
/* Shuffling the extra bits (as specified in mask) down the
* end is quite expensive. But the lower bits are redundant, so
* we fold the value first. */
return (hash ^ (hash >> ht->bits))
& ht->common_mask & ~ht->perfect_bit;
}
void htable_init(struct htable *ht,
size_t (*rehash)(const void *elem, void *priv), void *priv)
{
struct htable empty = HTABLE_INITIALIZER(empty, NULL, NULL);
*ht = empty;
ht->rehash = rehash;
ht->priv = priv;
ht->table = &ht->perfect_bit;
}
bool htable_init_sized(struct htable *ht,
size_t (*rehash)(const void *, void *),
void *priv, size_t expect)
{
htable_init(ht, rehash, priv);
/* Don't go insane with sizing. */
for (ht->bits = 1; ((size_t)3 << ht->bits) / 4 < expect; ht->bits++) {
if (ht->bits == 30)
break;
}
ht->table = calloc(1 << ht->bits, sizeof(size_t));
if (!ht->table) {
ht->table = &ht->perfect_bit;
return false;
}
ht->max = ((size_t)3 << ht->bits) / 4;
ht->max_with_deleted = ((size_t)9 << ht->bits) / 10;
return true;
}
void htable_clear(struct htable *ht)
{
if (ht->table != &ht->perfect_bit)
free((void *)ht->table);
htable_init(ht, ht->rehash, ht->priv);
}
static size_t hash_bucket(const struct htable *ht, size_t h)
{
return h & ((1 << ht->bits)-1);
}
static void *htable_val(const struct htable *ht,
struct htable_iter *i, size_t hash, uintptr_t perfect)
{
uintptr_t h2 = get_hash_ptr_bits(ht, hash) | perfect;
while (ht->table[i->off]) {
if (ht->table[i->off] != HTABLE_DELETED) {
if (get_extra_ptr_bits(ht, ht->table[i->off]) == h2)
return get_raw_ptr(ht, ht->table[i->off]);
}
i->off = (i->off + 1) & ((1 << ht->bits)-1);
h2 &= ~perfect;
}
return NULL;
}
void *htable_firstval(const struct htable *ht,
struct htable_iter *i, size_t hash)
{
i->off = hash_bucket(ht, hash);
return htable_val(ht, i, hash, ht->perfect_bit);
}
void *htable_nextval(const struct htable *ht,
struct htable_iter *i, size_t hash)
{
i->off = (i->off + 1) & ((1 << ht->bits)-1);
return htable_val(ht, i, hash, 0);
}
void *htable_first(const struct htable *ht, struct htable_iter *i)
{
for (i->off = 0; i->off < (size_t)1 << ht->bits; i->off++) {
if (entry_is_valid(ht->table[i->off]))
return get_raw_ptr(ht, ht->table[i->off]);
}
return NULL;
}
void *htable_next(const struct htable *ht, struct htable_iter *i)
{
for (i->off++; i->off < (size_t)1 << ht->bits; i->off++) {
if (entry_is_valid(ht->table[i->off]))
return get_raw_ptr(ht, ht->table[i->off]);
}
return NULL;
}
/* This does not expand the hash table, that's up to caller. */
static void ht_add(struct htable *ht, const void *new, size_t h)
{
size_t i;
uintptr_t perfect = ht->perfect_bit;
i = hash_bucket(ht, h);
while (entry_is_valid(ht->table[i])) {
perfect = 0;
i = (i + 1) & ((1 << ht->bits)-1);
}
ht->table[i] = make_hval(ht, new, get_hash_ptr_bits(ht, h)|perfect);
}
static COLD bool double_table(struct htable *ht)
{
unsigned int i;
size_t oldnum = (size_t)1 << ht->bits;
uintptr_t *oldtable, e;
oldtable = ht->table;
ht->table = calloc(1 << (ht->bits+1), sizeof(size_t));
if (!ht->table) {
ht->table = oldtable;
return false;
}
ht->bits++;
ht->max = ((size_t)3 << ht->bits) / 4;
ht->max_with_deleted = ((size_t)9 << ht->bits) / 10;
/* If we lost our "perfect bit", get it back now. */
if (!ht->perfect_bit && ht->common_mask) {
for (i = 0; i < sizeof(ht->common_mask) * CHAR_BIT; i++) {
if (ht->common_mask & ((size_t)1 << i)) {
ht->perfect_bit = (size_t)1 << i;
break;
}
}
}
if (oldtable != &ht->perfect_bit) {
for (i = 0; i < oldnum; i++) {
if (entry_is_valid(e = oldtable[i])) {
void *p = get_raw_ptr(ht, e);
ht_add(ht, p, ht->rehash(p, ht->priv));
}
}
free(oldtable);
}
ht->deleted = 0;
return true;
}
static COLD void rehash_table(struct htable *ht)
{
size_t start, i;
uintptr_t e;
/* Beware wrap cases: we need to start from first empty bucket. */
for (start = 0; ht->table[start]; start++);
for (i = 0; i < (size_t)1 << ht->bits; i++) {
size_t h = (i + start) & ((1 << ht->bits)-1);
e = ht->table[h];
if (!e)
continue;
if (e == HTABLE_DELETED)
ht->table[h] = 0;
else if (!(e & ht->perfect_bit)) {
void *p = get_raw_ptr(ht, e);
ht->table[h] = 0;
ht_add(ht, p, ht->rehash(p, ht->priv));
}
}
ht->deleted = 0;
}
/* We stole some bits, now we need to put them back... */
static COLD void update_common(struct htable *ht, const void *p)
{
unsigned int i;
uintptr_t maskdiff, bitsdiff;
if (ht->elems == 0) {
/* Always reveal one bit of the pointer in the bucket,
* so it's not zero or HTABLE_DELETED (1), even if
* hash happens to be 0. Assumes (void *)1 is not a
* valid pointer. */
for (i = sizeof(uintptr_t)*CHAR_BIT - 1; i > 0; i--) {
if ((uintptr_t)p & ((uintptr_t)1 << i))
break;
}
ht->common_mask = ~((uintptr_t)1 << i);
ht->common_bits = ((uintptr_t)p & ht->common_mask);
ht->perfect_bit = 1;
return;
}
/* Find bits which are unequal to old common set. */
maskdiff = ht->common_bits ^ ((uintptr_t)p & ht->common_mask);
/* These are the bits which go there in existing entries. */
bitsdiff = ht->common_bits & maskdiff;
for (i = 0; i < (size_t)1 << ht->bits; i++) {
if (!entry_is_valid(ht->table[i]))
continue;
/* Clear the bits no longer in the mask, set them as
* expected. */
ht->table[i] &= ~maskdiff;
ht->table[i] |= bitsdiff;
}
/* Take away those bits from our mask, bits and perfect bit. */
ht->common_mask &= ~maskdiff;
ht->common_bits &= ~maskdiff;
ht->perfect_bit &= ~maskdiff;
}
bool htable_add(struct htable *ht, size_t hash, const void *p)
{
if (ht->elems+1 > ht->max && !double_table(ht))
return false;
if (ht->elems+1 + ht->deleted > ht->max_with_deleted)
rehash_table(ht);
assert(p);
if (((uintptr_t)p & ht->common_mask) != ht->common_bits)
update_common(ht, p);
ht_add(ht, p, hash);
ht->elems++;
return true;
}
bool htable_del(struct htable *ht, size_t h, const void *p)
{
struct htable_iter i;
void *c;
for (c = htable_firstval(ht,&i,h); c; c = htable_nextval(ht,&i,h)) {
if (c == p) {
htable_delval(ht, &i);
return true;
}
}
return false;
}
void htable_delval(struct htable *ht, struct htable_iter *i)
{
assert(i->off < (size_t)1 << ht->bits);
assert(entry_is_valid(ht->table[i->off]));
ht->elems--;
ht->table[i->off] = HTABLE_DELETED;
ht->deleted++;
}

191
ccan/ccan/htable/htable.h

@ -0,0 +1,191 @@
/* Licensed under LGPLv2+ - see LICENSE file for details */
#ifndef CCAN_HTABLE_H
#define CCAN_HTABLE_H
#include "config.h"
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
/**
* struct htable - private definition of a htable.
*
* It's exposed here so you can put it in your structures and so we can
* supply inline functions.
*/
struct htable {
size_t (*rehash)(const void *elem, void *priv);
void *priv;
unsigned int bits;
size_t elems, deleted, max, max_with_deleted;
/* These are the bits which are the same in all pointers. */
uintptr_t common_mask, common_bits;
uintptr_t perfect_bit;
uintptr_t *table;
};
/**
* HTABLE_INITIALIZER - static initialization for a hash table.
* @name: name of this htable.
* @rehash: hash function to use for rehashing.
* @priv: private argument to @rehash function.
*
* This is useful for setting up static and global hash tables.
*
* Example:
* // For simplicity's sake, say hash value is contents of elem.
* static size_t rehash(const void *elem, void *unused)
* {
* return *(size_t *)elem;
* }
* static struct htable ht = HTABLE_INITIALIZER(ht, rehash, NULL);
*/
#define HTABLE_INITIALIZER(name, rehash, priv) \
{ rehash, priv, 0, 0, 0, 0, 0, -1, 0, 0, &name.perfect_bit }
/**
* htable_init - initialize an empty hash table.
* @ht: the hash table to initialize
* @rehash: hash function to use for rehashing.
* @priv: private argument to @rehash function.
*/
void htable_init(struct htable *ht,
size_t (*rehash)(const void *elem, void *priv), void *priv);
/**
* htable_init_sized - initialize an empty hash table of given size.
* @ht: the hash table to initialize
* @rehash: hash function to use for rehashing.
* @priv: private argument to @rehash function.
* @size: the number of element.
*
* If this returns false, @ht is still usable, but may need to do reallocation
* upon an add. If this returns true, it will not need to reallocate within
* @size htable_adds.
*/
bool htable_init_sized(struct htable *ht,
size_t (*rehash)(const void *elem, void *priv),
void *priv, size_t size);
/**
* htable_clear - empty a hash table.
* @ht: the hash table to clear
*
* This doesn't do anything to any pointers left in it.
*/
void htable_clear(struct htable *ht);
/**
* htable_rehash - use a hashtree's rehash function
* @elem: the argument to rehash()
*
*/
size_t htable_rehash(const void *elem);
/**
* htable_add - add a pointer into a hash table.
* @ht: the htable
* @hash: the hash value of the object
* @p: the non-NULL pointer
*
* Also note that this can only fail due to allocation failure. Otherwise, it
* returns true.
*/
bool htable_add(struct htable *ht, size_t hash, const void *p);
/**
* htable_del - remove a pointer from a hash table
* @ht: the htable
* @hash: the hash value of the object
* @p: the pointer
*
* Returns true if the pointer was found (and deleted).
*/
bool htable_del(struct htable *ht, size_t hash, const void *p);
/**
* struct htable_iter - iterator or htable_first or htable_firstval etc.
*
* This refers to a location inside the hashtable.
*/
struct htable_iter {
size_t off;
};
/**
* htable_firstval - find a candidate for a given hash value
* @htable: the hashtable
* @i: the struct htable_iter to initialize
* @hash: the hash value
*
* You'll need to check the value is what you want; returns NULL if none.
* See Also:
* htable_delval()
*/
void *htable_firstval(const struct htable *htable,
struct htable_iter *i, size_t hash);
/**
* htable_nextval - find another candidate for a given hash value
* @htable: the hashtable
* @i: the struct htable_iter to initialize
* @hash: the hash value
*
* You'll need to check the value is what you want; returns NULL if no more.
*/
void *htable_nextval(const struct htable *htable,
struct htable_iter *i, size_t hash);
/**
* htable_get - find an entry in the hash table
* @ht: the hashtable
* @h: the hash value of the entry
* @cmp: the comparison function
* @ptr: the pointer to hand to the comparison function.
*
* Convenient inline wrapper for htable_firstval/htable_nextval loop.
*/
static inline void *htable_get(const struct htable *ht,
size_t h,
bool (*cmp)(const void *candidate, void *ptr),
const void *ptr)
{
struct htable_iter i;
void *c;
for (c = htable_firstval(ht,&i,h); c; c = htable_nextval(ht,&i,h)) {
if (cmp(c, (void *)ptr))
return c;
}
return NULL;
}
/**
* htable_first - find an entry in the hash table
* @ht: the hashtable
* @i: the struct htable_iter to initialize
*
* Get an entry in the hashtable; NULL if empty.
*/
void *htable_first(const struct htable *htable, struct htable_iter *i);
/**
* htable_next - find another entry in the hash table
* @ht: the hashtable
* @i: the struct htable_iter to use
*
* Get another entry in the hashtable; NULL if all done.
* This is usually used after htable_first or prior non-NULL htable_next.
*/
void *htable_next(const struct htable *htable, struct htable_iter *i);
/**
* htable_delval - remove an iterated pointer from a hash table
* @ht: the htable
* @i: the htable_iter
*
* Usually used to delete a hash entry after it has been found with
* htable_firstval etc.
*/
void htable_delval(struct htable *ht, struct htable_iter *i);
#endif /* CCAN_HTABLE_H */

108
ccan/ccan/htable/htable_type.h

@ -0,0 +1,108 @@
/* Licensed under LGPLv2+ - see LICENSE file for details */
#ifndef CCAN_HTABLE_TYPE_H
#define CCAN_HTABLE_TYPE_H
#include <ccan/htable/htable.h>
#include "config.h"
/**
* HTABLE_DEFINE_TYPE - create a set of htable ops for a type
* @type: a type whose pointers will be values in the hash.
* @keyof: a function/macro to extract a key: <keytype> @keyof(const type *elem)
* @hashfn: a hash function for a @key: size_t @hashfn(const <keytype> *)
* @eqfn: an equality function keys: bool @eqfn(const type *, const <keytype> *)
* @prefix: a prefix for all the functions to define (of form <name>_*)
*
* NULL values may not be placed into the hash table.
*
* This defines the type hashtable type and an iterator type:
* struct <name>;
* struct <name>_iter;
*
* It also defines initialization and freeing functions:
* void <name>_init(struct <name> *);
* void <name>_init_sized(struct <name> *, size_t);
* void <name>_clear(struct <name> *);
*
* Add function only fails if we run out of memory:
* bool <name>_add(struct <name> *ht, const <type> *e);
*
* Delete and delete-by key return true if it was in the set:
* bool <name>_del(struct <name> *ht, const <type> *e);
* bool <name>_delkey(struct <name> *ht, const <keytype> *k);
*
* Find function return the matching element, or NULL:
* type *<name>_get(const struct @name *ht, const <keytype> *k);
*
* Iteration over hashtable is also supported:
* type *<name>_first(const struct <name> *ht, struct <name>_iter *i);
* type *<name>_next(const struct <name> *ht, struct <name>_iter *i);
*
* It's currently safe to iterate over a changing hashtable, but you might
* miss an element. Iteration isn't very efficient, either.
*
* You can use HTABLE_INITIALIZER like so:
* struct <name> ht = { HTABLE_INITIALIZER(ht.raw, <name>_hash, NULL) };
*/
#define HTABLE_DEFINE_TYPE(type, keyof, hashfn, eqfn, name) \
struct name { struct htable raw; }; \
struct name##_iter { struct htable_iter i; }; \
static inline size_t name##_hash(const void *elem, void *priv) \
{ \
return hashfn(keyof((const type *)elem)); \
} \
static inline void name##_init(struct name *ht) \
{ \
htable_init(&ht->raw, name##_hash, NULL); \
} \
static inline void name##_init_sized(struct name *ht, size_t s) \
{ \
htable_init_sized(&ht->raw, name##_hash, NULL, s); \
} \
static inline void name##_clear(struct name *ht) \
{ \
htable_clear(&ht->raw); \
} \
static inline bool name##_add(struct name *ht, const type *elem) \
{ \
return htable_add(&ht->raw, hashfn(keyof(elem)), elem); \
} \
static inline bool name##_del(struct name *ht, const type *elem) \
{ \
return htable_del(&ht->raw, hashfn(keyof(elem)), elem); \
} \
static inline type *name##_get(const struct name *ht, \
const HTABLE_KTYPE(keyof) k) \
{ \
/* Typecheck for eqfn */ \
(void)sizeof(eqfn((const type *)NULL, \
keyof((const type *)NULL))); \
return htable_get(&ht->raw, \
hashfn(k), \
(bool (*)(const void *, void *))(eqfn), \
k); \
} \
static inline bool name##_delkey(struct name *ht, \
const HTABLE_KTYPE(keyof) k) \
{ \
type *elem = name##_get(ht, k); \
if (elem) \
return name##_del(ht, elem); \
return false; \
} \
static inline type *name##_first(const struct name *ht, \
struct name##_iter *iter) \
{ \
return htable_first(&ht->raw, &iter->i); \
} \
static inline type *name##_next(const struct name *ht, \
struct name##_iter *iter) \
{ \
return htable_next(&ht->raw, &iter->i); \
}
#if HAVE_TYPEOF
#define HTABLE_KTYPE(keyof) typeof(keyof(NULL))
#else
#define HTABLE_KTYPE(keyof) void *
#endif
#endif /* CCAN_HTABLE_TYPE_H */

36
ccan/ccan/htable/test/run-size.c

@ -0,0 +1,36 @@
#include <ccan/htable/htable.h>
#include <ccan/htable/htable.c>
#include <ccan/tap/tap.h>
#include <stdbool.h>
#include <string.h>
#define NUM_VALS 512
/* We use the number divided by two as the hash (for lots of
collisions). */
static size_t hash(const void *elem, void *unused)
{
size_t h = *(uint64_t *)elem / 2;
return h;
}
int main(int argc, char *argv[])
{
struct htable ht;
uint64_t val[NUM_VALS];
unsigned int i;
plan_tests((NUM_VALS) * 2);
for (i = 0; i < NUM_VALS; i++)
val[i] = i;
htable_init(&ht, hash, NULL);
for (i = 0; i < NUM_VALS; i++) {
ok1(ht.max >= i);
ok1(ht.max <= i * 2);
htable_add(&ht, hash(&val[i], NULL), &val[i]);
}
htable_clear(&ht);
return exit_status();
}

175
ccan/ccan/htable/test/run-type.c

@ -0,0 +1,175 @@
#include <ccan/htable/htable_type.h>
#include <ccan/htable/htable.c>
#include <ccan/tap/tap.h>
#include <stdbool.h>
#include <string.h>
#define NUM_BITS 7
#define NUM_VALS (1 << NUM_BITS)
struct obj {
/* Makes sure we don't try to treat and obj as a key or vice versa */
unsigned char unused;
unsigned int key;
};
static const unsigned int *objkey(const struct obj *obj)
{
return &obj->key;
}
/* We use the number divided by two as the hash (for lots of
collisions), plus set all the higher bits so we can detect if they
don't get masked out. */
static size_t objhash(const unsigned int *key)
{
size_t h = *key / 2;
h |= -1UL << NUM_BITS;
return h;
}
static bool cmp(const struct obj *obj, const unsigned int *key)
{
return obj->key == *key;
}
HTABLE_DEFINE_TYPE(struct obj, objkey, objhash, cmp, htable_obj);
static void add_vals(struct htable_obj *ht,
struct obj val[], unsigned int num)
{
unsigned int i;
for (i = 0; i < num; i++) {
if (htable_obj_get(ht, &i)) {
fail("%u already in hash", i);
return;
}
htable_obj_add(ht, &val[i]);
if (htable_obj_get(ht, &i) != &val[i]) {
fail("%u not added to hash", i);
return;
}
}
pass("Added %u numbers to hash", i);
}
static void find_vals(const struct htable_obj *ht,
const struct obj val[], unsigned int num)
{
unsigned int i;
for (i = 0; i < num; i++) {
if (htable_obj_get(ht, &i) != &val[i]) {
fail("%u not found in hash", i);
return;
}
}
pass("Found %u numbers in hash", i);
}
static void del_vals(struct htable_obj *ht,
const struct obj val[], unsigned int num)
{
unsigned int i;
for (i = 0; i < num; i++) {
if (!htable_obj_delkey(ht, &val[i].key)) {
fail("%u not deleted from hash", i);
return;
}
}
pass("Deleted %u numbers in hash", i);
}
static void del_vals_bykey(struct htable_obj *ht,
const struct obj val[], unsigned int num)
{
unsigned int i;
for (i = 0; i < num; i++) {
if (!htable_obj_delkey(ht, &i)) {
fail("%u not deleted by key from hash", i);
return;
}
}
pass("Deleted %u numbers by key from hash", i);
}
static bool check_mask(struct htable *ht, const struct obj val[], unsigned num)
{
uint64_t i;
for (i = 0; i < num; i++) {
if (((uintptr_t)&val[i] & ht->common_mask) != ht->common_bits)
return false;
}
return true;
}
int main(int argc, char *argv[])
{
unsigned int i;
struct htable_obj ht;
struct obj val[NUM_VALS];
unsigned int dne;
void *p;
struct htable_obj_iter iter;
plan_tests(20);
for (i = 0; i < NUM_VALS; i++)
val[i].key = i;
dne = i;
htable_obj_init(&ht);
ok1(ht.raw.max == 0);
ok1(ht.raw.bits == 0);
/* We cannot find an entry which doesn't exist. */
ok1(!htable_obj_get(&ht, &dne));
/* Fill it, it should increase in size. */
add_vals(&ht, val, NUM_VALS);
ok1(ht.raw.bits == NUM_BITS + 1);
ok1(ht.raw.max < (1 << ht.raw.bits));
/* Mask should be set. */
ok1(ht.raw.common_mask != 0);
ok1(ht.raw.common_mask != -1);
ok1(check_mask(&ht.raw, val, NUM_VALS));
/* Find all. */
find_vals(&ht, val, NUM_VALS);
ok1(!htable_obj_get(&ht, &dne));
/* Walk once, should get them all. */
i = 0;
for (p = htable_obj_first(&ht,&iter); p; p = htable_obj_next(&ht, &iter))
i++;
ok1(i == NUM_VALS);
/* Delete all. */
del_vals(&ht, val, NUM_VALS);
ok1(!htable_obj_get(&ht, &val[0].key));
/* Worst case, a "pointer" which doesn't have any matching bits. */
htable_add(&ht.raw, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
htable_obj_add(&ht, &val[NUM_VALS-1]);
ok1(ht.raw.common_mask == 0);
ok1(ht.raw.common_bits == 0);
/* Delete the bogus one before we trip over it. */
htable_del(&ht.raw, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
/* Add the rest. */
add_vals(&ht, val, NUM_VALS-1);
/* Check we can find them all. */
find_vals(&ht, val, NUM_VALS);
ok1(!htable_obj_get(&ht, &dne));
/* Delete them all by key. */
del_vals_bykey(&ht, val, NUM_VALS);
htable_obj_clear(&ht);
return exit_status();
}

61
ccan/ccan/htable/test/run-zero-hash-first-entry.c

@ -0,0 +1,61 @@
#include <ccan/htable/htable.h>
#include <ccan/htable/htable.c>
#include <ccan/tap/tap.h>
#include <stdbool.h>
struct data {
size_t key;
};
/* Hash is simply key itself. */
static size_t hash(const void *e, void *unused)
{
struct data *d = (struct data *)e;
return d->key;
}
static bool eq(const void *e, void *k)
{
struct data *d = (struct data *)e;
size_t *key = (size_t *)k;
return (d->key == *key);
}
int main(void)
{
struct htable table;
struct data *d0, *d1;
plan_tests(6);
d1 = malloc(sizeof(struct data));
d1->key = 1;
d0 = malloc(sizeof(struct data));
d0->key = 0;
htable_init(&table, hash, NULL);
htable_add(&table, d0->key, d0);
htable_add(&table, d1->key, d1);
ok1(table.elems == 2);
ok1(htable_get(&table, 1, eq, &d1->key) == d1);
ok1(htable_get(&table, 0, eq, &d0->key) == d0);
htable_clear(&table);
/* Now add in reverse order, should still be OK. */
htable_add(&table, d1->key, d1);
htable_add(&table, d0->key, d0);
ok1(table.elems == 2);
ok1(htable_get(&table, 1, eq, &d1->key) == d1);
ok1(htable_get(&table, 0, eq, &d0->key) == d0);
htable_clear(&table);
free(d0);
free(d1);
return exit_status();
}

207
ccan/ccan/htable/test/run.c

@ -0,0 +1,207 @@
#include <ccan/htable/htable.h>
#include <ccan/htable/htable.c>
#include <ccan/tap/tap.h>
#include <stdbool.h>
#include <string.h>
#define NUM_BITS 7
#define NUM_VALS (1 << NUM_BITS)
/* We use the number divided by two as the hash (for lots of
collisions), plus set all the higher bits so we can detect if they
don't get masked out. */
static size_t hash(const void *elem, void *unused)
{
size_t h = *(uint64_t *)elem / 2;
h |= -1UL << NUM_BITS;
return h;
}
static bool objcmp(const void *htelem, void *cmpdata)
{
return *(uint64_t *)htelem == *(uint64_t *)cmpdata;
}
static void add_vals(struct htable *ht,
const uint64_t val[],
unsigned int off, unsigned int num)
{
uint64_t i;
for (i = off; i < off+num; i++) {
if (htable_get(ht, hash(&i, NULL), objcmp, &i)) {
fail("%llu already in hash", (long long)i);
return;
}
htable_add(ht, hash(&val[i], NULL), &val[i]);
if (htable_get(ht, hash(&i, NULL), objcmp, &i) != &val[i]) {
fail("%llu not added to hash", (long long)i);
return;
}
}
pass("Added %llu numbers to hash", (long long)i);
}
#if 0
static void refill_vals(struct htable *ht,
const uint64_t val[], unsigned int num)
{
uint64_t i;
for (i = 0; i < num; i++) {
if (htable_get(ht, hash(&i, NULL), objcmp, &i))
continue;
htable_add(ht, hash(&val[i], NULL), &val[i]);
}
}
#endif
static void find_vals(struct htable *ht,
const uint64_t val[], unsigned int num)
{
uint64_t i;
for (i = 0; i < num; i++) {
if (htable_get(ht, hash(&i, NULL), objcmp, &i) != &val[i]) {
fail("%llu not found in hash", (long long)i);
return;
}
}
pass("Found %llu numbers in hash", (long long)i);
}
static void del_vals(struct htable *ht,
const uint64_t val[], unsigned int num)
{
uint64_t i;
for (i = 0; i < num; i++) {
if (!htable_del(ht, hash(&val[i], NULL), &val[i])) {
fail("%llu not deleted from hash", (long long)i);
return;
}
}
pass("Deleted %llu numbers in hash", (long long)i);
}
static bool check_mask(struct htable *ht, uint64_t val[], unsigned num)
{
uint64_t i;
for (i = 0; i < num; i++) {
if (((uintptr_t)&val[i] & ht->common_mask) != ht->common_bits)
return false;
}
return true;
}
int main(int argc, char *argv[])
{
unsigned int i, weight;
uintptr_t perfect_bit;
struct htable ht;
uint64_t val[NUM_VALS];
uint64_t dne;
void *p;
struct htable_iter iter;
plan_tests(35);
for (i = 0; i < NUM_VALS; i++)
val[i] = i;
dne = i;
htable_init(&ht, hash, NULL);
ok1(ht.max == 0);
ok1(ht.bits == 0);
/* We cannot find an entry which doesn't exist. */
ok1(!htable_get(&ht, hash(&dne, NULL), objcmp, &dne));
/* This should increase it once. */
add_vals(&ht, val, 0, 1);
ok1(ht.bits == 1);
ok1(ht.max == 1);
weight = 0;
for (i = 0; i < sizeof(ht.common_mask) * CHAR_BIT; i++) {
if (ht.common_mask & ((uintptr_t)1 << i)) {
weight++;
}
}
/* Only one bit should be clear. */
ok1(weight == i-1);
/* Mask should be set. */
ok1(check_mask(&ht, val, 1));
/* This should increase it again. */
add_vals(&ht, val, 1, 1);
ok1(ht.bits == 2);
ok1(ht.max == 3);
/* Mask should be set. */
ok1(ht.common_mask != 0);
ok1(ht.common_mask != -1);
ok1(check_mask(&ht, val, 2));
/* Now do the rest. */
add_vals(&ht, val, 2, NUM_VALS - 2);
/* Find all. */
find_vals(&ht, val, NUM_VALS);
ok1(!htable_get(&ht, hash(&dne, NULL), objcmp, &dne));
/* Walk once, should get them all. */
i = 0;
for (p = htable_first(&ht,&iter); p; p = htable_next(&ht, &iter))
i++;
ok1(i == NUM_VALS);
/* Delete all. */
del_vals(&ht, val, NUM_VALS);
ok1(!htable_get(&ht, hash(&val[0], NULL), objcmp, &val[0]));
/* Worst case, a "pointer" which doesn't have any matching bits. */
htable_add(&ht, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
htable_add(&ht, hash(&val[NUM_VALS-1], NULL), &val[NUM_VALS-1]);
ok1(ht.common_mask == 0);
ok1(ht.common_bits == 0);
/* Get rid of bogus pointer before we trip over it! */
htable_del(&ht, 0, (void *)~(uintptr_t)&val[NUM_VALS-1]);
/* Add the rest. */
add_vals(&ht, val, 0, NUM_VALS-1);
/* Check we can find them all. */
find_vals(&ht, val, NUM_VALS);
ok1(!htable_get(&ht, hash(&dne, NULL), objcmp, &dne));
/* Corner cases: wipe out the perfect bit using bogus pointer. */
htable_clear(&ht);
htable_add(&ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1]));
ok1(ht.perfect_bit);
perfect_bit = ht.perfect_bit;
htable_add(&ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1]
| perfect_bit));
ok1(ht.perfect_bit == 0);
htable_del(&ht, 0, (void *)((uintptr_t)&val[NUM_VALS-1] | perfect_bit));
/* Enlarging should restore it... */
add_vals(&ht, val, 0, NUM_VALS-1);
ok1(ht.perfect_bit != 0);
htable_clear(&ht);
ok1(htable_init_sized(&ht, hash, NULL, 1024));
ok1(ht.max >= 1024);
htable_clear(&ht);
ok1(htable_init_sized(&ht, hash, NULL, 1023));
ok1(ht.max >= 1023);
htable_clear(&ht);
ok1(htable_init_sized(&ht, hash, NULL, 1025));
ok1(ht.max >= 1025);
htable_clear(&ht);
return exit_status();
}

40
ccan/ccan/htable/tools/Makefile

@ -0,0 +1,40 @@
CCANDIR=../../..
CFLAGS=-Wall -Werror -O3 -I$(CCANDIR)
#CFLAGS=-Wall -Werror -g -I$(CCANDIR)
CCAN_OBJS:=ccan-tal.o ccan-tal-str.o ccan-tal-grab_file.o ccan-take.o ccan-time.o ccan-str.o ccan-noerr.o ccan-list.o
all: speed stringspeed hsearchspeed
speed: speed.o hash.o $(CCAN_OBJS)
speed.o: speed.c ../htable.h ../htable.c
hash.o: ../../hash/hash.c
$(CC) $(CFLAGS) -c -o $@ $<
stringspeed: stringspeed.o hash.o $(CCAN_OBJS)
stringspeed.o: speed.c ../htable.h ../htable.c
hsearchspeed: hsearchspeed.o $(CCAN_OBJS)
clean:
rm -f stringspeed speed hsearchspeed *.o
ccan-tal.o: $(CCANDIR)/ccan/tal/tal.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-tal-str.o: $(CCANDIR)/ccan/tal/str/str.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-take.o: $(CCANDIR)/ccan/take/take.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-tal-grab_file.o: $(CCANDIR)/ccan/tal/grab_file/grab_file.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-time.o: $(CCANDIR)/ccan/time/time.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-list.o: $(CCANDIR)/ccan/list/list.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-str.o: $(CCANDIR)/ccan/str/str.c
$(CC) $(CFLAGS) -c -o $@ $<
ccan-noerr.o: $(CCANDIR)/ccan/noerr/noerr.c
$(CC) $(CFLAGS) -c -o $@ $<

95
ccan/ccan/htable/tools/hsearchspeed.c

@ -0,0 +1,95 @@
/* Simple speed tests for a hash of strings using hsearch */
#include <ccan/htable/htable_type.h>
#include <ccan/htable/htable.c>
#include <ccan/tal/str/str.h>
#include <ccan/tal/grab_file/grab_file.h>
#include <ccan/tal/tal.h>
#include <ccan/hash/hash.h>
#include <ccan/time/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#include <search.h>
/* Nanoseconds per operation */
static size_t normalize(const struct timeabs *start,
const struct timeabs *stop,
unsigned int num)
{
return time_to_nsec(time_divide(time_between(*stop, *start), num));
}
int main(int argc, char *argv[])
{
size_t i, j, num;
struct timeabs start, stop;
char **w;
ENTRY *words, *misswords;
w = tal_strsplit(NULL, grab_file(NULL,
argv[1] ? argv[1] : "/usr/share/dict/words"), "\n", STR_NO_EMPTY);
num = tal_count(w) - 1;
printf("%zu words\n", num);
hcreate(num+num/3);
words = tal_arr(w, ENTRY, num);
for (i = 0; i < num; i++) {
words[i].key = w[i];
words[i].data = words[i].key;
}
/* Append and prepend last char for miss testing. */
misswords = tal_arr(w, ENTRY, num);
for (i = 0; i < num; i++) {
char lastc;
if (strlen(w[i]))
lastc = w[i][strlen(w[i])-1];
else
lastc = 'z';
misswords[i].key = tal_fmt(misswords, "%c%s%c%c",
lastc, w[i], lastc, lastc);
}
printf("#01: Initial insert: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
hsearch(words[i], ENTER);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#02: Initial lookup (match): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (hsearch(words[i], FIND)->data != words[i].data)
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#03: Initial lookup (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
if (hsearch(misswords[i], FIND))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Lookups in order are very cache-friendly for judy; try random */
printf("#04: Initial lookup (random): ");
fflush(stdout);
start = time_now();
for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num)
if (hsearch(words[i], FIND)->data != words[i].data)
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
return 0;
}

370
ccan/ccan/htable/tools/speed.c

@ -0,0 +1,370 @@
/* Simple speed tests for hashtables. */
#include <ccan/htable/htable_type.h>
#include <ccan/htable/htable.c>
#include <ccan/hash/hash.h>
#include <ccan/time/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
static size_t hashcount;
struct object {
/* The key. */
unsigned int key;
/* Some contents. Doubles as consistency check. */
struct object *self;
};
static const unsigned int *objkey(const struct object *obj)
{
return &obj->key;
}
static size_t hash_obj(const unsigned int *key)
{
hashcount++;
return hashl(key, 1, 0);
}
static bool cmp(const struct object *object, const unsigned int *key)
{
return object->key == *key;
}
HTABLE_DEFINE_TYPE(struct object, objkey, hash_obj, cmp, htable_obj);
static unsigned int popcount(unsigned long val)
{
#if HAVE_BUILTIN_POPCOUNTL
return __builtin_popcountl(val);
#else
if (sizeof(long) == sizeof(u64)) {
u64 v = val;
v = (v & 0x5555555555555555ULL)
+ ((v >> 1) & 0x5555555555555555ULL);
v = (v & 0x3333333333333333ULL)
+ ((v >> 1) & 0x3333333333333333ULL);
v = (v & 0x0F0F0F0F0F0F0F0FULL)
+ ((v >> 1) & 0x0F0F0F0F0F0F0F0FULL);
v = (v & 0x00FF00FF00FF00FFULL)
+ ((v >> 1) & 0x00FF00FF00FF00FFULL);
v = (v & 0x0000FFFF0000FFFFULL)
+ ((v >> 1) & 0x0000FFFF0000FFFFULL);
v = (v & 0x00000000FFFFFFFFULL)
+ ((v >> 1) & 0x00000000FFFFFFFFULL);
return v;
}
val = (val & 0x55555555ULL) + ((val >> 1) & 0x55555555ULL);
val = (val & 0x33333333ULL) + ((val >> 1) & 0x33333333ULL);
val = (val & 0x0F0F0F0FULL) + ((val >> 1) & 0x0F0F0F0FULL);
val = (val & 0x00FF00FFULL) + ((val >> 1) & 0x00FF00FFULL);
val = (val & 0x0000FFFFULL) + ((val >> 1) & 0x0000FFFFULL);
return val;
#endif
}
static size_t perfect(const struct htable *ht)
{
size_t i, placed_perfect = 0;
for (i = 0; i < ((size_t)1 << ht->bits); i++) {
if (!entry_is_valid(ht->table[i]))
continue;
if (hash_bucket(ht, ht->rehash(get_raw_ptr(ht, ht->table[i]),
ht->priv)) == i) {
assert((ht->table[i] & ht->perfect_bit)
== ht->perfect_bit);
placed_perfect++;
}
}
return placed_perfect;
}
static size_t count_deleted(const struct htable *ht)
{
size_t i, delete_markers = 0;
for (i = 0; i < ((size_t)1 << ht->bits); i++) {
if (ht->table[i] == HTABLE_DELETED)
delete_markers++;
}
return delete_markers;
}
/* Nanoseconds per operation */
static size_t normalize(const struct timeabs *start,
const struct timeabs *stop,
unsigned int num)
{
return time_to_nsec(time_divide(time_between(*stop, *start), num));
}
static size_t worst_run(struct htable *ht, size_t *deleted)
{
size_t longest = 0, len = 0, this_del = 0, i;
*deleted = 0;
/* This doesn't take into account end-wrap, but gives an idea. */
for (i = 0; i < ((size_t)1 << ht->bits); i++) {
if (ht->table[i]) {
len++;
if (ht->table[i] == HTABLE_DELETED)
this_del++;
} else {
if (len > longest) {
longest = len;
*deleted = this_del;
}
len = 0;
this_del = 0;
}
}
return longest;
}
int main(int argc, char *argv[])
{
struct object *objs;
unsigned int i, j;
size_t num, deleted;
struct timeabs start, stop;
struct htable_obj ht;
bool make_dumb = false;
if (argv[1] && strcmp(argv[1], "--dumb") == 0) {
argv++;
make_dumb = true;
}
num = argv[1] ? atoi(argv[1]) : 1000000;
objs = calloc(num, sizeof(objs[0]));
for (i = 0; i < num; i++) {
objs[i].key = i;
objs[i].self = &objs[i];
}
htable_obj_init(&ht);
printf("Initial insert: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
htable_obj_add(&ht, objs[i].self);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Details: hash size %u, mask bits %u, perfect %.0f%%\n",
1U << ht.raw.bits, popcount(ht.raw.common_mask),
perfect(&ht.raw) * 100.0 / ht.raw.elems);
if (make_dumb) {
/* Screw with mask, to hobble us. */
update_common(&ht.raw, (void *)~ht.raw.common_bits);
printf("Details: DUMB MODE: mask bits %u\n",
popcount(ht.raw.common_mask));
}
printf("Initial lookup (match): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (htable_obj_get(&ht, &i)->self != objs[i].self)
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Initial lookup (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
unsigned int n = i + num;
if (htable_obj_get(&ht, &n))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Lookups in order are very cache-friendly for judy; try random */
printf("Initial lookup (random): ");
fflush(stdout);
start = time_now();
for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num)
if (htable_obj_get(&ht, &j)->self != &objs[j])
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
hashcount = 0;
printf("Initial delete all: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (!htable_obj_del(&ht, objs[i].self))
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Details: rehashes %zu\n", hashcount);
printf("Initial re-inserting: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
htable_obj_add(&ht, objs[i].self);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
hashcount = 0;
printf("Deleting first half: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i+=2)
if (!htable_obj_del(&ht, objs[i].self))
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Details: rehashes %zu, delete markers %zu\n",
hashcount, count_deleted(&ht.raw));
printf("Adding (a different) half: ");
fflush(stdout);
for (i = 0; i < num; i+=2)
objs[i].key = num+i;
start = time_now();
for (i = 0; i < num; i+=2)
htable_obj_add(&ht, objs[i].self);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Details: delete markers %zu, perfect %.0f%%\n",
count_deleted(&ht.raw), perfect(&ht.raw) * 100.0 / ht.raw.elems);
printf("Lookup after half-change (match): ");
fflush(stdout);
start = time_now();
for (i = 1; i < num; i+=2)
if (htable_obj_get(&ht, &i)->self != objs[i].self)
abort();
for (i = 0; i < num; i+=2) {
unsigned int n = i + num;
if (htable_obj_get(&ht, &n)->self != objs[i].self)
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Lookup after half-change (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
unsigned int n = i + num * 2;
if (htable_obj_get(&ht, &n))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Hashtables with delete markers can fill with markers over time.
* so do some changes to see how it operates in long-term. */
for (i = 0; i < 5; i++) {
if (i == 0) {
/* We don't measure this: jmap is different. */
printf("Details: initial churn\n");
} else {
printf("Churning %s time: ",
i == 1 ? "second"
: i == 2 ? "third"
: i == 3 ? "fourth"
: "fifth");
fflush(stdout);
}
start = time_now();
for (j = 0; j < num; j++) {
if (!htable_obj_del(&ht, &objs[j]))
abort();
objs[j].key = num*i+j;
if (!htable_obj_add(&ht, &objs[j]))
abort();
}
stop = time_now();
if (i != 0)
printf(" %zu ns\n", normalize(&start, &stop, num));
}
/* Spread out the keys more to try to make it harder. */
printf("Details: reinserting with spread\n");
for (i = 0; i < num; i++) {
if (!htable_obj_del(&ht, objs[i].self))
abort();
objs[i].key = num * 5 + i * 9;
if (!htable_obj_add(&ht, objs[i].self))
abort();
}
printf("Details: delete markers %zu, perfect %.0f%%\n",
count_deleted(&ht.raw), perfect(&ht.raw) * 100.0 / ht.raw.elems);
i = worst_run(&ht.raw, &deleted);
printf("Details: worst run %u (%zu deleted)\n", i, deleted);
printf("Lookup after churn & spread (match): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
unsigned int n = num * 5 + i * 9;
if (htable_obj_get(&ht, &n)->self != objs[i].self)
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Lookup after churn & spread (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
unsigned int n = num * (5 + 9) + i * 9;
if (htable_obj_get(&ht, &n))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Lookup after churn & spread (random): ");
fflush(stdout);
start = time_now();
for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num) {
unsigned int n = num * 5 + j * 9;
if (htable_obj_get(&ht, &n)->self != &objs[j])
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
hashcount = 0;
printf("Deleting half after churn & spread: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i+=2)
if (!htable_obj_del(&ht, objs[i].self))
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Adding (a different) half after churn & spread: ");
fflush(stdout);
for (i = 0; i < num; i+=2)
objs[i].key = num*6+i*9;
start = time_now();
for (i = 0; i < num; i+=2)
htable_obj_add(&ht, objs[i].self);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Details: delete markers %zu, perfect %.0f%%\n",
count_deleted(&ht.raw), perfect(&ht.raw) * 100.0 / ht.raw.elems);
return 0;
}

240
ccan/ccan/htable/tools/stringspeed.c

@ -0,0 +1,240 @@
/* Simple speed tests for a hash of strings. */
#include <ccan/htable/htable_type.h>
#include <ccan/htable/htable.c>
#include <ccan/tal/str/str.h>
#include <ccan/tal/grab_file/grab_file.h>
#include <ccan/tal/tal.h>
#include <ccan/hash/hash.h>
#include <ccan/time/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
static size_t hashcount;
static const char *strkey(const char *str)
{
return str;
}
static size_t hash_str(const char *key)
{
hashcount++;
return hash(key, strlen(key), 0);
}
static bool cmp(const char *obj, const char *key)
{
return strcmp(obj, key) == 0;
}
HTABLE_DEFINE_TYPE(char, strkey, hash_str, cmp, htable_str);
/* Nanoseconds per operation */
static size_t normalize(const struct timeabs *start,
const struct timeabs *stop,
unsigned int num)
{
return time_to_nsec(time_divide(time_between(*stop, *start), num));
}
int main(int argc, char *argv[])
{
size_t i, j, num;
struct timeabs start, stop;
struct htable_str ht;
char **words, **misswords;
words = tal_strsplit(NULL, grab_file(NULL,
argv[1] ? argv[1] : "/usr/share/dict/words"), "\n",
STR_NO_EMPTY);
htable_str_init(&ht);
num = tal_count(words) - 1;
/* Note that on my system, num is just > 98304, where we double! */
printf("%zu words\n", num);
/* Append and prepend last char for miss testing. */
misswords = tal_arr(words, char *, num);
for (i = 0; i < num; i++) {
char lastc;
if (strlen(words[i]))
lastc = words[i][strlen(words[i])-1];
else
lastc = 'z';
misswords[i] = tal_fmt(misswords, "%c%s%c%c",
lastc, words[i], lastc, lastc);
}
printf("#01: Initial insert: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
htable_str_add(&ht, words[i]);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("Bytes allocated: %zu\n",
sizeof(ht.raw.table[0]) << ht.raw.bits);
printf("#02: Initial lookup (match): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (htable_str_get(&ht, words[i]) != words[i])
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#03: Initial lookup (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
if (htable_str_get(&ht, misswords[i]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Lookups in order are very cache-friendly for judy; try random */
printf("#04: Initial lookup (random): ");
fflush(stdout);
start = time_now();
for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num)
if (htable_str_get(&ht, words[j]) != words[j])
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
hashcount = 0;
printf("#05: Initial delete all: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (!htable_str_del(&ht, words[i]))
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#06: Initial re-inserting: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
htable_str_add(&ht, words[i]);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
hashcount = 0;
printf("#07: Deleting first half: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i+=2)
if (!htable_str_del(&ht, words[i]))
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#08: Adding (a different) half: ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i+=2)
htable_str_add(&ht, misswords[i]);
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#09: Lookup after half-change (match): ");
fflush(stdout);
start = time_now();
for (i = 1; i < num; i+=2)
if (htable_str_get(&ht, words[i]) != words[i])
abort();
for (i = 0; i < num; i+=2) {
if (htable_str_get(&ht, misswords[i]) != misswords[i])
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#10: Lookup after half-change (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i+=2)
if (htable_str_get(&ht, words[i]))
abort();
for (i = 1; i < num; i+=2) {
if (htable_str_get(&ht, misswords[i]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Hashtables with delete markers can fill with markers over time.
* so do some changes to see how it operates in long-term. */
printf("#11: Churn 1: ");
start = time_now();
for (j = 0; j < num; j+=2) {
if (!htable_str_del(&ht, misswords[j]))
abort();
if (!htable_str_add(&ht, words[j]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#12: Churn 2: ");
start = time_now();
for (j = 1; j < num; j+=2) {
if (!htable_str_del(&ht, words[j]))
abort();
if (!htable_str_add(&ht, misswords[j]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#13: Churn 3: ");
start = time_now();
for (j = 1; j < num; j+=2) {
if (!htable_str_del(&ht, misswords[j]))
abort();
if (!htable_str_add(&ht, words[j]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Now it's back to normal... */
printf("#14: Post-Churn lookup (match): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++)
if (htable_str_get(&ht, words[i]) != words[i])
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
printf("#15: Post-Churn lookup (miss): ");
fflush(stdout);
start = time_now();
for (i = 0; i < num; i++) {
if (htable_str_get(&ht, misswords[i]))
abort();
}
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
/* Lookups in order are very cache-friendly for judy; try random */
printf("#16: Post-Churn lookup (random): ");
fflush(stdout);
start = time_now();
for (i = 0, j = 0; i < num; i++, j = (j + 10007) % num)
if (htable_str_get(&ht, words[j]) != words[j])
abort();
stop = time_now();
printf(" %zu ns\n", normalize(&start, &stop, num));
return 0;
}

1
ccan/ccan/order/LICENSE

@ -0,0 +1 @@
../../licenses/CC0

33
ccan/ccan/order/_info

@ -0,0 +1,33 @@
#include "config.h"
#include <stdio.h>
#include <string.h>
/**
* order - Simple, common value comparison functions
*
* This implements a number of commonly useful comparison functions in
* a form which can be used with qsort() and bsearch() in the standard
* library, or asort() and asearch() in ccan amongst other places.
*
* License: CC0
* Author: David Gibson <david@gibson.dropbear.id.au>
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/typesafe_cb\n");
printf("ccan/ptrint\n");
return 0;
}
if (strcmp(argv[1], "testdepends") == 0) {
printf("ccan/array_size\n");
printf("ccan/asort\n");
return 0;
}
return 1;
}

70
ccan/ccan/order/order.c

@ -0,0 +1,70 @@
/* CC0 license (public domain) - see LICENSE file for details */
#include <ccan/order/order.h>
#define SCALAR_ORDER(_oname, _type) \
int _order_##_oname(const void *a, \
const void *b, \
void *ctx) \
{ \
ptrdiff_t offset = ptr2int(ctx); \
const _type *aa = (const _type *)((char *)a + offset); \
const _type *bb = (const _type *)((char *)b + offset); \
\
if (*aa < *bb) { \
return -1; \
} else if (*aa > *bb) { \
return 1; \
} else { \
assert(*aa == *bb); \
return 0; \
} \
} \
int order_##_oname(const _type *a, \
const _type *b, \
void *ctx) \
{ \
return _order_##_oname(a, b, int2ptr(0)); \
} \
int _order_##_oname##_reverse(const void *a, \
const void *b, \
void *ctx) \
{ \
return -_order_##_oname(a, b, ctx); \
} \
int order_##_oname##_reverse(const _type *a, \
const _type *b, \
void *ctx) \
{ \
return _order_##_oname##_reverse(a, b, int2ptr(0)); \
} \
int order_##_oname##_noctx(const void *a, \
const void *b) \
{ \
return _order_##_oname(a, b, int2ptr(0)); \
} \
int order_##_oname##_reverse_noctx(const void *a, \
const void *b) \
{ \
return _order_##_oname##_reverse(a, b, int2ptr(0)); \
}
SCALAR_ORDER(s8, int8_t)
SCALAR_ORDER(s16, int16_t)
SCALAR_ORDER(s32, int32_t)
SCALAR_ORDER(s64, int64_t)
SCALAR_ORDER(u8, uint8_t)
SCALAR_ORDER(u16, uint16_t)
SCALAR_ORDER(u32, uint32_t)
SCALAR_ORDER(u64, uint64_t)
SCALAR_ORDER(int, int)
SCALAR_ORDER(uint, unsigned int)
SCALAR_ORDER(long, long)
SCALAR_ORDER(ulong, unsigned long)
SCALAR_ORDER(size, size_t)
SCALAR_ORDER(ptrdiff, ptrdiff_t)
SCALAR_ORDER(float, float)
SCALAR_ORDER(double, double)

73
ccan/ccan/order/order.h

@ -0,0 +1,73 @@
/* CC0 license (public domain) - see LICENSE file for details */
#ifndef CCAN_ORDER_H
#define CCAN_ORDER_H
#include <stdint.h>
#include <assert.h>
#include <ccan/typesafe_cb/typesafe_cb.h>
#include <ccan/ptrint/ptrint.h>
typedef int (*_total_order_cb)(const void *, const void *, void *);
typedef int (*total_order_noctx_cb)(const void *, const void *);
#define total_order_cb(_name, _item, _ctx) \
int (*_name)(const __typeof__(_item) *, \
const __typeof__(_item) *, \
__typeof__(_ctx))
#define total_order_cast(cmp, item, ctx) \
typesafe_cb_cast(_total_order_cb, total_order_cb(, item, ctx), \
(cmp))
struct _total_order {
_total_order_cb cb;
void *ctx;
};
#define total_order(_name, _item, _ctx) \
struct { \
total_order_cb(cb, _item, _ctx); \
_ctx ctx; \
} _name
#define _DECL_ONAME(_oname, _itype) \
extern int _order_##_oname(const void *, const void *, void *); \
extern int order_##_oname(const _itype *, const _itype *, void *); \
extern int order_##_oname##_noctx(const void *, const void *);
#define _DECL_ONAME_BIDIR(_oname, _itype) \
_DECL_ONAME(_oname, _itype) \
_DECL_ONAME(_oname##_reverse, _itype)
_DECL_ONAME_BIDIR(s8, int8_t)
_DECL_ONAME_BIDIR(s16, int16_t)
_DECL_ONAME_BIDIR(s32, int32_t)
_DECL_ONAME_BIDIR(s64, int64_t)
_DECL_ONAME_BIDIR(u8, uint8_t)
_DECL_ONAME_BIDIR(u16, uint16_t)
_DECL_ONAME_BIDIR(u32, uint32_t)
_DECL_ONAME_BIDIR(u64, uint64_t)
_DECL_ONAME_BIDIR(int, int)
_DECL_ONAME_BIDIR(uint, unsigned int)
_DECL_ONAME_BIDIR(long, long)
_DECL_ONAME_BIDIR(ulong, unsigned long)
_DECL_ONAME_BIDIR(size, size_t)
_DECL_ONAME_BIDIR(ptrdiff, ptrdiff_t)
_DECL_ONAME_BIDIR(float, float)
_DECL_ONAME_BIDIR(double, double)
#undef _DECL_ONAME
#undef _DECL_ONAME_BIDIR
#define total_order_by_field(_name, _oname, _itype, _field) \
total_order(_name, _itype, ptrint_t *) = { \
(total_order_cb(, _itype, \
ptrint_t *))(_order_##_oname), \
int2ptr(offsetof(_itype, _field)), \
}
#endif /* CCAN_ORDER_H */

138
ccan/ccan/order/test/api.c

@ -0,0 +1,138 @@
#include "config.h"
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <float.h>
#include <math.h>
#include <ccan/array_size/array_size.h>
#include <ccan/order/order.h>
#include <ccan/tap/tap.h>
#include <ccan/asort/asort.h>
#define QSORT_SCALAR(t, oname, ...) \
{ \
t arr0[] = { __VA_ARGS__ }; \
const int num = ARRAY_SIZE(arr0); \
t arr1[num], arr2[num]; \
int i; \
\
/* Intialize arr1 in reverse order */ \
for (i = 0; i < num; i++) \
arr1[i] = arr0[num-i-1]; \
\
memcpy(arr2, arr1, sizeof(arr1)); \
qsort(arr2, num, sizeof(t), order_##oname##_noctx); \
ok(memcmp(arr2, arr0, sizeof(arr0)) == 0, \
"qsort order_%s_noctx", #oname); \
\
qsort(arr2, num, sizeof(t), order_##oname##_reverse_noctx); \
ok(memcmp(arr2, arr1, sizeof(arr1)) == 0, \
"qsort order_%s_reverse_noctx", #oname); \
}
#define ASORT_SCALAR(t, oname, ...) \
{ \
t arr0[] = { __VA_ARGS__ }; \
const int num = ARRAY_SIZE(arr0); \
t arr1[num], arr2[num]; \
int i; \
\
/* Intialize arr1 in reverse order */ \
for (i = 0; i < num; i++) \
arr1[i] = arr0[num-i-1]; \
\
memcpy(arr2, arr1, sizeof(arr1)); \
asort(arr2, num, order_##oname, NULL); \
ok(memcmp(arr2, arr0, sizeof(arr0)) == 0, \
"asort order_%s", #oname); \
\
asort(arr2, num, order_##oname##_reverse, NULL); \
ok(memcmp(arr2, arr1, sizeof(arr1)) == 0, \
"asort order_%s_reverse", #oname); \
}
#define ASORT_STRUCT_BY_SCALAR(t, oname, ...) \
{ \
t arrbase[] = { __VA_ARGS__ }; \
struct tstruct { \
char dummy0[5]; \
t val; \
long dummy1; \
}; \
const int num = ARRAY_SIZE(arrbase); \
struct tstruct arr0[num], arr1[num], arr2[num]; \
int i; \
total_order_by_field(order, oname, struct tstruct, val); \
total_order_by_field(rorder, oname##_reverse, \
struct tstruct, val); \
\
/* Set up dummy structures */ \
memset(arr0, 0, sizeof(arr0)); \
for (i = 0; i < num; i++) { \
arr0[i].dummy1 = i; \
strcpy(arr0[i].dummy0, "abc"); \
arr0[i].val = arrbase[i]; \
} \
\
/* Intialize arr1 in reverse order */ \
for (i = 0; i < num; i++) \
arr1[i] = arr0[num-i-1]; \
\
memcpy(arr2, arr1, sizeof(arr1)); \
asort(arr2, num, order.cb, order.ctx); \
ok(memcmp(arr2, arr0, sizeof(arr0)) == 0, \
"asort by field %s", #oname); \
\
asort(arr2, num, rorder.cb, rorder.ctx); \
ok(memcmp(arr2, arr1, sizeof(arr1)) == 0, \
"asort by field %s_reverse", #oname); \
}
#define TEST_SCALAR(t, oname, ...) \
{ \
QSORT_SCALAR(t, oname, __VA_ARGS__); \
ASORT_SCALAR(t, oname, __VA_ARGS__); \
ASORT_STRUCT_BY_SCALAR(t, oname, __VA_ARGS__); \
}
int main(void)
{
/* This is how many tests you plan to run */
plan_tests(84);
TEST_SCALAR(int8_t, s8, -128, -4, 0, 1, 2, 88, 126, 127);
TEST_SCALAR(int16_t, s16, -32768, -4, 0, 1, 2, 88, 126, 32767);
TEST_SCALAR(int32_t, s32, -2000000000, -4, 0, 1, 2, 88, 126,
2000000000);
TEST_SCALAR(int64_t, s64, -999999999999999999LL, -2000000000, -4, 0,
1, 2, 88, 126, 2000000000, 999999999999999999LL);
TEST_SCALAR(uint8_t, u8, 0, 1, 2, 88, 126, 127, -10, -1);
TEST_SCALAR(uint16_t, u16, 0, 1, 2, 88, 126, 32767, -10, -1);
TEST_SCALAR(uint32_t, u32, 0, 1, 2, 88, 126, 2000000000, -10, -1);
TEST_SCALAR(uint64_t, u64, 0, 1, 2, 88, 126, 2000000000,
999999999999999999LL, -10, -1);
TEST_SCALAR(int, int, INT_MIN, -10, -1, 0, 1, 10, INT_MAX);
TEST_SCALAR(unsigned, uint, 0, 1, 10, INT_MAX, (unsigned)INT_MAX+1,
-10, -1);
TEST_SCALAR(long, long, LONG_MIN, INT_MIN, -10, -1, 0, 1, 10, INT_MAX,
LONG_MAX);
TEST_SCALAR(unsigned long, ulong, 0, 1, 10, INT_MAX,
(unsigned long)INT_MAX+1, LONG_MAX,
(unsigned long)LONG_MAX+1, -10, -1);
TEST_SCALAR(float, float, -INFINITY, -FLT_MAX, -1.0, 0.0, FLT_MIN,
0.1, M_E, M_PI, 5.79, FLT_MAX, INFINITY);
TEST_SCALAR(double, double, -INFINITY, -DBL_MAX, -FLT_MAX, -1.0, 0.0,
DBL_MIN, FLT_MIN, 0.1, M_E, M_PI, 5.79, FLT_MAX, DBL_MAX,
INFINITY);
/* This exits depending on whether all tests passed */
return exit_status();
}

24
ccan/ccan/order/test/compile_fail_1.c

@ -0,0 +1,24 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ccan/order/order.h>
#include "fancy_cmp.h"
#ifdef FAIL
typedef int item_t;
#else
typedef struct item item_t;
#endif
int main(int argc, char *argv[])
{
total_order_cb(cb0, struct item, struct cmp_info *) = fancy_cmp;
_total_order_cb cb1 = total_order_cast(fancy_cmp,
item_t, struct cmp_info *);
printf("%p %p\n", cb0, cb1);
exit(0);
}

25
ccan/ccan/order/test/compile_fail_2.c

@ -0,0 +1,25 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ccan/order/order.h>
#include "fancy_cmp.h"
#ifdef FAIL
typedef int ctx_t;
#else
typedef struct cmp_info ctx_t;
#endif
int main(int argc, char *argv[])
{
total_order_cb(cb0, struct item, struct cmp_info *) = fancy_cmp;
_total_order_cb cb1 = total_order_cast(fancy_cmp, struct item,
ctx_t *);
printf("%p %p\n", cb0, cb1);
exit(0);
}

19
ccan/ccan/order/test/compile_ok.c

@ -0,0 +1,19 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ccan/order/order.h>
#include "fancy_cmp.h"
int main(int argc, char *argv[])
{
total_order_cb(cb0, struct item, struct cmp_info *) = fancy_cmp;
_total_order_cb cb1 = total_order_cast(fancy_cmp,
struct item, struct cmp_info *);
total_order_noctx_cb cb_noctx = fancy_cmp_noctx;
printf("%p %p %p\n", cb0, cb1, cb_noctx);
exit(0);
}

47
ccan/ccan/order/test/fancy_cmp.h

@ -0,0 +1,47 @@
#ifndef _FANCY_CMP_H
#define _FANCY_CMP_H
struct cmp_info {
unsigned xcode;
int offset;
};
struct item {
unsigned value;
char *str;
};
static inline int fancy_cmp(const struct item *a, const struct item *b,
struct cmp_info *ctx)
{
unsigned vala = a->value ^ ctx->xcode;
unsigned valb = b->value ^ ctx->xcode;
const char *stra, *strb;
if (vala < valb)
return -1;
else if (valb < vala)
return 1;
stra = a->str + ctx->offset;
strb = b->str + ctx->offset;
return strcmp(stra, strb);
}
static inline int fancy_cmp_noctx(const void *av, const void *bv)
{
const struct item *a = (const struct item *)av;
const struct item *b = (const struct item *)bv;
struct cmp_info ctx_default = {
.xcode = 0x1234,
.offset = 3,
};
total_order(default_order, struct item, struct cmp_info *) = {
fancy_cmp, &ctx_default,
};
return default_order.cb(a, b, default_order.ctx);
}
#endif /* _FANCY_CMP_H */

1
ccan/ccan/ptrint/LICENSE

@ -0,0 +1 @@
../../licenses/CC0

59
ccan/ccan/ptrint/_info

@ -0,0 +1,59 @@
#include "config.h"
#include <stdio.h>
#include <string.h>
/**
* ptrint - Encoding integers in pointer values
*
* Library (standard or ccan) functions which take user supplied
* callbacks usually have the callback supplied with a void * context
* pointer. For simple cases, it's sometimes sufficient to pass a
* simple integer cast into a void *, rather than having to allocate a
* context structure. This module provides some helper macros to do
* this relatively safely and portably.
*
* The key characteristics of these functions are:
* ptr2int(int2ptr(val)) == val
* and
* !int2ptr(val) == !val
* (i.e. the transformation preserves truth value).
*
* Example:
* #include <ccan/ptrint/ptrint.h>
*
* static void callback(void *opaque)
* {
* int val = ptr2int(opaque);
* printf("Value is %d\n", val);
* }
*
* void (*cb)(void *opaque) = callback;
*
* int main(int argc, char *argv[])
* {
* int val = 17;
*
* (*cb)(int2ptr(val));
* exit(0);
* }
*
* License: CC0 (Public domain)
* Author: David Gibson <david@gibson.dropbear.id.au>
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/build_assert\n");
return 0;
}
if (strcmp(argv[1], "testdepends") == 0) {
printf("ccan/array_size\n");
return 0;
}
return 1;
}

34
ccan/ccan/ptrint/ptrint.h

@ -0,0 +1,34 @@
/* CC0 (Public domain) - see LICENSE file for details */
#ifndef CCAN_PTRINT_H
#define CCAN_PTRINT_H
#include "config.h"
#include <stddef.h>
#include <ccan/build_assert/build_assert.h>
/*
* This is a deliberately incomplete type, because it should never be
* dereferenced - instead it marks pointer values which are actually
* encoding integers
*/
typedef struct ptrint ptrint_t;
static inline ptrdiff_t ptr2int(const ptrint_t *p)
{
/*
* ptrdiff_t is the right size by definition, but to avoid
* surprises we want a warning if the user can't fit at least
* a regular int in there
*/
BUILD_ASSERT(sizeof(int) <= sizeof(ptrdiff_t));
return (const char *)p - (const char *)NULL;
}
static inline ptrint_t *int2ptr(ptrdiff_t i)
{
return (ptrint_t *)((char *)NULL + i);
}
#endif /* CCAN_PTRINT_H */

29
ccan/ccan/ptrint/test/run.c

@ -0,0 +1,29 @@
#include <limits.h>
#include <ccan/array_size/array_size.h>
#include <ccan/ptrint/ptrint.h>
#include <ccan/tap/tap.h>
static ptrdiff_t testvals[] = {
-INT_MAX, -1, 0, 1, 2, 17, INT_MAX,
};
int main(void)
{
int i;
/* This is how many tests you plan to run */
plan_tests(2 * ARRAY_SIZE(testvals));
for (i = 0; i < ARRAY_SIZE(testvals); i++) {
ptrdiff_t val = testvals[i];
void *ptr = int2ptr(val);
ok1(ptr2int(ptr) == val);
ok1(!val == !ptr);
}
/* This exits depending on whether all tests passed */
return exit_status();
}

26
ccan/ccan/tal/benchmark/Makefile

@ -0,0 +1,26 @@
CFLAGS=-O3 -Wall -flto -I../../..
#CFLAGS=-O3 -Wall -I../../..
#CFLAGS=-g -Wall -I../../..
LDFLAGS=-O3 -flto
LDLIBS=-lrt
all: speed samba-allocs
speed: speed.o tal.o talloc.o time.o list.o take.o str.o
samba-allocs: samba-allocs.o tal.o talloc.o time.o list.o take.o
tal.o: ../tal.c
$(CC) $(CFLAGS) -c -o $@ $<
str.o: ../str/str.c
$(CC) $(CFLAGS) -c -o $@ $<
talloc.o: ../../talloc/talloc.c
$(CC) $(CFLAGS) -c -o $@ $<
time.o: ../../time/time.c
$(CC) $(CFLAGS) -c -o $@ $<
list.o: ../../list/list.c
$(CC) $(CFLAGS) -c -o $@ $<
take.o: ../../take/take.c
$(CC) $(CFLAGS) -c -o $@ $<
clean:
rm -f speed samba-allocs *.o
Loading…
Cancel
Save