|
@ -36,6 +36,22 @@ __constant uint2 const Keccak_f1600_RC[24] = { |
|
|
(uint2)(0x80008008, 0x80000000), |
|
|
(uint2)(0x80008008, 0x80000000), |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
static uint2 ROL2(const uint2 v, const int n) |
|
|
|
|
|
{ |
|
|
|
|
|
uint2 result; |
|
|
|
|
|
if (n <= 32) |
|
|
|
|
|
{ |
|
|
|
|
|
result.y = ((v.y << (n)) | (v.x >> (32 - n))); |
|
|
|
|
|
result.x = ((v.x << (n)) | (v.y >> (32 - n))); |
|
|
|
|
|
} |
|
|
|
|
|
else |
|
|
|
|
|
{ |
|
|
|
|
|
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); |
|
|
|
|
|
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); |
|
|
|
|
|
} |
|
|
|
|
|
return result; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
static void keccak_f1600_round(uint2* a, uint r, uint out_size) |
|
|
static void keccak_f1600_round(uint2* a, uint r, uint out_size) |
|
|
{ |
|
|
{ |
|
|
#if !__ENDIAN_LITTLE__ |
|
|
#if !__ENDIAN_LITTLE__ |
|
@ -43,104 +59,113 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size) |
|
|
a[i] = a[i].yx; |
|
|
a[i] = a[i].yx; |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
|
|
|
uint2 b[25]; |
|
|
uint2 t[5]; |
|
|
uint2 t; |
|
|
uint2 u, v; |
|
|
|
|
|
|
|
|
// Theta |
|
|
// Theta |
|
|
b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]; |
|
|
t[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]; |
|
|
b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]; |
|
|
t[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]; |
|
|
b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]; |
|
|
t[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]; |
|
|
b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]; |
|
|
t[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]; |
|
|
b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]; |
|
|
t[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]; |
|
|
t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31); |
|
|
u = t[4] ^ ROL2(t[1], 1); |
|
|
a[0] ^= t; |
|
|
a[0] ^= u; |
|
|
a[5] ^= t; |
|
|
a[5] ^= u; |
|
|
a[10] ^= t; |
|
|
a[10] ^= u; |
|
|
a[15] ^= t; |
|
|
a[15] ^= u; |
|
|
a[20] ^= t; |
|
|
a[20] ^= u; |
|
|
t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31); |
|
|
u = t[0] ^ ROL2(t[2], 1); |
|
|
a[1] ^= t; |
|
|
a[1] ^= u; |
|
|
a[6] ^= t; |
|
|
a[6] ^= u; |
|
|
a[11] ^= t; |
|
|
a[11] ^= u; |
|
|
a[16] ^= t; |
|
|
a[16] ^= u; |
|
|
a[21] ^= t; |
|
|
a[21] ^= u; |
|
|
t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31); |
|
|
u = t[1] ^ ROL2(t[3], 1); |
|
|
a[2] ^= t; |
|
|
a[2] ^= u; |
|
|
a[7] ^= t; |
|
|
a[7] ^= u; |
|
|
a[12] ^= t; |
|
|
a[12] ^= u; |
|
|
a[17] ^= t; |
|
|
a[17] ^= u; |
|
|
a[22] ^= t; |
|
|
a[22] ^= u; |
|
|
t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31); |
|
|
u = t[2] ^ ROL2(t[4], 1); |
|
|
a[3] ^= t; |
|
|
a[3] ^= u; |
|
|
a[8] ^= t; |
|
|
a[8] ^= u; |
|
|
a[13] ^= t; |
|
|
a[13] ^= u; |
|
|
a[18] ^= t; |
|
|
a[18] ^= u; |
|
|
a[23] ^= t; |
|
|
a[23] ^= u; |
|
|
t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31); |
|
|
u = t[3] ^ ROL2(t[0], 1); |
|
|
a[4] ^= t; |
|
|
a[4] ^= u; |
|
|
a[9] ^= t; |
|
|
a[9] ^= u; |
|
|
a[14] ^= t; |
|
|
a[14] ^= u; |
|
|
a[19] ^= t; |
|
|
a[19] ^= u; |
|
|
a[24] ^= t; |
|
|
a[24] ^= u; |
|
|
|
|
|
|
|
|
// Rho Pi |
|
|
// Rho Pi |
|
|
b[0] = a[0]; |
|
|
u = a[1]; |
|
|
b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31); |
|
|
a[1] = ROL2(a[6], 44); |
|
|
b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29); |
|
|
a[6] = ROL2(a[9], 20); |
|
|
b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26); |
|
|
a[9] = ROL2(a[22], 61); |
|
|
b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22); |
|
|
a[22] = ROL2(a[14], 39); |
|
|
b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17); |
|
|
a[14] = ROL2(a[20], 18); |
|
|
b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11); |
|
|
a[20] = ROL2(a[2], 62); |
|
|
b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4); |
|
|
a[2] = ROL2(a[12], 43); |
|
|
b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28); |
|
|
a[12] = ROL2(a[13], 25); |
|
|
b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19); |
|
|
a[13] = ROL2(a[19], 8); |
|
|
b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9); |
|
|
a[19] = ROL2(a[23], 56); |
|
|
b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30); |
|
|
a[23] = ROL2(a[15], 41); |
|
|
b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18); |
|
|
a[15] = ROL2(a[4], 27); |
|
|
b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5); |
|
|
a[4] = ROL2(a[24], 14); |
|
|
b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23); |
|
|
a[24] = ROL2(a[21], 2); |
|
|
b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8); |
|
|
a[21] = ROL2(a[8], 55); |
|
|
b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24); |
|
|
a[8] = ROL2(a[16], 45); |
|
|
b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7); |
|
|
a[16] = ROL2(a[5], 36); |
|
|
b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21); |
|
|
a[5] = ROL2(a[3], 28); |
|
|
b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2); |
|
|
a[3] = ROL2(a[18], 21); |
|
|
b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14); |
|
|
a[18] = ROL2(a[17], 15); |
|
|
b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25); |
|
|
a[17] = ROL2(a[11], 10); |
|
|
b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3); |
|
|
a[11] = ROL2(a[7], 6); |
|
|
b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12); |
|
|
a[7] = ROL2(a[10], 3); |
|
|
b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20); |
|
|
a[10] = ROL2(u, 1); |
|
|
|
|
|
|
|
|
// Chi |
|
|
// Chi |
|
|
a[0] = bitselect(b[0] ^ b[2], b[0], b[1]); |
|
|
u = a[0]; v = a[1]; |
|
|
|
|
|
a[0] = bitselect(a[0] ^ a[2], a[0], a[1]); |
|
|
|
|
|
|
|
|
if (out_size > 4) |
|
|
if (out_size > 4) |
|
|
{ |
|
|
{ |
|
|
a[1] = bitselect(b[1] ^ b[3], b[1], b[2]); |
|
|
a[1] = bitselect(a[1] ^ a[3], a[1], a[2]); |
|
|
a[2] = bitselect(b[2] ^ b[4], b[2], b[3]); |
|
|
a[2] = bitselect(a[2] ^ a[4], a[2], a[3]); |
|
|
a[3] = bitselect(b[3] ^ b[0], b[3], b[4]); |
|
|
a[3] = bitselect(a[3] ^ u, a[3], a[4]); |
|
|
a[4] = bitselect(b[4] ^ b[1], b[4], b[0]); |
|
|
a[4] = bitselect(a[4] ^ v, a[4], u); |
|
|
a[5] = bitselect(b[5] ^ b[7], b[5], b[6]); |
|
|
|
|
|
a[6] = bitselect(b[6] ^ b[8], b[6], b[7]); |
|
|
u = a[5]; v = a[6]; |
|
|
a[7] = bitselect(b[7] ^ b[9], b[7], b[8]); |
|
|
a[5] = bitselect(a[5] ^ a[7], a[5], a[6]); |
|
|
a[8] = bitselect(b[8] ^ b[5], b[8], b[9]); |
|
|
a[6] = bitselect(a[6] ^ a[8], a[6], a[7]); |
|
|
|
|
|
a[7] = bitselect(a[7] ^ a[9], a[7], a[8]); |
|
|
|
|
|
a[8] = bitselect(a[8] ^ u, a[8], a[9]); |
|
|
if (out_size > 8) |
|
|
if (out_size > 8) |
|
|
{ |
|
|
{ |
|
|
a[9] = bitselect(b[9] ^ b[6], b[9], b[5]); |
|
|
a[9] = bitselect(a[9] ^ v, a[9], u); |
|
|
a[10] = bitselect(b[10] ^ b[12], b[10], b[11]); |
|
|
|
|
|
a[11] = bitselect(b[11] ^ b[13], b[11], b[12]); |
|
|
u = a[10]; v = a[11]; |
|
|
a[12] = bitselect(b[12] ^ b[14], b[12], b[13]); |
|
|
a[10] = bitselect(a[10] ^ a[12], a[10], a[11]); |
|
|
a[13] = bitselect(b[13] ^ b[10], b[13], b[14]); |
|
|
a[11] = bitselect(a[11] ^ a[13], a[11], a[12]); |
|
|
a[14] = bitselect(b[14] ^ b[11], b[14], b[10]); |
|
|
a[12] = bitselect(a[12] ^ a[14], a[12], a[13]); |
|
|
a[15] = bitselect(b[15] ^ b[17], b[15], b[16]); |
|
|
a[13] = bitselect(a[13] ^ u, a[13], a[14]); |
|
|
a[16] = bitselect(b[16] ^ b[18], b[16], b[17]); |
|
|
a[14] = bitselect(a[14] ^ v, a[14], u); |
|
|
a[17] = bitselect(b[17] ^ b[19], b[17], b[18]); |
|
|
|
|
|
a[18] = bitselect(b[18] ^ b[15], b[18], b[19]); |
|
|
u = a[15]; v = a[16]; |
|
|
a[19] = bitselect(b[19] ^ b[16], b[19], b[15]); |
|
|
a[15] = bitselect(a[15] ^ a[17], a[15], a[16]); |
|
|
a[20] = bitselect(b[20] ^ b[22], b[20], b[21]); |
|
|
a[16] = bitselect(a[16] ^ a[18], a[16], a[17]); |
|
|
a[21] = bitselect(b[21] ^ b[23], b[21], b[22]); |
|
|
a[17] = bitselect(a[17] ^ a[19], a[17], a[18]); |
|
|
a[22] = bitselect(b[22] ^ b[24], b[22], b[23]); |
|
|
a[18] = bitselect(a[18] ^ u, a[18], a[19]); |
|
|
a[23] = bitselect(b[23] ^ b[20], b[23], b[24]); |
|
|
a[19] = bitselect(a[19] ^ v, a[19], u); |
|
|
a[24] = bitselect(b[24] ^ b[21], b[24], b[20]); |
|
|
|
|
|
|
|
|
u = a[20]; v = a[21]; |
|
|
|
|
|
a[20] = bitselect(a[20] ^ a[22], a[20], a[21]); |
|
|
|
|
|
a[21] = bitselect(a[21] ^ a[23], a[21], a[22]); |
|
|
|
|
|
a[22] = bitselect(a[22] ^ a[24], a[22], a[23]); |
|
|
|
|
|
a[23] = bitselect(a[23] ^ u, a[23], a[24]); |
|
|
|
|
|
a[24] = bitselect(a[24] ^ v, a[24], u); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -257,12 +282,11 @@ static ulong compute_hash( |
|
|
uint const thread_id = gid & 7; |
|
|
uint const thread_id = gid & 7; |
|
|
uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3; |
|
|
uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3; |
|
|
|
|
|
|
|
|
uint i = 0; |
|
|
for (int i = 0; i < THREADS_PER_HASH; i++) |
|
|
do |
|
|
|
|
|
{ |
|
|
{ |
|
|
// share init with other threads |
|
|
// share init with other threads |
|
|
if (i == thread_id) |
|
|
if (i == thread_id) |
|
|
copy(share[hash_id].uint4s, state, 4); |
|
|
copy(share[hash_id].ulongs, state, 8); |
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
@ -277,12 +301,10 @@ static ulong compute_hash( |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
uint init0 = *share0; |
|
|
uint init0 = *share0; |
|
|
|
|
|
|
|
|
uint a = 0; |
|
|
for (uint a = 0; a < ACCESSES; a += 4) |
|
|
do |
|
|
|
|
|
{ |
|
|
{ |
|
|
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); |
|
|
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); |
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
|
for (uint i = 0; i != 4; ++i) |
|
|
for (uint i = 0; i != 4; ++i) |
|
|
{ |
|
|
{ |
|
|
if (update_share) |
|
|
if (update_share) |
|
@ -293,7 +315,7 @@ static ulong compute_hash( |
|
|
|
|
|
|
|
|
mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]); |
|
|
mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]); |
|
|
} |
|
|
} |
|
|
} while ((a += 4) != (ACCESSES & isolate)); |
|
|
} |
|
|
|
|
|
|
|
|
share[hash_id].uints[thread_id] = fnv_reduce(mix); |
|
|
share[hash_id].uints[thread_id] = fnv_reduce(mix); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
@ -303,7 +325,6 @@ static ulong compute_hash( |
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
} |
|
|
} |
|
|
while (++i != (THREADS_PER_HASH & isolate)); |
|
|
|
|
|
|
|
|
|
|
|
// keccak_256(keccak_512(header..nonce) .. mix); |
|
|
// keccak_256(keccak_512(header..nonce) .. mix); |
|
|
keccak_f1600_no_absorb(state, 12, 4, isolate); |
|
|
keccak_f1600_no_absorb(state, 12, 4, isolate); |
|
|