Browse Source

..

cl-refactor
Genoil 9 years ago
parent
commit
1442229665
  1. 211
      libethash-cl/ethash_cl_miner_kernel.cl

211
libethash-cl/ethash_cl_miner_kernel.cl

@ -36,6 +36,22 @@ __constant uint2 const Keccak_f1600_RC[24] = {
(uint2)(0x80008008, 0x80000000), (uint2)(0x80008008, 0x80000000),
}; };
static uint2 ROL2(const uint2 v, const int n)
{
uint2 result;
if (n <= 32)
{
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
}
else
{
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
}
return result;
}
static void keccak_f1600_round(uint2* a, uint r, uint out_size) static void keccak_f1600_round(uint2* a, uint r, uint out_size)
{ {
#if !__ENDIAN_LITTLE__ #if !__ENDIAN_LITTLE__
@ -43,104 +59,113 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size)
a[i] = a[i].yx; a[i] = a[i].yx;
#endif #endif
uint2 b[25]; uint2 t[5];
uint2 t; uint2 u, v;
// Theta // Theta
b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]; t[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]; t[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]; t[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]; t[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]; t[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31); u = t[4] ^ ROL2(t[1], 1);
a[0] ^= t; a[0] ^= u;
a[5] ^= t; a[5] ^= u;
a[10] ^= t; a[10] ^= u;
a[15] ^= t; a[15] ^= u;
a[20] ^= t; a[20] ^= u;
t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31); u = t[0] ^ ROL2(t[2], 1);
a[1] ^= t; a[1] ^= u;
a[6] ^= t; a[6] ^= u;
a[11] ^= t; a[11] ^= u;
a[16] ^= t; a[16] ^= u;
a[21] ^= t; a[21] ^= u;
t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31); u = t[1] ^ ROL2(t[3], 1);
a[2] ^= t; a[2] ^= u;
a[7] ^= t; a[7] ^= u;
a[12] ^= t; a[12] ^= u;
a[17] ^= t; a[17] ^= u;
a[22] ^= t; a[22] ^= u;
t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31); u = t[2] ^ ROL2(t[4], 1);
a[3] ^= t; a[3] ^= u;
a[8] ^= t; a[8] ^= u;
a[13] ^= t; a[13] ^= u;
a[18] ^= t; a[18] ^= u;
a[23] ^= t; a[23] ^= u;
t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31); u = t[3] ^ ROL2(t[0], 1);
a[4] ^= t; a[4] ^= u;
a[9] ^= t; a[9] ^= u;
a[14] ^= t; a[14] ^= u;
a[19] ^= t; a[19] ^= u;
a[24] ^= t; a[24] ^= u;
// Rho Pi // Rho Pi
b[0] = a[0]; u = a[1];
b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31); a[1] = ROL2(a[6], 44);
b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29); a[6] = ROL2(a[9], 20);
b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26); a[9] = ROL2(a[22], 61);
b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22); a[22] = ROL2(a[14], 39);
b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17); a[14] = ROL2(a[20], 18);
b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11); a[20] = ROL2(a[2], 62);
b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4); a[2] = ROL2(a[12], 43);
b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28); a[12] = ROL2(a[13], 25);
b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19); a[13] = ROL2(a[19], 8);
b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9); a[19] = ROL2(a[23], 56);
b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30); a[23] = ROL2(a[15], 41);
b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18); a[15] = ROL2(a[4], 27);
b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5); a[4] = ROL2(a[24], 14);
b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23); a[24] = ROL2(a[21], 2);
b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8); a[21] = ROL2(a[8], 55);
b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24); a[8] = ROL2(a[16], 45);
b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7); a[16] = ROL2(a[5], 36);
b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21); a[5] = ROL2(a[3], 28);
b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2); a[3] = ROL2(a[18], 21);
b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14); a[18] = ROL2(a[17], 15);
b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25); a[17] = ROL2(a[11], 10);
b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3); a[11] = ROL2(a[7], 6);
b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12); a[7] = ROL2(a[10], 3);
b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20); a[10] = ROL2(u, 1);
// Chi // Chi
a[0] = bitselect(b[0] ^ b[2], b[0], b[1]); u = a[0]; v = a[1];
a[0] = bitselect(a[0] ^ a[2], a[0], a[1]);
if (out_size > 4) if (out_size > 4)
{ {
a[1] = bitselect(b[1] ^ b[3], b[1], b[2]); a[1] = bitselect(a[1] ^ a[3], a[1], a[2]);
a[2] = bitselect(b[2] ^ b[4], b[2], b[3]); a[2] = bitselect(a[2] ^ a[4], a[2], a[3]);
a[3] = bitselect(b[3] ^ b[0], b[3], b[4]); a[3] = bitselect(a[3] ^ u, a[3], a[4]);
a[4] = bitselect(b[4] ^ b[1], b[4], b[0]); a[4] = bitselect(a[4] ^ v, a[4], u);
a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
a[6] = bitselect(b[6] ^ b[8], b[6], b[7]); u = a[5]; v = a[6];
a[7] = bitselect(b[7] ^ b[9], b[7], b[8]); a[5] = bitselect(a[5] ^ a[7], a[5], a[6]);
a[8] = bitselect(b[8] ^ b[5], b[8], b[9]); a[6] = bitselect(a[6] ^ a[8], a[6], a[7]);
a[7] = bitselect(a[7] ^ a[9], a[7], a[8]);
a[8] = bitselect(a[8] ^ u, a[8], a[9]);
if (out_size > 8) if (out_size > 8)
{ {
a[9] = bitselect(b[9] ^ b[6], b[9], b[5]); a[9] = bitselect(a[9] ^ v, a[9], u);
a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
a[11] = bitselect(b[11] ^ b[13], b[11], b[12]); u = a[10]; v = a[11];
a[12] = bitselect(b[12] ^ b[14], b[12], b[13]); a[10] = bitselect(a[10] ^ a[12], a[10], a[11]);
a[13] = bitselect(b[13] ^ b[10], b[13], b[14]); a[11] = bitselect(a[11] ^ a[13], a[11], a[12]);
a[14] = bitselect(b[14] ^ b[11], b[14], b[10]); a[12] = bitselect(a[12] ^ a[14], a[12], a[13]);
a[15] = bitselect(b[15] ^ b[17], b[15], b[16]); a[13] = bitselect(a[13] ^ u, a[13], a[14]);
a[16] = bitselect(b[16] ^ b[18], b[16], b[17]); a[14] = bitselect(a[14] ^ v, a[14], u);
a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
a[18] = bitselect(b[18] ^ b[15], b[18], b[19]); u = a[15]; v = a[16];
a[19] = bitselect(b[19] ^ b[16], b[19], b[15]); a[15] = bitselect(a[15] ^ a[17], a[15], a[16]);
a[20] = bitselect(b[20] ^ b[22], b[20], b[21]); a[16] = bitselect(a[16] ^ a[18], a[16], a[17]);
a[21] = bitselect(b[21] ^ b[23], b[21], b[22]); a[17] = bitselect(a[17] ^ a[19], a[17], a[18]);
a[22] = bitselect(b[22] ^ b[24], b[22], b[23]); a[18] = bitselect(a[18] ^ u, a[18], a[19]);
a[23] = bitselect(b[23] ^ b[20], b[23], b[24]); a[19] = bitselect(a[19] ^ v, a[19], u);
a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
u = a[20]; v = a[21];
a[20] = bitselect(a[20] ^ a[22], a[20], a[21]);
a[21] = bitselect(a[21] ^ a[23], a[21], a[22]);
a[22] = bitselect(a[22] ^ a[24], a[22], a[23]);
a[23] = bitselect(a[23] ^ u, a[23], a[24]);
a[24] = bitselect(a[24] ^ v, a[24], u);
} }
} }
@ -257,12 +282,11 @@ static ulong compute_hash(
uint const thread_id = gid & 7; uint const thread_id = gid & 7;
uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3; uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3;
uint i = 0; for (int i = 0; i < THREADS_PER_HASH; i++)
do
{ {
// share init with other threads // share init with other threads
if (i == thread_id) if (i == thread_id)
copy(share[hash_id].uint4s, state, 4); copy(share[hash_id].ulongs, state, 8);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -277,12 +301,10 @@ static ulong compute_hash(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
uint init0 = *share0; uint init0 = *share0;
uint a = 0; for (uint a = 0; a < ACCESSES; a += 4)
do
{ {
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
#pragma unroll
for (uint i = 0; i != 4; ++i) for (uint i = 0; i != 4; ++i)
{ {
if (update_share) if (update_share)
@ -293,7 +315,7 @@ static ulong compute_hash(
mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]); mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]);
} }
} while ((a += 4) != (ACCESSES & isolate)); }
share[hash_id].uints[thread_id] = fnv_reduce(mix); share[hash_id].uints[thread_id] = fnv_reduce(mix);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -303,7 +325,6 @@ static ulong compute_hash(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
while (++i != (THREADS_PER_HASH & isolate));
// keccak_256(keccak_512(header..nonce) .. mix); // keccak_256(keccak_512(header..nonce) .. mix);
keccak_f1600_no_absorb(state, 12, 4, isolate); keccak_f1600_no_absorb(state, 12, 4, isolate);

Loading…
Cancel
Save