|
@ -2,12 +2,13 @@ |
|
|
#include "ethash_cuda_miner_kernel.h" |
|
|
#include "ethash_cuda_miner_kernel.h" |
|
|
#include "cuda_helper.h" |
|
|
#include "cuda_helper.h" |
|
|
|
|
|
|
|
|
__device__ uint64_t compute_hash( |
|
|
template <uint32_t _PARALLEL_HASH> |
|
|
|
|
|
__device__ __forceinline__ uint64_t compute_hash( |
|
|
uint64_t nonce |
|
|
uint64_t nonce |
|
|
) |
|
|
) |
|
|
{ |
|
|
{ |
|
|
// sha3_512(header .. nonce) |
|
|
// sha3_512(header .. nonce) |
|
|
uint2 state[25]; |
|
|
uint2 state[12]; |
|
|
|
|
|
|
|
|
state[4] = vectorize(nonce); |
|
|
state[4] = vectorize(nonce); |
|
|
|
|
|
|
|
@ -17,49 +18,58 @@ __device__ uint64_t compute_hash( |
|
|
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
|
|
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
|
|
const int mix_idx = thread_id & 3; |
|
|
const int mix_idx = thread_id & 3; |
|
|
|
|
|
|
|
|
uint4 mix; |
|
|
for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH) |
|
|
uint2 shuffle[8]; |
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < THREADS_PER_HASH; i++) |
|
|
|
|
|
{ |
|
|
{ |
|
|
|
|
|
uint4 mix[_PARALLEL_HASH]; |
|
|
|
|
|
uint32_t offset[_PARALLEL_HASH]; |
|
|
|
|
|
uint32_t init0[_PARALLEL_HASH]; |
|
|
|
|
|
|
|
|
// share init among threads |
|
|
// share init among threads |
|
|
for (int j = 0; j < 8; j++) { |
|
|
for (int p = 0; p < _PARALLEL_HASH; p++) |
|
|
shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH); |
|
|
{ |
|
|
shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH); |
|
|
uint2 shuffle[8]; |
|
|
|
|
|
for (int j = 0; j < 8; j++) |
|
|
|
|
|
{ |
|
|
|
|
|
shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH); |
|
|
|
|
|
shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH); |
|
|
} |
|
|
} |
|
|
|
|
|
switch (mix_idx) |
|
|
// ugly but avoids local reads/writes |
|
|
{ |
|
|
if (mix_idx < 2) { |
|
|
case 0: mix[p] = vectorize2(shuffle[0], shuffle[1]); break; |
|
|
if (mix_idx == 0) |
|
|
case 1: mix[p] = vectorize2(shuffle[2], shuffle[3]); break; |
|
|
mix = vectorize2(shuffle[0], shuffle[1]); |
|
|
case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break; |
|
|
else |
|
|
case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break; |
|
|
mix = vectorize2(shuffle[2], shuffle[3]); |
|
|
|
|
|
} |
|
|
} |
|
|
else { |
|
|
init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH); |
|
|
if (mix_idx == 2) |
|
|
|
|
|
mix = vectorize2(shuffle[4], shuffle[5]); |
|
|
|
|
|
else |
|
|
|
|
|
mix = vectorize2(shuffle[6], shuffle[7]); |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH); |
|
|
|
|
|
|
|
|
|
|
|
for (uint32_t a = 0; a < ACCESSES; a += 4) |
|
|
for (uint32_t a = 0; a < ACCESSES; a += 4) |
|
|
{ |
|
|
{ |
|
|
int t = bfe(a, 2u, 3u); |
|
|
int t = bfe(a, 2u, 3u); |
|
|
|
|
|
|
|
|
for (uint32_t b = 0; b < 4; b++) |
|
|
for (uint32_t b = 0; b < 4; b++) |
|
|
{ |
|
|
{ |
|
|
if (thread_id == t) |
|
|
for (int p = 0; p < _PARALLEL_HASH; p++) |
|
|
{ |
|
|
{ |
|
|
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; |
|
|
offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size; |
|
|
|
|
|
offset[p] = __shfl(offset[p], t, THREADS_PER_HASH); |
|
|
} |
|
|
} |
|
|
shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH); |
|
|
#pragma unroll |
|
|
mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]); |
|
|
for (int p = 0; p < _PARALLEL_HASH; p++) |
|
|
|
|
|
{ |
|
|
|
|
|
//if(blockIdx.x == 0 && threadIdx.x==0 && offset[p] > (d_dag_size>>1)) //larger than half |
|
|
|
|
|
// printf("d_dag_size = %d offset[p] = %d\n", d_dag_size, offset[p]); |
|
|
|
|
|
mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
uint32_t thread_mix = fnv_reduce(mix); |
|
|
for (int p = 0; p < _PARALLEL_HASH; p++) |
|
|
|
|
|
{ |
|
|
|
|
|
uint2 shuffle[4]; |
|
|
|
|
|
uint32_t thread_mix = fnv_reduce(mix[p]); |
|
|
|
|
|
|
|
|
// update mix accross threads |
|
|
// update mix accross threads |
|
|
|
|
|
|
|
@ -72,7 +82,7 @@ __device__ uint64_t compute_hash( |
|
|
shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH); |
|
|
shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH); |
|
|
shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH); |
|
|
shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH); |
|
|
|
|
|
|
|
|
if (i == thread_id) { |
|
|
if ((i+p) == thread_id) { |
|
|
//move mix into state: |
|
|
//move mix into state: |
|
|
state[8] = shuffle[0]; |
|
|
state[8] = shuffle[0]; |
|
|
state[9] = shuffle[1]; |
|
|
state[9] = shuffle[1]; |
|
@ -80,6 +90,7 @@ __device__ uint64_t compute_hash( |
|
|
state[11] = shuffle[3]; |
|
|
state[11] = shuffle[3]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// keccak_256(keccak_512(header..nonce) .. mix); |
|
|
// keccak_256(keccak_512(header..nonce) .. mix); |
|
|
return keccak_f1600_final(state); |
|
|
return keccak_f1600_final(state); |
|
|