9 changed files with 112 additions and 192 deletions
@ -1,22 +0,0 @@ |
|||
#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; } |
|||
|
|||
#define ACCESSES 64 |
|||
#define THREADS_PER_HASH (128 / 16) |
|||
#define FNV_PRIME 0x01000193 |
|||
|
|||
#define fnv(x,y) ((x) * FNV_PRIME ^(y)) |
|||
|
|||
__device__ uint4 fnv4(uint4 a, uint4 b) |
|||
{ |
|||
uint4 c; |
|||
c.x = a.x * FNV_PRIME ^ b.x; |
|||
c.y = a.y * FNV_PRIME ^ b.y; |
|||
c.z = a.z * FNV_PRIME ^ b.z; |
|||
c.w = a.w * FNV_PRIME ^ b.w; |
|||
return c; |
|||
} |
|||
|
|||
__device__ uint32_t fnv_reduce(uint4 v) |
|||
{ |
|||
return fnv(fnv(fnv(v.x, v.y), v.z), v.w); |
|||
} |
@ -1,119 +0,0 @@ |
|||
#include "ethash_cuda_miner_kernel_globals.h" |
|||
#include "ethash_cuda_miner_kernel.h" |
|||
#include "keccak.cuh" |
|||
#include "dagger.cuh" |
|||
|
|||
typedef union |
|||
{ |
|||
hash64_t init; |
|||
hash32_t mix; |
|||
} compute_hash_share; |
|||
|
|||
__device__ hash64_t init_hash(uint64_t nonce) |
|||
{ |
|||
hash64_t init; |
|||
|
|||
// sha3_512(header .. nonce) |
|||
uint64_t state[25]; |
|||
|
|||
state[0] = d_header.uint64s[0]; |
|||
state[1] = d_header.uint64s[1]; |
|||
state[2] = d_header.uint64s[2]; |
|||
state[3] = d_header.uint64s[3]; |
|||
state[4] = nonce; |
|||
|
|||
keccak_f1600_init((uint2 *)state); |
|||
copy(init.uint64s, state, 8); |
|||
return init; |
|||
} |
|||
|
|||
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share) |
|||
{ |
|||
// share init0 |
|||
if (thread_id == 0) |
|||
*share = mix.x; |
|||
|
|||
uint32_t init0 = *share; |
|||
|
|||
uint32_t a = 0; |
|||
|
|||
do |
|||
{ |
|||
|
|||
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); |
|||
|
|||
//#pragma unroll 4 |
|||
for (uint32_t i = 0; i < 4; i++) |
|||
{ |
|||
|
|||
if (update_share) |
|||
{ |
|||
uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w }; |
|||
*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size; |
|||
} |
|||
__threadfence_block(); |
|||
|
|||
#if __CUDA_ARCH__ >= 350 |
|||
mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id)); |
|||
#else |
|||
mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]); |
|||
#endif |
|||
|
|||
} |
|||
|
|||
} while ((a += 4) != ACCESSES); |
|||
|
|||
return fnv_reduce(mix); |
|||
} |
|||
|
|||
__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix) |
|||
{ |
|||
uint64_t state[25]; |
|||
|
|||
hash32_t hash; |
|||
|
|||
// keccak_256(keccak_512(header..nonce) .. mix); |
|||
copy(state, init->uint64s, 8); |
|||
copy(state + 8, mix->uint64s, 4); |
|||
|
|||
keccak_f1600_final((uint2 *)state); |
|||
|
|||
// copy out |
|||
copy(hash.uint64s, state, 4); |
|||
return hash; |
|||
} |
|||
|
|||
__device__ hash32_t compute_hash( |
|||
uint64_t nonce |
|||
) |
|||
{ |
|||
extern __shared__ compute_hash_share share[]; |
|||
|
|||
// Compute one init hash per work item. |
|||
hash64_t init = init_hash(nonce); |
|||
|
|||
// Threads work together in this phase in groups of 8. |
|||
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
|||
uint32_t const hash_id = threadIdx.x >> 3; |
|||
|
|||
hash32_t mix; |
|||
|
|||
for (int i = 0; i < THREADS_PER_HASH; i++) |
|||
{ |
|||
// share init with other threads |
|||
if (i == thread_id) |
|||
share[hash_id].init = init; |
|||
|
|||
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; |
|||
|
|||
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s); |
|||
|
|||
share[hash_id].mix.uint32s[thread_id] = thread_mix; |
|||
|
|||
|
|||
if (i == thread_id) |
|||
mix = share[hash_id].mix; |
|||
} |
|||
|
|||
return final_hash(&init, &mix); |
|||
} |
Loading…
Reference in new issue