#include "ethash_cuda_miner_kernel_globals.h" #include "ethash_cuda_miner_kernel.h" #include "keccak.cuh" #include "dagger.cuh" typedef union { hash64_t init; hash32_t mix; } compute_hash_share; __device__ hash64_t init_hash(uint64_t nonce) { hash64_t init; // sha3_512(header .. nonce) uint64_t state[25]; state[0] = d_header.uint64s[0]; state[1] = d_header.uint64s[1]; state[2] = d_header.uint64s[2]; state[3] = d_header.uint64s[3]; state[4] = nonce; keccak_f1600_init((uint2 *)state); copy(init.uint64s, state, 8); return init; } __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share) { // share init0 if (thread_id == 0) *share = mix.x; uint32_t init0 = *share; uint32_t a = 0; do { bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); //#pragma unroll 4 for (uint32_t i = 0; i < 4; i++) { if (update_share) { uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w }; *share = fnv(init0 ^ (a + i), m[i]) % d_dag_size; } __threadfence_block(); #if __CUDA_ARCH__ >= 350 mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id)); #else mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]); #endif } } while ((a += 4) != ACCESSES); return fnv_reduce(mix); } __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix) { uint64_t state[25]; hash32_t hash; // keccak_256(keccak_512(header..nonce) .. mix); copy(state, init->uint64s, 8); copy(state + 8, mix->uint64s, 4); keccak_f1600_final((uint2 *)state); // copy out copy(hash.uint64s, state, 4); return hash; } __device__ hash32_t compute_hash( uint64_t nonce ) { extern __shared__ compute_hash_share share[]; // Compute one init hash per work item. hash64_t init = init_hash(nonce); // Threads work together in this phase in groups of 8. uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); uint32_t const hash_id = threadIdx.x >> 3; hash32_t mix; for (int i = 0; i < THREADS_PER_HASH; i++) { // share init with other threads if (i == thread_id) share[hash_id].init = init; uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s); share[hash_id].mix.uint32s[thread_id] = thread_mix; if (i == thread_id) mix = share[hash_id].mix; } return final_hash(&init, &mix); }