|
|
|
#include "ethash_cuda_miner_kernel_globals.h"
|
|
|
|
#include "ethash_cuda_miner_kernel.h"
|
|
|
|
#include "keccak.cuh"
|
|
|
|
#include "dagger.cuh"
|
|
|
|
|
|
|
|
typedef union
|
|
|
|
{
|
|
|
|
hash64_t init;
|
|
|
|
hash32_t mix;
|
|
|
|
} compute_hash_share;
|
|
|
|
|
|
|
|
__device__ hash64_t init_hash(uint64_t nonce)
|
|
|
|
{
|
|
|
|
hash64_t init;
|
|
|
|
|
|
|
|
// sha3_512(header .. nonce)
|
|
|
|
uint64_t state[25];
|
|
|
|
|
|
|
|
state[0] = d_header.uint64s[0];
|
|
|
|
state[1] = d_header.uint64s[1];
|
|
|
|
state[2] = d_header.uint64s[2];
|
|
|
|
state[3] = d_header.uint64s[3];
|
|
|
|
state[4] = nonce;
|
|
|
|
|
|
|
|
keccak_f1600_init((uint2 *)state);
|
|
|
|
copy(init.uint64s, state, 8);
|
|
|
|
return init;
|
|
|
|
}
|
|
|
|
|
|
|
|
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share)
|
|
|
|
{
|
|
|
|
// share init0
|
|
|
|
if (thread_id == 0)
|
|
|
|
*share = mix.x;
|
|
|
|
|
|
|
|
uint32_t init0 = *share;
|
|
|
|
|
|
|
|
uint32_t a = 0;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
|
|
|
|
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
|
|
|
|
|
|
|
|
//#pragma unroll 4
|
|
|
|
for (uint32_t i = 0; i < 4; i++)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (update_share)
|
|
|
|
{
|
|
|
|
uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w };
|
|
|
|
*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size;
|
|
|
|
}
|
|
|
|
__threadfence_block();
|
|
|
|
|
|
|
|
#if __CUDA_ARCH__ >= 350
|
|
|
|
mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id));
|
|
|
|
#else
|
|
|
|
mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} while ((a += 4) != ACCESSES);
|
|
|
|
|
|
|
|
return fnv_reduce(mix);
|
|
|
|
}
|
|
|
|
|
|
|
|
__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
|
|
|
|
{
|
|
|
|
uint64_t state[25];
|
|
|
|
|
|
|
|
hash32_t hash;
|
|
|
|
|
|
|
|
// keccak_256(keccak_512(header..nonce) .. mix);
|
|
|
|
copy(state, init->uint64s, 8);
|
|
|
|
copy(state + 8, mix->uint64s, 4);
|
|
|
|
|
|
|
|
keccak_f1600_final((uint2 *)state);
|
|
|
|
|
|
|
|
// copy out
|
|
|
|
copy(hash.uint64s, state, 4);
|
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
__device__ hash32_t compute_hash(
|
|
|
|
uint64_t nonce
|
|
|
|
)
|
|
|
|
{
|
|
|
|
extern __shared__ compute_hash_share share[];
|
|
|
|
|
|
|
|
// Compute one init hash per work item.
|
|
|
|
hash64_t init = init_hash(nonce);
|
|
|
|
|
|
|
|
// Threads work together in this phase in groups of 8.
|
|
|
|
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
|
|
|
|
uint32_t const hash_id = threadIdx.x >> 3;
|
|
|
|
|
|
|
|
hash32_t mix;
|
|
|
|
|
|
|
|
for (int i = 0; i < THREADS_PER_HASH; i++)
|
|
|
|
{
|
|
|
|
// share init with other threads
|
|
|
|
if (i == thread_id)
|
|
|
|
share[hash_id].init = init;
|
|
|
|
|
|
|
|
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
|
|
|
|
|
|
|
|
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s);
|
|
|
|
|
|
|
|
share[hash_id].mix.uint32s[thread_id] = thread_mix;
|
|
|
|
|
|
|
|
|
|
|
|
if (i == thread_id)
|
|
|
|
mix = share[hash_id].mix;
|
|
|
|
}
|
|
|
|
|
|
|
|
return final_hash(&init, &mix);
|
|
|
|
}
|