diff --git a/libethash-cuda/dagger_shared.cuh b/libethash-cuda/dagger_shared.cuh new file mode 100644 index 000000000..e21ff9c52 --- /dev/null +++ b/libethash-cuda/dagger_shared.cuh @@ -0,0 +1,78 @@ +#include "ethash_cuda_miner_kernel_globals.h" +#include "ethash_cuda_miner_kernel.h" +#include "keccak.cuh" +#include "fnv.cuh" + +#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } + +typedef union { + uint4 uint4s[4]; + uint2 uint2s[8]; + uint32_t uints[16]; +} compute_hash_share; + + +__device__ uint64_t compute_hash( + uint64_t nonce + ) +{ + // sha3_512(header .. nonce) + uint2 state[25]; + state[4] = vectorize(nonce); + keccak_f1600_init(state); + + // Threads work together in this phase in groups of 8. + const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); + const int hash_id = threadIdx.x >> 3; + + extern __shared__ compute_hash_share share[]; + + uint4 mix; + + for (int i = 0; i < THREADS_PER_HASH; i++) + { + // share init with other threads + if (i == thread_id) + copy(share[hash_id].uint2s, state, 8); + + __syncthreads(); + + uint4 mix = share[hash_id].uint4s[thread_id & 3]; + __syncthreads(); + + uint32_t *share0 = share[hash_id].uints; + + // share init0 + if (thread_id == 0) + *share0 = mix.x; + __syncthreads(); + uint32_t init0 = *share0; + + for (uint32_t a = 0; a < ACCESSES; a += 4) + { + bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); + + for (uint32_t i = 0; i != 4; ++i) + { + if (update_share) + { + *share0 = fnv(init0 ^ (a + i), ((uint32_t *)&mix)[i]) % d_dag_size; + } + __syncthreads(); + + mix = fnv4(mix, d_dag[*share0].uint4s[thread_id]); + } + } + + share[hash_id].uints[thread_id] = fnv_reduce(mix); + __syncthreads(); + + if (i == thread_id) + copy(state + 8, share[hash_id].uint2s, 4); + + __syncthreads(); + } + + // keccak_256(keccak_512(header..nonce) .. mix); + return keccak_f1600_final(state); +} \ No newline at end of file diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh index 598e5c08e..58ee6d5f9 100644 --- a/libethash-cuda/dagger_shuffled.cuh +++ b/libethash-cuda/dagger_shuffled.cuh @@ -3,10 +3,11 @@ #include "keccak.cuh" #include "fnv.cuh" -#define ACCESSES 64 -#define THREADS_PER_HASH (128 / 16) +#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) -__device__ uint64_t compute_hash_shuffle( +typedef bool compute_hash_share; + +__device__ uint64_t compute_hash( uint64_t nonce ) { diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index 6e594b380..64ce6ff90 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -8,96 +8,13 @@ #include "ethash_cuda_miner_kernel_globals.h" #include "cuda_helper.h" -#include "keccak.cuh" -#include "fnv.cuh" +#define SHUFFLE_MIN_VER 300 -#define ACCESSES 64 -#define THREADS_PER_HASH (128 / 16) - -__device__ uint64_t compute_hash_shuffle( - uint64_t nonce - ) -{ - // sha3_512(header .. nonce) - uint2 state[25]; - - state[4] = vectorize(nonce); - - keccak_f1600_init(state); - - // Threads work together in this phase in groups of 8. - const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); - const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1); - const int mix_idx = thread_id & 3; - - uint4 mix; - uint2 shuffle[8]; - - for (int i = 0; i < THREADS_PER_HASH; i++) - { - // share init among threads - for (int j = 0; j < 8; j++) { - shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH); - shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH); - } - - // ugly but avoids local reads/writes - if (mix_idx < 2) { - if (mix_idx == 0) - mix = vectorize2(shuffle[0], shuffle[1]); - else - mix = vectorize2(shuffle[2], shuffle[3]); - } - else { - if (mix_idx == 2) - mix = vectorize2(shuffle[4], shuffle[5]); - else - mix = vectorize2(shuffle[6], shuffle[7]); - } - - uint32_t init0 = __shfl(shuffle[0].x, start_lane); - - for (uint32_t a = 0; a < ACCESSES; a += 4) - { - int t = ((a >> 2) & (THREADS_PER_HASH - 1)); - - for (uint32_t b = 0; b < 4; b++) - { - if (thread_id == t) - { - shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; - } - shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH); - - mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]); - } - } - - uint32_t thread_mix = fnv_reduce(mix); - - // update mix accross threads - - shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH); - shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH); - shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH); - shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH); - shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH); - shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH); - shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH); - shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH); - - if (i == thread_id) { - //move mix into state: - state[8] = shuffle[0]; - state[9] = shuffle[1]; - state[10] = shuffle[2]; - state[11] = shuffle[3]; - } - } - - // keccak_256(keccak_512(header..nonce) .. mix); - return keccak_f1600_final(state); -} +#if __CUDA_ARCH__ < SHUFFLE_MIN_VER +#include "dagger_shared.cuh" +#else +#include "dagger_shuffled.cuh" +#endif __global__ void __launch_bounds__(896, 1) @@ -107,7 +24,7 @@ ethash_search( ) { uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; - uint64_t hash = compute_hash_shuffle(start_nonce + gid); + uint64_t hash = compute_hash(start_nonce + gid); if (cuda_swab64(hash) > d_target) return; uint32_t index = atomicInc(const_cast(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1; g_output[index] = gid; @@ -122,7 +39,7 @@ void run_ethash_search( uint64_t start_nonce ) { - ethash_search <<>>(g_output, start_nonce); + ethash_search << > >(g_output, start_nonce); CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h index 17493eb83..d261d24fa 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.h +++ b/libethash-cuda/ethash_cuda_miner_kernel.h @@ -6,6 +6,8 @@ #include #define SEARCH_RESULT_BUFFER_SIZE 64 +#define ACCESSES 64 +#define THREADS_PER_HASH (128 / 16) typedef struct {