Browse Source

working cuda kernel with shared mem

cl-refactor
Genoil 9 years ago
parent
commit
89cedad4a6
  1. 78
      libethash-cuda/dagger_shared.cuh
  2. 7
      libethash-cuda/dagger_shuffled.cuh
  3. 99
      libethash-cuda/ethash_cuda_miner_kernel.cu
  4. 2
      libethash-cuda/ethash_cuda_miner_kernel.h

78
libethash-cuda/dagger_shared.cuh

@ -0,0 +1,78 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "ethash_cuda_miner_kernel.h"
#include "keccak.cuh"
#include "fnv.cuh"
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
typedef union {
uint4 uint4s[4];
uint2 uint2s[8];
uint32_t uints[16];
} compute_hash_share;
__device__ uint64_t compute_hash(
uint64_t nonce
)
{
// sha3_512(header .. nonce)
uint2 state[25];
state[4] = vectorize(nonce);
keccak_f1600_init(state);
// Threads work together in this phase in groups of 8.
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int hash_id = threadIdx.x >> 3;
extern __shared__ compute_hash_share share[];
uint4 mix;
for (int i = 0; i < THREADS_PER_HASH; i++)
{
// share init with other threads
if (i == thread_id)
copy(share[hash_id].uint2s, state, 8);
__syncthreads();
uint4 mix = share[hash_id].uint4s[thread_id & 3];
__syncthreads();
uint32_t *share0 = share[hash_id].uints;
// share init0
if (thread_id == 0)
*share0 = mix.x;
__syncthreads();
uint32_t init0 = *share0;
for (uint32_t a = 0; a < ACCESSES; a += 4)
{
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
for (uint32_t i = 0; i != 4; ++i)
{
if (update_share)
{
*share0 = fnv(init0 ^ (a + i), ((uint32_t *)&mix)[i]) % d_dag_size;
}
__syncthreads();
mix = fnv4(mix, d_dag[*share0].uint4s[thread_id]);
}
}
share[hash_id].uints[thread_id] = fnv_reduce(mix);
__syncthreads();
if (i == thread_id)
copy(state + 8, share[hash_id].uint2s, 4);
__syncthreads();
}
// keccak_256(keccak_512(header..nonce) .. mix);
return keccak_f1600_final(state);
}

7
libethash-cuda/dagger_shuffled.cuh

@ -3,10 +3,11 @@
#include "keccak.cuh"
#include "fnv.cuh"
#define ACCESSES 64
#define THREADS_PER_HASH (128 / 16)
#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
__device__ uint64_t compute_hash_shuffle(
typedef bool compute_hash_share;
__device__ uint64_t compute_hash(
uint64_t nonce
)
{

99
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -8,96 +8,13 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "cuda_helper.h"
#include "keccak.cuh"
#include "fnv.cuh"
#define SHUFFLE_MIN_VER 300
#define ACCESSES 64
#define THREADS_PER_HASH (128 / 16)
__device__ uint64_t compute_hash_shuffle(
uint64_t nonce
)
{
// sha3_512(header .. nonce)
uint2 state[25];
state[4] = vectorize(nonce);
keccak_f1600_init(state);
// Threads work together in this phase in groups of 8.
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
const int mix_idx = thread_id & 3;
uint4 mix;
uint2 shuffle[8];
for (int i = 0; i < THREADS_PER_HASH; i++)
{
// share init among threads
for (int j = 0; j < 8; j++) {
shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH);
shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH);
}
// ugly but avoids local reads/writes
if (mix_idx < 2) {
if (mix_idx == 0)
mix = vectorize2(shuffle[0], shuffle[1]);
else
mix = vectorize2(shuffle[2], shuffle[3]);
}
else {
if (mix_idx == 2)
mix = vectorize2(shuffle[4], shuffle[5]);
else
mix = vectorize2(shuffle[6], shuffle[7]);
}
uint32_t init0 = __shfl(shuffle[0].x, start_lane);
for (uint32_t a = 0; a < ACCESSES; a += 4)
{
int t = ((a >> 2) & (THREADS_PER_HASH - 1));
for (uint32_t b = 0; b < 4; b++)
{
if (thread_id == t)
{
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
}
shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
}
}
uint32_t thread_mix = fnv_reduce(mix);
// update mix accross threads
shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
if (i == thread_id) {
//move mix into state:
state[8] = shuffle[0];
state[9] = shuffle[1];
state[10] = shuffle[2];
state[11] = shuffle[3];
}
}
// keccak_256(keccak_512(header..nonce) .. mix);
return keccak_f1600_final(state);
}
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
#include "dagger_shared.cuh"
#else
#include "dagger_shuffled.cuh"
#endif
__global__ void
__launch_bounds__(896, 1)
@ -107,7 +24,7 @@ ethash_search(
)
{
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t hash = compute_hash_shuffle(start_nonce + gid);
uint64_t hash = compute_hash(start_nonce + gid);
if (cuda_swab64(hash) > d_target) return;
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
g_output[index] = gid;
@ -122,7 +39,7 @@ void run_ethash_search(
uint64_t start_nonce
)
{
ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
ethash_search << <blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
CUDA_SAFE_CALL(cudaGetLastError());
}

2
libethash-cuda/ethash_cuda_miner_kernel.h

@ -6,6 +6,8 @@
#include <cuda_runtime.h>
#define SEARCH_RESULT_BUFFER_SIZE 64
#define ACCESSES 64
#define THREADS_PER_HASH (128 / 16)
typedef struct
{

Loading…
Cancel
Save