working cuda kernel with shared mem

9 years ago · 89cedad4a6
4 changed files with 92 additions and 94 deletions
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -0,0 +1,78 @@
+#include "ethash_cuda_miner_kernel_globals.h"
+#include "ethash_cuda_miner_kernel.h"
+#include "keccak.cuh"
+#include "fnv.cuh"
+
+#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
+typedef union {
+	uint4	 uint4s[4];
+	uint2	 uint2s[8];
+	uint32_t uints[16];
+} compute_hash_share;
+
+
+__device__ uint64_t compute_hash(
+	uint64_t nonce
+	)
+{
+	// sha3_512(header .. nonce)
+	uint2 state[25];
+	state[4] = vectorize(nonce);
+	keccak_f1600_init(state);
+	
+	// Threads work together in this phase in groups of 8.
+	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
+	const int hash_id = threadIdx.x  >> 3;
+
+	extern __shared__  compute_hash_share share[];
+
+	uint4 mix;
+	
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init with other threads
+		if (i == thread_id)
+			copy(share[hash_id].uint2s, state, 8);
+
+		__syncthreads();
+
+		uint4 mix = share[hash_id].uint4s[thread_id & 3];
+		__syncthreads();
+
+		uint32_t *share0 = share[hash_id].uints;
+
+		// share init0
+		if (thread_id == 0)
+			*share0 = mix.x;
+		__syncthreads();
+		uint32_t init0 = *share0;
+
+		for (uint32_t a = 0; a < ACCESSES; a += 4)
+		{
+			bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
+
+			for (uint32_t i = 0; i != 4; ++i)
+			{
+				if (update_share)
+				{
+					*share0 = fnv(init0 ^ (a + i), ((uint32_t *)&mix)[i]) % d_dag_size;
+				}
+				__syncthreads();
+
+				mix = fnv4(mix, d_dag[*share0].uint4s[thread_id]);
+			}
+		}
+
+		share[hash_id].uints[thread_id] = fnv_reduce(mix);
+		__syncthreads();
+
+		if (i == thread_id)
+			copy(state + 8, share[hash_id].uint2s, 4);
+
+		__syncthreads();
+	}
+	
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	return keccak_f1600_final(state);
+}
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -3,10 +3,11 @@
 #include "keccak.cuh"
 #include "fnv.cuh"

-#define ACCESSES 64
-#define THREADS_PER_HASH (128 / 16)
+#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)

-__device__ uint64_t compute_hash_shuffle(
+typedef bool compute_hash_share;
+
+__device__ uint64_t compute_hash(
 	uint64_t nonce
 	)
 {
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -8,96 +8,13 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "cuda_helper.h"

-#include "keccak.cuh"
-#include "fnv.cuh"
+#define SHUFFLE_MIN_VER 300

-#define ACCESSES 64
-#define THREADS_PER_HASH (128 / 16)
-
-__device__ uint64_t compute_hash_shuffle(
-	uint64_t nonce
-	)
-{
-	// sha3_512(header .. nonce)
-	uint2 state[25];
-
-	state[4] = vectorize(nonce);
-
-	keccak_f1600_init(state);
-
-	// Threads work together in this phase in groups of 8.
-	const int thread_id = threadIdx.x &  (THREADS_PER_HASH - 1);
-	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
-	const int mix_idx = thread_id & 3;
-
-	uint4 mix;
-	uint2 shuffle[8];
-
-	for (int i = 0; i < THREADS_PER_HASH; i++)
-	{
-		// share init among threads
-		for (int j = 0; j < 8; j++) {
-			shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH);
-			shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH);
-		}
-
-		// ugly but avoids local reads/writes
-		if (mix_idx < 2) {
-			if (mix_idx == 0)
-				mix = vectorize2(shuffle[0], shuffle[1]);
-			else
-				mix = vectorize2(shuffle[2], shuffle[3]);
-		}
-		else  {
-			if (mix_idx == 2)
-				mix = vectorize2(shuffle[4], shuffle[5]);
-			else
-				mix = vectorize2(shuffle[6], shuffle[7]);
-		}
-
-		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
-
-		for (uint32_t a = 0; a < ACCESSES; a += 4)
-		{
-			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
-
-			for (uint32_t b = 0; b < 4; b++)
-			{
-				if (thread_id == t)
-				{
-					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
-				}
-				shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
-
-				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
-			}
-		}
-
-		uint32_t thread_mix = fnv_reduce(mix);
-
-		// update mix accross threads
-
-		shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
-		shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
-		shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
-		shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
-		shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
-		shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
-		shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
-		shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
-
-		if (i == thread_id) {
-			//move mix into state:
-			state[8] = shuffle[0];
-			state[9] = shuffle[1];
-			state[10] = shuffle[2];
-			state[11] = shuffle[3];
-		}
-	}
-
-	// keccak_256(keccak_512(header..nonce) .. mix);
-	return keccak_f1600_final(state);
-}
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+#include "dagger_shared.cuh"
+#else
+#include "dagger_shuffled.cuh"
+#endif

 __global__ void 
 __launch_bounds__(896, 1)
@ -107,7 +24,7 @@ ethash_search(
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
-	uint64_t hash = compute_hash_shuffle(start_nonce + gid);
+	uint64_t hash = compute_hash(start_nonce + gid);
 	if (cuda_swab64(hash) > d_target) return;
 	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
@ -122,7 +39,7 @@ void run_ethash_search(
 	uint64_t start_nonce
 )
 {
-	ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
+	ethash_search << <blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
 	CUDA_SAFE_CALL(cudaGetLastError());
 }

--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -6,6 +6,8 @@
 #include <cuda_runtime.h>

 #define SEARCH_RESULT_BUFFER_SIZE 64
+#define ACCESSES 64
+#define THREADS_PER_HASH (128 / 16)

 typedef struct
 {