From 63ec7023b898f15792ad7b6bb9badf49d633e78e Mon Sep 17 00:00:00 2001
From: "David Li (Engrg-Hardware 1)" <davili@shcompute-h.nvidia.com>
Date: Mon, 15 May 2017 15:24:25 +0800
Subject: [PATCH 1/6] The optimized code by Nvidia Architecturer. the
 performance is improved from 'min/mean/max: 22369621/22579336/22719146 H/s'
 to 'min/mean/max: 23767722/23907532/24117248 H/s' on a flashed GTX 1060 with
 2 GPCs 9 TPCs (the product chip should have 10 TPCs). Note that the code is
 tested on the code pulled from May-11. The current code from github cannot
 generate reasonable scores ('min/max/avg is 0/0/0 H/s')

---
 libethash-cuda/dagger_shuffled.cuh         | 104 +++++++++++----------
 libethash-cuda/ethash_cuda_miner_kernel.cu |   4 +-
 libethash-cuda/keccak.cuh                  |  19 +++-
 3 files changed, 74 insertions(+), 53 deletions(-)

diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index b3a443d23..9d5754585 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -2,12 +2,14 @@
 #include "ethash_cuda_miner_kernel.h"
 #include "cuda_helper.h"
 
+#define PARALLEL_HASH 4
+
 __device__ uint64_t compute_hash(
 	uint64_t nonce
 	)
 {
 	// sha3_512(header .. nonce)
-	uint2 state[25];
+	uint2 state[12];
 	
 	state[4] = vectorize(nonce);
 
@@ -17,32 +19,30 @@ __device__ uint64_t compute_hash(
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;
 
-	uint4 mix;
-	uint2 shuffle[8];
-	
-	for (int i = 0; i < THREADS_PER_HASH; i++)
+	for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH)
 	{
+		uint4 mix[PARALLEL_HASH];
+		uint32_t offset[PARALLEL_HASH];
+		uint32_t init0[PARALLEL_HASH];
+	
 		// share init among threads
-		for (int j = 0; j < 8; j++) {
-			shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH);
-			shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH);
-		}
-
-		// ugly but avoids local reads/writes
-		if (mix_idx < 2) {
-			if (mix_idx == 0)
-				mix = vectorize2(shuffle[0], shuffle[1]);
-			else
-				mix = vectorize2(shuffle[2], shuffle[3]);
-		}
-		else  {
-			if (mix_idx == 2)
-				mix = vectorize2(shuffle[4], shuffle[5]);
-			else
-				mix = vectorize2(shuffle[6], shuffle[7]);
+		for (int p = 0; p < PARALLEL_HASH; p++)
+		{
+			uint2 shuffle[8];
+			for (int j = 0; j < 8; j++) 
+			{
+				shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
+				shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
+			}
+			switch (mix_idx)
+			{
+				case 0: mix[p] = vectorize2(shuffle[0], shuffle[1]); break;
+				case 1: mix[p] = vectorize2(shuffle[2], shuffle[3]); break;
+				case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break;
+				case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break;
+			}
+			init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
 		}
-		
-		uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
 
 		for (uint32_t a = 0; a < ACCESSES; a += 4)
 		{
@@ -50,37 +50,45 @@ __device__ uint64_t compute_hash(
 
 			for (uint32_t b = 0; b < 4; b++)
 			{
-				if (thread_id == t)
-				{	
-					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
+				for (int p = 0; p < PARALLEL_HASH; p++)
+				{
+					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
+					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
+				}
+				#pragma unroll
+				for (int p = 0; p < PARALLEL_HASH; p++)
+				{
+					mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
 				}
-				shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
-				mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]);
 			}
 		}
 
-		uint32_t thread_mix = fnv_reduce(mix);
+		for (int p = 0; p < PARALLEL_HASH; p++)
+		{
+			uint2 shuffle[4];
+			uint32_t thread_mix = fnv_reduce(mix[p]);
+
+			// update mix accross threads
 
-		// update mix accross threads
-		
-		shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
-		shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
-		shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
-		shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
-		shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
-		shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
-		shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
-		shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
-		
-		if (i == thread_id) {
-			//move mix into state:
-			state[8] = shuffle[0];
-			state[9] = shuffle[1];
-			state[10] = shuffle[2];
-			state[11] = shuffle[3];
+			shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
+			shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
+			shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
+			shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
+			shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
+			shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
+			shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
+			shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
+
+			if ((i+p) == thread_id) {
+				//move mix into state:
+				state[8] = shuffle[0];
+				state[9] = shuffle[1];
+				state[10] = shuffle[2];
+				state[11] = shuffle[3];
+			}
 		}
 	}
 	
 	// keccak_256(keccak_512(header..nonce) .. mix);
 	return keccak_f1600_final(state);
-}
\ No newline at end of file
+}
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index e16239699..b128258dd 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -26,7 +26,7 @@
 #endif
 
 __global__ void 
-__launch_bounds__(TPB, BPSM)
+//__launch_bounds__(TPB, BPSM)
 ethash_search(
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
@@ -56,7 +56,7 @@ void run_ethash_search(
 #define NODE_WORDS (64/4)
 
 __global__ void
-__launch_bounds__(128, 7)
+//__launch_bounds__(128, 7)
 ethash_calculate_dag_item(uint32_t start)
 {
 	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh
index 86969e014..8690b1805 100644
--- a/libethash-cuda/keccak.cuh
+++ b/libethash-cuda/keccak.cuh
@@ -25,10 +25,13 @@ uint2 chi(const uint2 a, const uint2 b, const uint2 c) {
 	return a ^ (~b) & c;
 }
 
-__device__ __forceinline__ void keccak_f1600_init(uint2* s)
+__device__ __forceinline__ void keccak_f1600_init(uint2* state)
 {
+	uint2 s[25];
 	uint2 t[5], u, v;
 
+	s[4] = state[4];
+
 	devectorize2(d_header.uint4s[0], s[0], s[1]);
 	devectorize2(d_header.uint4s[1], s[2], s[3]);
 
@@ -328,12 +331,22 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* s)
 
 	/* iota: a[0,0] ^= round constant */
 	s[0] ^= vectorize(keccak_round_constants[23]);
+
+	for(uint32_t i=0; i<12; i++)
+	{
+	    state[i] = s[i];
+	}
 }
 
-__device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s)
+__device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state)
 {
+	uint2 s[25];
 	uint2 t[5], u, v;
 
+	for (uint32_t i = 0; i<12; i++)
+	{
+		s[i] = state[i];
+	}
 	for (uint32_t i = 12; i < 25; i++)
 	{
 		s[i] = make_uint2(0, 0);
@@ -774,4 +787,4 @@ __device__ __forceinline__ void SHA3_512(uint2* s) {
 
 	/* iota: a[0,0] ^= round constant */
 	s[0] ^= vectorize(keccak_round_constants[23]);
-}
\ No newline at end of file
+}

From 2f945b22162beaf6c1af1d46d1061f82dc8a5928 Mon Sep 17 00:00:00 2001
From: "David Li (Engrg-Hardware 1)" <davili@shcompute-h.nvidia.com>
Date: Tue, 16 May 2017 17:56:29 +0800
Subject: [PATCH 2/6] 1. delete launch bound in ethash_cuda_miner_kernel.cu 2.
 re-format the for loop in keccak.cuh

---
 libethash-cuda/ethash_cuda_miner_kernel.cu | 2 --
 libethash-cuda/keccak.cuh                  | 9 +++------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index b128258dd..5bd5ac268 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -26,7 +26,6 @@
 #endif
 
 __global__ void 
-//__launch_bounds__(TPB, BPSM)
 ethash_search(
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
@@ -56,7 +55,6 @@ void run_ethash_search(
 #define NODE_WORDS (64/4)
 
 __global__ void
-//__launch_bounds__(128, 7)
 ethash_calculate_dag_item(uint32_t start)
 {
 	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh
index 8690b1805..65a19be7c 100644
--- a/libethash-cuda/keccak.cuh
+++ b/libethash-cuda/keccak.cuh
@@ -332,10 +332,8 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* state)
 	/* iota: a[0,0] ^= round constant */
 	s[0] ^= vectorize(keccak_round_constants[23]);
 
-	for(uint32_t i=0; i<12; i++)
-	{
+	for(int i=0; i<12; ++i)
 	    state[i] = s[i];
-	}
 }
 
 __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state)
@@ -343,10 +341,9 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state)
 	uint2 s[25];
 	uint2 t[5], u, v;
 
-	for (uint32_t i = 0; i<12; i++)
-	{
+	for (int i = 0; i<12; ++i)
 		s[i] = state[i];
-	}
+
 	for (uint32_t i = 12; i < 25; i++)
 	{
 		s[i] = make_uint2(0, 0);

From de2c1e3485b6c0361cbc270f31b26ac9bb2b2854 Mon Sep 17 00:00:00 2001
From: "David Li (Engrg-Hardware 1)" <davili@shcompute-h.nvidia.com>
Date: Tue, 16 May 2017 18:21:44 +0800
Subject: [PATCH 3/6] delete dead code resulting from launch_bound deleting

---
 libethash-cuda/ethash_cuda_miner_kernel.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 5bd5ac268..ac607c3a2 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -16,13 +16,9 @@
 #if __CUDA_ARCH__ < SHUFFLE_MIN_VER
 #include "keccak_u64.cuh"
 #include "dagger_shared.cuh"
-#define TPB		128
-#define BPSM	4
 #else
 #include "keccak.cuh"
 #include "dagger_shuffled.cuh"
-#define TPB		896
-#define BPSM	1
 #endif
 
 __global__ void 

From c99b693d9b18b9e514e8013d672cea0081b94e8c Mon Sep 17 00:00:00 2001
From: davilizh <davili@nvidia.com>
Date: Wed, 17 May 2017 14:47:49 +0800
Subject: [PATCH 4/6] add space between equations in the for loop

---
 libethash-cuda/keccak.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh
index 65a19be7c..ede8ecf54 100644
--- a/libethash-cuda/keccak.cuh
+++ b/libethash-cuda/keccak.cuh
@@ -332,7 +332,7 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* state)
 	/* iota: a[0,0] ^= round constant */
 	s[0] ^= vectorize(keccak_round_constants[23]);
 
-	for(int i=0; i<12; ++i)
+	for(int i = 0; i < 12; ++i)
 	    state[i] = s[i];
 }
 
@@ -341,7 +341,7 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state)
 	uint2 s[25];
 	uint2 t[5], u, v;
 
-	for (int i = 0; i<12; ++i)
+	for (int i = 0; i < 12; ++i)
 		s[i] = state[i];
 
 	for (uint32_t i = 12; i < 25; i++)

From 73fc65daf97840f61fdcd292ac42ccb54c7f1553 Mon Sep 17 00:00:00 2001
From: davilizh <davili@nvidia.com>
Date: Tue, 27 Jun 2017 14:49:40 +0800
Subject: [PATCH 5/6] add switch --cuda-parallel-hash to enable and disable the
 parallel-hash optimization

---
 ethminer/MinerAux.h                        | 20 ++++++++++++++++
 libethash-cuda/dagger_shared.cuh           |  5 ++--
 libethash-cuda/dagger_shuffled.cuh         | 25 +++++++++++--------
 libethash-cuda/ethash_cuda_miner.cpp       |  8 ++++++-
 libethash-cuda/ethash_cuda_miner.h         |  4 +++-
 libethash-cuda/ethash_cuda_miner_kernel.cu | 28 +++++++++++++++++++---
 libethash-cuda/ethash_cuda_miner_kernel.h  |  3 ++-
 libethcore/EthashCUDAMiner.cpp             |  7 +++++-
 libethcore/EthashCUDAMiner.h               |  1 +
 9 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 271e9df91..292fd0a93 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -309,6 +309,22 @@ public:
 				}
 			}
 		}
+                else if (arg == "--cuda-parallel-hash")
+                {
+                        try {
+                                m_parallelHash = stol(argv[++i]);
+                                if(m_parallelHash == 0 || m_parallelHash>8)
+                                {
+                                        cerr << "Bad " << arg << " option: " << argv[i] << endl;
+                                        BOOST_THROW_EXCEPTION(BadArgument());
+                                }
+                        }
+                        catch (...)
+                        {
+                                cerr << "Bad " << arg << " option: " << argv[i] << endl;
+                                BOOST_THROW_EXCEPTION(BadArgument());
+                        }
+                }
 		else if (arg == "--cuda-schedule" && i + 1 < argc)
 		{
 			string mode = argv[++i];
@@ -511,6 +527,8 @@ public:
 				m_dagCreateDevice
 				))
 				exit(1);
+
+                        EthashCUDAMiner::setParallelHash(m_parallelHash);
 #else
 			cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl;
 			exit(1);
@@ -589,6 +607,7 @@ public:
 			<< "        yield - Instruct CUDA to yield its thread when waiting for results from the device." << endl
 			<< "        sync  - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the results from the device." << endl
 			<< "    --cuda-devices <0 1 ..n> Select which CUDA GPUs to mine on. Default is to use all" << endl
+			<< "    --cude-parallel-hash <1 2 ..8> Define how many hashes to calculate in a kernel, can be scaled to achive better performance. Default=4" << endl
 #endif
 			;
 	}
@@ -1035,6 +1054,7 @@ private:
 	unsigned m_dagCreateDevice = 0;
 	/// Benchmarking params
 	unsigned m_benchmarkWarmup = 15;
+	unsigned m_parallelHash    = 4;
 	unsigned m_benchmarkTrial = 3;
 	unsigned m_benchmarkTrials = 5;
 	unsigned m_benchmarkBlock = 0;
diff --git a/libethash-cuda/dagger_shared.cuh b/libethash-cuda/dagger_shared.cuh
index 52fdc6a4f..89bd0327c 100644
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@@ -8,6 +8,7 @@ typedef union {
 } compute_hash_share;
 
 
+template <uint32_t _PARALLEL_HASH>
 __device__ uint64_t compute_hash(
 	uint64_t nonce
 	)
@@ -65,7 +66,7 @@ __device__ uint64_t compute_hash(
 
 		__syncthreads();
 	}
-	
+
 	// keccak_256(keccak_512(header..nonce) .. mix);
 	return keccak_f1600_final(state);
-}
\ No newline at end of file
+}
diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index 9d5754585..e841c3d92 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -2,9 +2,10 @@
 #include "ethash_cuda_miner_kernel.h"
 #include "cuda_helper.h"
 
-#define PARALLEL_HASH 4
+//#define PARALLEL_HASH 4
 
-__device__ uint64_t compute_hash(
+template <uint32_t _PARALLEL_HASH>
+__device__ __forceinline__ uint64_t compute_hash(
 	uint64_t nonce
 	)
 {
@@ -19,14 +20,14 @@ __device__ uint64_t compute_hash(
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;
 
-	for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH)
+	for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH)
 	{
-		uint4 mix[PARALLEL_HASH];
-		uint32_t offset[PARALLEL_HASH];
-		uint32_t init0[PARALLEL_HASH];
+		uint4 mix[_PARALLEL_HASH];
+		uint32_t offset[_PARALLEL_HASH];
+		uint32_t init0[_PARALLEL_HASH];
 	
 		// share init among threads
-		for (int p = 0; p < PARALLEL_HASH; p++)
+		for (int p = 0; p < _PARALLEL_HASH; p++)
 		{
 			uint2 shuffle[8];
 			for (int j = 0; j < 8; j++) 
@@ -50,20 +51,24 @@ __device__ uint64_t compute_hash(
 
 			for (uint32_t b = 0; b < 4; b++)
 			{
-				for (int p = 0; p < PARALLEL_HASH; p++)
+				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
 					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
 					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
 				}
 				#pragma unroll
-				for (int p = 0; p < PARALLEL_HASH; p++)
+				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
+                                        //if(blockIdx.x == 0 && threadIdx.x==0 && offset[p] > (d_dag_size>>1)) //larger than half
+                                        //    printf("d_dag_size = %d offset[p] = %d\n", d_dag_size, offset[p]);
 					mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
 				}
+
+                                
 			}
 		}
 
-		for (int p = 0; p < PARALLEL_HASH; p++)
+		for (int p = 0; p < _PARALLEL_HASH; p++)
 		{
 			uint2 shuffle[4];
 			uint32_t thread_mix = fnv_reduce(mix[p]);
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 37c39e4eb..a7d8b4a03 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -180,7 +180,13 @@ bool ethash_cuda_miner::configureGPU(
 	}
 }
 
+void ethash_cuda_miner::setParallelHash(unsigned _parallelHash)
+{
+  m_parallelHash = _parallelHash;
+}
+
 unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
+unsigned ethash_cuda_miner::m_parallelHash = 4;
 unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize;
 unsigned ethash_cuda_miner::s_gridSize = ethash_cuda_miner::c_defaultGridSize;
 unsigned ethash_cuda_miner::s_numStreams = ethash_cuda_miner::c_defaultNumStreams;
@@ -363,7 +369,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
 			for (unsigned int j = 0; j < found_count; j++)
 				nonces[j] = nonce_base + buffer[j + 1];
 		}
-		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce);
+		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
 		if (m_current_index >= s_numStreams)
 		{
 			exit = found_count && hook.found(nonces, found_count);
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 4307770a0..80aa08ce5 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -34,6 +34,7 @@ public:
 		unsigned _scheduleFlag,
 		uint64_t _currentBlock
 		);
+        static void setParallelHash(unsigned _parallelHash);
 
 	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);
 
@@ -72,4 +73,5 @@ private:
 	/// GPU memory required for other things, like window rendering e.t.c.
 	/// User can set it via the --cl-extragpu-mem argument.
 	static unsigned s_extraRequiredGPUMem;
-};
\ No newline at end of file
+	static unsigned m_parallelHash;
+};
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index ac607c3a2..70dcad9a4 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -21,6 +21,7 @@
 #include "dagger_shuffled.cuh"
 #endif
 
+template <uint32_t _PARALLEL_HASH>
 __global__ void 
 ethash_search(
 	volatile uint32_t* g_output,
@@ -28,7 +29,7 @@ ethash_search(
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
-	uint64_t hash = compute_hash(start_nonce + gid);
+        uint64_t hash = compute_hash<_PARALLEL_HASH>(start_nonce + gid);
 	if (cuda_swab64(hash) > d_target) return;
 	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
@@ -40,16 +41,37 @@ void run_ethash_search(
 	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
-	uint64_t start_nonce
+	uint64_t start_nonce,
+        uint32_t parallelHash
 )
 {
-	ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
+
+        //printf("parallelHash = %d\n", parallelHash);
+        if(parallelHash == 1)
+            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 2)
+            ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 3)
+            ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 4)
+            ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 5)
+            ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 6)
+            ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 7)
+            ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 8)
+            ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else
+            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); 
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
 #define ETHASH_DATASET_PARENTS 256
 #define NODE_WORDS (64/4)
 
+
 __global__ void
 ethash_calculate_dag_item(uint32_t start)
 {
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h
index ebb24dd75..93f83bb45 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@@ -52,7 +52,8 @@ void run_ethash_search(
 	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
-	uint64_t start_nonce
+	uint64_t start_nonce,
+        uint32_t parallelHash
 	);
 
 void ethash_generate_dag(
diff --git a/libethcore/EthashCUDAMiner.cpp b/libethcore/EthashCUDAMiner.cpp
index 516628b18..1f2868ee6 100644
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@@ -261,4 +261,9 @@ bool EthashCUDAMiner::configureGPU(
 	return true;
 }
 
-#endif
\ No newline at end of file
+void EthashCUDAMiner::setParallelHash(unsigned _parallelHash)
+{
+  ethash_cuda_miner::setParallelHash(_parallelHash);
+}
+
+#endif
diff --git a/libethcore/EthashCUDAMiner.h b/libethcore/EthashCUDAMiner.h
index e2018af27..f6314c348 100644
--- a/libethcore/EthashCUDAMiner.h
+++ b/libethcore/EthashCUDAMiner.h
@@ -51,6 +51,7 @@ class EthashCUDAHook;
 		static std::string platformInfo();
 		static unsigned getNumDevices();
 		static void listDevices();
+                static void setParallelHash(unsigned _parallelHash);
 		static bool configureGPU(
 			unsigned _blockSize,
 			unsigned _gridSize,

From 41c8e69a8f2bceba1af833bd1357f2c752b95299 Mon Sep 17 00:00:00 2001
From: davilizh <davili@nvidia.com>
Date: Tue, 27 Jun 2017 16:50:22 +0800
Subject: [PATCH 6/6] Change the code according to Chafast's suggestions in
 pull request #18. 1) change if..else if..else if to switch. 2)change spaces
 to tab. 3) add additioanl arg check. 4) delete commented out code. 5)  throw
 any exception to avoid code duplication.

---
 ethminer/MinerAux.h                        |  7 ++---
 libethash-cuda/dagger_shuffled.cuh         |  2 --
 libethash-cuda/ethash_cuda_miner_kernel.cu | 34 +++++++++-------------
 libethash-cuda/ethash_cuda_miner_kernel.h  |  2 +-
 4 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 292fd0a93..b0c6b03e8 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -309,14 +309,13 @@ public:
 				}
 			}
 		}
-                else if (arg == "--cuda-parallel-hash")
+                else if (arg == "--cuda-parallel-hash" && i + 1 < argc)
                 {
                         try {
                                 m_parallelHash = stol(argv[++i]);
-                                if(m_parallelHash == 0 || m_parallelHash>8)
+                                if (m_parallelHash == 0 || m_parallelHash > 8)
                                 {
-                                        cerr << "Bad " << arg << " option: " << argv[i] << endl;
-                                        BOOST_THROW_EXCEPTION(BadArgument());
+                                    throw BadArgument();
                                 }
                         }
                         catch (...)
diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index e841c3d92..4497ad699 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -2,8 +2,6 @@
 #include "ethash_cuda_miner_kernel.h"
 #include "cuda_helper.h"
 
-//#define PARALLEL_HASH 4
-
 template <uint32_t _PARALLEL_HASH>
 __device__ __forceinline__ uint64_t compute_hash(
 	uint64_t nonce
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 70dcad9a4..6092f267a 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -42,29 +42,21 @@ void run_ethash_search(
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce,
-        uint32_t parallelHash
+	uint32_t parallelHash
 )
 {
-
-        //printf("parallelHash = %d\n", parallelHash);
-        if(parallelHash == 1)
-            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 2)
-            ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 3)
-            ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 4)
-            ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 5)
-            ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 6)
-            ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 7)
-            ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else if(parallelHash == 8)
-            ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
-        else
-            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); 
+	switch (parallelHash)
+	{
+		case 1: ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 2: ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 3: ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 4: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 5: ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 6: ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 7: ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 8: ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		default: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+	}
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h
index 93f83bb45..c219308ff 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@@ -53,7 +53,7 @@ void run_ethash_search(
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce,
-        uint32_t parallelHash
+	uint32_t parallelHash
 	);
 
 void ethash_generate_dag(