add switch --cuda-parallel-hash to enable and disable the parallel-hash optimization

8 years ago · 73fc65daf9
9 changed files with 82 additions and 19 deletions
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -309,6 +309,22 @@ public:
 				}
 			}
 		}
+                else if (arg == "--cuda-parallel-hash")
+                {
+                        try {
+                                m_parallelHash = stol(argv[++i]);
+                                if(m_parallelHash == 0 || m_parallelHash>8)
+                                {
+                                        cerr << "Bad " << arg << " option: " << argv[i] << endl;
+                                        BOOST_THROW_EXCEPTION(BadArgument());
+                                }
+                        }
+                        catch (...)
+                        {
+                                cerr << "Bad " << arg << " option: " << argv[i] << endl;
+                                BOOST_THROW_EXCEPTION(BadArgument());
+                        }
+                }
 		else if (arg == "--cuda-schedule" && i + 1 < argc)
 		{
 			string mode = argv[++i];
@ -511,6 +527,8 @@ public:
 				m_dagCreateDevice
 				))
 				exit(1);
+
+                        EthashCUDAMiner::setParallelHash(m_parallelHash);
 #else
 			cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl;
 			exit(1);
@ -589,6 +607,7 @@ public:
 			<< "        yield - Instruct CUDA to yield its thread when waiting for results from the device." << endl
 			<< "        sync  - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the results from the device." << endl
 			<< "    --cuda-devices <0 1 ..n> Select which CUDA GPUs to mine on. Default is to use all" << endl
+			<< "    --cude-parallel-hash <1 2 ..8> Define how many hashes to calculate in a kernel, can be scaled to achive better performance. Default=4" << endl
 #endif
 			;
 	}
@ -1035,6 +1054,7 @@ private:
 	unsigned m_dagCreateDevice = 0;
 	/// Benchmarking params
 	unsigned m_benchmarkWarmup = 15;
+	unsigned m_parallelHash    = 4;
 	unsigned m_benchmarkTrial = 3;
 	unsigned m_benchmarkTrials = 5;
 	unsigned m_benchmarkBlock = 0;
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -8,6 +8,7 @@ typedef union {
 } compute_hash_share;


+template <uint32_t _PARALLEL_HASH>
 __device__ uint64_t compute_hash(
 	uint64_t nonce
 	)
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -2,9 +2,10 @@
 #include "ethash_cuda_miner_kernel.h"
 #include "cuda_helper.h"

-#define PARALLEL_HASH 4
+//#define PARALLEL_HASH 4

-__device__ uint64_t compute_hash(
+template <uint32_t _PARALLEL_HASH>
+__device__ __forceinline__ uint64_t compute_hash(
 	uint64_t nonce
 	)
 {
@ -19,14 +20,14 @@ __device__ uint64_t compute_hash(
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;

-	for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH)
+	for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH)
 	{
-		uint4 mix[PARALLEL_HASH];
-		uint32_t offset[PARALLEL_HASH];
-		uint32_t init0[PARALLEL_HASH];
+		uint4 mix[_PARALLEL_HASH];
+		uint32_t offset[_PARALLEL_HASH];
+		uint32_t init0[_PARALLEL_HASH];
 	
 		// share init among threads
-		for (int p = 0; p < PARALLEL_HASH; p++)
+		for (int p = 0; p < _PARALLEL_HASH; p++)
 		{
 			uint2 shuffle[8];
 			for (int j = 0; j < 8; j++) 
@ -50,20 +51,24 @@ __device__ uint64_t compute_hash(

 			for (uint32_t b = 0; b < 4; b++)
 			{
-				for (int p = 0; p < PARALLEL_HASH; p++)
+				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
 					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
 					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
 				}
 				#pragma unroll
-				for (int p = 0; p < PARALLEL_HASH; p++)
+				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
+                                        //if(blockIdx.x == 0 && threadIdx.x==0 && offset[p] > (d_dag_size>>1)) //larger than half
+                                        //    printf("d_dag_size = %d offset[p] = %d\n", d_dag_size, offset[p]);
 					mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
 				}
+
+                                
 			}
 		}

-		for (int p = 0; p < PARALLEL_HASH; p++)
+		for (int p = 0; p < _PARALLEL_HASH; p++)
 		{
 			uint2 shuffle[4];
 			uint32_t thread_mix = fnv_reduce(mix[p]);
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -180,7 +180,13 @@ bool ethash_cuda_miner::configureGPU(
 	}
 }

+void ethash_cuda_miner::setParallelHash(unsigned _parallelHash)
+{
+  m_parallelHash = _parallelHash;
+}
+
 unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
+unsigned ethash_cuda_miner::m_parallelHash = 4;
 unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize;
 unsigned ethash_cuda_miner::s_gridSize = ethash_cuda_miner::c_defaultGridSize;
 unsigned ethash_cuda_miner::s_numStreams = ethash_cuda_miner::c_defaultNumStreams;
@ -363,7 +369,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
 			for (unsigned int j = 0; j < found_count; j++)
 				nonces[j] = nonce_base + buffer[j + 1];
 		}
-		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce);
+		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
 		if (m_current_index >= s_numStreams)
 		{
 			exit = found_count && hook.found(nonces, found_count);
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -34,6 +34,7 @@ public:
 		unsigned _scheduleFlag,
 		uint64_t _currentBlock
 		);
+        static void setParallelHash(unsigned _parallelHash);

 	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);

@ -72,4 +73,5 @@ private:
 	/// GPU memory required for other things, like window rendering e.t.c.
 	/// User can set it via the --cl-extragpu-mem argument.
 	static unsigned s_extraRequiredGPUMem;
+	static unsigned m_parallelHash;
 };
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -21,6 +21,7 @@
 #include "dagger_shuffled.cuh"
 #endif

+template <uint32_t _PARALLEL_HASH>
 __global__ void 
 ethash_search(
 	volatile uint32_t* g_output,
@ -28,7 +29,7 @@ ethash_search(
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
-	uint64_t hash = compute_hash(start_nonce + gid);
+        uint64_t hash = compute_hash<_PARALLEL_HASH>(start_nonce + gid);
 	if (cuda_swab64(hash) > d_target) return;
 	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
@ -40,16 +41,37 @@ void run_ethash_search(
 	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
-	uint64_t start_nonce
+	uint64_t start_nonce,
+        uint32_t parallelHash
 )
 {
-	ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
+
+        //printf("parallelHash = %d\n", parallelHash);
+        if(parallelHash == 1)
+            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 2)
+            ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 3)
+            ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 4)
+            ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 5)
+            ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 6)
+            ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 7)
+            ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else if(parallelHash == 8)
+            ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
+        else
+            ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); 
 	CUDA_SAFE_CALL(cudaGetLastError());
 }

 #define ETHASH_DATASET_PARENTS 256
 #define NODE_WORDS (64/4)

+
 __global__ void
 ethash_calculate_dag_item(uint32_t start)
 {
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -52,7 +52,8 @@ void run_ethash_search(
 	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
-	uint64_t start_nonce
+	uint64_t start_nonce,
+        uint32_t parallelHash
 	);

 void ethash_generate_dag(
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@ -261,4 +261,9 @@ bool EthashCUDAMiner::configureGPU(
 	return true;
 }

+void EthashCUDAMiner::setParallelHash(unsigned _parallelHash)
+{
+  ethash_cuda_miner::setParallelHash(_parallelHash);
+}
+
 #endif
--- a/libethcore/EthashCUDAMiner.h
+++ b/libethcore/EthashCUDAMiner.h
@ -51,6 +51,7 @@ class EthashCUDAHook;
 		static std::string platformInfo();
 		static unsigned getNumDevices();
 		static void listDevices();
+                static void setParallelHash(unsigned _parallelHash);
 		static bool configureGPU(
 			unsigned _blockSize,
 			unsigned _gridSize,