From 63ec7023b898f15792ad7b6bb9badf49d633e78e Mon Sep 17 00:00:00 2001 From: "David Li (Engrg-Hardware 1)" Date: Mon, 15 May 2017 15:24:25 +0800 Subject: [PATCH 1/6] The optimized code by Nvidia Architecturer. the performance is improved from 'min/mean/max: 22369621/22579336/22719146 H/s' to 'min/mean/max: 23767722/23907532/24117248 H/s' on a flashed GTX 1060 with 2 GPCs 9 TPCs (the product chip should have 10 TPCs). Note that the code is tested on the code pulled from May-11. The current code from github cannot generate reasonable scores ('min/max/avg is 0/0/0 H/s') --- libethash-cuda/dagger_shuffled.cuh | 104 +++++++++++---------- libethash-cuda/ethash_cuda_miner_kernel.cu | 4 +- libethash-cuda/keccak.cuh | 19 +++- 3 files changed, 74 insertions(+), 53 deletions(-) diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh index b3a443d23..9d5754585 100644 --- a/libethash-cuda/dagger_shuffled.cuh +++ b/libethash-cuda/dagger_shuffled.cuh @@ -2,12 +2,14 @@ #include "ethash_cuda_miner_kernel.h" #include "cuda_helper.h" +#define PARALLEL_HASH 4 + __device__ uint64_t compute_hash( uint64_t nonce ) { // sha3_512(header .. nonce) - uint2 state[25]; + uint2 state[12]; state[4] = vectorize(nonce); @@ -17,32 +19,30 @@ __device__ uint64_t compute_hash( const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); const int mix_idx = thread_id & 3; - uint4 mix; - uint2 shuffle[8]; - - for (int i = 0; i < THREADS_PER_HASH; i++) + for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH) { + uint4 mix[PARALLEL_HASH]; + uint32_t offset[PARALLEL_HASH]; + uint32_t init0[PARALLEL_HASH]; + // share init among threads - for (int j = 0; j < 8; j++) { - shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH); - shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH); - } - - // ugly but avoids local reads/writes - if (mix_idx < 2) { - if (mix_idx == 0) - mix = vectorize2(shuffle[0], shuffle[1]); - else - mix = vectorize2(shuffle[2], shuffle[3]); - } - else { - if (mix_idx == 2) - mix = vectorize2(shuffle[4], shuffle[5]); - else - mix = vectorize2(shuffle[6], shuffle[7]); + for (int p = 0; p < PARALLEL_HASH; p++) + { + uint2 shuffle[8]; + for (int j = 0; j < 8; j++) + { + shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH); + shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH); + } + switch (mix_idx) + { + case 0: mix[p] = vectorize2(shuffle[0], shuffle[1]); break; + case 1: mix[p] = vectorize2(shuffle[2], shuffle[3]); break; + case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break; + case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break; + } + init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH); } - - uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH); for (uint32_t a = 0; a < ACCESSES; a += 4) { @@ -50,37 +50,45 @@ __device__ uint64_t compute_hash( for (uint32_t b = 0; b < 4; b++) { - if (thread_id == t) - { - shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; + for (int p = 0; p < PARALLEL_HASH; p++) + { + offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size; + offset[p] = __shfl(offset[p], t, THREADS_PER_HASH); + } + #pragma unroll + for (int p = 0; p < PARALLEL_HASH; p++) + { + mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]); } - shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH); - mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]); } } - uint32_t thread_mix = fnv_reduce(mix); + for (int p = 0; p < PARALLEL_HASH; p++) + { + uint2 shuffle[4]; + uint32_t thread_mix = fnv_reduce(mix[p]); + + // update mix accross threads - // update mix accross threads - - shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH); - shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH); - shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH); - shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH); - shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH); - shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH); - shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH); - shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH); - - if (i == thread_id) { - //move mix into state: - state[8] = shuffle[0]; - state[9] = shuffle[1]; - state[10] = shuffle[2]; - state[11] = shuffle[3]; + shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH); + shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH); + shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH); + shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH); + shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH); + shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH); + shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH); + shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH); + + if ((i+p) == thread_id) { + //move mix into state: + state[8] = shuffle[0]; + state[9] = shuffle[1]; + state[10] = shuffle[2]; + state[11] = shuffle[3]; + } } } // keccak_256(keccak_512(header..nonce) .. mix); return keccak_f1600_final(state); -} \ No newline at end of file +} diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index e16239699..b128258dd 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -26,7 +26,7 @@ #endif __global__ void -__launch_bounds__(TPB, BPSM) +//__launch_bounds__(TPB, BPSM) ethash_search( volatile uint32_t* g_output, uint64_t start_nonce @@ -56,7 +56,7 @@ void run_ethash_search( #define NODE_WORDS (64/4) __global__ void -__launch_bounds__(128, 7) +//__launch_bounds__(128, 7) ethash_calculate_dag_item(uint32_t start) { uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh index 86969e014..8690b1805 100644 --- a/libethash-cuda/keccak.cuh +++ b/libethash-cuda/keccak.cuh @@ -25,10 +25,13 @@ uint2 chi(const uint2 a, const uint2 b, const uint2 c) { return a ^ (~b) & c; } -__device__ __forceinline__ void keccak_f1600_init(uint2* s) +__device__ __forceinline__ void keccak_f1600_init(uint2* state) { + uint2 s[25]; uint2 t[5], u, v; + s[4] = state[4]; + devectorize2(d_header.uint4s[0], s[0], s[1]); devectorize2(d_header.uint4s[1], s[2], s[3]); @@ -328,12 +331,22 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* s) /* iota: a[0,0] ^= round constant */ s[0] ^= vectorize(keccak_round_constants[23]); + + for(uint32_t i=0; i<12; i++) + { + state[i] = s[i]; + } } -__device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s) +__device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state) { + uint2 s[25]; uint2 t[5], u, v; + for (uint32_t i = 0; i<12; i++) + { + s[i] = state[i]; + } for (uint32_t i = 12; i < 25; i++) { s[i] = make_uint2(0, 0); @@ -774,4 +787,4 @@ __device__ __forceinline__ void SHA3_512(uint2* s) { /* iota: a[0,0] ^= round constant */ s[0] ^= vectorize(keccak_round_constants[23]); -} \ No newline at end of file +} From 2f945b22162beaf6c1af1d46d1061f82dc8a5928 Mon Sep 17 00:00:00 2001 From: "David Li (Engrg-Hardware 1)" Date: Tue, 16 May 2017 17:56:29 +0800 Subject: [PATCH 2/6] 1. delete launch bound in ethash_cuda_miner_kernel.cu 2. re-format the for loop in keccak.cuh --- libethash-cuda/ethash_cuda_miner_kernel.cu | 2 -- libethash-cuda/keccak.cuh | 9 +++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index b128258dd..5bd5ac268 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -26,7 +26,6 @@ #endif __global__ void -//__launch_bounds__(TPB, BPSM) ethash_search( volatile uint32_t* g_output, uint64_t start_nonce @@ -56,7 +55,6 @@ void run_ethash_search( #define NODE_WORDS (64/4) __global__ void -//__launch_bounds__(128, 7) ethash_calculate_dag_item(uint32_t start) { uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh index 8690b1805..65a19be7c 100644 --- a/libethash-cuda/keccak.cuh +++ b/libethash-cuda/keccak.cuh @@ -332,10 +332,8 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* state) /* iota: a[0,0] ^= round constant */ s[0] ^= vectorize(keccak_round_constants[23]); - for(uint32_t i=0; i<12; i++) - { + for(int i=0; i<12; ++i) state[i] = s[i]; - } } __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state) @@ -343,10 +341,9 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state) uint2 s[25]; uint2 t[5], u, v; - for (uint32_t i = 0; i<12; i++) - { + for (int i = 0; i<12; ++i) s[i] = state[i]; - } + for (uint32_t i = 12; i < 25; i++) { s[i] = make_uint2(0, 0); From de2c1e3485b6c0361cbc270f31b26ac9bb2b2854 Mon Sep 17 00:00:00 2001 From: "David Li (Engrg-Hardware 1)" Date: Tue, 16 May 2017 18:21:44 +0800 Subject: [PATCH 3/6] delete dead code resulting from launch_bound deleting --- libethash-cuda/ethash_cuda_miner_kernel.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index 5bd5ac268..ac607c3a2 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -16,13 +16,9 @@ #if __CUDA_ARCH__ < SHUFFLE_MIN_VER #include "keccak_u64.cuh" #include "dagger_shared.cuh" -#define TPB 128 -#define BPSM 4 #else #include "keccak.cuh" #include "dagger_shuffled.cuh" -#define TPB 896 -#define BPSM 1 #endif __global__ void From c99b693d9b18b9e514e8013d672cea0081b94e8c Mon Sep 17 00:00:00 2001 From: davilizh Date: Wed, 17 May 2017 14:47:49 +0800 Subject: [PATCH 4/6] add space between equations in the for loop --- libethash-cuda/keccak.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh index 65a19be7c..ede8ecf54 100644 --- a/libethash-cuda/keccak.cuh +++ b/libethash-cuda/keccak.cuh @@ -332,7 +332,7 @@ __device__ __forceinline__ void keccak_f1600_init(uint2* state) /* iota: a[0,0] ^= round constant */ s[0] ^= vectorize(keccak_round_constants[23]); - for(int i=0; i<12; ++i) + for(int i = 0; i < 12; ++i) state[i] = s[i]; } @@ -341,7 +341,7 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* state) uint2 s[25]; uint2 t[5], u, v; - for (int i = 0; i<12; ++i) + for (int i = 0; i < 12; ++i) s[i] = state[i]; for (uint32_t i = 12; i < 25; i++) From 73fc65daf97840f61fdcd292ac42ccb54c7f1553 Mon Sep 17 00:00:00 2001 From: davilizh Date: Tue, 27 Jun 2017 14:49:40 +0800 Subject: [PATCH 5/6] add switch --cuda-parallel-hash to enable and disable the parallel-hash optimization --- ethminer/MinerAux.h | 20 ++++++++++++++++ libethash-cuda/dagger_shared.cuh | 5 ++-- libethash-cuda/dagger_shuffled.cuh | 25 +++++++++++-------- libethash-cuda/ethash_cuda_miner.cpp | 8 ++++++- libethash-cuda/ethash_cuda_miner.h | 4 +++- libethash-cuda/ethash_cuda_miner_kernel.cu | 28 +++++++++++++++++++--- libethash-cuda/ethash_cuda_miner_kernel.h | 3 ++- libethcore/EthashCUDAMiner.cpp | 7 +++++- libethcore/EthashCUDAMiner.h | 1 + 9 files changed, 82 insertions(+), 19 deletions(-) diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h index 271e9df91..292fd0a93 100644 --- a/ethminer/MinerAux.h +++ b/ethminer/MinerAux.h @@ -309,6 +309,22 @@ public: } } } + else if (arg == "--cuda-parallel-hash") + { + try { + m_parallelHash = stol(argv[++i]); + if(m_parallelHash == 0 || m_parallelHash>8) + { + cerr << "Bad " << arg << " option: " << argv[i] << endl; + BOOST_THROW_EXCEPTION(BadArgument()); + } + } + catch (...) + { + cerr << "Bad " << arg << " option: " << argv[i] << endl; + BOOST_THROW_EXCEPTION(BadArgument()); + } + } else if (arg == "--cuda-schedule" && i + 1 < argc) { string mode = argv[++i]; @@ -511,6 +527,8 @@ public: m_dagCreateDevice )) exit(1); + + EthashCUDAMiner::setParallelHash(m_parallelHash); #else cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl; exit(1); @@ -589,6 +607,7 @@ public: << " yield - Instruct CUDA to yield its thread when waiting for results from the device." << endl << " sync - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the results from the device." << endl << " --cuda-devices <0 1 ..n> Select which CUDA GPUs to mine on. Default is to use all" << endl + << " --cude-parallel-hash <1 2 ..8> Define how many hashes to calculate in a kernel, can be scaled to achive better performance. Default=4" << endl #endif ; } @@ -1035,6 +1054,7 @@ private: unsigned m_dagCreateDevice = 0; /// Benchmarking params unsigned m_benchmarkWarmup = 15; + unsigned m_parallelHash = 4; unsigned m_benchmarkTrial = 3; unsigned m_benchmarkTrials = 5; unsigned m_benchmarkBlock = 0; diff --git a/libethash-cuda/dagger_shared.cuh b/libethash-cuda/dagger_shared.cuh index 52fdc6a4f..89bd0327c 100644 --- a/libethash-cuda/dagger_shared.cuh +++ b/libethash-cuda/dagger_shared.cuh @@ -8,6 +8,7 @@ typedef union { } compute_hash_share; +template __device__ uint64_t compute_hash( uint64_t nonce ) @@ -65,7 +66,7 @@ __device__ uint64_t compute_hash( __syncthreads(); } - + // keccak_256(keccak_512(header..nonce) .. mix); return keccak_f1600_final(state); -} \ No newline at end of file +} diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh index 9d5754585..e841c3d92 100644 --- a/libethash-cuda/dagger_shuffled.cuh +++ b/libethash-cuda/dagger_shuffled.cuh @@ -2,9 +2,10 @@ #include "ethash_cuda_miner_kernel.h" #include "cuda_helper.h" -#define PARALLEL_HASH 4 +//#define PARALLEL_HASH 4 -__device__ uint64_t compute_hash( +template +__device__ __forceinline__ uint64_t compute_hash( uint64_t nonce ) { @@ -19,14 +20,14 @@ __device__ uint64_t compute_hash( const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); const int mix_idx = thread_id & 3; - for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH) + for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH) { - uint4 mix[PARALLEL_HASH]; - uint32_t offset[PARALLEL_HASH]; - uint32_t init0[PARALLEL_HASH]; + uint4 mix[_PARALLEL_HASH]; + uint32_t offset[_PARALLEL_HASH]; + uint32_t init0[_PARALLEL_HASH]; // share init among threads - for (int p = 0; p < PARALLEL_HASH; p++) + for (int p = 0; p < _PARALLEL_HASH; p++) { uint2 shuffle[8]; for (int j = 0; j < 8; j++) @@ -50,20 +51,24 @@ __device__ uint64_t compute_hash( for (uint32_t b = 0; b < 4; b++) { - for (int p = 0; p < PARALLEL_HASH; p++) + for (int p = 0; p < _PARALLEL_HASH; p++) { offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size; offset[p] = __shfl(offset[p], t, THREADS_PER_HASH); } #pragma unroll - for (int p = 0; p < PARALLEL_HASH; p++) + for (int p = 0; p < _PARALLEL_HASH; p++) { + //if(blockIdx.x == 0 && threadIdx.x==0 && offset[p] > (d_dag_size>>1)) //larger than half + // printf("d_dag_size = %d offset[p] = %d\n", d_dag_size, offset[p]); mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]); } + + } } - for (int p = 0; p < PARALLEL_HASH; p++) + for (int p = 0; p < _PARALLEL_HASH; p++) { uint2 shuffle[4]; uint32_t thread_mix = fnv_reduce(mix[p]); diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp index 37c39e4eb..a7d8b4a03 100644 --- a/libethash-cuda/ethash_cuda_miner.cpp +++ b/libethash-cuda/ethash_cuda_miner.cpp @@ -180,7 +180,13 @@ bool ethash_cuda_miner::configureGPU( } } +void ethash_cuda_miner::setParallelHash(unsigned _parallelHash) +{ + m_parallelHash = _parallelHash; +} + unsigned ethash_cuda_miner::s_extraRequiredGPUMem; +unsigned ethash_cuda_miner::m_parallelHash = 4; unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize; unsigned ethash_cuda_miner::s_gridSize = ethash_cuda_miner::c_defaultGridSize; unsigned ethash_cuda_miner::s_numStreams = ethash_cuda_miner::c_defaultNumStreams; @@ -363,7 +369,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho for (unsigned int j = 0; j < found_count; j++) nonces[j] = nonce_base + buffer[j + 1]; } - run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce); + run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash); if (m_current_index >= s_numStreams) { exit = found_count && hook.found(nonces, found_count); diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h index 4307770a0..80aa08ce5 100644 --- a/libethash-cuda/ethash_cuda_miner.h +++ b/libethash-cuda/ethash_cuda_miner.h @@ -34,6 +34,7 @@ public: unsigned _scheduleFlag, uint64_t _currentBlock ); + static void setParallelHash(unsigned _parallelHash); bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG); @@ -72,4 +73,5 @@ private: /// GPU memory required for other things, like window rendering e.t.c. /// User can set it via the --cl-extragpu-mem argument. static unsigned s_extraRequiredGPUMem; -}; \ No newline at end of file + static unsigned m_parallelHash; +}; diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index ac607c3a2..70dcad9a4 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -21,6 +21,7 @@ #include "dagger_shuffled.cuh" #endif +template __global__ void ethash_search( volatile uint32_t* g_output, @@ -28,7 +29,7 @@ ethash_search( ) { uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; - uint64_t hash = compute_hash(start_nonce + gid); + uint64_t hash = compute_hash<_PARALLEL_HASH>(start_nonce + gid); if (cuda_swab64(hash) > d_target) return; uint32_t index = atomicInc(const_cast(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1; g_output[index] = gid; @@ -40,16 +41,37 @@ void run_ethash_search( uint32_t sharedbytes, cudaStream_t stream, volatile uint32_t* g_output, - uint64_t start_nonce + uint64_t start_nonce, + uint32_t parallelHash ) { - ethash_search << > >(g_output, start_nonce); + + //printf("parallelHash = %d\n", parallelHash); + if(parallelHash == 1) + ethash_search <1> <<>>(g_output, start_nonce); + else if(parallelHash == 2) + ethash_search <2> <<>>(g_output, start_nonce); + else if(parallelHash == 3) + ethash_search <3> <<>>(g_output, start_nonce); + else if(parallelHash == 4) + ethash_search <4> <<>>(g_output, start_nonce); + else if(parallelHash == 5) + ethash_search <5> <<>>(g_output, start_nonce); + else if(parallelHash == 6) + ethash_search <6> <<>>(g_output, start_nonce); + else if(parallelHash == 7) + ethash_search <7> <<>>(g_output, start_nonce); + else if(parallelHash == 8) + ethash_search <8> <<>>(g_output, start_nonce); + else + ethash_search <1> <<>>(g_output, start_nonce); CUDA_SAFE_CALL(cudaGetLastError()); } #define ETHASH_DATASET_PARENTS 256 #define NODE_WORDS (64/4) + __global__ void ethash_calculate_dag_item(uint32_t start) { diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h index ebb24dd75..93f83bb45 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.h +++ b/libethash-cuda/ethash_cuda_miner_kernel.h @@ -52,7 +52,8 @@ void run_ethash_search( uint32_t sharedbytes, cudaStream_t stream, volatile uint32_t* g_output, - uint64_t start_nonce + uint64_t start_nonce, + uint32_t parallelHash ); void ethash_generate_dag( diff --git a/libethcore/EthashCUDAMiner.cpp b/libethcore/EthashCUDAMiner.cpp index 516628b18..1f2868ee6 100644 --- a/libethcore/EthashCUDAMiner.cpp +++ b/libethcore/EthashCUDAMiner.cpp @@ -261,4 +261,9 @@ bool EthashCUDAMiner::configureGPU( return true; } -#endif \ No newline at end of file +void EthashCUDAMiner::setParallelHash(unsigned _parallelHash) +{ + ethash_cuda_miner::setParallelHash(_parallelHash); +} + +#endif diff --git a/libethcore/EthashCUDAMiner.h b/libethcore/EthashCUDAMiner.h index e2018af27..f6314c348 100644 --- a/libethcore/EthashCUDAMiner.h +++ b/libethcore/EthashCUDAMiner.h @@ -51,6 +51,7 @@ class EthashCUDAHook; static std::string platformInfo(); static unsigned getNumDevices(); static void listDevices(); + static void setParallelHash(unsigned _parallelHash); static bool configureGPU( unsigned _blockSize, unsigned _gridSize, From 41c8e69a8f2bceba1af833bd1357f2c752b95299 Mon Sep 17 00:00:00 2001 From: davilizh Date: Tue, 27 Jun 2017 16:50:22 +0800 Subject: [PATCH 6/6] Change the code according to Chafast's suggestions in pull request #18. 1) change if..else if..else if to switch. 2)change spaces to tab. 3) add additioanl arg check. 4) delete commented out code. 5) throw any exception to avoid code duplication. --- ethminer/MinerAux.h | 7 ++--- libethash-cuda/dagger_shuffled.cuh | 2 -- libethash-cuda/ethash_cuda_miner_kernel.cu | 34 +++++++++------------- libethash-cuda/ethash_cuda_miner_kernel.h | 2 +- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h index 292fd0a93..b0c6b03e8 100644 --- a/ethminer/MinerAux.h +++ b/ethminer/MinerAux.h @@ -309,14 +309,13 @@ public: } } } - else if (arg == "--cuda-parallel-hash") + else if (arg == "--cuda-parallel-hash" && i + 1 < argc) { try { m_parallelHash = stol(argv[++i]); - if(m_parallelHash == 0 || m_parallelHash>8) + if (m_parallelHash == 0 || m_parallelHash > 8) { - cerr << "Bad " << arg << " option: " << argv[i] << endl; - BOOST_THROW_EXCEPTION(BadArgument()); + throw BadArgument(); } } catch (...) diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh index e841c3d92..4497ad699 100644 --- a/libethash-cuda/dagger_shuffled.cuh +++ b/libethash-cuda/dagger_shuffled.cuh @@ -2,8 +2,6 @@ #include "ethash_cuda_miner_kernel.h" #include "cuda_helper.h" -//#define PARALLEL_HASH 4 - template __device__ __forceinline__ uint64_t compute_hash( uint64_t nonce diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index 70dcad9a4..6092f267a 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -42,29 +42,21 @@ void run_ethash_search( cudaStream_t stream, volatile uint32_t* g_output, uint64_t start_nonce, - uint32_t parallelHash + uint32_t parallelHash ) { - - //printf("parallelHash = %d\n", parallelHash); - if(parallelHash == 1) - ethash_search <1> <<>>(g_output, start_nonce); - else if(parallelHash == 2) - ethash_search <2> <<>>(g_output, start_nonce); - else if(parallelHash == 3) - ethash_search <3> <<>>(g_output, start_nonce); - else if(parallelHash == 4) - ethash_search <4> <<>>(g_output, start_nonce); - else if(parallelHash == 5) - ethash_search <5> <<>>(g_output, start_nonce); - else if(parallelHash == 6) - ethash_search <6> <<>>(g_output, start_nonce); - else if(parallelHash == 7) - ethash_search <7> <<>>(g_output, start_nonce); - else if(parallelHash == 8) - ethash_search <8> <<>>(g_output, start_nonce); - else - ethash_search <1> <<>>(g_output, start_nonce); + switch (parallelHash) + { + case 1: ethash_search <1> <<>>(g_output, start_nonce); break; + case 2: ethash_search <2> <<>>(g_output, start_nonce); break; + case 3: ethash_search <3> <<>>(g_output, start_nonce); break; + case 4: ethash_search <4> <<>>(g_output, start_nonce); break; + case 5: ethash_search <5> <<>>(g_output, start_nonce); break; + case 6: ethash_search <6> <<>>(g_output, start_nonce); break; + case 7: ethash_search <7> <<>>(g_output, start_nonce); break; + case 8: ethash_search <8> <<>>(g_output, start_nonce); break; + default: ethash_search <4> <<>>(g_output, start_nonce); break; + } CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h index 93f83bb45..c219308ff 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.h +++ b/libethash-cuda/ethash_cuda_miner_kernel.h @@ -53,7 +53,7 @@ void run_ethash_search( cudaStream_t stream, volatile uint32_t* g_output, uint64_t start_nonce, - uint32_t parallelHash + uint32_t parallelHash ); void ethash_generate_dag(