Browse Source

add switch --cuda-parallel-hash to enable and disable the parallel-hash optimization

cl-refactor
davilizh 8 years ago
parent
commit
73fc65daf9
  1. 20
      ethminer/MinerAux.h
  2. 5
      libethash-cuda/dagger_shared.cuh
  3. 25
      libethash-cuda/dagger_shuffled.cuh
  4. 8
      libethash-cuda/ethash_cuda_miner.cpp
  5. 4
      libethash-cuda/ethash_cuda_miner.h
  6. 28
      libethash-cuda/ethash_cuda_miner_kernel.cu
  7. 3
      libethash-cuda/ethash_cuda_miner_kernel.h
  8. 7
      libethcore/EthashCUDAMiner.cpp
  9. 1
      libethcore/EthashCUDAMiner.h

20
ethminer/MinerAux.h

@ -309,6 +309,22 @@ public:
} }
} }
} }
else if (arg == "--cuda-parallel-hash")
{
try {
m_parallelHash = stol(argv[++i]);
if(m_parallelHash == 0 || m_parallelHash>8)
{
cerr << "Bad " << arg << " option: " << argv[i] << endl;
BOOST_THROW_EXCEPTION(BadArgument());
}
}
catch (...)
{
cerr << "Bad " << arg << " option: " << argv[i] << endl;
BOOST_THROW_EXCEPTION(BadArgument());
}
}
else if (arg == "--cuda-schedule" && i + 1 < argc) else if (arg == "--cuda-schedule" && i + 1 < argc)
{ {
string mode = argv[++i]; string mode = argv[++i];
@ -511,6 +527,8 @@ public:
m_dagCreateDevice m_dagCreateDevice
)) ))
exit(1); exit(1);
EthashCUDAMiner::setParallelHash(m_parallelHash);
#else #else
cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl; cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl;
exit(1); exit(1);
@ -589,6 +607,7 @@ public:
<< " yield - Instruct CUDA to yield its thread when waiting for results from the device." << endl << " yield - Instruct CUDA to yield its thread when waiting for results from the device." << endl
<< " sync - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the results from the device." << endl << " sync - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the results from the device." << endl
<< " --cuda-devices <0 1 ..n> Select which CUDA GPUs to mine on. Default is to use all" << endl << " --cuda-devices <0 1 ..n> Select which CUDA GPUs to mine on. Default is to use all" << endl
<< " --cude-parallel-hash <1 2 ..8> Define how many hashes to calculate in a kernel, can be scaled to achive better performance. Default=4" << endl
#endif #endif
; ;
} }
@ -1035,6 +1054,7 @@ private:
unsigned m_dagCreateDevice = 0; unsigned m_dagCreateDevice = 0;
/// Benchmarking params /// Benchmarking params
unsigned m_benchmarkWarmup = 15; unsigned m_benchmarkWarmup = 15;
unsigned m_parallelHash = 4;
unsigned m_benchmarkTrial = 3; unsigned m_benchmarkTrial = 3;
unsigned m_benchmarkTrials = 5; unsigned m_benchmarkTrials = 5;
unsigned m_benchmarkBlock = 0; unsigned m_benchmarkBlock = 0;

5
libethash-cuda/dagger_shared.cuh

@ -8,6 +8,7 @@ typedef union {
} compute_hash_share; } compute_hash_share;
template <uint32_t _PARALLEL_HASH>
__device__ uint64_t compute_hash( __device__ uint64_t compute_hash(
uint64_t nonce uint64_t nonce
) )
@ -65,7 +66,7 @@ __device__ uint64_t compute_hash(
__syncthreads(); __syncthreads();
} }
// keccak_256(keccak_512(header..nonce) .. mix); // keccak_256(keccak_512(header..nonce) .. mix);
return keccak_f1600_final(state); return keccak_f1600_final(state);
} }

25
libethash-cuda/dagger_shuffled.cuh

@ -2,9 +2,10 @@
#include "ethash_cuda_miner_kernel.h" #include "ethash_cuda_miner_kernel.h"
#include "cuda_helper.h" #include "cuda_helper.h"
#define PARALLEL_HASH 4 //#define PARALLEL_HASH 4
__device__ uint64_t compute_hash( template <uint32_t _PARALLEL_HASH>
__device__ __forceinline__ uint64_t compute_hash(
uint64_t nonce uint64_t nonce
) )
{ {
@ -19,14 +20,14 @@ __device__ uint64_t compute_hash(
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int mix_idx = thread_id & 3; const int mix_idx = thread_id & 3;
for (int i = 0; i < THREADS_PER_HASH; i += PARALLEL_HASH) for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH)
{ {
uint4 mix[PARALLEL_HASH]; uint4 mix[_PARALLEL_HASH];
uint32_t offset[PARALLEL_HASH]; uint32_t offset[_PARALLEL_HASH];
uint32_t init0[PARALLEL_HASH]; uint32_t init0[_PARALLEL_HASH];
// share init among threads // share init among threads
for (int p = 0; p < PARALLEL_HASH; p++) for (int p = 0; p < _PARALLEL_HASH; p++)
{ {
uint2 shuffle[8]; uint2 shuffle[8];
for (int j = 0; j < 8; j++) for (int j = 0; j < 8; j++)
@ -50,20 +51,24 @@ __device__ uint64_t compute_hash(
for (uint32_t b = 0; b < 4; b++) for (uint32_t b = 0; b < 4; b++)
{ {
for (int p = 0; p < PARALLEL_HASH; p++) for (int p = 0; p < _PARALLEL_HASH; p++)
{ {
offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size; offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
offset[p] = __shfl(offset[p], t, THREADS_PER_HASH); offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
} }
#pragma unroll #pragma unroll
for (int p = 0; p < PARALLEL_HASH; p++) for (int p = 0; p < _PARALLEL_HASH; p++)
{ {
//if(blockIdx.x == 0 && threadIdx.x==0 && offset[p] > (d_dag_size>>1)) //larger than half
// printf("d_dag_size = %d offset[p] = %d\n", d_dag_size, offset[p]);
mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]); mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
} }
} }
} }
for (int p = 0; p < PARALLEL_HASH; p++) for (int p = 0; p < _PARALLEL_HASH; p++)
{ {
uint2 shuffle[4]; uint2 shuffle[4];
uint32_t thread_mix = fnv_reduce(mix[p]); uint32_t thread_mix = fnv_reduce(mix[p]);

8
libethash-cuda/ethash_cuda_miner.cpp

@ -180,7 +180,13 @@ bool ethash_cuda_miner::configureGPU(
} }
} }
void ethash_cuda_miner::setParallelHash(unsigned _parallelHash)
{
m_parallelHash = _parallelHash;
}
unsigned ethash_cuda_miner::s_extraRequiredGPUMem; unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
unsigned ethash_cuda_miner::m_parallelHash = 4;
unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize; unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize;
unsigned ethash_cuda_miner::s_gridSize = ethash_cuda_miner::c_defaultGridSize; unsigned ethash_cuda_miner::s_gridSize = ethash_cuda_miner::c_defaultGridSize;
unsigned ethash_cuda_miner::s_numStreams = ethash_cuda_miner::c_defaultNumStreams; unsigned ethash_cuda_miner::s_numStreams = ethash_cuda_miner::c_defaultNumStreams;
@ -363,7 +369,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
for (unsigned int j = 0; j < found_count; j++) for (unsigned int j = 0; j < found_count; j++)
nonces[j] = nonce_base + buffer[j + 1]; nonces[j] = nonce_base + buffer[j + 1];
} }
run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce); run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
if (m_current_index >= s_numStreams) if (m_current_index >= s_numStreams)
{ {
exit = found_count && hook.found(nonces, found_count); exit = found_count && hook.found(nonces, found_count);

4
libethash-cuda/ethash_cuda_miner.h

@ -34,6 +34,7 @@ public:
unsigned _scheduleFlag, unsigned _scheduleFlag,
uint64_t _currentBlock uint64_t _currentBlock
); );
static void setParallelHash(unsigned _parallelHash);
bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG); bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);
@ -72,4 +73,5 @@ private:
/// GPU memory required for other things, like window rendering e.t.c. /// GPU memory required for other things, like window rendering e.t.c.
/// User can set it via the --cl-extragpu-mem argument. /// User can set it via the --cl-extragpu-mem argument.
static unsigned s_extraRequiredGPUMem; static unsigned s_extraRequiredGPUMem;
}; static unsigned m_parallelHash;
};

28
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -21,6 +21,7 @@
#include "dagger_shuffled.cuh" #include "dagger_shuffled.cuh"
#endif #endif
template <uint32_t _PARALLEL_HASH>
__global__ void __global__ void
ethash_search( ethash_search(
volatile uint32_t* g_output, volatile uint32_t* g_output,
@ -28,7 +29,7 @@ ethash_search(
) )
{ {
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t hash = compute_hash(start_nonce + gid); uint64_t hash = compute_hash<_PARALLEL_HASH>(start_nonce + gid);
if (cuda_swab64(hash) > d_target) return; if (cuda_swab64(hash) > d_target) return;
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1; uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
g_output[index] = gid; g_output[index] = gid;
@ -40,16 +41,37 @@ void run_ethash_search(
uint32_t sharedbytes, uint32_t sharedbytes,
cudaStream_t stream, cudaStream_t stream,
volatile uint32_t* g_output, volatile uint32_t* g_output,
uint64_t start_nonce uint64_t start_nonce,
uint32_t parallelHash
) )
{ {
ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
//printf("parallelHash = %d\n", parallelHash);
if(parallelHash == 1)
ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 2)
ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 3)
ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 4)
ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 5)
ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 6)
ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 7)
ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else if(parallelHash == 8)
ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
else
ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce);
CUDA_SAFE_CALL(cudaGetLastError()); CUDA_SAFE_CALL(cudaGetLastError());
} }
#define ETHASH_DATASET_PARENTS 256 #define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64/4) #define NODE_WORDS (64/4)
__global__ void __global__ void
ethash_calculate_dag_item(uint32_t start) ethash_calculate_dag_item(uint32_t start)
{ {

3
libethash-cuda/ethash_cuda_miner_kernel.h

@ -52,7 +52,8 @@ void run_ethash_search(
uint32_t sharedbytes, uint32_t sharedbytes,
cudaStream_t stream, cudaStream_t stream,
volatile uint32_t* g_output, volatile uint32_t* g_output,
uint64_t start_nonce uint64_t start_nonce,
uint32_t parallelHash
); );
void ethash_generate_dag( void ethash_generate_dag(

7
libethcore/EthashCUDAMiner.cpp

@ -261,4 +261,9 @@ bool EthashCUDAMiner::configureGPU(
return true; return true;
} }
#endif void EthashCUDAMiner::setParallelHash(unsigned _parallelHash)
{
ethash_cuda_miner::setParallelHash(_parallelHash);
}
#endif

1
libethcore/EthashCUDAMiner.h

@ -51,6 +51,7 @@ class EthashCUDAHook;
static std::string platformInfo(); static std::string platformInfo();
static unsigned getNumDevices(); static unsigned getNumDevices();
static void listDevices(); static void listDevices();
static void setParallelHash(unsigned _parallelHash);
static bool configureGPU( static bool configureGPU(
unsigned _blockSize, unsigned _blockSize,
unsigned _gridSize, unsigned _gridSize,

Loading…
Cancel
Save