Browse Source

fixed m_search_buf races, removed unnecessary cudaMemcpyAsync calls, fixed missing cudaStreamSynchronize

cl-refactor
RoBiK 9 years ago
parent
commit
3aa41b10ea
  1. 14
      libethash-cuda/ethash_cuda_miner.cpp
  2. 15
      libethash-cuda/ethash_cuda_miner_kernel.cu

14
libethash-cuda/ethash_cuda_miner.cpp

@ -285,13 +285,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
}; };
std::queue<pending_batch> pending; std::queue<pending_batch> pending;
static uint32_t const c_zero = 0;
// update header constant buffer // update header constant buffer
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice); cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
for (unsigned i = 0; i != s_numStreams; ++i) for (unsigned i = 0; i != s_numStreams; ++i)
{ {
cudaMemcpyAsync(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[i]); m_search_buf[i][0] = 0;
} }
cudaError err = cudaGetLastError(); cudaError err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)
@ -314,13 +313,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
{ {
pending_batch const& batch = pending.front(); pending_batch const& batch = pending.front();
uint32_t results[1 + c_max_search_results]; if (s_highCPU)
cudaStreamSynchronize(m_streams[buf]);
if (!s_highCPU) else
waitStream(m_streams[buf]); // 28ms waitStream(m_streams[buf]); // 28ms
cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyDeviceToHost, m_streams[batch.buf]); uint32_t * results = m_search_buf[batch.buf];
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results); unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
uint64_t nonces[c_max_search_results]; uint64_t nonces[c_max_search_results];
for (unsigned i = 0; i != num_found; ++i) for (unsigned i = 0; i != num_found; ++i)
@ -339,7 +337,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
start_nonce += s_gridSize * s_blockSize; start_nonce += s_gridSize * s_blockSize;
// reset search buffer if we're still going // reset search buffer if we're still going
if (num_found) if (num_found)
cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]); results[0] = 0;
cudaError err = cudaGetLastError(); cudaError err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)

15
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -25,21 +25,16 @@ ethash_search(
uint64_t target uint64_t target
) )
{ {
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
if (cuda_swab64(hash) < target)
#else #else
hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid); uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0];
if (cuda_swab64(hash.uint64s[0]) < target)
#endif #endif
{ if (cuda_swab64(hash) > target) return;
atomicInc(g_output, d_max_outputs); uint32_t index = atomicInc(g_output, d_max_outputs) + 1;
g_output[g_output[0]] = gid; g_output[index] = gid;
} __threadfence_system();
} }
void run_ethash_search( void run_ethash_search(

Loading…
Cancel
Save