Browse Source

fixed m_search_buf races, removed unnecessary cudaMemcpyAsync calls, fixed missing cudaStreamSynchronize

cl-refactor
RoBiK 10 years ago
parent
commit
3aa41b10ea
  1. 14
      libethash-cuda/ethash_cuda_miner.cpp
  2. 15
      libethash-cuda/ethash_cuda_miner_kernel.cu

14
libethash-cuda/ethash_cuda_miner.cpp

@ -285,13 +285,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
};
std::queue<pending_batch> pending;
static uint32_t const c_zero = 0;
// update header constant buffer
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
for (unsigned i = 0; i != s_numStreams; ++i)
{
cudaMemcpyAsync(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[i]);
m_search_buf[i][0] = 0;
}
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
@ -314,13 +313,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
{
pending_batch const& batch = pending.front();
uint32_t results[1 + c_max_search_results];
if (!s_highCPU)
if (s_highCPU)
cudaStreamSynchronize(m_streams[buf]);
else
waitStream(m_streams[buf]); // 28ms
cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyDeviceToHost, m_streams[batch.buf]);
uint32_t * results = m_search_buf[batch.buf];
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
uint64_t nonces[c_max_search_results];
for (unsigned i = 0; i != num_found; ++i)
@ -339,7 +337,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
start_nonce += s_gridSize * s_blockSize;
// reset search buffer if we're still going
if (num_found)
cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
results[0] = 0;
cudaError err = cudaGetLastError();
if (cudaSuccess != err)

15
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -25,21 +25,16 @@ ethash_search(
uint64_t target
)
{
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
if (cuda_swab64(hash) < target)
#else
hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid);
if (cuda_swab64(hash.uint64s[0]) < target)
uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0];
#endif
{
atomicInc(g_output, d_max_outputs);
g_output[g_output[0]] = gid;
}
if (cuda_swab64(hash) > target) return;
uint32_t index = atomicInc(g_output, d_max_outputs) + 1;
g_output[index] = gid;
__threadfence_system();
}
void run_ethash_search(

Loading…
Cancel
Save