diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp index 40c68002d..5ca10bf70 100644 --- a/libethash-cuda/ethash_cuda_miner.cpp +++ b/libethash-cuda/ethash_cuda_miner.cpp @@ -285,13 +285,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho }; std::queue pending; - static uint32_t const c_zero = 0; // update header constant buffer cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice); for (unsigned i = 0; i != s_numStreams; ++i) { - cudaMemcpyAsync(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[i]); + m_search_buf[i][0] = 0; } cudaError err = cudaGetLastError(); if (cudaSuccess != err) @@ -314,13 +313,12 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho { pending_batch const& batch = pending.front(); - uint32_t results[1 + c_max_search_results]; - - if (!s_highCPU) + if (s_highCPU) + cudaStreamSynchronize(m_streams[buf]); + else waitStream(m_streams[buf]); // 28ms - cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyDeviceToHost, m_streams[batch.buf]); - + uint32_t * results = m_search_buf[batch.buf]; unsigned num_found = std::min(results[0], c_max_search_results); uint64_t nonces[c_max_search_results]; for (unsigned i = 0; i != num_found; ++i) @@ -339,7 +337,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho start_nonce += s_gridSize * s_blockSize; // reset search buffer if we're still going if (num_found) - cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]); + results[0] = 0; cudaError err = cudaGetLastError(); if (cudaSuccess != err) diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index 265f43d8f..c589bc450 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -25,21 +25,16 @@ ethash_search( uint64_t target ) { - uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; - #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); - if (cuda_swab64(hash) < target) #else - hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid); - if (cuda_swab64(hash.uint64s[0]) < target) + uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0]; #endif - { - atomicInc(g_output, d_max_outputs); - g_output[g_output[0]] = gid; - } - + if (cuda_swab64(hash) > target) return; + uint32_t index = atomicInc(g_output, d_max_outputs) + 1; + g_output[index] = gid; + __threadfence_system(); } void run_ethash_search(