improved error handling and usage of constant memory space

10 years ago · d8c8582f94
7 changed files with 195 additions and 211 deletions
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -9,14 +9,17 @@ typedef union
 	hash32_t mix;
 } compute_hash_share;
-__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
+__device__ hash64_t init_hash(uint64_t nonce)
 {
 	hash64_t init;
 	// sha3_512(header .. nonce)
 	uint64_t state[25];
-	copy(state, header->uint64s, 4);
+	state[0] = d_header.uint64s[0];
 	state[1] = d_header.uint64s[1];
 	state[2] = d_header.uint64s[2];
 	state[3] = d_header.uint64s[3];
 	state[4] = nonce;
 	state[5] = 0x0000000000000001;
 	state[6] = 0;
@ -32,7 +35,7 @@ __device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
 	return init;
 }
-__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
+__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share)
 {
 	// share init0
 	if (thread_id == 0)
@ -59,9 +62,9 @@ __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, h
 			__threadfence_block();
 #if __CUDA_ARCH__ >= 350
-			mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
+			mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id));
 #else
-			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+			mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]);
 #endif
 		}
@ -99,15 +102,13 @@ __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
 }
 __device__ hash32_t compute_hash(
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	extern __shared__  compute_hash_share share[];
 	// Compute one init hash per work item.
-	hash64_t init = init_hash(g_header, nonce);
+	hash64_t init = init_hash(nonce);
 	// Threads work together in this phase in groups of 8.
 	uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
@ -123,7 +124,7 @@ __device__ hash32_t compute_hash(
 		uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
-		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
+		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s);
 		share[hash_id].mix.uint32s[thread_id] = thread_mix;
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -4,18 +4,16 @@
 #include "dagger.cuh"
 __device__ uint64_t compute_hash_shuffle(
 	uint2 const* g_header,
 	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	// sha3_512(header .. nonce)
 	uint2 state[25];
-	state[0] = g_header[0];
+	state[0] = d_header.uint2s[0];
-	state[1] = g_header[1];
+	state[1] = d_header.uint2s[1];
-	state[2] = g_header[2];
+	state[2] = d_header.uint2s[2];
-	state[3] = g_header[3];
+	state[3] = d_header.uint2s[3];
 	state[4] = vectorize(nonce);
 	state[5] = vectorize(0x0000000000000001ULL);
 	for (uint32_t i = 6; i < 25; i++)
@ -69,7 +67,7 @@ __device__ uint64_t compute_hash_shuffle(
 				}
 				shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
-				mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
+				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
 			}
 		}
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -71,16 +71,6 @@ static std::atomic_flag s_logSpin = ATOMIC_FLAG_INIT;
 #define ETHCUDA_LOG(_contents) cout << "[CUDA]:" << _contents << endl
 #endif
 #define CUDA_SAFE_CALL(call)                                          \
 do {                                                                  \
 	cudaError_t err = call;                                           \
 	if (cudaSuccess != err) {                                         \
 		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
 		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
 		exit(EXIT_FAILURE);                                           \
 		}                                                                 \
 } while (0)
 ethash_cuda_miner::search_hook::~search_hook() {}
 ethash_cuda_miner::ethash_cuda_miner()
@ -114,7 +104,6 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
 	sprintf(compute, "%d.%d", device_props.major, device_props.minor);
 	return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + std::string(device_props.name) + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
 }
 unsigned ethash_cuda_miner::getNumDevices()
@ -133,6 +122,8 @@ bool ethash_cuda_miner::configureGPU(
 	unsigned _scheduleFlag,
 	uint64_t _currentBlock
 	)
 {
 	try
 	{
 		s_blockSize = _blockSize;
 		s_gridSize = _gridSize;
@ -143,7 +134,8 @@ bool ethash_cuda_miner::configureGPU(
 		// by default let's only consider the DAG of the first epoch
 		uint64_t dagSize = ethash_get_datasize(_currentBlock);
 		uint64_t requiredSize = dagSize + _extraGPUMemory;
-	for (unsigned int i = 0; i < getNumDevices(); i++)
+		unsigned devicesCount = getNumDevices();
 		for (unsigned int i = 0; i < devicesCount; i++)
 		{
 			if (_devices[i] != -1)
 			{
@ -169,6 +161,11 @@ bool ethash_cuda_miner::configureGPU(
 		}
 		return true;
 	}
 	catch (runtime_error)
 	{
 		return false;
 	}
 }
 unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
 unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize;
@ -193,14 +190,12 @@ void ethash_cuda_miner::listDevices()
 void ethash_cuda_miner::finish()
 {
-	for (unsigned i = 0; i != s_numStreams; i++) {
+	CUDA_SAFE_CALL(cudaDeviceReset());
 		cudaStreamDestroy(m_streams[i]);
 		m_streams[i] = 0;
 	}
 	cudaDeviceReset();
 }
 bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
 {
 	try
 	{
 		int device_count = getNumDevices();
@ -211,124 +206,93 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
 		int device_num = std::min<int>((int)_deviceId, device_count - 1);
 		cudaDeviceProp device_props;
-	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
 	{
 		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
 		return false;
 	}
 		cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;
-	cudaError_t r = cudaSetDevice(device_num);
+		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-	if (r != cudaSuccess)
+		CUDA_SAFE_CALL(cudaDeviceReset());
-	{
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		cout << cudaGetErrorString(r) << endl;
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 		return false;
 	}
 	cudaDeviceReset();
 	cudaSetDeviceFlags(s_scheduleFlag);
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-	m_search_buf = new uint32_t *[s_numStreams];
+		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 	// patch source code
 	cudaError result;
 		uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
 	unsigned max_outputs = c_max_search_results;
 	result = set_constants(&dagSize128, &max_outputs);
 		// create buffer for dag
-	result = cudaMalloc(&m_dag_ptr, _dagSize);
+		hash128_t * dag;
-
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
 	// create buffer for header256
 	result = cudaMalloc(&m_header, 32);
 		// copy dag to CPU.
-    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
+		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
 		// create mining buffers
 		for (unsigned i = 0; i != s_numStreams; ++i)
 		{
-		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
+			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
-		result = cudaStreamCreate(&m_streams[i]);
+			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 		}
 		set_constants(dag, dagSize128);
 		memset(&m_current_header, 0, sizeof(hash32_t));
 		m_current_target = 0;
 		m_current_nonce = 0;
 		m_current_index = 0;
 		return true;
 	}
-	if (result != cudaSuccess)
+	catch (runtime_error)
 	{
 		cout << cudaGetErrorString(result) << endl;
 		return false;
 	}
 	return true;
 }
 void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
-	struct pending_batch
+	bool initialize = false;
-	{
+	bool exit = false;
-		uint64_t start_nonce;
+	if (memcmp(&m_current_header, header, sizeof(hash32_t)))
 		unsigned buf;
 	};
 	std::queue<pending_batch> pending;
 	// update header constant buffer
 	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
 	for (unsigned i = 0; i != s_numStreams; ++i)
 	{
-		m_search_buf[i][0] = 0;
+		m_current_header = *reinterpret_cast<hash32_t const *>(header);
 		set_header(m_current_header);
 		initialize = true;
 	}
-	cudaError err = cudaGetLastError();
+	if (m_current_target != target)
 	if (cudaSuccess != err)
 	{
-		throw std::runtime_error(cudaGetErrorString(err));
+		m_current_target = target;
 		set_target(m_current_target);
 		initialize = true;
 	}
-
+	if (initialize)
 	unsigned buf = 0;
 	std::random_device engine;
 	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
 	for (;;)
 	{
-		run_ethash_search(s_gridSize, s_blockSize, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);
+		random_device engine;
-		
+		m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
-		pending.push({ start_nonce, buf });
+		m_current_index = 0;
-		buf = (buf + 1) % s_numStreams;
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
-
+		for (unsigned int i = 0; i < s_numStreams; i++)
-		// read results
+			m_search_buf[i][0] = 0;
-		if (pending.size() == s_numStreams)
+	}
 	uint64_t batch_size = s_gridSize * s_blockSize;
 	for (; !exit; m_current_index++, m_current_nonce += batch_size)
 	{
-			pending_batch const& batch = pending.front();
+		unsigned int stream_index = m_current_index % s_numStreams;
-
+		cudaStream_t stream = m_streams[stream_index];
-			cudaStreamSynchronize(m_streams[buf]);
+		volatile uint32_t* buffer = m_search_buf[stream_index];
-
+		uint32_t found_count = 0;
-			uint32_t * results = m_search_buf[batch.buf];
+		uint64_t nonces[SEARCH_RESULT_BUFFER_SIZE - 1];
-			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
+		uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
-			uint64_t nonces[c_max_search_results];
+		if (m_current_index >= s_numStreams)
 			for (unsigned i = 0; i != num_found; ++i)
 		{
-				nonces[i] = batch.start_nonce + results[i + 1];
+			CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
-				//cout << results[i + 1] << ", ";
+			found_count = buffer[0];
 			if (found_count)
 				buffer[0] = 0;
 			for (unsigned int j = 0; j < found_count; j++)
 				nonces[j] = nonce_base + buffer[j + 1];
 		}
-			//if (num_found > 0)
+		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
-			//	cout << endl;
+		if (m_current_index >= s_numStreams)
 			bool exit = num_found && hook.found(nonces, num_found);
 			exit |= hook.searched(batch.start_nonce, s_gridSize * s_blockSize); // always report searched before exit
 			if (exit)
 				break;
 			start_nonce += s_gridSize * s_blockSize;
 			// reset search buffer if we're still going
 			if (num_found)
 				results[0] = 0;
 			cudaError err = cudaGetLastError();
 			if (cudaSuccess != err)
 		{
-				throw std::runtime_error(cudaGetErrorString(err));
+			exit = found_count && hook.found(nonces, found_count);
-			}
+			exit |= hook.searched(nonce_base, batch_size);
 			pending.pop();
 		}
 	}
 }
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -51,13 +51,12 @@ public:
 	static unsigned const c_defaultNumStreams;
 private:
-	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
+	hash32_t m_current_header;
 	uint64_t m_current_target;
 	uint64_t m_current_nonce;
 	uint64_t m_current_index;
-	hash128_t * m_dag_ptr;
+	volatile uint32_t ** m_search_buf;
 	hash32_t * m_header;
 	void ** m_hash_buf;
 	uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;
 	/// The local work size for the search
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -18,21 +18,18 @@
 __global__ void 
 __launch_bounds__(128, 7)
 ethash_search(
-	uint32_t* g_output,
+	volatile uint32_t* g_output,
-	hash32_t const* g_header,
+	uint64_t start_nonce
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
-	uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
+	uint64_t hash = compute_hash_shuffle(start_nonce + gid);
 #else
-	uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0];
+	uint64_t hash = compute_hash(start_nonce + gid).uint64s[0];
 #endif
-	if (cuda_swab64(hash) > target) return;
+	if (cuda_swab64(hash) > d_target) return;
-	uint32_t index = atomicInc(g_output, d_max_outputs) + 1;
+	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
 	__threadfence_system();
 }
@ -41,27 +38,37 @@ void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
 	cudaStream_t stream,
-	uint32_t* g_output,
+	volatile uint32_t* g_output,
-	hash32_t const* g_header,
+	uint64_t start_nonce
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 )
 {
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
-	ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
+	ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
 #else
-	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
+	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, start_nonce);
 #endif
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
-cudaError set_constants(
+void set_constants(
-	uint32_t * dag_size,
+	hash128_t* _dag,
-	uint32_t * max_outputs
+	uint32_t _dag_size
 	)
 {
-	cudaError result;
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
-	result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
-	result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
+}
-	return result;
+
 void set_header(
 	hash32_t _header
 	)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
 }
 void set_target(
 	uint64_t _target
 	)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
 }
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -1,9 +1,12 @@
 #ifndef _ETHASH_CUDA_MINER_KERNEL_H_
 #define _ETHASH_CUDA_MINER_KERNEL_H_
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
 #define SEARCH_RESULT_BUFFER_SIZE 64
 typedef union
 {
 	uint64_t uint64s[16 / sizeof(uint64_t)];
@ -33,27 +36,37 @@ typedef union
 } hash128_t;
-cudaError set_constants(
+void set_constants(
-	uint32_t * dag_size,
+	hash128_t* _dag,
-	uint32_t * max_outputs
+	uint32_t _dag_size
 	);
-void run_ethash_hash(
+void set_header(
-	hash32_t* g_hashes,
+	hash32_t _header
-	hash32_t const* g_header,
+	);
-	hash128_t const* g_dag,
+
-	uint64_t start_nonce
+void set_target(
 	uint64_t _target
 	);
 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
 	cudaStream_t stream,
-	uint32_t* g_output,
+	volatile uint32_t* g_output,
-	hash32_t const* g_header,
+	uint64_t start_nonce
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 	);
 #define CUDA_SAFE_CALL(call)								\
 do {														\
 	cudaError_t err = call;									\
 	if (cudaSuccess != err) {								\
 		const char * errorString = cudaGetErrorString(err);	\
 		fprintf(stderr,										\
 			"CUDA error in func '%s' at line %i : %s.\n",	\
 			__FUNCTION__, __LINE__, errorString);			\
 		throw std::runtime_error(errorString);				\
 	}														\
 } while (0)
 #endif
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@ -4,6 +4,8 @@
 //#include "cuda_helper.h"
 __constant__ uint32_t d_dag_size;
-__constant__ uint32_t d_max_outputs;
+__constant__ hash128_t* d_dag;
 __constant__ hash32_t d_header;
 __constant__ uint64_t d_target;
 #endif