improved error handling and usage of constant memory space

10 years ago · d8c8582f94
7 changed files with 195 additions and 211 deletions
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -9,14 +9,17 @@ typedef union
 	hash32_t mix;
 } compute_hash_share;

-__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
+__device__ hash64_t init_hash(uint64_t nonce)
 {
 	hash64_t init;

 	// sha3_512(header .. nonce)
 	uint64_t state[25];

-	copy(state, header->uint64s, 4);
+	state[0] = d_header.uint64s[0];
+	state[1] = d_header.uint64s[1];
+	state[2] = d_header.uint64s[2];
+	state[3] = d_header.uint64s[3];
 	state[4] = nonce;
 	state[5] = 0x0000000000000001;
 	state[6] = 0;
@ -32,7 +35,7 @@ __device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
 	return init;
 }

-__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
+__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share)
 {
 	// share init0
 	if (thread_id == 0)
@ -59,9 +62,9 @@ __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, h
 			__threadfence_block();

 #if __CUDA_ARCH__ >= 350
-			mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
+			mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id));
 #else
-			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+			mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]);
 #endif

 		}
@ -99,15 +102,13 @@ __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
 }

 __device__ hash32_t compute_hash(
-	hash32_t const* g_header,
-	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	extern __shared__  compute_hash_share share[];

 	// Compute one init hash per work item.
-	hash64_t init = init_hash(g_header, nonce);
+	hash64_t init = init_hash(nonce);

 	// Threads work together in this phase in groups of 8.
 	uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
@ -123,7 +124,7 @@ __device__ hash32_t compute_hash(

 		uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];

-		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
+		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s);

 		share[hash_id].mix.uint32s[thread_id] = thread_mix;

--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -4,18 +4,16 @@
 #include "dagger.cuh"

 __device__ uint64_t compute_hash_shuffle(
-	uint2 const* g_header,
-	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	// sha3_512(header .. nonce)
 	uint2 state[25];

-	state[0] = g_header[0];
-	state[1] = g_header[1];
-	state[2] = g_header[2];
-	state[3] = g_header[3];
+	state[0] = d_header.uint2s[0];
+	state[1] = d_header.uint2s[1];
+	state[2] = d_header.uint2s[2];
+	state[3] = d_header.uint2s[3];
 	state[4] = vectorize(nonce);
 	state[5] = vectorize(0x0000000000000001ULL);
 	for (uint32_t i = 6; i < 25; i++)
@ -69,7 +67,7 @@ __device__ uint64_t compute_hash_shuffle(
 				}
 				shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);

-				mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
+				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
 			}
 		}

--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -71,16 +71,6 @@ static std::atomic_flag s_logSpin = ATOMIC_FLAG_INIT;
 #define ETHCUDA_LOG(_contents) cout << "[CUDA]:" << _contents << endl
 #endif

-#define CUDA_SAFE_CALL(call)                                          \
-do {                                                                  \
-	cudaError_t err = call;                                           \
-	if (cudaSuccess != err) {                                         \
-		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
-		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
-		exit(EXIT_FAILURE);                                           \
-		}                                                                 \
-} while (0)
-
 ethash_cuda_miner::search_hook::~search_hook() {}

 ethash_cuda_miner::ethash_cuda_miner()
@ -91,7 +81,7 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
 {
 	int runtime_version;
 	int device_count;
-	
+
 	device_count = getNumDevices();

 	if (device_count == 0)
@ -109,12 +99,11 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
 	int version_major = runtime_version / 1000;
 	int version_minor = (runtime_version - (version_major * 1000)) / 10;
 	sprintf(platform, "%d.%d", version_major, version_minor);
-	
+
 	char compute[5];
 	sprintf(compute, "%d.%d", device_props.major, device_props.minor);

 	return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + std::string(device_props.name) + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
-
 }

 unsigned ethash_cuda_miner::getNumDevices()
@ -134,40 +123,48 @@ bool ethash_cuda_miner::configureGPU(
 	uint64_t _currentBlock
 	)
 {
-	s_blockSize = _blockSize;
-	s_gridSize = _gridSize;
-	s_extraRequiredGPUMem = _extraGPUMemory;
-	s_numStreams = _numStreams;
-	s_scheduleFlag = _scheduleFlag;
-
-	// by default let's only consider the DAG of the first epoch
-	uint64_t dagSize = ethash_get_datasize(_currentBlock);
-	uint64_t requiredSize = dagSize + _extraGPUMemory;
-	for (unsigned int i = 0; i < getNumDevices(); i++)
+	try
 	{
-		if (_devices[i] != -1) 
+		s_blockSize = _blockSize;
+		s_gridSize = _gridSize;
+		s_extraRequiredGPUMem = _extraGPUMemory;
+		s_numStreams = _numStreams;
+		s_scheduleFlag = _scheduleFlag;
+
+		// by default let's only consider the DAG of the first epoch
+		uint64_t dagSize = ethash_get_datasize(_currentBlock);
+		uint64_t requiredSize = dagSize + _extraGPUMemory;
+		unsigned devicesCount = getNumDevices();
+		for (unsigned int i = 0; i < devicesCount; i++)
 		{
-			cudaDeviceProp props;
-			CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, _devices[i]));
-			if (props.totalGlobalMem >= requiredSize)
+			if (_devices[i] != -1)
 			{
-				ETHCUDA_LOG(
-					"Found suitable CUDA device [" << string(props.name)
-					<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
-					);
-			}
-			else
-			{
-				ETHCUDA_LOG(
-					"CUDA device " << string(props.name)
-					<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
-					" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
-					);
-				return false;
+				cudaDeviceProp props;
+				CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, _devices[i]));
+				if (props.totalGlobalMem >= requiredSize)
+				{
+					ETHCUDA_LOG(
+						"Found suitable CUDA device [" << string(props.name)
+						<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
+						);
+				}
+				else
+				{
+					ETHCUDA_LOG(
+						"CUDA device " << string(props.name)
+						<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
+						" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
+						);
+					return false;
+				}
 			}
 		}
+		return true;
+	}
+	catch (runtime_error)
+	{
+		return false;
 	}
-	return true;
 }

 unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
@ -193,143 +190,110 @@ void ethash_cuda_miner::listDevices()

 void ethash_cuda_miner::finish()
 {
-	for (unsigned i = 0; i != s_numStreams; i++) {
-		cudaStreamDestroy(m_streams[i]);
-		m_streams[i] = 0;
-	}
-	cudaDeviceReset();
+	CUDA_SAFE_CALL(cudaDeviceReset());
 }

 bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
 {
-	int device_count = getNumDevices();
-
-	if (device_count == 0)
-		return false;
-
-	// use selected device
-	int device_num = std::min<int>((int)_deviceId, device_count - 1);
-	
-	cudaDeviceProp device_props;
-	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+	try
 	{
-		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
-		return false;
-	}
+		int device_count = getNumDevices();

-	cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;
+		if (device_count == 0)
+			return false;

-	cudaError_t r = cudaSetDevice(device_num);
-	if (r != cudaSuccess)
-	{
-		cout << cudaGetErrorString(r) << endl;
-		return false;
-	}
-	cudaDeviceReset();
-	cudaSetDeviceFlags(s_scheduleFlag);
-	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		// use selected device
+		int device_num = std::min<int>((int)_deviceId, device_count - 1);

-	m_search_buf = new uint32_t *[s_numStreams];
-	m_streams = new cudaStream_t[s_numStreams];
+		cudaDeviceProp device_props;
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));

-	// patch source code
-	cudaError result;
+		cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;

-	uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
-	unsigned max_outputs = c_max_search_results;
+		CUDA_SAFE_CALL(cudaSetDevice(device_num));
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));

-	result = set_constants(&dagSize128, &max_outputs);
+		m_search_buf = new volatile uint32_t *[s_numStreams];
+		m_streams = new cudaStream_t[s_numStreams];

-	// create buffer for dag
-	result = cudaMalloc(&m_dag_ptr, _dagSize);
+		uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);

-	// create buffer for header256
-	result = cudaMalloc(&m_header, 32);
+		// create buffer for dag
+		hash128_t * dag;
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
+		// copy dag to CPU.
+		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));

-	// copy dag to CPU.
-    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
-	
-	// create mining buffers
-	for (unsigned i = 0; i != s_numStreams; ++i)
-	{		
-		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
-		result = cudaStreamCreate(&m_streams[i]);
+		// create mining buffers
+		for (unsigned i = 0; i != s_numStreams; ++i)
+		{
+			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
+		}
+		set_constants(dag, dagSize128);
+		memset(&m_current_header, 0, sizeof(hash32_t));
+		m_current_target = 0;
+		m_current_nonce = 0;
+		m_current_index = 0;
+		return true;
 	}
-	if (result != cudaSuccess)
+	catch (runtime_error)
 	{
-		cout << cudaGetErrorString(result) << endl;
 		return false;
 	}
-	return true;
 }

 void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
-	struct pending_batch
+	bool initialize = false;
+	bool exit = false;
+	if (memcmp(&m_current_header, header, sizeof(hash32_t)))
 	{
-		uint64_t start_nonce;
-		unsigned buf;
-	};
-	std::queue<pending_batch> pending;
-
-
-	// update header constant buffer
-	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
-	for (unsigned i = 0; i != s_numStreams; ++i)
+		m_current_header = *reinterpret_cast<hash32_t const *>(header);
+		set_header(m_current_header);
+		initialize = true;
+	}
+	if (m_current_target != target)
 	{
-		m_search_buf[i][0] = 0;
+		m_current_target = target;
+		set_target(m_current_target);
+		initialize = true;
 	}
-	cudaError err = cudaGetLastError();
-	if (cudaSuccess != err)
+	if (initialize)
 	{
-		throw std::runtime_error(cudaGetErrorString(err));
+		random_device engine;
+		m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
+		m_current_index = 0;
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+		for (unsigned int i = 0; i < s_numStreams; i++)
+			m_search_buf[i][0] = 0;
 	}
-
-	unsigned buf = 0;
-	std::random_device engine;
-	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
-	for (;;)
+	uint64_t batch_size = s_gridSize * s_blockSize;
+	for (; !exit; m_current_index++, m_current_nonce += batch_size)
 	{
-		run_ethash_search(s_gridSize, s_blockSize, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);
-		
-		pending.push({ start_nonce, buf });
-		buf = (buf + 1) % s_numStreams;
-
-		// read results
-		if (pending.size() == s_numStreams)
+		unsigned int stream_index = m_current_index % s_numStreams;
+		cudaStream_t stream = m_streams[stream_index];
+		volatile uint32_t* buffer = m_search_buf[stream_index];
+		uint32_t found_count = 0;
+		uint64_t nonces[SEARCH_RESULT_BUFFER_SIZE - 1];
+		uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
+		if (m_current_index >= s_numStreams)
 		{
-			pending_batch const& batch = pending.front();
-
-			cudaStreamSynchronize(m_streams[buf]);
-
-			uint32_t * results = m_search_buf[batch.buf];
-			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
-			uint64_t nonces[c_max_search_results];
-			for (unsigned i = 0; i != num_found; ++i)
-			{
-				nonces[i] = batch.start_nonce + results[i + 1];
-				//cout << results[i + 1] << ", ";
-			}
-			//if (num_found > 0)
-			//	cout << endl;
-			
-			bool exit = num_found && hook.found(nonces, num_found);
-			exit |= hook.searched(batch.start_nonce, s_gridSize * s_blockSize); // always report searched before exit
-			if (exit)
-				break;
-
-			start_nonce += s_gridSize * s_blockSize;
-			// reset search buffer if we're still going
-			if (num_found)
-				results[0] = 0;
-
-			cudaError err = cudaGetLastError();
-			if (cudaSuccess != err)
-			{
-				throw std::runtime_error(cudaGetErrorString(err));
-			}
-			pending.pop();
+			CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+			found_count = buffer[0];
+			if (found_count)
+				buffer[0] = 0;
+			for (unsigned int j = 0; j < found_count; j++)
+				nonces[j] = nonce_base + buffer[j + 1];
+		}
+		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
+		if (m_current_index >= s_numStreams)
+		{
+			exit = found_count && hook.found(nonces, found_count);
+			exit |= hook.searched(nonce_base, batch_size);
 		}
-	}	
+	}
 }

--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -51,13 +51,12 @@ public:
 	static unsigned const c_defaultNumStreams;

 private:
-	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
+	hash32_t m_current_header;
+	uint64_t m_current_target;
+	uint64_t m_current_nonce;
+	uint64_t m_current_index;

-	hash128_t * m_dag_ptr;
-	hash32_t * m_header;
-
-	void ** m_hash_buf;
-	uint32_t ** m_search_buf;
+	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

 	/// The local work size for the search
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -18,21 +18,18 @@
 __global__ void 
 __launch_bounds__(128, 7)
 ethash_search(
-	uint32_t* g_output,
-	hash32_t const* g_header,
-	hash128_t const* g_dag,
-	uint64_t start_nonce,
-	uint64_t target
+	volatile uint32_t* g_output,
+	uint64_t start_nonce
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
-	uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
+	uint64_t hash = compute_hash_shuffle(start_nonce + gid);
 #else
-	uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0];
+	uint64_t hash = compute_hash(start_nonce + gid).uint64s[0];
 #endif
-	if (cuda_swab64(hash) > target) return;
-	uint32_t index = atomicInc(g_output, d_max_outputs) + 1;
+	if (cuda_swab64(hash) > d_target) return;
+	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
 	__threadfence_system();
 }
@ -41,27 +38,37 @@ void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
 	cudaStream_t stream,
-	uint32_t* g_output,
-	hash32_t const* g_header,
-	hash128_t const* g_dag,
-	uint64_t start_nonce,
-	uint64_t target
+	volatile uint32_t* g_output,
+	uint64_t start_nonce
 )
 {
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
-	ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
+	ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
 #else
-	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
+	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, start_nonce);
 #endif
+	CUDA_SAFE_CALL(cudaGetLastError());
 }

-cudaError set_constants(
-	uint32_t * dag_size,
-	uint32_t * max_outputs
+void set_constants(
+	hash128_t* _dag,
+	uint32_t _dag_size
 	)
 {
-	cudaError result;
-	result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
-	result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
-	return result;
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
+}
+
+void set_header(
+	hash32_t _header
+	)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
+}
+
+void set_target(
+	uint64_t _target
+	)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
 }
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -1,9 +1,12 @@
 #ifndef _ETHASH_CUDA_MINER_KERNEL_H_
 #define _ETHASH_CUDA_MINER_KERNEL_H_

+#include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>

+#define SEARCH_RESULT_BUFFER_SIZE 64
+
 typedef union
 {
 	uint64_t uint64s[16 / sizeof(uint64_t)];
@ -33,27 +36,37 @@ typedef union
 } hash128_t;


-cudaError set_constants(
-	uint32_t * dag_size,
-	uint32_t * max_outputs
-);
+void set_constants(
+	hash128_t* _dag,
+	uint32_t _dag_size
+	);

-void run_ethash_hash(
-	hash32_t* g_hashes,
-	hash32_t const* g_header,
-	hash128_t const* g_dag,
-	uint64_t start_nonce
-);
+void set_header(
+	hash32_t _header
+	);
+
+void set_target(
+	uint64_t _target
+	);

 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
 	cudaStream_t stream,
-	uint32_t* g_output,
-	hash32_t const* g_header,
-	hash128_t const* g_dag,
-	uint64_t start_nonce,
-	uint64_t target
-);
+	volatile uint32_t* g_output,
+	uint64_t start_nonce
+	);
+
+#define CUDA_SAFE_CALL(call)								\
+do {														\
+	cudaError_t err = call;									\
+	if (cudaSuccess != err) {								\
+		const char * errorString = cudaGetErrorString(err);	\
+		fprintf(stderr,										\
+			"CUDA error in func '%s' at line %i : %s.\n",	\
+			__FUNCTION__, __LINE__, errorString);			\
+		throw std::runtime_error(errorString);				\
+	}														\
+} while (0)

 #endif
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@ -4,6 +4,8 @@
 //#include "cuda_helper.h"

 __constant__ uint32_t d_dag_size;
-__constant__ uint32_t d_max_outputs;
+__constant__ hash128_t* d_dag;
+__constant__ hash32_t d_header;
+__constant__ uint64_t d_target;

 #endif