ethminer/libethash-cuda/ethash_cuda_miner_kernel.cu

/*
* Genoil's CUDA mining kernel for Ethereum
* based on Tim Hughes' opencl kernel.
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
*/

#include "ethash_cuda_miner_kernel.h"
#include "ethash_cuda_miner_kernel_globals.h"
#include "cuda_helper.h"

#include "fnv.cuh"

#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }


#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
#include "keccak_u64.cuh"
#include "dagger_shared.cuh"
#define TPB		128
#define BPSM	4
#else
#include "keccak.cuh"
#include "dagger_shuffled.cuh"
#define TPB		896
#define BPSM	1
#endif

__global__ void 
__launch_bounds__(TPB, BPSM)
ethash_search(
	volatile uint32_t* g_output,
	uint64_t start_nonce
	)
{
	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
	uint64_t hash = compute_hash(start_nonce + gid);
	if (cuda_swab64(hash) > d_target) return;
	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
	g_output[index] = gid;
}

void run_ethash_search(
	uint32_t blocks,
	uint32_t threads,
	uint32_t sharedbytes,
	cudaStream_t stream,
	volatile uint32_t* g_output,
	uint64_t start_nonce
)
{
	ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
	CUDA_SAFE_CALL(cudaGetLastError());
}

#define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64/4)

__global__ void
__launch_bounds__(128, 7)
ethash_calculate_dag_item(uint32_t start)
{
	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
	if (node_index > d_dag_size * 2) return;

	hash200_t dag_node;
	copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
	dag_node.words[0] ^= node_index;
	SHA3_512(dag_node.uint2s);

	const int thread_id = threadIdx.x & 3;

	for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
		for (unsigned w = 0; w != 4; ++w) {
			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
		}
#else
		for (uint32_t t = 0; t < 4; t++) {
			uint32_t shuffle_index = __shfl(parent_index, t, 4);
			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];

			for (int w = 0; w < 4; w++) {
				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
				if (t == thread_id) {
					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
				}
			}

		}
#endif		
	}
	SHA3_512(dag_node.uint2s);
	hash64_t * dag_nodes = (hash64_t *)d_dag;

#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
	for (uint32_t i = 0; i < 4; i++) {
		dag_nodes[node_index].uint4s[i] =  dag_node.uint4s[i];
	}
#else
	for (uint32_t t = 0; t < 4; t++) {

		uint32_t shuffle_index = __shfl(node_index, t, 4);
		uint4 s[4];
		for (uint32_t w = 0; w < 4; w++) {
			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
		}
		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
	}
#endif		 
}

void ethash_generate_dag(
	uint64_t dag_size,
	uint32_t blocks,
	uint32_t threads,
	cudaStream_t stream,
	int device
	)
{
	uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));

	uint32_t fullRuns = work / (blocks * threads);
	uint32_t const restWork = work % (blocks * threads);
	if (restWork > 0) fullRuns++;
	for (uint32_t i = 0; i < fullRuns; i++)
	{
		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
		CUDA_SAFE_CALL(cudaDeviceSynchronize());
		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);
	}
	//printf("GPU#%d 100%%\n");
	CUDA_SAFE_CALL(cudaGetLastError());
}

void set_constants(
	hash128_t* _dag,
	uint32_t _dag_size,
	hash64_t * _light,
	uint32_t _light_size
	)
{
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
}

void set_header(
	hash32_t _header
	)
{
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
}

void set_target(
	uint64_t _target
	)
{
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
}
end of day commit..nothing works yet 9 years ago			`/*`
			`* Genoil's CUDA mining kernel for Ethereum`
			`* based on Tim Hughes' opencl kernel.`
			`* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.`
			`*/`

renamed "cu" -> "cuda" globally 9 years ago			`#include "ethash_cuda_miner_kernel.h"`
			`#include "ethash_cuda_miner_kernel_globals.h"`
end of day commit..nothing works yet 9 years ago			`#include "cuda_helper.h"`

on-GPU DAG generation 9 years ago			`#include "fnv.cuh"`

			`#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }`

shorter work mutex lock longer work timeout binary 9 years ago
working cuda kernel with shared mem 9 years ago			`#if __CUDA_ARCH__ < SHUFFLE_MIN_VER`
on-GPU DAG generation 9 years ago			`#include "keccak_u64.cuh"`
working cuda kernel with shared mem 9 years ago			`#include "dagger_shared.cuh"`
tpb 9 years ago			`#define TPB 128`
			`#define BPSM 4`
working cuda kernel with shared mem 9 years ago			`#else`
on-GPU DAG generation 9 years ago			`#include "keccak.cuh"`
working cuda kernel with shared mem 9 years ago			`#include "dagger_shuffled.cuh"`
tpb 9 years ago			`#define TPB 896`
			`#define BPSM 1`
working cuda kernel with shared mem 9 years ago			`#endif`
end of day commit..nothing works yet 9 years ago
			`__global__ void`
tpb 9 years ago			`__launch_bounds__(TPB, BPSM)`
end of day commit..nothing works yet 9 years ago			`ethash_search(`
improved error handling and usage of constant memory space 9 years ago			`volatile uint32_t* g_output,`
			`uint64_t start_nonce`
end of day commit..nothing works yet 9 years ago			`)`
			`{`
			`uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;`
working cuda kernel with shared mem 9 years ago			`uint64_t hash = compute_hash(start_nonce + gid);`
improved error handling and usage of constant memory space 9 years ago			`if (cuda_swab64(hash) > d_target) return;`
			`uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;`
fixed m_search_buf races, removed unnecessary cudaMemcpyAsync calls, fixed missing cudaStreamSynchronize 9 years ago			`g_output[index] = gid;`
end of day commit..nothing works yet 9 years ago			`}`

			`void run_ethash_search(`
			`uint32_t blocks,`
			`uint32_t threads,`
some fixes, cuda 8 rc build 9 years ago			`uint32_t sharedbytes,`
end of day commit..nothing works yet 9 years ago			`cudaStream_t stream,`
improved error handling and usage of constant memory space 9 years ago			`volatile uint32_t* g_output,`
			`uint64_t start_nonce`
end of day commit..nothing works yet 9 years ago			`)`
			`{`
some fixes, cuda 8 rc build 9 years ago			`ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);`
on-GPU DAG generation 9 years ago			`CUDA_SAFE_CALL(cudaGetLastError());`
			`}`

			`#define ETHASH_DATASET_PARENTS 256`
			`#define NODE_WORDS (64/4)`

			`__global__ void`
			`__launch_bounds__(128, 7)`
			`ethash_calculate_dag_item(uint32_t start)`
			`{`
some fixes, cuda 8 rc build 9 years ago			`uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;`
on-GPU DAG generation 9 years ago			`if (node_index > d_dag_size * 2) return;`
some fixes, cuda 8 rc build 9 years ago
on-GPU DAG generation 9 years ago			`hash200_t dag_node;`
			`copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);`
			`dag_node.words[0] ^= node_index;`
			`SHA3_512(dag_node.uint2s);`

			`const int thread_id = threadIdx.x & 3;`

			`for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {`
			`uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;`
some fixes, cuda 8 rc build 9 years ago			`#if __CUDA_ARCH__ < SHUFFLE_MIN_VER`
			`for (unsigned w = 0; w != 4; ++w) {`
			`dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);`
			`}`
			`#else`
on-GPU DAG generation 9 years ago			`for (uint32_t t = 0; t < 4; t++) {`
			`uint32_t shuffle_index = __shfl(parent_index, t, 4);`
			`uint4 p4 = d_light[shuffle_index].uint4s[thread_id];`
some fixes, cuda 8 rc build 9 years ago
			`for (int w = 0; w < 4; w++) {`
			`uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));`
			`if (t == thread_id) {`
on-GPU DAG generation 9 years ago			`dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);`
			`}`
			`}`

			`}`
some fixes, cuda 8 rc build 9 years ago			`#endif`
on-GPU DAG generation 9 years ago			`}`
			`SHA3_512(dag_node.uint2s);`
			`hash64_t * dag_nodes = (hash64_t *)d_dag;`
some fixes, cuda 8 rc build 9 years ago
			`#if __CUDA_ARCH__ < SHUFFLE_MIN_VER`
			`for (uint32_t i = 0; i < 4; i++) {`
			`dag_nodes[node_index].uint4s[i] = dag_node.uint4s[i];`
			`}`
			`#else`
			`for (uint32_t t = 0; t < 4; t++) {`

			`uint32_t shuffle_index = __shfl(node_index, t, 4);`
			`uint4 s[4];`
			`for (uint32_t w = 0; w < 4; w++) {`
			`s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));`
			`}`
			`dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];`
			`}`
			`#endif`
on-GPU DAG generation 9 years ago			`}`

			`void ethash_generate_dag(`
			`uint64_t dag_size,`
			`uint32_t blocks,`
			`uint32_t threads,`
synchronous stratum client test (wip) 9 years ago			`cudaStream_t stream,`
			`int device`
on-GPU DAG generation 9 years ago			`)`
			`{`
			`uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));`

			`uint32_t fullRuns = work / (blocks * threads);`
			`uint32_t const restWork = work % (blocks * threads);`
			`if (restWork > 0) fullRuns++;`
			`for (uint32_t i = 0; i < fullRuns; i++)`
			`{`
			`ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);`
			`CUDA_SAFE_CALL(cudaDeviceSynchronize());`
mixed AMD/Nvidia mining 9 years ago			`printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);`
on-GPU DAG generation 9 years ago			`}`
synchronous stratum client test (wip) 9 years ago			`//printf("GPU#%d 100%%\n");`
improved error handling and usage of constant memory space 9 years ago			`CUDA_SAFE_CALL(cudaGetLastError());`
end of day commit..nothing works yet 9 years ago			`}`

improved error handling and usage of constant memory space 9 years ago			`void set_constants(`
			`hash128_t* _dag,`
on-GPU DAG generation 9 years ago			`uint32_t _dag_size,`
			`hash64_t * _light,`
			`uint32_t _light_size`
end of day commit..nothing works yet 9 years ago			`)`
			`{`
improved error handling and usage of constant memory space 9 years ago			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));`
			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));`
on-GPU DAG generation 9 years ago			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));`
			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));`
improved error handling and usage of constant memory space 9 years ago			`}`

			`void set_header(`
			`hash32_t _header`
			`)`
			`{`
			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));`
			`}`

			`void set_target(`
			`uint64_t _target`
			`)`
			`{`
			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));`
end of day commit..nothing works yet 9 years ago			`}`