ethminer/libethash-cuda/ethash_cuda_miner_kernel.cu


								/*

								* Genoil's CUDA mining kernel for Ethereum

								* based on Tim Hughes' opencl kernel.

								* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.

								*/


								#include "ethash_cuda_miner_kernel.h"

								#include "ethash_cuda_miner_kernel_globals.h"

								#include "cuda_helper.h"


								#include "fnv.cuh"


								#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }


								#if __CUDA_ARCH__ < SHUFFLE_MIN_VER

								#include "keccak_u64.cuh"

								#include "dagger_shared.cuh"

								#else

								#include "keccak.cuh"

								#include "dagger_shuffled.cuh"

								#endif


								template <uint32_t _PARALLEL_HASH>

								__global__ void

								ethash_search(

									volatile uint32_t* g_output,

									uint64_t start_nonce

									)

								{

									uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;

								        uint64_t hash = compute_hash<_PARALLEL_HASH>(start_nonce + gid);

									if (cuda_swab64(hash) > d_target) return;

									uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;

									g_output[index] = gid;

								}


								void run_ethash_search(

									uint32_t blocks,

									uint32_t threads,

									uint32_t sharedbytes,

									cudaStream_t stream,

									volatile uint32_t* g_output,

									uint64_t start_nonce,

									uint32_t parallelHash

								)

								{

									switch (parallelHash)

									{

										case 1: ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 2: ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 3: ethash_search <3> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 4: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 5: ethash_search <5> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 6: ethash_search <6> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 7: ethash_search <7> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										case 8: ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

										default: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;

									}

									CUDA_SAFE_CALL(cudaGetLastError());

								}


								#define ETHASH_DATASET_PARENTS 256

								#define NODE_WORDS (64/4)


								__global__ void

								ethash_calculate_dag_item(uint32_t start)

								{

									uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;

									if (node_index > d_dag_size * 2) return;


									hash200_t dag_node;

									copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);

									dag_node.words[0] ^= node_index;

									SHA3_512(dag_node.uint2s);


									const int thread_id = threadIdx.x & 3;


									for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {

										uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;

								#if __CUDA_ARCH__ < SHUFFLE_MIN_VER

										for (unsigned w = 0; w != 4; ++w) {

											dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);

										}

								#else

										for (uint32_t t = 0; t < 4; t++) {

											uint32_t shuffle_index = __shfl(parent_index, t, 4);

											uint4 p4 = d_light[shuffle_index].uint4s[thread_id];


											for (int w = 0; w < 4; w++) {

												uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));

												if (t == thread_id) {

													dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);

												}

											}


										}

								#endif

									}

									SHA3_512(dag_node.uint2s);

									hash64_t * dag_nodes = (hash64_t *)d_dag;


								#if __CUDA_ARCH__ < SHUFFLE_MIN_VER

									for (uint32_t i = 0; i < 4; i++) {

										dag_nodes[node_index].uint4s[i] =  dag_node.uint4s[i];

									}

								#else

									for (uint32_t t = 0; t < 4; t++) {


										uint32_t shuffle_index = __shfl(node_index, t, 4);

										uint4 s[4];

										for (uint32_t w = 0; w < 4; w++) {

											s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));

										}

										dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];

									}

								#endif

								}


								void ethash_generate_dag(

									uint64_t dag_size,

									uint32_t blocks,

									uint32_t threads,

									cudaStream_t stream,

									int device

									)

								{

									uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));


									uint32_t fullRuns = work / (blocks * threads);

									uint32_t const restWork = work % (blocks * threads);

									if (restWork > 0) fullRuns++;

									for (uint32_t i = 0; i < fullRuns; i++)

									{

										ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);

										CUDA_SAFE_CALL(cudaDeviceSynchronize());

										printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);

									}

									//printf("GPU#%d 100%%\n");

									CUDA_SAFE_CALL(cudaGetLastError());

								}


								void set_constants(

									hash128_t* _dag,

									uint32_t _dag_size,

									hash64_t * _light,

									uint32_t _light_size

									)

								{

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));

								}


								void set_header(

									hash32_t _header

									)

								{

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));

								}


								void set_target(

									uint64_t _target

									)

								{

									CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));

								}