/* * Genoil's CUDA mining kernel for Ethereum * based on Tim Hughes' opencl kernel. * thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer. */ #include "ethash_cuda_miner_kernel.h" #include "ethash_cuda_miner_kernel_globals.h" #include "cuda_helper.h" #include "fnv.cuh" #define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } #if __CUDA_ARCH__ < SHUFFLE_MIN_VER #include "keccak_u64.cuh" #include "dagger_shared.cuh" #define TPB 128 #define BPSM 4 #else #include "keccak.cuh" #include "dagger_shuffled.cuh" #define TPB 896 #define BPSM 1 #endif __global__ void __launch_bounds__(TPB, BPSM) ethash_search( volatile uint32_t* g_output, uint64_t start_nonce ) { uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; uint64_t hash = compute_hash(start_nonce + gid); if (cuda_swab64(hash) > d_target) return; uint32_t index = atomicInc(const_cast(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1; g_output[index] = gid; } void run_ethash_search( uint32_t blocks, uint32_t threads, uint32_t sharedbytes, cudaStream_t stream, volatile uint32_t* g_output, uint64_t start_nonce ) { ethash_search << > >(g_output, start_nonce); CUDA_SAFE_CALL(cudaGetLastError()); } #define ETHASH_DATASET_PARENTS 256 #define NODE_WORDS (64/4) __global__ void __launch_bounds__(128, 7) ethash_calculate_dag_item(uint32_t start) { uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; if (node_index > d_dag_size * 2) return; hash200_t dag_node; copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4); dag_node.words[0] ^= node_index; SHA3_512(dag_node.uint2s); const int thread_id = threadIdx.x & 3; for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) { uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size; #if __CUDA_ARCH__ < SHUFFLE_MIN_VER for (unsigned w = 0; w != 4; ++w) { dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]); } #else for (uint32_t t = 0; t < 4; t++) { uint32_t shuffle_index = __shfl(parent_index, t, 4); uint4 p4 = d_light[shuffle_index].uint4s[thread_id]; for (int w = 0; w < 4; w++) { uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4)); if (t == thread_id) { dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4); } } } #endif } SHA3_512(dag_node.uint2s); hash64_t * dag_nodes = (hash64_t *)d_dag; #if __CUDA_ARCH__ < SHUFFLE_MIN_VER for (uint32_t i = 0; i < 4; i++) { dag_nodes[node_index].uint4s[i] = dag_node.uint4s[i]; } #else for (uint32_t t = 0; t < 4; t++) { uint32_t shuffle_index = __shfl(node_index, t, 4); uint4 s[4]; for (uint32_t w = 0; w < 4; w++) { s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4)); } dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id]; } #endif } void ethash_generate_dag( uint64_t dag_size, uint32_t blocks, uint32_t threads, cudaStream_t stream, int device ) { uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t)); uint32_t fullRuns = work / (blocks * threads); uint32_t const restWork = work % (blocks * threads); if (restWork > 0) fullRuns++; for (uint32_t i = 0; i < fullRuns; i++) { ethash_calculate_dag_item <<>>(i * blocks * threads); CUDA_SAFE_CALL(cudaDeviceSynchronize()); printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns); } //printf("GPU#%d 100%%\n"); CUDA_SAFE_CALL(cudaGetLastError()); } void set_constants( hash128_t* _dag, uint32_t _dag_size, hash64_t * _light, uint32_t _light_size ) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *))); CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *))); CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t))); } void set_header( hash32_t _header ) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t))); } void set_target( uint64_t _target ) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t))); }