diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c65a1a6e..8d2e9b17f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 2.8.12) set(PROJECT_VERSION "0.9.41") -set(GENOIL_VERSION "1.0.8") +set(GENOIL_VERSION "1.1") if (${CMAKE_VERSION} VERSION_GREATER 3.0) cmake_policy(SET CMP0042 OLD) # fix MACOSX_RPATH cmake_policy(SET CMP0048 NEW) # allow VERSION argument in project() diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h index 8c031f2e6..32b8fe921 100644 --- a/ethminer/MinerAux.h +++ b/ethminer/MinerAux.h @@ -362,6 +362,7 @@ public: } else if (arg == "--current-block" && i + 1 < argc) m_currentBlock = stol(argv[++i]); + /* else if ((arg == "-R" || arg == "--dag-dir") && i + 1 < argc) { strcpy(s_dagDir, argv[++i]); @@ -395,6 +396,7 @@ public: BOOST_THROW_EXCEPTION(BadArgument()); } } + */ else if ((arg == "-w" || arg == "--check-pow") && i + 4 < argc) { string m; @@ -494,6 +496,7 @@ public: void execute() { + /* EthashAux::setDAGDirName(s_dagDir); EthashAux::setDAGEraseMode(m_eraseMode); EthashAux::eraseDAGs(); @@ -501,6 +504,7 @@ public: { m_eraseMode = DAGEraseMode::None; } + */ if (m_shouldListDevices) { @@ -596,7 +600,7 @@ public: << " -FS, --failover-stratum Failover stratum server at host:port" << endl << " -O, --userpass Stratum login credentials" << endl << " -FO, --failover-userpass Failover stratum login credentials (optional, will use normal credentials when omitted)" << endl - << " --work-timeout reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 60. Don't set lower than max. avg. block time" << endl + << " --work-timeout reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 180. Don't set lower than max. avg. block time" << endl #endif #if ETH_JSONRPC || ETH_STRATUM || !ETH_TRUE << " --farm-recheck Leave n ms between checks for changed work (default: 500). When using stratum, use a high value (i.e. 2000) to get more stable hashrate output" << endl @@ -615,14 +619,14 @@ public: #if ETH_JSONRPC || !ETH_TRUE << " --phone-home When benchmarking, publish results (default: off)" << endl #endif - << "DAG file management:" << endl - << " -D,--create-dag Create the DAG in preparation for mining on given block and exit." << endl - << " -R , --dag-dir Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl - << " -E , --erase-dags Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl - << " none - don't erase DAG files (default)" << endl - << " old - erase all DAG files older than current epoch" << endl - << " bench - like old, but keep epoch 0 for benchmarking" << endl - << " all - erase all DAG files. After deleting all files, setting changes to none." << endl +// << "DAG file management:" << endl +// << " -D,--create-dag Create the DAG in preparation for mining on given block and exit." << endl +// << " -R , --dag-dir Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl +// << " -E , --erase-dags Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl +// << " none - don't erase DAG files (default)" << endl +// << " old - erase all DAG files older than current epoch" << endl +// << " bench - like old, but keep epoch 0 for benchmarking" << endl +// << " all - erase all DAG files. After deleting all files, setting changes to none." << endl << "Mining configuration:" << endl << " -C,--cpu When mining, use the CPU." << endl << " -G,--opencl When mining use the GPU via OpenCL." << endl @@ -691,7 +695,7 @@ private: cout << "Benchmarking on platform: " << platformInfo << endl; cout << "Preparing DAG for block #" << m_benchmarkBlock << endl; - genesis.prep(); + //genesis.prep(); genesis.setDifficulty(u256(1) << 63); f.setWork(genesis); @@ -774,7 +778,7 @@ private: cout << "Running mining simulation on platform: " << platformInfo << endl; cout << "Preparing DAG for block #" << m_benchmarkBlock << endl; - genesis.prep(); + //genesis.prep(); genesis.setDifficulty(u256(1) << difficulty); f.setWork(genesis); @@ -911,6 +915,7 @@ private: Json::Value v = prpc->eth_getWork(); h256 hh(v[0].asString()); h256 newSeedHash(v[1].asString()); + /* if (current.seedHash != newSeedHash) { minelog << "Grabbing DAG for" << newSeedHash; @@ -923,6 +928,7 @@ private: { EthashAux::computeFull(sha3(newSeedHash), true); } + */ if (hh != current.headerHash) { x_current.lock(); @@ -1104,7 +1110,7 @@ private: /// Benchmarking params bool m_phoneHome = false; - unsigned m_benchmarkWarmup = 3; + unsigned m_benchmarkWarmup = 15; unsigned m_benchmarkTrial = 3; unsigned m_benchmarkTrials = 5; unsigned m_benchmarkBlock = 0; @@ -1119,7 +1125,7 @@ private: unsigned m_farmRecheckPeriod = 500; unsigned m_defaultStratumFarmRecheckPeriod = 2000; bool m_farmRecheckSet = false; - int m_worktimeout = 90; + int m_worktimeout = 180; bool m_precompute = true; #if ETH_STRATUM || !ETH_TRUE diff --git a/libethash-cl/ethash_cl_miner.cpp b/libethash-cl/ethash_cl_miner.cpp index b5bf72d06..b08e19506 100644 --- a/libethash-cl/ethash_cl_miner.cpp +++ b/libethash-cl/ethash_cl_miner.cpp @@ -324,9 +324,11 @@ void ethash_cl_miner::finish() m_queue.finish(); } + bool ethash_cl_miner::init( - uint8_t const* _dag, - uint64_t _dagSize, + ethash_light_t _light, + uint8_t const* _lightData, + uint64_t _lightSize, unsigned _platformId, unsigned _deviceId ) @@ -399,12 +401,17 @@ bool ethash_cl_miner::init( if (m_globalWorkSize % s_workgroupSize != 0) m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize; + uint64_t dagSize = ethash_get_datasize(_light->block_number); + uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES); + uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node)); + // patch source code // note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled // into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE); addDefinition(code, "GROUP_SIZE", s_workgroupSize); - addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES)); + addDefinition(code, "DAG_SIZE", dagSize128); + addDefinition(code, "LIGHT_SIZE", lightSize64); addDefinition(code, "ACCESSES", ETHASH_ACCESSES); addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults); addDefinition(code, "PLATFORM", platformId); @@ -430,16 +437,19 @@ bool ethash_cl_miner::init( // create buffer for dag try { - ETHCL_LOG("Creating one big buffer for the DAG"); - m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize); - ETHCL_LOG("Loading single big chunk kernels"); + ETHCL_LOG("Creating cache buffer"); + m_light = cl::Buffer(m_context, CL_MEM_READ_ONLY, _lightSize); + ETHCL_LOG("Creating DAG buffer"); + m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, dagSize); + ETHCL_LOG("Loading kernels"); m_searchKernel = cl::Kernel(program, "ethash_search"); - ETHCL_LOG("Mapping one big chunk."); - m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag); + m_dagKernel = cl::Kernel(program, "ethash_calculate_dag_item"); + ETHCL_LOG("Writing cache buffer"); + m_queue.enqueueWriteBuffer(m_light, CL_TRUE, 0, _lightSize, _lightData); } catch (cl::Error const& err) { - ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing."); + ETHCL_LOG("Allocating/mapping DAG buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing."); return false; } // create buffer for header @@ -456,6 +466,28 @@ bool ethash_cl_miner::init( ETHCL_LOG("Creating mining buffer " << i); m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t)); } + + ETHCL_LOG("Generating DAG data"); + + uint32_t const work = (uint32_t)(dagSize / sizeof(node)); + //while (work < blocks * threads) blocks /= 2; + + uint32_t fullRuns = work / m_globalWorkSize; + uint32_t const restWork = work % m_globalWorkSize; + if (restWork > 0) fullRuns++; + + m_dagKernel.setArg(1, m_light); + m_dagKernel.setArg(2, m_dag); + m_dagKernel.setArg(3, ~0u); + + for (uint32_t i = 0; i < fullRuns; i++) + { + m_dagKernel.setArg(0, i * m_globalWorkSize); + m_queue.enqueueNDRangeKernel(m_dagKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize); + m_queue.finish(); + printf("%.0f%%\n", 100.0f * (float)i / (float)fullRuns); + } + } catch (cl::Error const& err) { diff --git a/libethash-cl/ethash_cl_miner.h b/libethash-cl/ethash_cl_miner.h index d7691cb42..b6f96c993 100644 --- a/libethash-cl/ethash_cl_miner.h +++ b/libethash-cl/ethash_cl_miner.h @@ -51,12 +51,13 @@ public: uint64_t _currentBlock ); - bool init( - uint8_t const* _dag, - uint64_t _dagSize, - unsigned _platformId = 0, - unsigned _deviceId = 0 - ); + bool ethash_cl_miner::init( + ethash_light_t _light, + uint8_t const* _lightData, + uint64_t _lightSize, + unsigned _platformId, + unsigned _deviceId + ); void finish(); void search(uint8_t const* _header, uint64_t _target, search_hook& _hook); @@ -74,7 +75,9 @@ private: cl::Context m_context; cl::CommandQueue m_queue; cl::Kernel m_searchKernel; + cl::Kernel m_dagKernel; cl::Buffer m_dag; + cl::Buffer m_light; cl::Buffer m_header; cl::Buffer m_searchBuffer[c_bufferCount]; unsigned m_globalWorkSize; diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl index 54952fc3f..390a021c9 100644 --- a/libethash-cl/ethash_cl_miner_kernel.cl +++ b/libethash-cl/ethash_cl_miner_kernel.cl @@ -1,11 +1,12 @@ #define OPENCL_PLATFORM_UNKNOWN 0 #define OPENCL_PLATFORM_NVIDIA 1 -#define OPENCL_PLATFORM_AMD 2 +#define OPENCL_PLATFORM_AMD 2 +#define ETHASH_DATASET_PARENTS 256 +#define NODE_WORDS (64/4) #define THREADS_PER_HASH (128 / 16) #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) - #define FNV_PRIME 0x01000193 __constant uint2 const Keccak_f1600_RC[24] = { @@ -176,8 +177,6 @@ static void keccak_f1600_round(uint2* a, uint r) static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate) { - - // Originally I unrolled the first and last rounds to interface // better with surrounding code, however I haven't done this // without causing the AMD compiler to blow up the VGPR usage. @@ -227,6 +226,18 @@ typedef struct ulong ulongs[32 / sizeof(ulong)]; } hash32_t; +typedef union { + uint words[64 / sizeof(uint)]; + uint2 uint2s[64 / sizeof(uint2)]; + uint4 uint4s[64 / sizeof(uint4)]; +} hash64_t; + +typedef union { + uint words[200 / sizeof(uint)]; + uint2 uint2s[200 / sizeof(uint2)]; + uint4 uint4s[200 / sizeof(uint4)]; +} hash200_t; + typedef struct { uint4 uint4s[128 / sizeof(uint4)]; @@ -334,3 +345,35 @@ __kernel void ethash_search( g_output[slot] = gid; } } + +static void SHA3_512(uint2* s, uint isolate) +{ + for (uint i = 8; i != 25; ++i) + { + s[i] = (uint2){ 0, 0 }; + } + s[8].x = 0x00000001; + s[8].y = 0x80000000; + keccak_f1600_no_absorb(s, 8, isolate); +} + +__kernel void ethash_calculate_dag_item(uint start, __global hash64_t const* g_light, __global hash64_t * g_dag, uint isolate) +{ + uint const node_index = start + get_global_id(0); + if (node_index > DAG_SIZE * 2) return; + + hash200_t dag_node; + copy(dag_node.uint4s, g_light[node_index % LIGHT_SIZE].uint4s, 4); + dag_node.words[0] ^= node_index; + SHA3_512(dag_node.uint2s, isolate); + + for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i) { + uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_SIZE; + + for (uint w = 0; w != 4; ++w) { + dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]); + } + } + SHA3_512(dag_node.uint2s, isolate); + copy(g_dag[node_index].uint4s, dag_node.uint4s, 4); +} diff --git a/libethash-cuda/dagger_shared.cuh b/libethash-cuda/dagger_shared.cuh index 8410891ed..8a08bc83d 100644 --- a/libethash-cuda/dagger_shared.cuh +++ b/libethash-cuda/dagger_shared.cuh @@ -1,9 +1,5 @@ #include "ethash_cuda_miner_kernel_globals.h" #include "ethash_cuda_miner_kernel.h" -#include "keccak_u64.cuh" -#include "fnv.cuh" - -#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } typedef union { uint4 uint4s[4]; diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh index 58ee6d5f9..8a8a3890f 100644 --- a/libethash-cuda/dagger_shuffled.cuh +++ b/libethash-cuda/dagger_shuffled.cuh @@ -1,7 +1,5 @@ #include "ethash_cuda_miner_kernel_globals.h" #include "ethash_cuda_miner_kernel.h" -#include "keccak.cuh" -#include "fnv.cuh" #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp index d5a1cca8a..2ad749dca 100644 --- a/libethash-cuda/ethash_cuda_miner.cpp +++ b/libethash-cuda/ethash_cuda_miner.cpp @@ -199,7 +199,7 @@ void ethash_cuda_miner::finish() CUDA_SAFE_CALL(cudaDeviceReset()); } -bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId) +bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId) { try { @@ -224,25 +224,38 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d m_search_buf = new volatile uint32_t *[s_numStreams]; m_streams = new cudaStream_t[s_numStreams]; - uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES); + uint64_t dagSize = ethash_get_datasize(_light->block_number); + uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES); + uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node)); + + // create buffer for cache + hash64_t * light; + CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast(&light), _lightSize)); + // copy dag to CPU. + CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast(light), _lightData, _lightSize, cudaMemcpyHostToDevice)); // create buffer for dag hash128_t * dag; - CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast(&dag), _dagSize)); + CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast(&dag), dagSize)); // copy dag to CPU. - CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast(dag), _dag, _dagSize, cudaMemcpyHostToDevice)); + //CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast(dag), _dag, _dagSize, cudaMemcpyHostToDevice)); + // create mining buffers for (unsigned i = 0; i != s_numStreams; ++i) { CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i])); } - set_constants(dag, dagSize128); + set_constants(dag, dagSize128, light, lightSize64); memset(&m_current_header, 0, sizeof(hash32_t)); m_current_target = 0; m_current_nonce = 0; m_current_index = 0; + + cout << "Generating DAG..." << endl; + ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0]); + return true; } catch (runtime_error) diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h index 4b1845c97..de0d3bea5 100644 --- a/libethash-cuda/ethash_cuda_miner.h +++ b/libethash-cuda/ethash_cuda_miner.h @@ -34,11 +34,13 @@ public: unsigned _scheduleFlag, uint64_t _currentBlock ); - bool init( + /*bool init( uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId = 0 - ); + );*/ + bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId); + void finish(); void search(uint8_t const* header, uint64_t target, search_hook& hook); diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu index 3357049eb..85479a0a0 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.cu +++ b/libethash-cuda/ethash_cuda_miner_kernel.cu @@ -8,13 +8,19 @@ #include "ethash_cuda_miner_kernel_globals.h" #include "cuda_helper.h" +#include "fnv.cuh" + +#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } + #define SHUFFLE_MIN_VER 300 #if __CUDA_ARCH__ < SHUFFLE_MIN_VER +#include "keccak_u64.cuh" #include "dagger_shared.cuh" #define TPB 128 #define BPSM 4 #else +#include "keccak.cuh" #include "dagger_shuffled.cuh" #define TPB 896 #define BPSM 1 @@ -43,17 +49,86 @@ void run_ethash_search( uint64_t start_nonce ) { - ethash_search << > >(g_output, start_nonce); + ethash_search <<> >(g_output, start_nonce); + CUDA_SAFE_CALL(cudaGetLastError()); +} + +#define ETHASH_DATASET_PARENTS 256 +#define NODE_WORDS (64/4) + +__global__ void +__launch_bounds__(128, 7) +ethash_calculate_dag_item(uint32_t start) +{ + uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; + if (node_index > d_dag_size * 2) return; + + hash200_t dag_node; + copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4); + dag_node.words[0] ^= node_index; + SHA3_512(dag_node.uint2s); + + const int thread_id = threadIdx.x & 3; + + for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) { + uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size; + + /* fix this some time. or not. + for (uint32_t t = 0; t < 4; t++) { + uint32_t shuffle_index = __shfl(parent_index, t, 4); + uint4 p4 = d_light[shuffle_index].uint4s[thread_id]; + if (t == thread_id) { + for (uint32_t w = 0; w < 4; w++) { + uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4)); + dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4); + } + } + } + */ + + for (unsigned w = 0; w != 4; ++w) { + dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]); + } + } + SHA3_512(dag_node.uint2s); + hash64_t * dag_nodes = (hash64_t *)d_dag; + copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4); +} + +void ethash_generate_dag( + uint64_t dag_size, + uint32_t blocks, + uint32_t threads, + cudaStream_t stream + ) +{ + uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t)); + //while (work < blocks * threads) blocks /= 2; + + uint32_t fullRuns = work / (blocks * threads); + uint32_t const restWork = work % (blocks * threads); + if (restWork > 0) fullRuns++; + for (uint32_t i = 0; i < fullRuns; i++) + { + ethash_calculate_dag_item <<>>(i * blocks * threads); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + printf("%.0f%%\n",100.0f * (float)i / (float)fullRuns); + } + CUDA_SAFE_CALL(cudaGetLastError()); } void set_constants( hash128_t* _dag, - uint32_t _dag_size + uint32_t _dag_size, + hash64_t * _light, + uint32_t _light_size ) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *))); CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *))); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t))); } void set_header( diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h index d261d24fa..b9e2a05ed 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel.h +++ b/libethash-cuda/ethash_cuda_miner_kernel.h @@ -19,10 +19,23 @@ typedef struct uint4 uint4s[128 / sizeof(uint4)]; } hash128_t; +typedef union { + uint32_t words[64 / sizeof(uint32_t)]; + uint2 uint2s[64 / sizeof(uint2)]; + uint4 uint4s[64 / sizeof(uint4)]; +} hash64_t; + +typedef union { + uint32_t words[200 / sizeof(uint32_t)]; + uint2 uint2s[200 / sizeof(uint2)]; + uint4 uint4s[200 / sizeof(uint4)]; +} hash200_t; void set_constants( hash128_t* _dag, - uint32_t _dag_size + uint32_t _dag_size, + hash64_t * _light, + uint32_t _light_size ); void set_header( @@ -41,6 +54,13 @@ void run_ethash_search( uint64_t start_nonce ); +void ethash_generate_dag( + uint64_t dag_size, + uint32_t blocks, + uint32_t threads, + cudaStream_t stream + ); + #define CUDA_SAFE_CALL(call) \ do { \ diff --git a/libethash-cuda/ethash_cuda_miner_kernel_globals.h b/libethash-cuda/ethash_cuda_miner_kernel_globals.h index da4e60cb9..578df48da 100644 --- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h +++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h @@ -5,6 +5,8 @@ __constant__ uint32_t d_dag_size; __constant__ hash128_t* d_dag; +__constant__ uint32_t d_light_size; +__constant__ hash64_t* d_light; __constant__ hash32_t d_header; __constant__ uint64_t d_target; diff --git a/libethash-cuda/fnv.cuh b/libethash-cuda/fnv.cuh index e12654e9a..07079c538 100644 --- a/libethash-cuda/fnv.cuh +++ b/libethash-cuda/fnv.cuh @@ -1,3 +1,4 @@ + #define FNV_PRIME 0x01000193 #define fnv(x,y) ((x) * FNV_PRIME ^(y)) @@ -15,4 +16,5 @@ __device__ uint4 fnv4(uint4 a, uint4 b) __device__ uint32_t fnv_reduce(uint4 v) { return fnv(fnv(fnv(v.x, v.y), v.z), v.w); -} \ No newline at end of file +} + diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh index 836b39b67..86969e014 100644 --- a/libethash-cuda/keccak.cuh +++ b/libethash-cuda/keccak.cuh @@ -588,4 +588,190 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s) /* iota: a[0,0] ^= round constant */ //s[0] ^= vectorize(keccak_round_constants[23]); return devectorize(s[0]) ^ keccak_round_constants[23]; +} + +__device__ __forceinline__ void SHA3_512(uint2* s) { + + uint2 t[5], u, v; + + for (uint32_t i = 8; i < 25; i++) + { + s[i] = make_uint2(0, 0); + } + s[8].x = 1; + s[8].y = 0x80000000; + + for (int i = 0; i < 23; i++) + { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = ROL2(t[1], 1); + s[0] = xor3(s[0], t[4], u); + s[5] = xor3(s[5], t[4], u); + s[10] = xor3(s[10], t[4], u); + s[15] = xor3(s[15], t[4], u); + s[20] = xor3(s[20], t[4], u); + + u = ROL2(t[2], 1); + s[1] = xor3(s[1], t[0], u); + s[6] = xor3(s[6], t[0], u); + s[11] = xor3(s[11], t[0], u); + s[16] = xor3(s[16], t[0], u); + s[21] = xor3(s[21], t[0], u); + + u = ROL2(t[3], 1); + s[2] = xor3(s[2], t[1], u); + s[7] = xor3(s[7], t[1], u); + s[12] = xor3(s[12], t[1], u); + s[17] = xor3(s[17], t[1], u); + s[22] = xor3(s[22], t[1], u); + + u = ROL2(t[4], 1); + s[3] = xor3(s[3], t[2], u); + s[8] = xor3(s[8], t[2], u); + s[13] = xor3(s[13], t[2], u); + s[18] = xor3(s[18], t[2], u); + s[23] = xor3(s[23], t[2], u); + + + u = ROL2(t[0], 1); + s[4] = xor3(s[4], t[3], u); + s[9] = xor3(s[9], t[3], u); + s[14] = xor3(s[14], t[3], u); + s[19] = xor3(s[19], t[3], u); + s[24] = xor3(s[24], t[3], u); + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(u, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + u = s[0]; v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + + u = s[5]; v = s[6]; + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + s[8] = chi(s[8], s[9], u); + s[9] = chi(s[9], u, v); + + u = s[10]; v = s[11]; + s[10] = chi(s[10], s[11], s[12]); + s[11] = chi(s[11], s[12], s[13]); + s[12] = chi(s[12], s[13], s[14]); + s[13] = chi(s[13], s[14], u); + s[14] = chi(s[14], u, v); + + u = s[15]; v = s[16]; + s[15] = chi(s[15], s[16], s[17]); + s[16] = chi(s[16], s[17], s[18]); + s[17] = chi(s[17], s[18], s[19]); + s[18] = chi(s[18], s[19], u); + s[19] = chi(s[19], u, v); + + u = s[20]; v = s[21]; + s[20] = chi(s[20], s[21], s[22]); + s[21] = chi(s[21], s[22], s[23]); + s[22] = chi(s[22], s[23], s[24]); + s[23] = chi(s[23], s[24], u); + s[24] = chi(s[24], u, v); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= vectorize(keccak_round_constants[i]); + } + + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = ROL2(t[1], 1); + s[0] = xor3(s[0], t[4], u); + s[10] = xor3(s[10], t[4], u); + + u = ROL2(t[2], 1); + s[6] = xor3(s[6], t[0], u); + s[16] = xor3(s[16], t[0], u); + + u = ROL2(t[3], 1); + s[12] = xor3(s[12], t[1], u); + s[22] = xor3(s[22], t[1], u); + + u = ROL2(t[4], 1); + s[3] = xor3(s[3], t[2], u); + s[18] = xor3(s[18], t[2], u); + + u = ROL2(t[0], 1); + s[9] = xor3(s[9], t[3], u); + s[24] = xor3(s[24], t[3], u); + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[2] = ROL2(s[12], 43); + s[4] = ROL2(s[24], 14); + s[8] = ROL2(s[16], 45); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[7] = ROL2(s[10], 3); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + + u = s[0]; v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= vectorize(keccak_round_constants[23]); } \ No newline at end of file diff --git a/libethash-cuda/keccak_u64.cuh b/libethash-cuda/keccak_u64.cuh index 6eedba6c2..ac61dad3f 100644 --- a/libethash-cuda/keccak_u64.cuh +++ b/libethash-cuda/keccak_u64.cuh @@ -586,4 +586,192 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint64_t* s) /* iota: a[0,0] ^= round constant */ //s[0] ^= vectorize(keccak_round_constants[23]); return s[0] ^ keccak_round_constants[23]; +} + +__device__ __forceinline__ void SHA3_512(uint2* s2) { + + uint64_t * s = (uint64_t*)s2; //dirty + + uint64_t t[5], u, v; + + for (uint32_t i = 9; i < 25; i++) + { + s[i] = 0; + } + s[8] = 0x8000000000000001; + + for (int i = 0; i < 23; i++) + { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = ROTL64(t[1], 1); + s[0] = xor3(s[0], t[4], u); + s[5] = xor3(s[5], t[4], u); + s[10] = xor3(s[10], t[4], u); + s[15] = xor3(s[15], t[4], u); + s[20] = xor3(s[20], t[4], u); + + u = ROTL64(t[2], 1); + s[1] = xor3(s[1], t[0], u); + s[6] = xor3(s[6], t[0], u); + s[11] = xor3(s[11], t[0], u); + s[16] = xor3(s[16], t[0], u); + s[21] = xor3(s[21], t[0], u); + + u = ROTL64(t[3], 1); + s[2] = xor3(s[2], t[1], u); + s[7] = xor3(s[7], t[1], u); + s[12] = xor3(s[12], t[1], u); + s[17] = xor3(s[17], t[1], u); + s[22] = xor3(s[22], t[1], u); + + u = ROTL64(t[4], 1); + s[3] = xor3(s[3], t[2], u); + s[8] = xor3(s[8], t[2], u); + s[13] = xor3(s[13], t[2], u); + s[18] = xor3(s[18], t[2], u); + s[23] = xor3(s[23], t[2], u); + + + u = ROTL64(t[0], 1); + s[4] = xor3(s[4], t[3], u); + s[9] = xor3(s[9], t[3], u); + s[14] = xor3(s[14], t[3], u); + s[19] = xor3(s[19], t[3], u); + s[24] = xor3(s[24], t[3], u); + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROTL64(s[6], 44); + s[6] = ROTL64(s[9], 20); + s[9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[2], 62); + s[2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[4], 27); + s[4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[8], 55); + s[8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[5], 36); + s[5] = ROTL64(s[3], 28); + s[3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[7], 6); + s[7] = ROTL64(s[10], 3); + s[10] = ROTL64(u, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + + u = s[0]; v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + + u = s[5]; v = s[6]; + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + s[8] = chi(s[8], s[9], u); + s[9] = chi(s[9], u, v); + + u = s[10]; v = s[11]; + s[10] = chi(s[10], s[11], s[12]); + s[11] = chi(s[11], s[12], s[13]); + s[12] = chi(s[12], s[13], s[14]); + s[13] = chi(s[13], s[14], u); + s[14] = chi(s[14], u, v); + + u = s[15]; v = s[16]; + s[15] = chi(s[15], s[16], s[17]); + s[16] = chi(s[16], s[17], s[18]); + s[17] = chi(s[17], s[18], s[19]); + s[18] = chi(s[18], s[19], u); + s[19] = chi(s[19], u, v); + + u = s[20]; v = s[21]; + s[20] = chi(s[20], s[21], s[22]); + s[21] = chi(s[21], s[22], s[23]); + s[22] = chi(s[22], s[23], s[24]); + s[23] = chi(s[23], s[24], u); + s[24] = chi(s[24], u, v); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } + + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = ROTL64(t[1], 1); + s[0] = xor3(s[0], t[4], u); + s[10] = xor3(s[10], t[4], u); + + u = ROTL64(t[2], 1); + s[6] = xor3(s[6], t[0], u); + s[16] = xor3(s[16], t[0], u); + + u = ROTL64(t[3], 1); + s[12] = xor3(s[12], t[1], u); + s[22] = xor3(s[22], t[1], u); + + u = ROTL64(t[4], 1); + s[3] = xor3(s[3], t[2], u); + s[18] = xor3(s[18], t[2], u); + + u = ROTL64(t[0], 1); + s[9] = xor3(s[9], t[3], u); + s[24] = xor3(s[24], t[3], u); + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROTL64(s[6], 44); + s[6] = ROTL64(s[9], 20); + s[9] = ROTL64(s[22], 61); + s[2] = ROTL64(s[12], 43); + s[4] = ROTL64(s[24], 14); + s[8] = ROTL64(s[16], 45); + s[5] = ROTL64(s[3], 28); + s[3] = ROTL64(s[18], 21); + s[7] = ROTL64(s[10], 3); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + + u = s[0]; v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[23]; } \ No newline at end of file diff --git a/libethcore/EthashCUDAMiner.cpp b/libethcore/EthashCUDAMiner.cpp index 77bf747bb..df9ed2503 100644 --- a/libethcore/EthashCUDAMiner.cpp +++ b/libethcore/EthashCUDAMiner.cpp @@ -112,16 +112,8 @@ EthashCUDAMiner::EthashCUDAMiner(ConstructionInfo const& _ci) : Worker("cudaminer" + toString(index())), m_hook( new EthashCUDAHook(this)) { -/* -#if defined(WIN32) - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - int num_cpus = sysinfo.dwNumberOfProcessors; - SetThreadAffinityMask(GetCurrentThread(), 1 << (index() % num_cpus)); - SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); -#endif -*/ } + EthashCUDAMiner::~EthashCUDAMiner() { pause(); @@ -149,8 +141,7 @@ void EthashCUDAMiner::workLoop() // take local copy of work since it may end up being overwritten by kickOff/pause. try { WorkPackage w = work(); - //cnote << "seedhash" << "#" + m_minerSeed.hex().substr(0, 16); - cnote << "set work to" << "#" + w.headerHash.hex().substr(0, 8) + ", target " << "#" + w.boundary.hex().substr(0, 16); + cnote << "set work; seed: " << "#" + w.seedHash.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 16); if (!m_miner || m_minerSeed != w.seedHash) { cnote << "Initialising miner..."; @@ -161,7 +152,9 @@ void EthashCUDAMiner::workLoop() unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index(); + /* EthashAux::FullType dag; + while (true) { if ((dag = EthashAux::full(w.seedHash, true))) @@ -175,8 +168,14 @@ void EthashCUDAMiner::workLoop() cnote << "Awaiting DAG"; this_thread::sleep_for(chrono::milliseconds(500)); } - bytesConstRef dagData = dag->data(); - m_miner->init(dagData.data(), dagData.size(), device); + */ + + EthashAux::LightType light; + light = EthashAux::light(w.seedHash); + //bytesConstRef dagData = dag->data(); + bytesConstRef lightData = light->data(); + + m_miner->init(light->light, lightData.data(), lightData.size(), device); } uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192); diff --git a/libethcore/EthashGPUMiner.cpp b/libethcore/EthashGPUMiner.cpp index 6d577376c..a621896c5 100644 --- a/libethcore/EthashGPUMiner.cpp +++ b/libethcore/EthashGPUMiner.cpp @@ -150,6 +150,7 @@ void EthashGPUMiner::workLoop() unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index(); + /* EthashAux::FullType dag; while (true) { @@ -164,8 +165,13 @@ void EthashGPUMiner::workLoop() cnote << "Awaiting DAG"; this_thread::sleep_for(chrono::milliseconds(500)); } - bytesConstRef dagData = dag->data(); - m_miner->init(dagData.data(), dagData.size(), s_platformId, device); + */ + + EthashAux::LightType light; + light = EthashAux::light(w.seedHash); + bytesConstRef lightData = light->data(); + + m_miner->init(light->light, lightData.data(), lightData.size(), s_platformId, device); } uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192); diff --git a/libstratum/EthStratumClient.cpp b/libstratum/EthStratumClient.cpp index 9ad84f7db..e2360b678 100644 --- a/libstratum/EthStratumClient.cpp +++ b/libstratum/EthStratumClient.cpp @@ -305,7 +305,7 @@ void EthStratumClient::processReponse(Json::Value& responseObject) h256 headerHash = h256(sHeaderHash); EthashAux::FullType dag; - + /* if (seedHash != m_current.seedHash) { cnote << "Grabbing DAG for" << seedHash; @@ -318,6 +318,8 @@ void EthStratumClient::processReponse(Json::Value& responseObject) { EthashAux::computeFull(sha3(seedHash), true); } + */ + if (headerHash != m_current.headerHash) { x_current.lock();