Browse Source

on-GPU DAG generation

cl-refactor
Genoil 9 years ago
parent
commit
19459aa2b2
  1. 2
      CMakeLists.txt
  2. 32
      ethminer/MinerAux.h
  3. 50
      libethash-cl/ethash_cl_miner.cpp
  4. 15
      libethash-cl/ethash_cl_miner.h
  5. 51
      libethash-cl/ethash_cl_miner_kernel.cl
  6. 4
      libethash-cuda/dagger_shared.cuh
  7. 2
      libethash-cuda/dagger_shuffled.cuh
  8. 23
      libethash-cuda/ethash_cuda_miner.cpp
  9. 6
      libethash-cuda/ethash_cuda_miner.h
  10. 79
      libethash-cuda/ethash_cuda_miner_kernel.cu
  11. 22
      libethash-cuda/ethash_cuda_miner_kernel.h
  12. 2
      libethash-cuda/ethash_cuda_miner_kernel_globals.h
  13. 4
      libethash-cuda/fnv.cuh
  14. 186
      libethash-cuda/keccak.cuh
  15. 188
      libethash-cuda/keccak_u64.cuh
  16. 25
      libethcore/EthashCUDAMiner.cpp
  17. 10
      libethcore/EthashGPUMiner.cpp
  18. 4
      libstratum/EthStratumClient.cpp

2
CMakeLists.txt

@ -2,7 +2,7 @@
cmake_minimum_required(VERSION 2.8.12)
set(PROJECT_VERSION "0.9.41")
set(GENOIL_VERSION "1.0.8")
set(GENOIL_VERSION "1.1")
if (${CMAKE_VERSION} VERSION_GREATER 3.0)
cmake_policy(SET CMP0042 OLD) # fix MACOSX_RPATH
cmake_policy(SET CMP0048 NEW) # allow VERSION argument in project()

32
ethminer/MinerAux.h

@ -362,6 +362,7 @@ public:
}
else if (arg == "--current-block" && i + 1 < argc)
m_currentBlock = stol(argv[++i]);
/*
else if ((arg == "-R" || arg == "--dag-dir") && i + 1 < argc)
{
strcpy(s_dagDir, argv[++i]);
@ -395,6 +396,7 @@ public:
BOOST_THROW_EXCEPTION(BadArgument());
}
}
*/
else if ((arg == "-w" || arg == "--check-pow") && i + 4 < argc)
{
string m;
@ -494,6 +496,7 @@ public:
void execute()
{
/*
EthashAux::setDAGDirName(s_dagDir);
EthashAux::setDAGEraseMode(m_eraseMode);
EthashAux::eraseDAGs();
@ -501,6 +504,7 @@ public:
{
m_eraseMode = DAGEraseMode::None;
}
*/
if (m_shouldListDevices)
{
@ -596,7 +600,7 @@ public:
<< " -FS, --failover-stratum <host:port> Failover stratum server at host:port" << endl
<< " -O, --userpass <username.workername:password> Stratum login credentials" << endl
<< " -FO, --failover-userpass <username.workername:password> Failover stratum login credentials (optional, will use normal credentials when omitted)" << endl
<< " --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 60. Don't set lower than max. avg. block time" << endl
<< " --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 180. Don't set lower than max. avg. block time" << endl
#endif
#if ETH_JSONRPC || ETH_STRATUM || !ETH_TRUE
<< " --farm-recheck <n> Leave n ms between checks for changed work (default: 500). When using stratum, use a high value (i.e. 2000) to get more stable hashrate output" << endl
@ -615,14 +619,14 @@ public:
#if ETH_JSONRPC || !ETH_TRUE
<< " --phone-home <on/off> When benchmarking, publish results (default: off)" << endl
#endif
<< "DAG file management:" << endl
<< " -D,--create-dag <number> Create the DAG in preparation for mining on given block and exit." << endl
<< " -R <s>, --dag-dir <s> Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl
<< " -E <mode>, --erase-dags <mode> Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl
<< " none - don't erase DAG files (default)" << endl
<< " old - erase all DAG files older than current epoch" << endl
<< " bench - like old, but keep epoch 0 for benchmarking" << endl
<< " all - erase all DAG files. After deleting all files, setting changes to none." << endl
// << "DAG file management:" << endl
// << " -D,--create-dag <number> Create the DAG in preparation for mining on given block and exit." << endl
// << " -R <s>, --dag-dir <s> Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl
// << " -E <mode>, --erase-dags <mode> Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl
// << " none - don't erase DAG files (default)" << endl
// << " old - erase all DAG files older than current epoch" << endl
// << " bench - like old, but keep epoch 0 for benchmarking" << endl
// << " all - erase all DAG files. After deleting all files, setting changes to none." << endl
<< "Mining configuration:" << endl
<< " -C,--cpu When mining, use the CPU." << endl
<< " -G,--opencl When mining use the GPU via OpenCL." << endl
@ -691,7 +695,7 @@ private:
cout << "Benchmarking on platform: " << platformInfo << endl;
cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
genesis.prep();
//genesis.prep();
genesis.setDifficulty(u256(1) << 63);
f.setWork(genesis);
@ -774,7 +778,7 @@ private:
cout << "Running mining simulation on platform: " << platformInfo << endl;
cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
genesis.prep();
//genesis.prep();
genesis.setDifficulty(u256(1) << difficulty);
f.setWork(genesis);
@ -911,6 +915,7 @@ private:
Json::Value v = prpc->eth_getWork();
h256 hh(v[0].asString());
h256 newSeedHash(v[1].asString());
/*
if (current.seedHash != newSeedHash)
{
minelog << "Grabbing DAG for" << newSeedHash;
@ -923,6 +928,7 @@ private:
{
EthashAux::computeFull(sha3(newSeedHash), true);
}
*/
if (hh != current.headerHash)
{
x_current.lock();
@ -1104,7 +1110,7 @@ private:
/// Benchmarking params
bool m_phoneHome = false;
unsigned m_benchmarkWarmup = 3;
unsigned m_benchmarkWarmup = 15;
unsigned m_benchmarkTrial = 3;
unsigned m_benchmarkTrials = 5;
unsigned m_benchmarkBlock = 0;
@ -1119,7 +1125,7 @@ private:
unsigned m_farmRecheckPeriod = 500;
unsigned m_defaultStratumFarmRecheckPeriod = 2000;
bool m_farmRecheckSet = false;
int m_worktimeout = 90;
int m_worktimeout = 180;
bool m_precompute = true;
#if ETH_STRATUM || !ETH_TRUE

50
libethash-cl/ethash_cl_miner.cpp

@ -324,9 +324,11 @@ void ethash_cl_miner::finish()
m_queue.finish();
}
bool ethash_cl_miner::init(
uint8_t const* _dag,
uint64_t _dagSize,
ethash_light_t _light,
uint8_t const* _lightData,
uint64_t _lightSize,
unsigned _platformId,
unsigned _deviceId
)
@ -399,12 +401,17 @@ bool ethash_cl_miner::init(
if (m_globalWorkSize % s_workgroupSize != 0)
m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize;
uint64_t dagSize = ethash_get_datasize(_light->block_number);
uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES);
uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
// patch source code
// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
addDefinition(code, "GROUP_SIZE", s_workgroupSize);
addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
addDefinition(code, "DAG_SIZE", dagSize128);
addDefinition(code, "LIGHT_SIZE", lightSize64);
addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
addDefinition(code, "PLATFORM", platformId);
@ -430,16 +437,19 @@ bool ethash_cl_miner::init(
// create buffer for dag
try
{
ETHCL_LOG("Creating one big buffer for the DAG");
m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
ETHCL_LOG("Loading single big chunk kernels");
ETHCL_LOG("Creating cache buffer");
m_light = cl::Buffer(m_context, CL_MEM_READ_ONLY, _lightSize);
ETHCL_LOG("Creating DAG buffer");
m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, dagSize);
ETHCL_LOG("Loading kernels");
m_searchKernel = cl::Kernel(program, "ethash_search");
ETHCL_LOG("Mapping one big chunk.");
m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
m_dagKernel = cl::Kernel(program, "ethash_calculate_dag_item");
ETHCL_LOG("Writing cache buffer");
m_queue.enqueueWriteBuffer(m_light, CL_TRUE, 0, _lightSize, _lightData);
}
catch (cl::Error const& err)
{
ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
ETHCL_LOG("Allocating/mapping DAG buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
return false;
}
// create buffer for header
@ -456,6 +466,28 @@ bool ethash_cl_miner::init(
ETHCL_LOG("Creating mining buffer " << i);
m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t));
}
ETHCL_LOG("Generating DAG data");
uint32_t const work = (uint32_t)(dagSize / sizeof(node));
//while (work < blocks * threads) blocks /= 2;
uint32_t fullRuns = work / m_globalWorkSize;
uint32_t const restWork = work % m_globalWorkSize;
if (restWork > 0) fullRuns++;
m_dagKernel.setArg(1, m_light);
m_dagKernel.setArg(2, m_dag);
m_dagKernel.setArg(3, ~0u);
for (uint32_t i = 0; i < fullRuns; i++)
{
m_dagKernel.setArg(0, i * m_globalWorkSize);
m_queue.enqueueNDRangeKernel(m_dagKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize);
m_queue.finish();
printf("%.0f%%\n", 100.0f * (float)i / (float)fullRuns);
}
}
catch (cl::Error const& err)
{

15
libethash-cl/ethash_cl_miner.h

@ -51,12 +51,13 @@ public:
uint64_t _currentBlock
);
bool init(
uint8_t const* _dag,
uint64_t _dagSize,
unsigned _platformId = 0,
unsigned _deviceId = 0
);
bool ethash_cl_miner::init(
ethash_light_t _light,
uint8_t const* _lightData,
uint64_t _lightSize,
unsigned _platformId,
unsigned _deviceId
);
void finish();
void search(uint8_t const* _header, uint64_t _target, search_hook& _hook);
@ -74,7 +75,9 @@ private:
cl::Context m_context;
cl::CommandQueue m_queue;
cl::Kernel m_searchKernel;
cl::Kernel m_dagKernel;
cl::Buffer m_dag;
cl::Buffer m_light;
cl::Buffer m_header;
cl::Buffer m_searchBuffer[c_bufferCount];
unsigned m_globalWorkSize;

51
libethash-cl/ethash_cl_miner_kernel.cl

@ -1,11 +1,12 @@
#define OPENCL_PLATFORM_UNKNOWN 0
#define OPENCL_PLATFORM_NVIDIA 1
#define OPENCL_PLATFORM_AMD 2
#define OPENCL_PLATFORM_AMD 2
#define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64/4)
#define THREADS_PER_HASH (128 / 16)
#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
#define FNV_PRIME 0x01000193
__constant uint2 const Keccak_f1600_RC[24] = {
@ -176,8 +177,6 @@ static void keccak_f1600_round(uint2* a, uint r)
static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
{
// Originally I unrolled the first and last rounds to interface
// better with surrounding code, however I haven't done this
// without causing the AMD compiler to blow up the VGPR usage.
@ -227,6 +226,18 @@ typedef struct
ulong ulongs[32 / sizeof(ulong)];
} hash32_t;
typedef union {
uint words[64 / sizeof(uint)];
uint2 uint2s[64 / sizeof(uint2)];
uint4 uint4s[64 / sizeof(uint4)];
} hash64_t;
typedef union {
uint words[200 / sizeof(uint)];
uint2 uint2s[200 / sizeof(uint2)];
uint4 uint4s[200 / sizeof(uint4)];
} hash200_t;
typedef struct
{
uint4 uint4s[128 / sizeof(uint4)];
@ -334,3 +345,35 @@ __kernel void ethash_search(
g_output[slot] = gid;
}
}
static void SHA3_512(uint2* s, uint isolate)
{
for (uint i = 8; i != 25; ++i)
{
s[i] = (uint2){ 0, 0 };
}
s[8].x = 0x00000001;
s[8].y = 0x80000000;
keccak_f1600_no_absorb(s, 8, isolate);
}
__kernel void ethash_calculate_dag_item(uint start, __global hash64_t const* g_light, __global hash64_t * g_dag, uint isolate)
{
uint const node_index = start + get_global_id(0);
if (node_index > DAG_SIZE * 2) return;
hash200_t dag_node;
copy(dag_node.uint4s, g_light[node_index % LIGHT_SIZE].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s, isolate);
for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_SIZE;
for (uint w = 0; w != 4; ++w) {
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
}
}
SHA3_512(dag_node.uint2s, isolate);
copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
}

4
libethash-cuda/dagger_shared.cuh

@ -1,9 +1,5 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "ethash_cuda_miner_kernel.h"
#include "keccak_u64.cuh"
#include "fnv.cuh"
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
typedef union {
uint4 uint4s[4];

2
libethash-cuda/dagger_shuffled.cuh

@ -1,7 +1,5 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "ethash_cuda_miner_kernel.h"
#include "keccak.cuh"
#include "fnv.cuh"
#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)

23
libethash-cuda/ethash_cuda_miner.cpp

@ -199,7 +199,7 @@ void ethash_cuda_miner::finish()
CUDA_SAFE_CALL(cudaDeviceReset());
}
bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId)
{
try
{
@ -224,25 +224,38 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
m_search_buf = new volatile uint32_t *[s_numStreams];
m_streams = new cudaStream_t[s_numStreams];
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
uint64_t dagSize = ethash_get_datasize(_light->block_number);
uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES);
uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
// create buffer for cache
hash64_t * light;
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
// copy dag to CPU.
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
// create buffer for dag
hash128_t * dag;
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
// copy dag to CPU.
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
//CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
// create mining buffers
for (unsigned i = 0; i != s_numStreams; ++i)
{
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
}
set_constants(dag, dagSize128);
set_constants(dag, dagSize128, light, lightSize64);
memset(&m_current_header, 0, sizeof(hash32_t));
m_current_target = 0;
m_current_nonce = 0;
m_current_index = 0;
cout << "Generating DAG..." << endl;
ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0]);
return true;
}
catch (runtime_error)

6
libethash-cuda/ethash_cuda_miner.h

@ -34,11 +34,13 @@ public:
unsigned _scheduleFlag,
uint64_t _currentBlock
);
bool init(
/*bool init(
uint8_t const* _dag,
uint64_t _dagSize,
unsigned _deviceId = 0
);
);*/
bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId);
void finish();
void search(uint8_t const* header, uint64_t target, search_hook& hook);

79
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -8,13 +8,19 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "cuda_helper.h"
#include "fnv.cuh"
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
#define SHUFFLE_MIN_VER 300
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
#include "keccak_u64.cuh"
#include "dagger_shared.cuh"
#define TPB 128
#define BPSM 4
#else
#include "keccak.cuh"
#include "dagger_shuffled.cuh"
#define TPB 896
#define BPSM 1
@ -43,17 +49,86 @@ void run_ethash_search(
uint64_t start_nonce
)
{
ethash_search << <blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
CUDA_SAFE_CALL(cudaGetLastError());
}
#define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64/4)
__global__ void
__launch_bounds__(128, 7)
ethash_calculate_dag_item(uint32_t start)
{
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
if (node_index > d_dag_size * 2) return;
hash200_t dag_node;
copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s);
const int thread_id = threadIdx.x & 3;
for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
/* fix this some time. or not.
for (uint32_t t = 0; t < 4; t++) {
uint32_t shuffle_index = __shfl(parent_index, t, 4);
uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
if (t == thread_id) {
for (uint32_t w = 0; w < 4; w++) {
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
}
}
}
*/
for (unsigned w = 0; w != 4; ++w) {
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
}
}
SHA3_512(dag_node.uint2s);
hash64_t * dag_nodes = (hash64_t *)d_dag;
copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4);
}
void ethash_generate_dag(
uint64_t dag_size,
uint32_t blocks,
uint32_t threads,
cudaStream_t stream
)
{
uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));
//while (work < blocks * threads) blocks /= 2;
uint32_t fullRuns = work / (blocks * threads);
uint32_t const restWork = work % (blocks * threads);
if (restWork > 0) fullRuns++;
for (uint32_t i = 0; i < fullRuns; i++)
{
ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
printf("%.0f%%\n",100.0f * (float)i / (float)fullRuns);
}
CUDA_SAFE_CALL(cudaGetLastError());
}
void set_constants(
hash128_t* _dag,
uint32_t _dag_size
uint32_t _dag_size,
hash64_t * _light,
uint32_t _light_size
)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
}
void set_header(

22
libethash-cuda/ethash_cuda_miner_kernel.h

@ -19,10 +19,23 @@ typedef struct
uint4 uint4s[128 / sizeof(uint4)];
} hash128_t;
typedef union {
uint32_t words[64 / sizeof(uint32_t)];
uint2 uint2s[64 / sizeof(uint2)];
uint4 uint4s[64 / sizeof(uint4)];
} hash64_t;
typedef union {
uint32_t words[200 / sizeof(uint32_t)];
uint2 uint2s[200 / sizeof(uint2)];
uint4 uint4s[200 / sizeof(uint4)];
} hash200_t;
void set_constants(
hash128_t* _dag,
uint32_t _dag_size
uint32_t _dag_size,
hash64_t * _light,
uint32_t _light_size
);
void set_header(
@ -41,6 +54,13 @@ void run_ethash_search(
uint64_t start_nonce
);
void ethash_generate_dag(
uint64_t dag_size,
uint32_t blocks,
uint32_t threads,
cudaStream_t stream
);
#define CUDA_SAFE_CALL(call) \
do { \

2
libethash-cuda/ethash_cuda_miner_kernel_globals.h

@ -5,6 +5,8 @@
__constant__ uint32_t d_dag_size;
__constant__ hash128_t* d_dag;
__constant__ uint32_t d_light_size;
__constant__ hash64_t* d_light;
__constant__ hash32_t d_header;
__constant__ uint64_t d_target;

4
libethash-cuda/fnv.cuh

@ -1,3 +1,4 @@
#define FNV_PRIME 0x01000193
#define fnv(x,y) ((x) * FNV_PRIME ^(y))
@ -15,4 +16,5 @@ __device__ uint4 fnv4(uint4 a, uint4 b)
__device__ uint32_t fnv_reduce(uint4 v)
{
return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
}
}

186
libethash-cuda/keccak.cuh

@ -588,4 +588,190 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s)
/* iota: a[0,0] ^= round constant */
//s[0] ^= vectorize(keccak_round_constants[23]);
return devectorize(s[0]) ^ keccak_round_constants[23];
}
__device__ __forceinline__ void SHA3_512(uint2* s) {
uint2 t[5], u, v;
for (uint32_t i = 8; i < 25; i++)
{
s[i] = make_uint2(0, 0);
}
s[8].x = 1;
s[8].y = 0x80000000;
for (int i = 0; i < 23; i++)
{
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
u = ROL2(t[1], 1);
s[0] = xor3(s[0], t[4], u);
s[5] = xor3(s[5], t[4], u);
s[10] = xor3(s[10], t[4], u);
s[15] = xor3(s[15], t[4], u);
s[20] = xor3(s[20], t[4], u);
u = ROL2(t[2], 1);
s[1] = xor3(s[1], t[0], u);
s[6] = xor3(s[6], t[0], u);
s[11] = xor3(s[11], t[0], u);
s[16] = xor3(s[16], t[0], u);
s[21] = xor3(s[21], t[0], u);
u = ROL2(t[3], 1);
s[2] = xor3(s[2], t[1], u);
s[7] = xor3(s[7], t[1], u);
s[12] = xor3(s[12], t[1], u);
s[17] = xor3(s[17], t[1], u);
s[22] = xor3(s[22], t[1], u);
u = ROL2(t[4], 1);
s[3] = xor3(s[3], t[2], u);
s[8] = xor3(s[8], t[2], u);
s[13] = xor3(s[13], t[2], u);
s[18] = xor3(s[18], t[2], u);
s[23] = xor3(s[23], t[2], u);
u = ROL2(t[0], 1);
s[4] = xor3(s[4], t[3], u);
s[9] = xor3(s[9], t[3], u);
s[14] = xor3(s[14], t[3], u);
s[19] = xor3(s[19], t[3], u);
s[24] = xor3(s[24], t[3], u);
/* rho pi: b[..] = rotl(a[..], ..) */
u = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[22] = ROL2(s[14], 39);
s[14] = ROL2(s[20], 18);
s[20] = ROL2(s[2], 62);
s[2] = ROL2(s[12], 43);
s[12] = ROL2(s[13], 25);
s[13] = ROL2(s[19], 8);
s[19] = ROL2(s[23], 56);
s[23] = ROL2(s[15], 41);
s[15] = ROL2(s[4], 27);
s[4] = ROL2(s[24], 14);
s[24] = ROL2(s[21], 2);
s[21] = ROL2(s[8], 55);
s[8] = ROL2(s[16], 45);
s[16] = ROL2(s[5], 36);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[18] = ROL2(s[17], 15);
s[17] = ROL2(s[11], 10);
s[11] = ROL2(s[7], 6);
s[7] = ROL2(s[10], 3);
s[10] = ROL2(u, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
u = s[0]; v = s[1];
s[0] = chi(s[0], s[1], s[2]);
s[1] = chi(s[1], s[2], s[3]);
s[2] = chi(s[2], s[3], s[4]);
s[3] = chi(s[3], s[4], u);
s[4] = chi(s[4], u, v);
u = s[5]; v = s[6];
s[5] = chi(s[5], s[6], s[7]);
s[6] = chi(s[6], s[7], s[8]);
s[7] = chi(s[7], s[8], s[9]);
s[8] = chi(s[8], s[9], u);
s[9] = chi(s[9], u, v);
u = s[10]; v = s[11];
s[10] = chi(s[10], s[11], s[12]);
s[11] = chi(s[11], s[12], s[13]);
s[12] = chi(s[12], s[13], s[14]);
s[13] = chi(s[13], s[14], u);
s[14] = chi(s[14], u, v);
u = s[15]; v = s[16];
s[15] = chi(s[15], s[16], s[17]);
s[16] = chi(s[16], s[17], s[18]);
s[17] = chi(s[17], s[18], s[19]);
s[18] = chi(s[18], s[19], u);
s[19] = chi(s[19], u, v);
u = s[20]; v = s[21];
s[20] = chi(s[20], s[21], s[22]);
s[21] = chi(s[21], s[22], s[23]);
s[22] = chi(s[22], s[23], s[24]);
s[23] = chi(s[23], s[24], u);
s[24] = chi(s[24], u, v);
/* iota: a[0,0] ^= round constant */
s[0] ^= vectorize(keccak_round_constants[i]);
}
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
u = ROL2(t[1], 1);
s[0] = xor3(s[0], t[4], u);
s[10] = xor3(s[10], t[4], u);
u = ROL2(t[2], 1);
s[6] = xor3(s[6], t[0], u);
s[16] = xor3(s[16], t[0], u);
u = ROL2(t[3], 1);
s[12] = xor3(s[12], t[1], u);
s[22] = xor3(s[22], t[1], u);
u = ROL2(t[4], 1);
s[3] = xor3(s[3], t[2], u);
s[18] = xor3(s[18], t[2], u);
u = ROL2(t[0], 1);
s[9] = xor3(s[9], t[3], u);
s[24] = xor3(s[24], t[3], u);
/* rho pi: b[..] = rotl(a[..], ..) */
u = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[2] = ROL2(s[12], 43);
s[4] = ROL2(s[24], 14);
s[8] = ROL2(s[16], 45);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[7] = ROL2(s[10], 3);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
u = s[0]; v = s[1];
s[0] = chi(s[0], s[1], s[2]);
s[1] = chi(s[1], s[2], s[3]);
s[2] = chi(s[2], s[3], s[4]);
s[3] = chi(s[3], s[4], u);
s[4] = chi(s[4], u, v);
s[5] = chi(s[5], s[6], s[7]);
s[6] = chi(s[6], s[7], s[8]);
s[7] = chi(s[7], s[8], s[9]);
/* iota: a[0,0] ^= round constant */
s[0] ^= vectorize(keccak_round_constants[23]);
}

188
libethash-cuda/keccak_u64.cuh

@ -586,4 +586,192 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint64_t* s)
/* iota: a[0,0] ^= round constant */
//s[0] ^= vectorize(keccak_round_constants[23]);
return s[0] ^ keccak_round_constants[23];
}
__device__ __forceinline__ void SHA3_512(uint2* s2) {
uint64_t * s = (uint64_t*)s2; //dirty
uint64_t t[5], u, v;
for (uint32_t i = 9; i < 25; i++)
{
s[i] = 0;
}
s[8] = 0x8000000000000001;
for (int i = 0; i < 23; i++)
{
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
u = ROTL64(t[1], 1);
s[0] = xor3(s[0], t[4], u);
s[5] = xor3(s[5], t[4], u);
s[10] = xor3(s[10], t[4], u);
s[15] = xor3(s[15], t[4], u);
s[20] = xor3(s[20], t[4], u);
u = ROTL64(t[2], 1);
s[1] = xor3(s[1], t[0], u);
s[6] = xor3(s[6], t[0], u);
s[11] = xor3(s[11], t[0], u);
s[16] = xor3(s[16], t[0], u);
s[21] = xor3(s[21], t[0], u);
u = ROTL64(t[3], 1);
s[2] = xor3(s[2], t[1], u);
s[7] = xor3(s[7], t[1], u);
s[12] = xor3(s[12], t[1], u);
s[17] = xor3(s[17], t[1], u);
s[22] = xor3(s[22], t[1], u);
u = ROTL64(t[4], 1);
s[3] = xor3(s[3], t[2], u);
s[8] = xor3(s[8], t[2], u);
s[13] = xor3(s[13], t[2], u);
s[18] = xor3(s[18], t[2], u);
s[23] = xor3(s[23], t[2], u);
u = ROTL64(t[0], 1);
s[4] = xor3(s[4], t[3], u);
s[9] = xor3(s[9], t[3], u);
s[14] = xor3(s[14], t[3], u);
s[19] = xor3(s[19], t[3], u);
s[24] = xor3(s[24], t[3], u);
/* rho pi: b[..] = rotl(a[..], ..) */
u = s[1];
s[1] = ROTL64(s[6], 44);
s[6] = ROTL64(s[9], 20);
s[9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[2], 62);
s[2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[4], 27);
s[4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[8], 55);
s[8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[5], 36);
s[5] = ROTL64(s[3], 28);
s[3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[7], 6);
s[7] = ROTL64(s[10], 3);
s[10] = ROTL64(u, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
u = s[0]; v = s[1];
s[0] = chi(s[0], s[1], s[2]);
s[1] = chi(s[1], s[2], s[3]);
s[2] = chi(s[2], s[3], s[4]);
s[3] = chi(s[3], s[4], u);
s[4] = chi(s[4], u, v);
u = s[5]; v = s[6];
s[5] = chi(s[5], s[6], s[7]);
s[6] = chi(s[6], s[7], s[8]);
s[7] = chi(s[7], s[8], s[9]);
s[8] = chi(s[8], s[9], u);
s[9] = chi(s[9], u, v);
u = s[10]; v = s[11];
s[10] = chi(s[10], s[11], s[12]);
s[11] = chi(s[11], s[12], s[13]);
s[12] = chi(s[12], s[13], s[14]);
s[13] = chi(s[13], s[14], u);
s[14] = chi(s[14], u, v);
u = s[15]; v = s[16];
s[15] = chi(s[15], s[16], s[17]);
s[16] = chi(s[16], s[17], s[18]);
s[17] = chi(s[17], s[18], s[19]);
s[18] = chi(s[18], s[19], u);
s[19] = chi(s[19], u, v);
u = s[20]; v = s[21];
s[20] = chi(s[20], s[21], s[22]);
s[21] = chi(s[21], s[22], s[23]);
s[22] = chi(s[22], s[23], s[24]);
s[23] = chi(s[23], s[24], u);
s[24] = chi(s[24], u, v);
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[i];
}
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
u = ROTL64(t[1], 1);
s[0] = xor3(s[0], t[4], u);
s[10] = xor3(s[10], t[4], u);
u = ROTL64(t[2], 1);
s[6] = xor3(s[6], t[0], u);
s[16] = xor3(s[16], t[0], u);
u = ROTL64(t[3], 1);
s[12] = xor3(s[12], t[1], u);
s[22] = xor3(s[22], t[1], u);
u = ROTL64(t[4], 1);
s[3] = xor3(s[3], t[2], u);
s[18] = xor3(s[18], t[2], u);
u = ROTL64(t[0], 1);
s[9] = xor3(s[9], t[3], u);
s[24] = xor3(s[24], t[3], u);
/* rho pi: b[..] = rotl(a[..], ..) */
u = s[1];
s[1] = ROTL64(s[6], 44);
s[6] = ROTL64(s[9], 20);
s[9] = ROTL64(s[22], 61);
s[2] = ROTL64(s[12], 43);
s[4] = ROTL64(s[24], 14);
s[8] = ROTL64(s[16], 45);
s[5] = ROTL64(s[3], 28);
s[3] = ROTL64(s[18], 21);
s[7] = ROTL64(s[10], 3);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
u = s[0]; v = s[1];
s[0] = chi(s[0], s[1], s[2]);
s[1] = chi(s[1], s[2], s[3]);
s[2] = chi(s[2], s[3], s[4]);
s[3] = chi(s[3], s[4], u);
s[4] = chi(s[4], u, v);
s[5] = chi(s[5], s[6], s[7]);
s[6] = chi(s[6], s[7], s[8]);
s[7] = chi(s[7], s[8], s[9]);
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[23];
}

25
libethcore/EthashCUDAMiner.cpp

@ -112,16 +112,8 @@ EthashCUDAMiner::EthashCUDAMiner(ConstructionInfo const& _ci) :
Worker("cudaminer" + toString(index())),
m_hook( new EthashCUDAHook(this))
{
/*
#if defined(WIN32)
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
int num_cpus = sysinfo.dwNumberOfProcessors;
SetThreadAffinityMask(GetCurrentThread(), 1 << (index() % num_cpus));
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
#endif
*/
}
EthashCUDAMiner::~EthashCUDAMiner()
{
pause();
@ -149,8 +141,7 @@ void EthashCUDAMiner::workLoop()
// take local copy of work since it may end up being overwritten by kickOff/pause.
try {
WorkPackage w = work();
//cnote << "seedhash" << "#" + m_minerSeed.hex().substr(0, 16);
cnote << "set work to" << "#" + w.headerHash.hex().substr(0, 8) + ", target " << "#" + w.boundary.hex().substr(0, 16);
cnote << "set work; seed: " << "#" + w.seedHash.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 16);
if (!m_miner || m_minerSeed != w.seedHash)
{
cnote << "Initialising miner...";
@ -161,7 +152,9 @@ void EthashCUDAMiner::workLoop()
unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();
/*
EthashAux::FullType dag;
while (true)
{
if ((dag = EthashAux::full(w.seedHash, true)))
@ -175,8 +168,14 @@ void EthashCUDAMiner::workLoop()
cnote << "Awaiting DAG";
this_thread::sleep_for(chrono::milliseconds(500));
}
bytesConstRef dagData = dag->data();
m_miner->init(dagData.data(), dagData.size(), device);
*/
EthashAux::LightType light;
light = EthashAux::light(w.seedHash);
//bytesConstRef dagData = dag->data();
bytesConstRef lightData = light->data();
m_miner->init(light->light, lightData.data(), lightData.size(), device);
}
uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);

10
libethcore/EthashGPUMiner.cpp

@ -150,6 +150,7 @@ void EthashGPUMiner::workLoop()
unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();
/*
EthashAux::FullType dag;
while (true)
{
@ -164,8 +165,13 @@ void EthashGPUMiner::workLoop()
cnote << "Awaiting DAG";
this_thread::sleep_for(chrono::milliseconds(500));
}
bytesConstRef dagData = dag->data();
m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
*/
EthashAux::LightType light;
light = EthashAux::light(w.seedHash);
bytesConstRef lightData = light->data();
m_miner->init(light->light, lightData.data(), lightData.size(), s_platformId, device);
}
uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);

4
libstratum/EthStratumClient.cpp

@ -305,7 +305,7 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
h256 headerHash = h256(sHeaderHash);
EthashAux::FullType dag;
/*
if (seedHash != m_current.seedHash)
{
cnote << "Grabbing DAG for" << seedHash;
@ -318,6 +318,8 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
{
EthashAux::computeFull(sha3(seedHash), true);
}
*/
if (headerHash != m_current.headerHash)
{
x_current.lock();

Loading…
Cancel
Save