Browse Source

improved error handling and usage of constant memory space

cl-refactor
RoBiK 10 years ago
parent
commit
d8c8582f94
  1. 19
      libethash-cuda/dagger_shared.cuh
  2. 12
      libethash-cuda/dagger_shuffled.cuh
  3. 172
      libethash-cuda/ethash_cuda_miner.cpp
  4. 11
      libethash-cuda/ethash_cuda_miner.h
  5. 53
      libethash-cuda/ethash_cuda_miner_kernel.cu
  6. 39
      libethash-cuda/ethash_cuda_miner_kernel.h
  7. 4
      libethash-cuda/ethash_cuda_miner_kernel_globals.h

19
libethash-cuda/dagger_shared.cuh

@ -9,14 +9,17 @@ typedef union
hash32_t mix; hash32_t mix;
} compute_hash_share; } compute_hash_share;
__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce) __device__ hash64_t init_hash(uint64_t nonce)
{ {
hash64_t init; hash64_t init;
// sha3_512(header .. nonce) // sha3_512(header .. nonce)
uint64_t state[25]; uint64_t state[25];
copy(state, header->uint64s, 4); state[0] = d_header.uint64s[0];
state[1] = d_header.uint64s[1];
state[2] = d_header.uint64s[2];
state[3] = d_header.uint64s[3];
state[4] = nonce; state[4] = nonce;
state[5] = 0x0000000000000001; state[5] = 0x0000000000000001;
state[6] = 0; state[6] = 0;
@ -32,7 +35,7 @@ __device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
return init; return init;
} }
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag) __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share)
{ {
// share init0 // share init0
if (thread_id == 0) if (thread_id == 0)
@ -59,9 +62,9 @@ __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, h
__threadfence_block(); __threadfence_block();
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id])); mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id));
#else #else
mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]);
#endif #endif
} }
@ -99,15 +102,13 @@ __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
} }
__device__ hash32_t compute_hash( __device__ hash32_t compute_hash(
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t nonce uint64_t nonce
) )
{ {
extern __shared__ compute_hash_share share[]; extern __shared__ compute_hash_share share[];
// Compute one init hash per work item. // Compute one init hash per work item.
hash64_t init = init_hash(g_header, nonce); hash64_t init = init_hash(nonce);
// Threads work together in this phase in groups of 8. // Threads work together in this phase in groups of 8.
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
@ -123,7 +124,7 @@ __device__ hash32_t compute_hash(
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag); uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s);
share[hash_id].mix.uint32s[thread_id] = thread_mix; share[hash_id].mix.uint32s[thread_id] = thread_mix;

12
libethash-cuda/dagger_shuffled.cuh

@ -4,18 +4,16 @@
#include "dagger.cuh" #include "dagger.cuh"
__device__ uint64_t compute_hash_shuffle( __device__ uint64_t compute_hash_shuffle(
uint2 const* g_header,
hash128_t const* g_dag,
uint64_t nonce uint64_t nonce
) )
{ {
// sha3_512(header .. nonce) // sha3_512(header .. nonce)
uint2 state[25]; uint2 state[25];
state[0] = g_header[0]; state[0] = d_header.uint2s[0];
state[1] = g_header[1]; state[1] = d_header.uint2s[1];
state[2] = g_header[2]; state[2] = d_header.uint2s[2];
state[3] = g_header[3]; state[3] = d_header.uint2s[3];
state[4] = vectorize(nonce); state[4] = vectorize(nonce);
state[5] = vectorize(0x0000000000000001ULL); state[5] = vectorize(0x0000000000000001ULL);
for (uint32_t i = 6; i < 25; i++) for (uint32_t i = 6; i < 25; i++)
@ -69,7 +67,7 @@ __device__ uint64_t compute_hash_shuffle(
} }
shuffle[0].x = __shfl(shuffle[0].x, start_lane + t); shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]); mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
} }
} }

172
libethash-cuda/ethash_cuda_miner.cpp

@ -71,16 +71,6 @@ static std::atomic_flag s_logSpin = ATOMIC_FLAG_INIT;
#define ETHCUDA_LOG(_contents) cout << "[CUDA]:" << _contents << endl #define ETHCUDA_LOG(_contents) cout << "[CUDA]:" << _contents << endl
#endif #endif
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
__FUNCTION__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
ethash_cuda_miner::search_hook::~search_hook() {} ethash_cuda_miner::search_hook::~search_hook() {}
ethash_cuda_miner::ethash_cuda_miner() ethash_cuda_miner::ethash_cuda_miner()
@ -114,7 +104,6 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
sprintf(compute, "%d.%d", device_props.major, device_props.minor); sprintf(compute, "%d.%d", device_props.major, device_props.minor);
return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + std::string(device_props.name) + "\", \"version\": \"Compute " + std::string(compute) + "\" }"; return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + std::string(device_props.name) + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
} }
unsigned ethash_cuda_miner::getNumDevices() unsigned ethash_cuda_miner::getNumDevices()
@ -133,6 +122,8 @@ bool ethash_cuda_miner::configureGPU(
unsigned _scheduleFlag, unsigned _scheduleFlag,
uint64_t _currentBlock uint64_t _currentBlock
) )
{
try
{ {
s_blockSize = _blockSize; s_blockSize = _blockSize;
s_gridSize = _gridSize; s_gridSize = _gridSize;
@ -143,7 +134,8 @@ bool ethash_cuda_miner::configureGPU(
// by default let's only consider the DAG of the first epoch // by default let's only consider the DAG of the first epoch
uint64_t dagSize = ethash_get_datasize(_currentBlock); uint64_t dagSize = ethash_get_datasize(_currentBlock);
uint64_t requiredSize = dagSize + _extraGPUMemory; uint64_t requiredSize = dagSize + _extraGPUMemory;
for (unsigned int i = 0; i < getNumDevices(); i++) unsigned devicesCount = getNumDevices();
for (unsigned int i = 0; i < devicesCount; i++)
{ {
if (_devices[i] != -1) if (_devices[i] != -1)
{ {
@ -169,6 +161,11 @@ bool ethash_cuda_miner::configureGPU(
} }
return true; return true;
} }
catch (runtime_error)
{
return false;
}
}
unsigned ethash_cuda_miner::s_extraRequiredGPUMem; unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize; unsigned ethash_cuda_miner::s_blockSize = ethash_cuda_miner::c_defaultBlockSize;
@ -193,14 +190,12 @@ void ethash_cuda_miner::listDevices()
void ethash_cuda_miner::finish() void ethash_cuda_miner::finish()
{ {
for (unsigned i = 0; i != s_numStreams; i++) { CUDA_SAFE_CALL(cudaDeviceReset());
cudaStreamDestroy(m_streams[i]);
m_streams[i] = 0;
}
cudaDeviceReset();
} }
bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId) bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
{
try
{ {
int device_count = getNumDevices(); int device_count = getNumDevices();
@ -211,124 +206,93 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
int device_num = std::min<int>((int)_deviceId, device_count - 1); int device_num = std::min<int>((int)_deviceId, device_count - 1);
cudaDeviceProp device_props; cudaDeviceProp device_props;
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
{
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
return false;
}
cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl; cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;
cudaError_t r = cudaSetDevice(device_num); CUDA_SAFE_CALL(cudaSetDevice(device_num));
if (r != cudaSuccess) CUDA_SAFE_CALL(cudaDeviceReset());
{ CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
cout << cudaGetErrorString(r) << endl; CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
return false;
}
cudaDeviceReset();
cudaSetDeviceFlags(s_scheduleFlag);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
m_search_buf = new uint32_t *[s_numStreams]; m_search_buf = new volatile uint32_t *[s_numStreams];
m_streams = new cudaStream_t[s_numStreams]; m_streams = new cudaStream_t[s_numStreams];
// patch source code
cudaError result;
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES); uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
unsigned max_outputs = c_max_search_results;
result = set_constants(&dagSize128, &max_outputs);
// create buffer for dag // create buffer for dag
result = cudaMalloc(&m_dag_ptr, _dagSize); hash128_t * dag;
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
// create buffer for header256
result = cudaMalloc(&m_header, 32);
// copy dag to CPU. // copy dag to CPU.
result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice); CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
// create mining buffers // create mining buffers
for (unsigned i = 0; i != s_numStreams; ++i) for (unsigned i = 0; i != s_numStreams; ++i)
{ {
result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t)); CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
result = cudaStreamCreate(&m_streams[i]); CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
}
set_constants(dag, dagSize128);
memset(&m_current_header, 0, sizeof(hash32_t));
m_current_target = 0;
m_current_nonce = 0;
m_current_index = 0;
return true;
} }
if (result != cudaSuccess) catch (runtime_error)
{ {
cout << cudaGetErrorString(result) << endl;
return false; return false;
} }
return true;
} }
void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook) void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
{ {
struct pending_batch bool initialize = false;
{ bool exit = false;
uint64_t start_nonce; if (memcmp(&m_current_header, header, sizeof(hash32_t)))
unsigned buf;
};
std::queue<pending_batch> pending;
// update header constant buffer
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
for (unsigned i = 0; i != s_numStreams; ++i)
{ {
m_search_buf[i][0] = 0; m_current_header = *reinterpret_cast<hash32_t const *>(header);
set_header(m_current_header);
initialize = true;
} }
cudaError err = cudaGetLastError(); if (m_current_target != target)
if (cudaSuccess != err)
{ {
throw std::runtime_error(cudaGetErrorString(err)); m_current_target = target;
set_target(m_current_target);
initialize = true;
} }
if (initialize)
unsigned buf = 0;
std::random_device engine;
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
for (;;)
{ {
run_ethash_search(s_gridSize, s_blockSize, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target); random_device engine;
m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
pending.push({ start_nonce, buf }); m_current_index = 0;
buf = (buf + 1) % s_numStreams; CUDA_SAFE_CALL(cudaDeviceSynchronize());
for (unsigned int i = 0; i < s_numStreams; i++)
// read results m_search_buf[i][0] = 0;
if (pending.size() == s_numStreams) }
uint64_t batch_size = s_gridSize * s_blockSize;
for (; !exit; m_current_index++, m_current_nonce += batch_size)
{ {
pending_batch const& batch = pending.front(); unsigned int stream_index = m_current_index % s_numStreams;
cudaStream_t stream = m_streams[stream_index];
cudaStreamSynchronize(m_streams[buf]); volatile uint32_t* buffer = m_search_buf[stream_index];
uint32_t found_count = 0;
uint32_t * results = m_search_buf[batch.buf]; uint64_t nonces[SEARCH_RESULT_BUFFER_SIZE - 1];
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results); uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
uint64_t nonces[c_max_search_results]; if (m_current_index >= s_numStreams)
for (unsigned i = 0; i != num_found; ++i)
{ {
nonces[i] = batch.start_nonce + results[i + 1]; CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
//cout << results[i + 1] << ", "; found_count = buffer[0];
if (found_count)
buffer[0] = 0;
for (unsigned int j = 0; j < found_count; j++)
nonces[j] = nonce_base + buffer[j + 1];
} }
//if (num_found > 0) run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
// cout << endl; if (m_current_index >= s_numStreams)
bool exit = num_found && hook.found(nonces, num_found);
exit |= hook.searched(batch.start_nonce, s_gridSize * s_blockSize); // always report searched before exit
if (exit)
break;
start_nonce += s_gridSize * s_blockSize;
// reset search buffer if we're still going
if (num_found)
results[0] = 0;
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{ {
throw std::runtime_error(cudaGetErrorString(err)); exit = found_count && hook.found(nonces, found_count);
} exit |= hook.searched(nonce_base, batch_size);
pending.pop();
} }
} }
} }

11
libethash-cuda/ethash_cuda_miner.h

@ -51,13 +51,12 @@ public:
static unsigned const c_defaultNumStreams; static unsigned const c_defaultNumStreams;
private: private:
enum { c_max_search_results = 63, c_hash_batch_size = 1024 }; hash32_t m_current_header;
uint64_t m_current_target;
uint64_t m_current_nonce;
uint64_t m_current_index;
hash128_t * m_dag_ptr; volatile uint32_t ** m_search_buf;
hash32_t * m_header;
void ** m_hash_buf;
uint32_t ** m_search_buf;
cudaStream_t * m_streams; cudaStream_t * m_streams;
/// The local work size for the search /// The local work size for the search

53
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -18,21 +18,18 @@
__global__ void __global__ void
__launch_bounds__(128, 7) __launch_bounds__(128, 7)
ethash_search( ethash_search(
uint32_t* g_output, volatile uint32_t* g_output,
hash32_t const* g_header, uint64_t start_nonce
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
) )
{ {
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); uint64_t hash = compute_hash_shuffle(start_nonce + gid);
#else #else
uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0]; uint64_t hash = compute_hash(start_nonce + gid).uint64s[0];
#endif #endif
if (cuda_swab64(hash) > target) return; if (cuda_swab64(hash) > d_target) return;
uint32_t index = atomicInc(g_output, d_max_outputs) + 1; uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
g_output[index] = gid; g_output[index] = gid;
__threadfence_system(); __threadfence_system();
} }
@ -41,27 +38,37 @@ void run_ethash_search(
uint32_t blocks, uint32_t blocks,
uint32_t threads, uint32_t threads,
cudaStream_t stream, cudaStream_t stream,
uint32_t* g_output, volatile uint32_t* g_output,
hash32_t const* g_header, uint64_t start_nonce
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
) )
{ {
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target); ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
#else #else
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target); ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, start_nonce);
#endif #endif
CUDA_SAFE_CALL(cudaGetLastError());
} }
cudaError set_constants( void set_constants(
uint32_t * dag_size, hash128_t* _dag,
uint32_t * max_outputs uint32_t _dag_size
) )
{ {
cudaError result; CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t)); }
return result;
void set_header(
hash32_t _header
)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
}
void set_target(
uint64_t _target
)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
} }

39
libethash-cuda/ethash_cuda_miner_kernel.h

@ -1,9 +1,12 @@
#ifndef _ETHASH_CUDA_MINER_KERNEL_H_ #ifndef _ETHASH_CUDA_MINER_KERNEL_H_
#define _ETHASH_CUDA_MINER_KERNEL_H_ #define _ETHASH_CUDA_MINER_KERNEL_H_
#include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#define SEARCH_RESULT_BUFFER_SIZE 64
typedef union typedef union
{ {
uint64_t uint64s[16 / sizeof(uint64_t)]; uint64_t uint64s[16 / sizeof(uint64_t)];
@ -33,27 +36,37 @@ typedef union
} hash128_t; } hash128_t;
cudaError set_constants( void set_constants(
uint32_t * dag_size, hash128_t* _dag,
uint32_t * max_outputs uint32_t _dag_size
); );
void run_ethash_hash( void set_header(
hash32_t* g_hashes, hash32_t _header
hash32_t const* g_header, );
hash128_t const* g_dag,
uint64_t start_nonce void set_target(
uint64_t _target
); );
void run_ethash_search( void run_ethash_search(
uint32_t search_batch_size, uint32_t search_batch_size,
uint32_t workgroup_size, uint32_t workgroup_size,
cudaStream_t stream, cudaStream_t stream,
uint32_t* g_output, volatile uint32_t* g_output,
hash32_t const* g_header, uint64_t start_nonce
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
); );
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
const char * errorString = cudaGetErrorString(err); \
fprintf(stderr, \
"CUDA error in func '%s' at line %i : %s.\n", \
__FUNCTION__, __LINE__, errorString); \
throw std::runtime_error(errorString); \
} \
} while (0)
#endif #endif

4
libethash-cuda/ethash_cuda_miner_kernel_globals.h

@ -4,6 +4,8 @@
//#include "cuda_helper.h" //#include "cuda_helper.h"
__constant__ uint32_t d_dag_size; __constant__ uint32_t d_dag_size;
__constant__ uint32_t d_max_outputs; __constant__ hash128_t* d_dag;
__constant__ hash32_t d_header;
__constant__ uint64_t d_target;
#endif #endif
Loading…
Cancel
Save