Browse Source

improved error handling and usage of constant memory space

cl-refactor
RoBiK 9 years ago
parent
commit
d8c8582f94
  1. 19
      libethash-cuda/dagger_shared.cuh
  2. 12
      libethash-cuda/dagger_shuffled.cuh
  3. 262
      libethash-cuda/ethash_cuda_miner.cpp
  4. 11
      libethash-cuda/ethash_cuda_miner.h
  5. 53
      libethash-cuda/ethash_cuda_miner_kernel.cu
  6. 45
      libethash-cuda/ethash_cuda_miner_kernel.h
  7. 4
      libethash-cuda/ethash_cuda_miner_kernel_globals.h

19
libethash-cuda/dagger_shared.cuh

@ -9,14 +9,17 @@ typedef union
hash32_t mix;
} compute_hash_share;
__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
__device__ hash64_t init_hash(uint64_t nonce)
{
hash64_t init;
// sha3_512(header .. nonce)
uint64_t state[25];
copy(state, header->uint64s, 4);
state[0] = d_header.uint64s[0];
state[1] = d_header.uint64s[1];
state[2] = d_header.uint64s[2];
state[3] = d_header.uint64s[3];
state[4] = nonce;
state[5] = 0x0000000000000001;
state[6] = 0;
@ -32,7 +35,7 @@ __device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
return init;
}
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share)
{
// share init0
if (thread_id == 0)
@ -59,9 +62,9 @@ __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, h
__threadfence_block();
#if __CUDA_ARCH__ >= 350
mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
mix = fnv4(mix, __ldg((&d_dag[*share])->uint4s + thread_id));
#else
mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
mix = fnv4(mix, (&d_dag[*share])->uint4s[thread_id]);
#endif
}
@ -99,15 +102,13 @@ __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
}
__device__ hash32_t compute_hash(
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t nonce
)
{
extern __shared__ compute_hash_share share[];
// Compute one init hash per work item.
hash64_t init = init_hash(g_header, nonce);
hash64_t init = init_hash(nonce);
// Threads work together in this phase in groups of 8.
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
@ -123,7 +124,7 @@ __device__ hash32_t compute_hash(
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s);
share[hash_id].mix.uint32s[thread_id] = thread_mix;

12
libethash-cuda/dagger_shuffled.cuh

@ -4,18 +4,16 @@
#include "dagger.cuh"
__device__ uint64_t compute_hash_shuffle(
uint2 const* g_header,
hash128_t const* g_dag,
uint64_t nonce
)
{
// sha3_512(header .. nonce)
uint2 state[25];
state[0] = g_header[0];
state[1] = g_header[1];
state[2] = g_header[2];
state[3] = g_header[3];
state[0] = d_header.uint2s[0];
state[1] = d_header.uint2s[1];
state[2] = d_header.uint2s[2];
state[3] = d_header.uint2s[3];
state[4] = vectorize(nonce);
state[5] = vectorize(0x0000000000000001ULL);
for (uint32_t i = 6; i < 25; i++)
@ -69,7 +67,7 @@ __device__ uint64_t compute_hash_shuffle(
}
shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
}
}

262
libethash-cuda/ethash_cuda_miner.cpp

@ -71,16 +71,6 @@ static std::atomic_flag s_logSpin = ATOMIC_FLAG_INIT;
#define ETHCUDA_LOG(_contents) cout << "[CUDA]:" << _contents << endl
#endif
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
__FUNCTION__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
ethash_cuda_miner::search_hook::~search_hook() {}
ethash_cuda_miner::ethash_cuda_miner()
@ -91,7 +81,7 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
{
int runtime_version;
int device_count;
device_count = getNumDevices();
if (device_count == 0)
@ -109,12 +99,11 @@ std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
int version_major = runtime_version / 1000;
int version_minor = (runtime_version - (version_major * 1000)) / 10;
sprintf(platform, "%d.%d", version_major, version_minor);
char compute[5];
sprintf(compute, "%d.%d", device_props.major, device_props.minor);
return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + std::string(device_props.name) + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
}
unsigned ethash_cuda_miner::getNumDevices()
@ -134,40 +123,48 @@ bool ethash_cuda_miner::configureGPU(
uint64_t _currentBlock
)
{
s_blockSize = _blockSize;
s_gridSize = _gridSize;
s_extraRequiredGPUMem = _extraGPUMemory;
s_numStreams = _numStreams;
s_scheduleFlag = _scheduleFlag;
// by default let's only consider the DAG of the first epoch
uint64_t dagSize = ethash_get_datasize(_currentBlock);
uint64_t requiredSize = dagSize + _extraGPUMemory;
for (unsigned int i = 0; i < getNumDevices(); i++)
try
{
if (_devices[i] != -1)
s_blockSize = _blockSize;
s_gridSize = _gridSize;
s_extraRequiredGPUMem = _extraGPUMemory;
s_numStreams = _numStreams;
s_scheduleFlag = _scheduleFlag;
// by default let's only consider the DAG of the first epoch
uint64_t dagSize = ethash_get_datasize(_currentBlock);
uint64_t requiredSize = dagSize + _extraGPUMemory;
unsigned devicesCount = getNumDevices();
for (unsigned int i = 0; i < devicesCount; i++)
{
cudaDeviceProp props;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, _devices[i]));
if (props.totalGlobalMem >= requiredSize)
if (_devices[i] != -1)
{
ETHCUDA_LOG(
"Found suitable CUDA device [" << string(props.name)
<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
);
}
else
{
ETHCUDA_LOG(
"CUDA device " << string(props.name)
<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
);
return false;
cudaDeviceProp props;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, _devices[i]));
if (props.totalGlobalMem >= requiredSize)
{
ETHCUDA_LOG(
"Found suitable CUDA device [" << string(props.name)
<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
);
}
else
{
ETHCUDA_LOG(
"CUDA device " << string(props.name)
<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
);
return false;
}
}
}
return true;
}
catch (runtime_error)
{
return false;
}
return true;
}
unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
@ -193,143 +190,110 @@ void ethash_cuda_miner::listDevices()
void ethash_cuda_miner::finish()
{
for (unsigned i = 0; i != s_numStreams; i++) {
cudaStreamDestroy(m_streams[i]);
m_streams[i] = 0;
}
cudaDeviceReset();
CUDA_SAFE_CALL(cudaDeviceReset());
}
bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
{
int device_count = getNumDevices();
if (device_count == 0)
return false;
// use selected device
int device_num = std::min<int>((int)_deviceId, device_count - 1);
cudaDeviceProp device_props;
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
try
{
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
return false;
}
int device_count = getNumDevices();
cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;
if (device_count == 0)
return false;
cudaError_t r = cudaSetDevice(device_num);
if (r != cudaSuccess)
{
cout << cudaGetErrorString(r) << endl;
return false;
}
cudaDeviceReset();
cudaSetDeviceFlags(s_scheduleFlag);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
// use selected device
int device_num = std::min<int>((int)_deviceId, device_count - 1);
m_search_buf = new uint32_t *[s_numStreams];
m_streams = new cudaStream_t[s_numStreams];
cudaDeviceProp device_props;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
// patch source code
cudaError result;
cout << "Using device: " << device_props.name << " (Compute " << device_props.major << "." << device_props.minor << ")" << endl;
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
unsigned max_outputs = c_max_search_results;
CUDA_SAFE_CALL(cudaSetDevice(device_num));
CUDA_SAFE_CALL(cudaDeviceReset());
CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
result = set_constants(&dagSize128, &max_outputs);
m_search_buf = new volatile uint32_t *[s_numStreams];
m_streams = new cudaStream_t[s_numStreams];
// create buffer for dag
result = cudaMalloc(&m_dag_ptr, _dagSize);
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
// create buffer for header256
result = cudaMalloc(&m_header, 32);
// create buffer for dag
hash128_t * dag;
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
// copy dag to CPU.
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
// copy dag to CPU.
result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
// create mining buffers
for (unsigned i = 0; i != s_numStreams; ++i)
{
result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
result = cudaStreamCreate(&m_streams[i]);
// create mining buffers
for (unsigned i = 0; i != s_numStreams; ++i)
{
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
}
set_constants(dag, dagSize128);
memset(&m_current_header, 0, sizeof(hash32_t));
m_current_target = 0;
m_current_nonce = 0;
m_current_index = 0;
return true;
}
if (result != cudaSuccess)
catch (runtime_error)
{
cout << cudaGetErrorString(result) << endl;
return false;
}
return true;
}
void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
{
struct pending_batch
bool initialize = false;
bool exit = false;
if (memcmp(&m_current_header, header, sizeof(hash32_t)))
{
uint64_t start_nonce;
unsigned buf;
};
std::queue<pending_batch> pending;
// update header constant buffer
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
for (unsigned i = 0; i != s_numStreams; ++i)
m_current_header = *reinterpret_cast<hash32_t const *>(header);
set_header(m_current_header);
initialize = true;
}
if (m_current_target != target)
{
m_search_buf[i][0] = 0;
m_current_target = target;
set_target(m_current_target);
initialize = true;
}
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
if (initialize)
{
throw std::runtime_error(cudaGetErrorString(err));
random_device engine;
m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
m_current_index = 0;
CUDA_SAFE_CALL(cudaDeviceSynchronize());
for (unsigned int i = 0; i < s_numStreams; i++)
m_search_buf[i][0] = 0;
}
unsigned buf = 0;
std::random_device engine;
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
for (;;)
uint64_t batch_size = s_gridSize * s_blockSize;
for (; !exit; m_current_index++, m_current_nonce += batch_size)
{
run_ethash_search(s_gridSize, s_blockSize, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);
pending.push({ start_nonce, buf });
buf = (buf + 1) % s_numStreams;
// read results
if (pending.size() == s_numStreams)
unsigned int stream_index = m_current_index % s_numStreams;
cudaStream_t stream = m_streams[stream_index];
volatile uint32_t* buffer = m_search_buf[stream_index];
uint32_t found_count = 0;
uint64_t nonces[SEARCH_RESULT_BUFFER_SIZE - 1];
uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
if (m_current_index >= s_numStreams)
{
pending_batch const& batch = pending.front();
cudaStreamSynchronize(m_streams[buf]);
uint32_t * results = m_search_buf[batch.buf];
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
uint64_t nonces[c_max_search_results];
for (unsigned i = 0; i != num_found; ++i)
{
nonces[i] = batch.start_nonce + results[i + 1];
//cout << results[i + 1] << ", ";
}
//if (num_found > 0)
// cout << endl;
bool exit = num_found && hook.found(nonces, num_found);
exit |= hook.searched(batch.start_nonce, s_gridSize * s_blockSize); // always report searched before exit
if (exit)
break;
start_nonce += s_gridSize * s_blockSize;
// reset search buffer if we're still going
if (num_found)
results[0] = 0;
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
throw std::runtime_error(cudaGetErrorString(err));
}
pending.pop();
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
found_count = buffer[0];
if (found_count)
buffer[0] = 0;
for (unsigned int j = 0; j < found_count; j++)
nonces[j] = nonce_base + buffer[j + 1];
}
run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
if (m_current_index >= s_numStreams)
{
exit = found_count && hook.found(nonces, found_count);
exit |= hook.searched(nonce_base, batch_size);
}
}
}
}

11
libethash-cuda/ethash_cuda_miner.h

@ -51,13 +51,12 @@ public:
static unsigned const c_defaultNumStreams;
private:
enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
hash32_t m_current_header;
uint64_t m_current_target;
uint64_t m_current_nonce;
uint64_t m_current_index;
hash128_t * m_dag_ptr;
hash32_t * m_header;
void ** m_hash_buf;
uint32_t ** m_search_buf;
volatile uint32_t ** m_search_buf;
cudaStream_t * m_streams;
/// The local work size for the search

53
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -18,21 +18,18 @@
__global__ void
__launch_bounds__(128, 7)
ethash_search(
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
volatile uint32_t* g_output,
uint64_t start_nonce
)
{
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
uint64_t hash = compute_hash_shuffle(start_nonce + gid);
#else
uint64_t hash = compute_hash(g_header, g_dag, start_nonce + gid).uint64s[0];
uint64_t hash = compute_hash(start_nonce + gid).uint64s[0];
#endif
if (cuda_swab64(hash) > target) return;
uint32_t index = atomicInc(g_output, d_max_outputs) + 1;
if (cuda_swab64(hash) > d_target) return;
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
g_output[index] = gid;
__threadfence_system();
}
@ -41,27 +38,37 @@ void run_ethash_search(
uint32_t blocks,
uint32_t threads,
cudaStream_t stream,
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
volatile uint32_t* g_output,
uint64_t start_nonce
)
{
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
#else
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, start_nonce);
#endif
CUDA_SAFE_CALL(cudaGetLastError());
}
cudaError set_constants(
uint32_t * dag_size,
uint32_t * max_outputs
void set_constants(
hash128_t* _dag,
uint32_t _dag_size
)
{
cudaError result;
result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
return result;
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
}
void set_header(
hash32_t _header
)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
}
void set_target(
uint64_t _target
)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
}

45
libethash-cuda/ethash_cuda_miner_kernel.h

@ -1,9 +1,12 @@
#ifndef _ETHASH_CUDA_MINER_KERNEL_H_
#define _ETHASH_CUDA_MINER_KERNEL_H_
#include <stdio.h>
#include <stdint.h>
#include <cuda_runtime.h>
#define SEARCH_RESULT_BUFFER_SIZE 64
typedef union
{
uint64_t uint64s[16 / sizeof(uint64_t)];
@ -33,27 +36,37 @@ typedef union
} hash128_t;
cudaError set_constants(
uint32_t * dag_size,
uint32_t * max_outputs
);
void set_constants(
hash128_t* _dag,
uint32_t _dag_size
);
void run_ethash_hash(
hash32_t* g_hashes,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce
);
void set_header(
hash32_t _header
);
void set_target(
uint64_t _target
);
void run_ethash_search(
uint32_t search_batch_size,
uint32_t workgroup_size,
cudaStream_t stream,
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
);
volatile uint32_t* g_output,
uint64_t start_nonce
);
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
const char * errorString = cudaGetErrorString(err); \
fprintf(stderr, \
"CUDA error in func '%s' at line %i : %s.\n", \
__FUNCTION__, __LINE__, errorString); \
throw std::runtime_error(errorString); \
} \
} while (0)
#endif

4
libethash-cuda/ethash_cuda_miner_kernel_globals.h

@ -4,6 +4,8 @@
//#include "cuda_helper.h"
__constant__ uint32_t d_dag_size;
__constant__ uint32_t d_max_outputs;
__constant__ hash128_t* d_dag;
__constant__ hash32_t d_header;
__constant__ uint64_t d_target;
#endif
Loading…
Cancel
Save