Jan Willem Penterman
10 years ago
15 changed files with 1989 additions and 5 deletions
@ -0,0 +1,29 @@ |
|||
set(EXECUTABLE ethash-cu) |
|||
|
|||
FIND_PACKAGE(CUDA REQUIRED) |
|||
|
|||
file(GLOB SRC_LIST "*.cpp" "*.cu") |
|||
file(GLOB HEADERS "*.h" "*.cuh") |
|||
|
|||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) |
|||
|
|||
LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) |
|||
LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G) |
|||
|
|||
if(COMPUTE) |
|||
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) |
|||
else(COMPUTE) |
|||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52) |
|||
endif(COMPUTE) |
|||
|
|||
|
|||
|
|||
include_directories(${CMAKE_CURRENT_BINARY_DIR}) |
|||
include_directories(${CUDA_INCLUDE_DIRS}) |
|||
include_directories(..) |
|||
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) |
|||
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash) |
|||
|
|||
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) |
|||
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) |
|||
|
File diff suppressed because it is too large
@ -0,0 +1,22 @@ |
|||
#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; } |
|||
|
|||
#define ACCESSES 64 |
|||
#define THREADS_PER_HASH (128 / 16) |
|||
#define FNV_PRIME 0x01000193 |
|||
|
|||
#define fnv(x,y) ((x) * FNV_PRIME ^(y)) |
|||
|
|||
__device__ uint4 fnv4(uint4 a, uint4 b) |
|||
{ |
|||
uint4 c; |
|||
c.x = a.x * FNV_PRIME ^ b.x; |
|||
c.y = a.y * FNV_PRIME ^ b.y; |
|||
c.z = a.z * FNV_PRIME ^ b.z; |
|||
c.w = a.w * FNV_PRIME ^ b.w; |
|||
return c; |
|||
} |
|||
|
|||
__device__ uint32_t fnv_reduce(uint4 v) |
|||
{ |
|||
return fnv(fnv(fnv(v.x, v.y), v.z), v.w); |
|||
} |
@ -0,0 +1,136 @@ |
|||
#include "ethash_cu_miner_kernel_globals.h" |
|||
#include "ethash_cu_miner_kernel.h" |
|||
#include "keccak.cuh" |
|||
#include "dagger.cuh" |
|||
|
|||
typedef union |
|||
{ |
|||
hash64_t init; |
|||
hash32_t mix; |
|||
} compute_hash_share; |
|||
|
|||
__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce) |
|||
{ |
|||
hash64_t init; |
|||
|
|||
// sha3_512(header .. nonce) |
|||
uint64_t state[25]; |
|||
|
|||
copy(state, header->uint64s, 4); |
|||
state[4] = nonce; |
|||
state[5] = 0x0000000000000001; |
|||
state[6] = 0; |
|||
state[7] = 0; |
|||
state[8] = 0x8000000000000000; |
|||
for (uint32_t i = 9; i < 25; i++) |
|||
{ |
|||
state[i] = 0; |
|||
} |
|||
|
|||
keccak_f1600_block((uint2 *)state, 8); |
|||
copy(init.uint64s, state, 8); |
|||
return init; |
|||
} |
|||
|
|||
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag) |
|||
{ |
|||
// share init0 |
|||
if (thread_id == 0) |
|||
*share = mix.x; |
|||
|
|||
uint32_t init0 = *share; |
|||
|
|||
uint32_t a = 0; |
|||
|
|||
do |
|||
{ |
|||
|
|||
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); |
|||
|
|||
//#pragma unroll 4 |
|||
for (uint32_t i = 0; i < 4; i++) |
|||
{ |
|||
|
|||
if (update_share) |
|||
{ |
|||
uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w }; |
|||
*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size; |
|||
} |
|||
__threadfence_block(); |
|||
|
|||
#if __CUDA_ARCH__ >= 350 |
|||
mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id])); |
|||
#else |
|||
mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); |
|||
#endif |
|||
|
|||
} |
|||
|
|||
} while ((a += 4) != ACCESSES); |
|||
|
|||
return fnv_reduce(mix); |
|||
} |
|||
|
|||
__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix) |
|||
{ |
|||
uint64_t state[25]; |
|||
|
|||
hash32_t hash; |
|||
|
|||
// keccak_256(keccak_512(header..nonce) .. mix); |
|||
copy(state, init->uint64s, 8); |
|||
copy(state + 8, mix->uint64s, 4); |
|||
state[12] = 0x0000000000000001; |
|||
for (uint32_t i = 13; i < 16; i++) |
|||
{ |
|||
state[i] = 0; |
|||
} |
|||
state[16] = 0x8000000000000000; |
|||
for (uint32_t i = 17; i < 25; i++) |
|||
{ |
|||
state[i] = 0; |
|||
} |
|||
|
|||
keccak_f1600_block((uint2 *)state, 1); |
|||
|
|||
// copy out |
|||
copy(hash.uint64s, state, 4); |
|||
return hash; |
|||
} |
|||
|
|||
__device__ hash32_t compute_hash( |
|||
hash32_t const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t nonce |
|||
) |
|||
{ |
|||
extern __shared__ compute_hash_share share[]; |
|||
|
|||
// Compute one init hash per work item. |
|||
hash64_t init = init_hash(g_header, nonce); |
|||
|
|||
// Threads work together in this phase in groups of 8. |
|||
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
|||
uint32_t const hash_id = threadIdx.x >> 3; |
|||
|
|||
hash32_t mix; |
|||
|
|||
for (int i = 0; i < THREADS_PER_HASH; i++) |
|||
{ |
|||
// share init with other threads |
|||
if (i == thread_id) |
|||
share[hash_id].init = init; |
|||
|
|||
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; |
|||
|
|||
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag); |
|||
|
|||
share[hash_id].mix.uint32s[thread_id] = thread_mix; |
|||
|
|||
|
|||
if (i == thread_id) |
|||
mix = share[hash_id].mix; |
|||
} |
|||
|
|||
return final_hash(&init, &mix); |
|||
} |
@ -0,0 +1,107 @@ |
|||
#include "ethash_cu_miner_kernel_globals.h" |
|||
#include "ethash_cu_miner_kernel.h" |
|||
#include "keccak.cuh" |
|||
#include "dagger.cuh" |
|||
|
|||
__device__ uint64_t compute_hash_shuffle( |
|||
uint2 const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t nonce |
|||
) |
|||
{ |
|||
// sha3_512(header .. nonce) |
|||
uint2 state[25]; |
|||
|
|||
state[0] = g_header[0]; |
|||
state[1] = g_header[1]; |
|||
state[2] = g_header[2]; |
|||
state[3] = g_header[3]; |
|||
state[4] = vectorize(nonce); |
|||
state[5] = vectorize(0x0000000000000001ULL); |
|||
for (uint32_t i = 6; i < 25; i++) |
|||
{ |
|||
state[i] = make_uint2(0, 0); |
|||
} |
|||
state[8] = vectorize(0x8000000000000000ULL); |
|||
keccak_f1600_block(state,8); |
|||
|
|||
// Threads work together in this phase in groups of 8. |
|||
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
|||
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1); |
|||
const int mix_idx = thread_id & 3; |
|||
|
|||
uint4 mix; |
|||
uint2 shuffle[8]; |
|||
|
|||
for (int i = 0; i < THREADS_PER_HASH; i++) |
|||
{ |
|||
// share init among threads |
|||
for (int j = 0; j < 8; j++) { |
|||
shuffle[j].x = __shfl(state[j].x, start_lane + i); |
|||
shuffle[j].y = __shfl(state[j].y, start_lane + i); |
|||
} |
|||
|
|||
// ugly but avoids local reads/writes |
|||
if (mix_idx < 2) { |
|||
if (mix_idx == 0) |
|||
mix = vectorize2(shuffle[0], shuffle[1]); |
|||
else |
|||
mix = vectorize2(shuffle[2], shuffle[3]); |
|||
} |
|||
else { |
|||
if (mix_idx == 2) |
|||
mix = vectorize2(shuffle[4], shuffle[5]); |
|||
else |
|||
mix = vectorize2(shuffle[6], shuffle[7]); |
|||
} |
|||
|
|||
uint32_t init0 = __shfl(shuffle[0].x, start_lane); |
|||
|
|||
for (uint32_t a = 0; a < ACCESSES; a += 4) |
|||
{ |
|||
int t = ((a >> 2) & (THREADS_PER_HASH - 1)); |
|||
|
|||
for (uint32_t b = 0; b < 4; b++) |
|||
{ |
|||
if (thread_id == t) |
|||
{ |
|||
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; |
|||
} |
|||
shuffle[0].x = __shfl(shuffle[0].x, start_lane + t); |
|||
|
|||
mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]); |
|||
} |
|||
} |
|||
|
|||
uint32_t thread_mix = fnv_reduce(mix); |
|||
|
|||
// update mix accross threads |
|||
shuffle[0].x = __shfl(thread_mix, start_lane + 0); |
|||
shuffle[0].y = __shfl(thread_mix, start_lane + 1); |
|||
shuffle[1].x = __shfl(thread_mix, start_lane + 2); |
|||
shuffle[1].y = __shfl(thread_mix, start_lane + 3); |
|||
shuffle[2].x = __shfl(thread_mix, start_lane + 4); |
|||
shuffle[2].y = __shfl(thread_mix, start_lane + 5); |
|||
shuffle[3].x = __shfl(thread_mix, start_lane + 6); |
|||
shuffle[3].y = __shfl(thread_mix, start_lane + 7); |
|||
|
|||
if (i == thread_id) { |
|||
//move mix into state: |
|||
state[8] = shuffle[0]; |
|||
state[9] = shuffle[1]; |
|||
state[10] = shuffle[2]; |
|||
state[11] = shuffle[3]; |
|||
} |
|||
} |
|||
|
|||
// keccak_256(keccak_512(header..nonce) .. mix); |
|||
state[12] = vectorize(0x0000000000000001ULL); |
|||
for (uint32_t i = 13; i < 25; i++) |
|||
{ |
|||
state[i] = vectorize(0ULL); |
|||
} |
|||
state[16] = vectorize(0x8000000000000000); |
|||
keccak_f1600_block(state, 1); |
|||
|
|||
return devectorize(state[0]); |
|||
} |
@ -0,0 +1,281 @@ |
|||
/*
|
|||
This file is part of c-ethash. |
|||
|
|||
c-ethash is free software: you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published by |
|||
the Free Software Foundation, either version 3 of the License, or |
|||
(at your option) any later version. |
|||
|
|||
c-ethash is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with cpp-ethereum. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
/** @file ethash_cu_miner.cpp
|
|||
* @author Tim Hughes <tim@twistedfury.com> |
|||
* @date 2015 |
|||
*/ |
|||
|
|||
|
|||
#define _CRT_SECURE_NO_WARNINGS |
|||
|
|||
#include <cstdio> |
|||
#include <cstdlib> |
|||
#include <iostream> |
|||
#include <assert.h> |
|||
#include <queue> |
|||
#include <random> |
|||
#include <vector> |
|||
#include <chrono> |
|||
#include <thread> |
|||
#include <libethash/util.h> |
|||
#include <libethash/ethash.h> |
|||
#include "ethash_cu_miner.h" |
|||
#include "ethash_cu_miner_kernel_globals.h" |
|||
|
|||
|
|||
#define ETHASH_BYTES 32 |
|||
|
|||
// workaround lame platforms
|
|||
#if !CL_VERSION_1_2 |
|||
#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE |
|||
#define CL_MEM_HOST_READ_ONLY 0 |
|||
#endif |
|||
|
|||
#undef min |
|||
#undef max |
|||
|
|||
using namespace std; |
|||
|
|||
unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128; |
|||
unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
|
|||
|
|||
ethash_cu_miner::search_hook::~search_hook() {} |
|||
|
|||
ethash_cu_miner::ethash_cu_miner() |
|||
{ |
|||
} |
|||
|
|||
std::string ethash_cu_miner::platform_info(unsigned _deviceId) |
|||
{ |
|||
int runtime_version; |
|||
int device_count; |
|||
|
|||
device_count = get_num_devices(); |
|||
|
|||
if (device_count == 0) |
|||
return std::string(); |
|||
|
|||
if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue) |
|||
{ |
|||
cout << cudaGetErrorString(cudaErrorInvalidValue) << endl; |
|||
return std::string(); |
|||
} |
|||
|
|||
// use selected default device
|
|||
int device_num = std::min<int>((int)_deviceId, device_count - 1); |
|||
|
|||
cudaDeviceProp device_props; |
|||
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) |
|||
{ |
|||
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; |
|||
return std::string(); |
|||
} |
|||
|
|||
char platform[5]; |
|||
int version_major = runtime_version / 1000; |
|||
int version_minor = (runtime_version - (version_major * 1000)) / 10; |
|||
sprintf(platform, "%d.%d", version_major, version_minor); |
|||
|
|||
|
|||
char compute[5]; |
|||
sprintf(compute, "%d.%d", device_props.major, device_props.minor); |
|||
|
|||
return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }"; |
|||
|
|||
} |
|||
|
|||
int ethash_cu_miner::get_num_devices() |
|||
{ |
|||
int device_count; |
|||
|
|||
if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice) |
|||
{ |
|||
cout << cudaGetErrorString(cudaErrorNoDevice) << endl; |
|||
return 0; |
|||
} |
|||
return device_count; |
|||
} |
|||
|
|||
void ethash_cu_miner::finish() |
|||
{ |
|||
for (unsigned i = 0; i != m_num_buffers; i++) { |
|||
cudaStreamDestroy(m_streams[i]); |
|||
m_streams[i] = 0; |
|||
} |
|||
cudaDeviceReset(); |
|||
} |
|||
|
|||
bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu) |
|||
{ |
|||
|
|||
int device_count = get_num_devices(); |
|||
|
|||
if (device_count == 0) |
|||
return false; |
|||
|
|||
// use selected device
|
|||
int device_num = std::min<int>((int)_deviceId, device_count - 1); |
|||
|
|||
cudaDeviceProp device_props; |
|||
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) |
|||
{ |
|||
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; |
|||
return false; |
|||
} |
|||
|
|||
cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl; |
|||
|
|||
cudaError_t r = cudaSetDevice(device_num); |
|||
if (r != cudaSuccess) |
|||
{ |
|||
cout << cudaGetErrorString(r) << endl; |
|||
return false; |
|||
} |
|||
cudaDeviceReset(); |
|||
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); |
|||
|
|||
m_num_buffers = num_buffers; |
|||
m_search_batch_size = search_batch_size; |
|||
|
|||
m_hash_buf = new void *[m_num_buffers]; |
|||
m_search_buf = new uint32_t *[m_num_buffers]; |
|||
m_streams = new cudaStream_t[m_num_buffers]; |
|||
|
|||
// use requested workgroup size, but we require multiple of 8
|
|||
m_workgroup_size = ((workgroup_size + 7) / 8) * 8; |
|||
|
|||
m_highcpu = highcpu; |
|||
|
|||
// patch source code
|
|||
cudaError result; |
|||
|
|||
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES); |
|||
unsigned max_outputs = c_max_search_results; |
|||
|
|||
result = set_constants(&dagSize128, &max_outputs); |
|||
|
|||
// create buffer for dag
|
|||
result = cudaMalloc(&m_dag_ptr, _dagSize); |
|||
|
|||
// create buffer for header256
|
|||
result = cudaMalloc(&m_header, 32); |
|||
|
|||
// copy dag to CPU.
|
|||
result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice); |
|||
|
|||
// create mining buffers
|
|||
for (unsigned i = 0; i != m_num_buffers; ++i) |
|||
{ |
|||
result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size); |
|||
result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t)); |
|||
result = cudaStreamCreate(&m_streams[i]); |
|||
} |
|||
if (result != cudaSuccess) |
|||
{ |
|||
cout << cudaGetErrorString(result) << endl; |
|||
return false; |
|||
} |
|||
return true; |
|||
} |
|||
|
|||
/**
|
|||
* Prevent High CPU usage while waiting for an async task |
|||
*/ |
|||
static unsigned waitStream(cudaStream_t stream) |
|||
{ |
|||
unsigned wait_ms = 0; |
|||
while (cudaStreamQuery(stream) == cudaErrorNotReady) { |
|||
this_thread::sleep_for(chrono::milliseconds(10)); |
|||
wait_ms += 10; |
|||
} |
|||
return wait_ms; |
|||
} |
|||
|
|||
void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook) |
|||
{ |
|||
struct pending_batch |
|||
{ |
|||
uint64_t start_nonce; |
|||
unsigned buf; |
|||
}; |
|||
std::queue<pending_batch> pending; |
|||
|
|||
static uint32_t const c_zero = 0; |
|||
|
|||
// update header constant buffer
|
|||
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice); |
|||
for (unsigned i = 0; i != m_num_buffers; ++i) |
|||
{ |
|||
cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice); |
|||
} |
|||
cudaError err = cudaGetLastError(); |
|||
if (cudaSuccess != err) |
|||
{ |
|||
throw std::runtime_error(cudaGetErrorString(err)); |
|||
} |
|||
|
|||
unsigned buf = 0; |
|||
std::random_device engine; |
|||
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine); |
|||
for (;; start_nonce += m_search_batch_size) |
|||
{ |
|||
run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target); |
|||
|
|||
pending.push({ start_nonce, buf }); |
|||
buf = (buf + 1) % m_num_buffers; |
|||
|
|||
// read results
|
|||
if (pending.size() == m_num_buffers) |
|||
{ |
|||
pending_batch const& batch = pending.front(); |
|||
|
|||
uint32_t results[1 + c_max_search_results]; |
|||
|
|||
if (!m_highcpu) |
|||
waitStream(m_streams[buf]); // 28ms
|
|||
cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]); |
|||
|
|||
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results); |
|||
uint64_t nonces[c_max_search_results]; |
|||
for (unsigned i = 0; i != num_found; ++i) |
|||
{ |
|||
nonces[i] = batch.start_nonce + results[i + 1]; |
|||
//cout << results[i + 1] << ", ";
|
|||
} |
|||
//if (num_found > 0)
|
|||
// cout << endl;
|
|||
|
|||
bool exit = num_found && hook.found(nonces, num_found); |
|||
exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
|
|||
if (exit) |
|||
break; |
|||
|
|||
start_nonce += m_search_batch_size; |
|||
// reset search buffer if we're still going
|
|||
if (num_found) |
|||
cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]); |
|||
|
|||
cudaError err = cudaGetLastError(); |
|||
if (cudaSuccess != err) |
|||
{ |
|||
throw std::runtime_error(cudaGetErrorString(err)); |
|||
} |
|||
pending.pop(); |
|||
} |
|||
} |
|||
} |
|||
|
@ -0,0 +1,56 @@ |
|||
#pragma once |
|||
|
|||
#include <cuda_runtime.h> |
|||
|
|||
#include <time.h> |
|||
#include <functional> |
|||
#include <libethash/ethash.h> |
|||
#include "ethash_cu_miner_kernel.h" |
|||
|
|||
class ethash_cu_miner |
|||
{ |
|||
public: |
|||
struct search_hook |
|||
{ |
|||
virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
|
|||
|
|||
// reports progress, return true to abort
|
|||
virtual bool found(uint64_t const* nonces, uint32_t count) = 0; |
|||
virtual bool searched(uint64_t start_nonce, uint32_t count) = 0; |
|||
}; |
|||
|
|||
public: |
|||
ethash_cu_miner(); |
|||
|
|||
bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false); |
|||
static std::string platform_info(unsigned _deviceId = 0); |
|||
static int get_num_devices(); |
|||
|
|||
|
|||
void finish(); |
|||
void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count); |
|||
void search(uint8_t const* header, uint64_t target, search_hook& hook); |
|||
|
|||
/* -- default values -- */ |
|||
/// Default value of the local work size. Also known as workgroup size.
|
|||
static unsigned const c_defaultLocalWorkSize; |
|||
/// Default value of the global work size as a multiplier of the local work size
|
|||
static unsigned const c_defaultGlobalWorkSizeMultiplier; |
|||
|
|||
private: |
|||
enum { c_max_search_results = 63, c_hash_batch_size = 1024 }; |
|||
|
|||
bool m_highcpu; |
|||
unsigned m_num_buffers; |
|||
unsigned m_search_batch_size; |
|||
unsigned m_workgroup_size; |
|||
|
|||
hash128_t * m_dag_ptr; |
|||
hash32_t * m_header; |
|||
|
|||
void ** m_hash_buf; |
|||
uint32_t ** m_search_buf; |
|||
cudaStream_t * m_streams; |
|||
|
|||
|
|||
}; |
@ -0,0 +1,72 @@ |
|||
/* |
|||
* Genoil's CUDA mining kernel for Ethereum |
|||
* based on Tim Hughes' opencl kernel. |
|||
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer. |
|||
*/ |
|||
|
|||
#include "ethash_cu_miner_kernel.h" |
|||
#include "ethash_cu_miner_kernel_globals.h" |
|||
#include "cuda_helper.h" |
|||
|
|||
#define SHUFFLE_MIN_VER 350 |
|||
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
|||
#include "dagger_shuffled.cuh" |
|||
#else |
|||
#include "dagger_shared.cuh" |
|||
#endif |
|||
|
|||
__global__ void |
|||
__launch_bounds__(128, 7) |
|||
ethash_search( |
|||
uint32_t* g_output, |
|||
hash32_t const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t start_nonce, |
|||
uint64_t target |
|||
) |
|||
{ |
|||
|
|||
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; |
|||
|
|||
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
|||
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); |
|||
if (cuda_swab64(hash) < target) |
|||
#else |
|||
hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid); |
|||
if (cuda_swab64(hash.uint64s[0]) < target) |
|||
#endif |
|||
{ |
|||
atomicInc(g_output, d_max_outputs); |
|||
g_output[g_output[0]] = gid; |
|||
} |
|||
|
|||
} |
|||
|
|||
void run_ethash_search( |
|||
uint32_t blocks, |
|||
uint32_t threads, |
|||
cudaStream_t stream, |
|||
uint32_t* g_output, |
|||
hash32_t const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t start_nonce, |
|||
uint64_t target |
|||
) |
|||
{ |
|||
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
|||
ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target); |
|||
#else |
|||
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target); |
|||
#endif |
|||
} |
|||
|
|||
cudaError set_constants( |
|||
uint32_t * dag_size, |
|||
uint32_t * max_outputs |
|||
) |
|||
{ |
|||
cudaError result; |
|||
result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t)); |
|||
result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t)); |
|||
return result; |
|||
} |
@ -0,0 +1,60 @@ |
|||
#ifndef _ETHASH_CU_MINER_KERNEL_H_ |
|||
#define _ETHASH_CU_MINER_KERNEL_H_ |
|||
|
|||
#include <stdint.h> |
|||
|
|||
|
|||
typedef union |
|||
{ |
|||
uint64_t uint64s[16 / sizeof(uint64_t)]; |
|||
uint32_t uint32s[16 / sizeof(uint32_t)]; |
|||
} hash16_t; |
|||
|
|||
typedef union |
|||
{ |
|||
uint32_t uint32s[32 / sizeof(uint32_t)]; |
|||
uint64_t uint64s[32 / sizeof(uint64_t)]; |
|||
uint2 uint2s[32 / sizeof(uint2)]; |
|||
} hash32_t; |
|||
|
|||
|
|||
typedef union |
|||
{ |
|||
uint32_t uint32s[64 / sizeof(uint32_t)]; |
|||
uint64_t uint64s[64 / sizeof(uint64_t)]; |
|||
uint4 uint4s[64 / sizeof(uint4)]; |
|||
} hash64_t; |
|||
|
|||
|
|||
typedef union |
|||
{ |
|||
uint32_t uint32s[128 / sizeof(uint32_t)]; |
|||
uint4 uint4s[128 / sizeof(uint4)]; |
|||
} hash128_t; |
|||
|
|||
//typedef uint32_t hash128_t;
|
|||
|
|||
cudaError set_constants( |
|||
uint32_t * dag_size, |
|||
uint32_t * max_outputs |
|||
); |
|||
|
|||
void run_ethash_hash( |
|||
hash32_t* g_hashes, |
|||
hash32_t const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t start_nonce |
|||
); |
|||
|
|||
void run_ethash_search( |
|||
uint32_t search_batch_size, |
|||
uint32_t workgroup_size, |
|||
cudaStream_t stream, |
|||
uint32_t* g_output, |
|||
hash32_t const* g_header, |
|||
hash128_t const* g_dag, |
|||
uint64_t start_nonce, |
|||
uint64_t target |
|||
); |
|||
|
|||
#endif |
@ -0,0 +1,9 @@ |
|||
#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ |
|||
#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ |
|||
|
|||
//#include "cuda_helper.h"
|
|||
|
|||
__constant__ uint32_t d_dag_size; |
|||
__constant__ uint32_t d_max_outputs; |
|||
|
|||
#endif |
@ -0,0 +1,89 @@ |
|||
#include "cuda_helper.h" |
|||
|
|||
__device__ __constant__ uint64_t const keccak_round_constants[24] = { |
|||
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL, |
|||
0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL, |
|||
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL, |
|||
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL, |
|||
0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL, |
|||
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, |
|||
0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL, |
|||
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL |
|||
}; |
|||
|
|||
#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) |
|||
|
|||
__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size) |
|||
{ |
|||
uint2 t[5], u, v; |
|||
|
|||
#pragma unroll 3 |
|||
for (int i = 0; i < 24; i++) |
|||
{ |
|||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ |
|||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; |
|||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; |
|||
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; |
|||
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; |
|||
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; |
|||
|
|||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ |
|||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ |
|||
u = t[4] ^ ROL2(t[1], 1); |
|||
s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u; |
|||
u = t[0] ^ ROL2(t[2], 1); |
|||
s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u; |
|||
u = t[1] ^ ROL2(t[3], 1); |
|||
s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u; |
|||
u = t[2] ^ ROL2(t[4], 1); |
|||
s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u; |
|||
u = t[3] ^ ROL2(t[0], 1); |
|||
s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u; |
|||
|
|||
/* rho pi: b[..] = rotl(a[..], ..) */ |
|||
u = s[1]; |
|||
|
|||
s[1] = ROL2(s[6], 44); |
|||
s[6] = ROL2(s[9], 20); |
|||
s[9] = ROL2(s[22], 61); |
|||
s[22] = ROL2(s[14], 39); |
|||
s[14] = ROL2(s[20], 18); |
|||
s[20] = ROL2(s[2], 62); |
|||
s[2] = ROL2(s[12], 43); |
|||
s[12] = ROL2(s[13], 25); |
|||
s[13] = ROL2(s[19], 8); |
|||
s[19] = ROL2(s[23], 56); |
|||
s[23] = ROL2(s[15], 41); |
|||
s[15] = ROL2(s[4], 27); |
|||
s[4] = ROL2(s[24], 14); |
|||
s[24] = ROL2(s[21], 2); |
|||
s[21] = ROL2(s[8], 55); |
|||
s[8] = ROL2(s[16], 45); |
|||
s[16] = ROL2(s[5], 36); |
|||
s[5] = ROL2(s[3], 28); |
|||
s[3] = ROL2(s[18], 21); |
|||
s[18] = ROL2(s[17], 15); |
|||
s[17] = ROL2(s[11], 10); |
|||
s[11] = ROL2(s[7], 6); |
|||
s[7] = ROL2(s[10], 3); |
|||
s[10] = ROL2(u, 1); |
|||
|
|||
// squeeze this in here |
|||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ |
|||
u = s[0]; v = s[1]; s[0] ^= (~v) & s[2]; |
|||
|
|||
/* iota: a[0,0] ^= round constant */ |
|||
s[0] ^= vectorize(keccak_round_constants[i]); |
|||
if (i == 23 && out_size == 1) return; |
|||
|
|||
// continue chi |
|||
s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v; |
|||
u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; |
|||
|
|||
if (i == 23) return; |
|||
s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v; |
|||
u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u); |
|||
u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u); |
|||
u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u); |
|||
} |
|||
} |
Loading…
Reference in new issue