Jan Willem Penterman
10 years ago
15 changed files with 1989 additions and 5 deletions
@ -0,0 +1,29 @@ |
|||||
|
set(EXECUTABLE ethash-cu) |
||||
|
|
||||
|
FIND_PACKAGE(CUDA REQUIRED) |
||||
|
|
||||
|
file(GLOB SRC_LIST "*.cpp" "*.cu") |
||||
|
file(GLOB HEADERS "*.h" "*.cuh") |
||||
|
|
||||
|
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) |
||||
|
|
||||
|
LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) |
||||
|
LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G) |
||||
|
|
||||
|
if(COMPUTE) |
||||
|
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) |
||||
|
else(COMPUTE) |
||||
|
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52) |
||||
|
endif(COMPUTE) |
||||
|
|
||||
|
|
||||
|
|
||||
|
include_directories(${CMAKE_CURRENT_BINARY_DIR}) |
||||
|
include_directories(${CUDA_INCLUDE_DIRS}) |
||||
|
include_directories(..) |
||||
|
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) |
||||
|
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash) |
||||
|
|
||||
|
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) |
||||
|
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) |
||||
|
|
File diff suppressed because it is too large
@ -0,0 +1,22 @@ |
|||||
|
#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; } |
||||
|
|
||||
|
#define ACCESSES 64 |
||||
|
#define THREADS_PER_HASH (128 / 16) |
||||
|
#define FNV_PRIME 0x01000193 |
||||
|
|
||||
|
#define fnv(x,y) ((x) * FNV_PRIME ^(y)) |
||||
|
|
||||
|
__device__ uint4 fnv4(uint4 a, uint4 b) |
||||
|
{ |
||||
|
uint4 c; |
||||
|
c.x = a.x * FNV_PRIME ^ b.x; |
||||
|
c.y = a.y * FNV_PRIME ^ b.y; |
||||
|
c.z = a.z * FNV_PRIME ^ b.z; |
||||
|
c.w = a.w * FNV_PRIME ^ b.w; |
||||
|
return c; |
||||
|
} |
||||
|
|
||||
|
__device__ uint32_t fnv_reduce(uint4 v) |
||||
|
{ |
||||
|
return fnv(fnv(fnv(v.x, v.y), v.z), v.w); |
||||
|
} |
@ -0,0 +1,136 @@ |
|||||
|
#include "ethash_cu_miner_kernel_globals.h" |
||||
|
#include "ethash_cu_miner_kernel.h" |
||||
|
#include "keccak.cuh" |
||||
|
#include "dagger.cuh" |
||||
|
|
||||
|
typedef union |
||||
|
{ |
||||
|
hash64_t init; |
||||
|
hash32_t mix; |
||||
|
} compute_hash_share; |
||||
|
|
||||
|
__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce) |
||||
|
{ |
||||
|
hash64_t init; |
||||
|
|
||||
|
// sha3_512(header .. nonce) |
||||
|
uint64_t state[25]; |
||||
|
|
||||
|
copy(state, header->uint64s, 4); |
||||
|
state[4] = nonce; |
||||
|
state[5] = 0x0000000000000001; |
||||
|
state[6] = 0; |
||||
|
state[7] = 0; |
||||
|
state[8] = 0x8000000000000000; |
||||
|
for (uint32_t i = 9; i < 25; i++) |
||||
|
{ |
||||
|
state[i] = 0; |
||||
|
} |
||||
|
|
||||
|
keccak_f1600_block((uint2 *)state, 8); |
||||
|
copy(init.uint64s, state, 8); |
||||
|
return init; |
||||
|
} |
||||
|
|
||||
|
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag) |
||||
|
{ |
||||
|
// share init0 |
||||
|
if (thread_id == 0) |
||||
|
*share = mix.x; |
||||
|
|
||||
|
uint32_t init0 = *share; |
||||
|
|
||||
|
uint32_t a = 0; |
||||
|
|
||||
|
do |
||||
|
{ |
||||
|
|
||||
|
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); |
||||
|
|
||||
|
//#pragma unroll 4 |
||||
|
for (uint32_t i = 0; i < 4; i++) |
||||
|
{ |
||||
|
|
||||
|
if (update_share) |
||||
|
{ |
||||
|
uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w }; |
||||
|
*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size; |
||||
|
} |
||||
|
__threadfence_block(); |
||||
|
|
||||
|
#if __CUDA_ARCH__ >= 350 |
||||
|
mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id])); |
||||
|
#else |
||||
|
mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); |
||||
|
#endif |
||||
|
|
||||
|
} |
||||
|
|
||||
|
} while ((a += 4) != ACCESSES); |
||||
|
|
||||
|
return fnv_reduce(mix); |
||||
|
} |
||||
|
|
||||
|
__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix) |
||||
|
{ |
||||
|
uint64_t state[25]; |
||||
|
|
||||
|
hash32_t hash; |
||||
|
|
||||
|
// keccak_256(keccak_512(header..nonce) .. mix); |
||||
|
copy(state, init->uint64s, 8); |
||||
|
copy(state + 8, mix->uint64s, 4); |
||||
|
state[12] = 0x0000000000000001; |
||||
|
for (uint32_t i = 13; i < 16; i++) |
||||
|
{ |
||||
|
state[i] = 0; |
||||
|
} |
||||
|
state[16] = 0x8000000000000000; |
||||
|
for (uint32_t i = 17; i < 25; i++) |
||||
|
{ |
||||
|
state[i] = 0; |
||||
|
} |
||||
|
|
||||
|
keccak_f1600_block((uint2 *)state, 1); |
||||
|
|
||||
|
// copy out |
||||
|
copy(hash.uint64s, state, 4); |
||||
|
return hash; |
||||
|
} |
||||
|
|
||||
|
__device__ hash32_t compute_hash( |
||||
|
hash32_t const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t nonce |
||||
|
) |
||||
|
{ |
||||
|
extern __shared__ compute_hash_share share[]; |
||||
|
|
||||
|
// Compute one init hash per work item. |
||||
|
hash64_t init = init_hash(g_header, nonce); |
||||
|
|
||||
|
// Threads work together in this phase in groups of 8. |
||||
|
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
||||
|
uint32_t const hash_id = threadIdx.x >> 3; |
||||
|
|
||||
|
hash32_t mix; |
||||
|
|
||||
|
for (int i = 0; i < THREADS_PER_HASH; i++) |
||||
|
{ |
||||
|
// share init with other threads |
||||
|
if (i == thread_id) |
||||
|
share[hash_id].init = init; |
||||
|
|
||||
|
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; |
||||
|
|
||||
|
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag); |
||||
|
|
||||
|
share[hash_id].mix.uint32s[thread_id] = thread_mix; |
||||
|
|
||||
|
|
||||
|
if (i == thread_id) |
||||
|
mix = share[hash_id].mix; |
||||
|
} |
||||
|
|
||||
|
return final_hash(&init, &mix); |
||||
|
} |
@ -0,0 +1,107 @@ |
|||||
|
#include "ethash_cu_miner_kernel_globals.h" |
||||
|
#include "ethash_cu_miner_kernel.h" |
||||
|
#include "keccak.cuh" |
||||
|
#include "dagger.cuh" |
||||
|
|
||||
|
__device__ uint64_t compute_hash_shuffle( |
||||
|
uint2 const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t nonce |
||||
|
) |
||||
|
{ |
||||
|
// sha3_512(header .. nonce) |
||||
|
uint2 state[25]; |
||||
|
|
||||
|
state[0] = g_header[0]; |
||||
|
state[1] = g_header[1]; |
||||
|
state[2] = g_header[2]; |
||||
|
state[3] = g_header[3]; |
||||
|
state[4] = vectorize(nonce); |
||||
|
state[5] = vectorize(0x0000000000000001ULL); |
||||
|
for (uint32_t i = 6; i < 25; i++) |
||||
|
{ |
||||
|
state[i] = make_uint2(0, 0); |
||||
|
} |
||||
|
state[8] = vectorize(0x8000000000000000ULL); |
||||
|
keccak_f1600_block(state,8); |
||||
|
|
||||
|
// Threads work together in this phase in groups of 8. |
||||
|
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); |
||||
|
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1); |
||||
|
const int mix_idx = thread_id & 3; |
||||
|
|
||||
|
uint4 mix; |
||||
|
uint2 shuffle[8]; |
||||
|
|
||||
|
for (int i = 0; i < THREADS_PER_HASH; i++) |
||||
|
{ |
||||
|
// share init among threads |
||||
|
for (int j = 0; j < 8; j++) { |
||||
|
shuffle[j].x = __shfl(state[j].x, start_lane + i); |
||||
|
shuffle[j].y = __shfl(state[j].y, start_lane + i); |
||||
|
} |
||||
|
|
||||
|
// ugly but avoids local reads/writes |
||||
|
if (mix_idx < 2) { |
||||
|
if (mix_idx == 0) |
||||
|
mix = vectorize2(shuffle[0], shuffle[1]); |
||||
|
else |
||||
|
mix = vectorize2(shuffle[2], shuffle[3]); |
||||
|
} |
||||
|
else { |
||||
|
if (mix_idx == 2) |
||||
|
mix = vectorize2(shuffle[4], shuffle[5]); |
||||
|
else |
||||
|
mix = vectorize2(shuffle[6], shuffle[7]); |
||||
|
} |
||||
|
|
||||
|
uint32_t init0 = __shfl(shuffle[0].x, start_lane); |
||||
|
|
||||
|
for (uint32_t a = 0; a < ACCESSES; a += 4) |
||||
|
{ |
||||
|
int t = ((a >> 2) & (THREADS_PER_HASH - 1)); |
||||
|
|
||||
|
for (uint32_t b = 0; b < 4; b++) |
||||
|
{ |
||||
|
if (thread_id == t) |
||||
|
{ |
||||
|
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; |
||||
|
} |
||||
|
shuffle[0].x = __shfl(shuffle[0].x, start_lane + t); |
||||
|
|
||||
|
mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
uint32_t thread_mix = fnv_reduce(mix); |
||||
|
|
||||
|
// update mix accross threads |
||||
|
shuffle[0].x = __shfl(thread_mix, start_lane + 0); |
||||
|
shuffle[0].y = __shfl(thread_mix, start_lane + 1); |
||||
|
shuffle[1].x = __shfl(thread_mix, start_lane + 2); |
||||
|
shuffle[1].y = __shfl(thread_mix, start_lane + 3); |
||||
|
shuffle[2].x = __shfl(thread_mix, start_lane + 4); |
||||
|
shuffle[2].y = __shfl(thread_mix, start_lane + 5); |
||||
|
shuffle[3].x = __shfl(thread_mix, start_lane + 6); |
||||
|
shuffle[3].y = __shfl(thread_mix, start_lane + 7); |
||||
|
|
||||
|
if (i == thread_id) { |
||||
|
//move mix into state: |
||||
|
state[8] = shuffle[0]; |
||||
|
state[9] = shuffle[1]; |
||||
|
state[10] = shuffle[2]; |
||||
|
state[11] = shuffle[3]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// keccak_256(keccak_512(header..nonce) .. mix); |
||||
|
state[12] = vectorize(0x0000000000000001ULL); |
||||
|
for (uint32_t i = 13; i < 25; i++) |
||||
|
{ |
||||
|
state[i] = vectorize(0ULL); |
||||
|
} |
||||
|
state[16] = vectorize(0x8000000000000000); |
||||
|
keccak_f1600_block(state, 1); |
||||
|
|
||||
|
return devectorize(state[0]); |
||||
|
} |
@ -0,0 +1,281 @@ |
|||||
|
/*
|
||||
|
This file is part of c-ethash. |
||||
|
|
||||
|
c-ethash is free software: you can redistribute it and/or modify |
||||
|
it under the terms of the GNU General Public License as published by |
||||
|
the Free Software Foundation, either version 3 of the License, or |
||||
|
(at your option) any later version. |
||||
|
|
||||
|
c-ethash is distributed in the hope that it will be useful, |
||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
|
GNU General Public License for more details. |
||||
|
|
||||
|
You should have received a copy of the GNU General Public License |
||||
|
along with cpp-ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
*/ |
||||
|
/** @file ethash_cu_miner.cpp
|
||||
|
* @author Tim Hughes <tim@twistedfury.com> |
||||
|
* @date 2015 |
||||
|
*/ |
||||
|
|
||||
|
|
||||
|
#define _CRT_SECURE_NO_WARNINGS |
||||
|
|
||||
|
#include <cstdio> |
||||
|
#include <cstdlib> |
||||
|
#include <iostream> |
||||
|
#include <assert.h> |
||||
|
#include <queue> |
||||
|
#include <random> |
||||
|
#include <vector> |
||||
|
#include <chrono> |
||||
|
#include <thread> |
||||
|
#include <libethash/util.h> |
||||
|
#include <libethash/ethash.h> |
||||
|
#include "ethash_cu_miner.h" |
||||
|
#include "ethash_cu_miner_kernel_globals.h" |
||||
|
|
||||
|
|
||||
|
#define ETHASH_BYTES 32 |
||||
|
|
||||
|
// workaround lame platforms
|
||||
|
#if !CL_VERSION_1_2 |
||||
|
#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE |
||||
|
#define CL_MEM_HOST_READ_ONLY 0 |
||||
|
#endif |
||||
|
|
||||
|
#undef min |
||||
|
#undef max |
||||
|
|
||||
|
using namespace std; |
||||
|
|
||||
|
unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128; |
||||
|
unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
|
||||
|
|
||||
|
ethash_cu_miner::search_hook::~search_hook() {} |
||||
|
|
||||
|
ethash_cu_miner::ethash_cu_miner() |
||||
|
{ |
||||
|
} |
||||
|
|
||||
|
std::string ethash_cu_miner::platform_info(unsigned _deviceId) |
||||
|
{ |
||||
|
int runtime_version; |
||||
|
int device_count; |
||||
|
|
||||
|
device_count = get_num_devices(); |
||||
|
|
||||
|
if (device_count == 0) |
||||
|
return std::string(); |
||||
|
|
||||
|
if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(cudaErrorInvalidValue) << endl; |
||||
|
return std::string(); |
||||
|
} |
||||
|
|
||||
|
// use selected default device
|
||||
|
int device_num = std::min<int>((int)_deviceId, device_count - 1); |
||||
|
|
||||
|
cudaDeviceProp device_props; |
||||
|
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; |
||||
|
return std::string(); |
||||
|
} |
||||
|
|
||||
|
char platform[5]; |
||||
|
int version_major = runtime_version / 1000; |
||||
|
int version_minor = (runtime_version - (version_major * 1000)) / 10; |
||||
|
sprintf(platform, "%d.%d", version_major, version_minor); |
||||
|
|
||||
|
|
||||
|
char compute[5]; |
||||
|
sprintf(compute, "%d.%d", device_props.major, device_props.minor); |
||||
|
|
||||
|
return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }"; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
int ethash_cu_miner::get_num_devices() |
||||
|
{ |
||||
|
int device_count; |
||||
|
|
||||
|
if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(cudaErrorNoDevice) << endl; |
||||
|
return 0; |
||||
|
} |
||||
|
return device_count; |
||||
|
} |
||||
|
|
||||
|
void ethash_cu_miner::finish() |
||||
|
{ |
||||
|
for (unsigned i = 0; i != m_num_buffers; i++) { |
||||
|
cudaStreamDestroy(m_streams[i]); |
||||
|
m_streams[i] = 0; |
||||
|
} |
||||
|
cudaDeviceReset(); |
||||
|
} |
||||
|
|
||||
|
bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu) |
||||
|
{ |
||||
|
|
||||
|
int device_count = get_num_devices(); |
||||
|
|
||||
|
if (device_count == 0) |
||||
|
return false; |
||||
|
|
||||
|
// use selected device
|
||||
|
int device_num = std::min<int>((int)_deviceId, device_count - 1); |
||||
|
|
||||
|
cudaDeviceProp device_props; |
||||
|
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl; |
||||
|
|
||||
|
cudaError_t r = cudaSetDevice(device_num); |
||||
|
if (r != cudaSuccess) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(r) << endl; |
||||
|
return false; |
||||
|
} |
||||
|
cudaDeviceReset(); |
||||
|
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); |
||||
|
|
||||
|
m_num_buffers = num_buffers; |
||||
|
m_search_batch_size = search_batch_size; |
||||
|
|
||||
|
m_hash_buf = new void *[m_num_buffers]; |
||||
|
m_search_buf = new uint32_t *[m_num_buffers]; |
||||
|
m_streams = new cudaStream_t[m_num_buffers]; |
||||
|
|
||||
|
// use requested workgroup size, but we require multiple of 8
|
||||
|
m_workgroup_size = ((workgroup_size + 7) / 8) * 8; |
||||
|
|
||||
|
m_highcpu = highcpu; |
||||
|
|
||||
|
// patch source code
|
||||
|
cudaError result; |
||||
|
|
||||
|
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES); |
||||
|
unsigned max_outputs = c_max_search_results; |
||||
|
|
||||
|
result = set_constants(&dagSize128, &max_outputs); |
||||
|
|
||||
|
// create buffer for dag
|
||||
|
result = cudaMalloc(&m_dag_ptr, _dagSize); |
||||
|
|
||||
|
// create buffer for header256
|
||||
|
result = cudaMalloc(&m_header, 32); |
||||
|
|
||||
|
// copy dag to CPU.
|
||||
|
result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice); |
||||
|
|
||||
|
// create mining buffers
|
||||
|
for (unsigned i = 0; i != m_num_buffers; ++i) |
||||
|
{ |
||||
|
result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size); |
||||
|
result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t)); |
||||
|
result = cudaStreamCreate(&m_streams[i]); |
||||
|
} |
||||
|
if (result != cudaSuccess) |
||||
|
{ |
||||
|
cout << cudaGetErrorString(result) << endl; |
||||
|
return false; |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Prevent High CPU usage while waiting for an async task |
||||
|
*/ |
||||
|
static unsigned waitStream(cudaStream_t stream) |
||||
|
{ |
||||
|
unsigned wait_ms = 0; |
||||
|
while (cudaStreamQuery(stream) == cudaErrorNotReady) { |
||||
|
this_thread::sleep_for(chrono::milliseconds(10)); |
||||
|
wait_ms += 10; |
||||
|
} |
||||
|
return wait_ms; |
||||
|
} |
||||
|
|
||||
|
void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook) |
||||
|
{ |
||||
|
struct pending_batch |
||||
|
{ |
||||
|
uint64_t start_nonce; |
||||
|
unsigned buf; |
||||
|
}; |
||||
|
std::queue<pending_batch> pending; |
||||
|
|
||||
|
static uint32_t const c_zero = 0; |
||||
|
|
||||
|
// update header constant buffer
|
||||
|
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice); |
||||
|
for (unsigned i = 0; i != m_num_buffers; ++i) |
||||
|
{ |
||||
|
cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice); |
||||
|
} |
||||
|
cudaError err = cudaGetLastError(); |
||||
|
if (cudaSuccess != err) |
||||
|
{ |
||||
|
throw std::runtime_error(cudaGetErrorString(err)); |
||||
|
} |
||||
|
|
||||
|
unsigned buf = 0; |
||||
|
std::random_device engine; |
||||
|
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine); |
||||
|
for (;; start_nonce += m_search_batch_size) |
||||
|
{ |
||||
|
run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target); |
||||
|
|
||||
|
pending.push({ start_nonce, buf }); |
||||
|
buf = (buf + 1) % m_num_buffers; |
||||
|
|
||||
|
// read results
|
||||
|
if (pending.size() == m_num_buffers) |
||||
|
{ |
||||
|
pending_batch const& batch = pending.front(); |
||||
|
|
||||
|
uint32_t results[1 + c_max_search_results]; |
||||
|
|
||||
|
if (!m_highcpu) |
||||
|
waitStream(m_streams[buf]); // 28ms
|
||||
|
cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]); |
||||
|
|
||||
|
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results); |
||||
|
uint64_t nonces[c_max_search_results]; |
||||
|
for (unsigned i = 0; i != num_found; ++i) |
||||
|
{ |
||||
|
nonces[i] = batch.start_nonce + results[i + 1]; |
||||
|
//cout << results[i + 1] << ", ";
|
||||
|
} |
||||
|
//if (num_found > 0)
|
||||
|
// cout << endl;
|
||||
|
|
||||
|
bool exit = num_found && hook.found(nonces, num_found); |
||||
|
exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
|
||||
|
if (exit) |
||||
|
break; |
||||
|
|
||||
|
start_nonce += m_search_batch_size; |
||||
|
// reset search buffer if we're still going
|
||||
|
if (num_found) |
||||
|
cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]); |
||||
|
|
||||
|
cudaError err = cudaGetLastError(); |
||||
|
if (cudaSuccess != err) |
||||
|
{ |
||||
|
throw std::runtime_error(cudaGetErrorString(err)); |
||||
|
} |
||||
|
pending.pop(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
@ -0,0 +1,56 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cuda_runtime.h> |
||||
|
|
||||
|
#include <time.h> |
||||
|
#include <functional> |
||||
|
#include <libethash/ethash.h> |
||||
|
#include "ethash_cu_miner_kernel.h" |
||||
|
|
||||
|
class ethash_cu_miner |
||||
|
{ |
||||
|
public: |
||||
|
struct search_hook |
||||
|
{ |
||||
|
virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
|
||||
|
|
||||
|
// reports progress, return true to abort
|
||||
|
virtual bool found(uint64_t const* nonces, uint32_t count) = 0; |
||||
|
virtual bool searched(uint64_t start_nonce, uint32_t count) = 0; |
||||
|
}; |
||||
|
|
||||
|
public: |
||||
|
ethash_cu_miner(); |
||||
|
|
||||
|
bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false); |
||||
|
static std::string platform_info(unsigned _deviceId = 0); |
||||
|
static int get_num_devices(); |
||||
|
|
||||
|
|
||||
|
void finish(); |
||||
|
void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count); |
||||
|
void search(uint8_t const* header, uint64_t target, search_hook& hook); |
||||
|
|
||||
|
/* -- default values -- */ |
||||
|
/// Default value of the local work size. Also known as workgroup size.
|
||||
|
static unsigned const c_defaultLocalWorkSize; |
||||
|
/// Default value of the global work size as a multiplier of the local work size
|
||||
|
static unsigned const c_defaultGlobalWorkSizeMultiplier; |
||||
|
|
||||
|
private: |
||||
|
enum { c_max_search_results = 63, c_hash_batch_size = 1024 }; |
||||
|
|
||||
|
bool m_highcpu; |
||||
|
unsigned m_num_buffers; |
||||
|
unsigned m_search_batch_size; |
||||
|
unsigned m_workgroup_size; |
||||
|
|
||||
|
hash128_t * m_dag_ptr; |
||||
|
hash32_t * m_header; |
||||
|
|
||||
|
void ** m_hash_buf; |
||||
|
uint32_t ** m_search_buf; |
||||
|
cudaStream_t * m_streams; |
||||
|
|
||||
|
|
||||
|
}; |
@ -0,0 +1,72 @@ |
|||||
|
/* |
||||
|
* Genoil's CUDA mining kernel for Ethereum |
||||
|
* based on Tim Hughes' opencl kernel. |
||||
|
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer. |
||||
|
*/ |
||||
|
|
||||
|
#include "ethash_cu_miner_kernel.h" |
||||
|
#include "ethash_cu_miner_kernel_globals.h" |
||||
|
#include "cuda_helper.h" |
||||
|
|
||||
|
#define SHUFFLE_MIN_VER 350 |
||||
|
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
||||
|
#include "dagger_shuffled.cuh" |
||||
|
#else |
||||
|
#include "dagger_shared.cuh" |
||||
|
#endif |
||||
|
|
||||
|
__global__ void |
||||
|
__launch_bounds__(128, 7) |
||||
|
ethash_search( |
||||
|
uint32_t* g_output, |
||||
|
hash32_t const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t start_nonce, |
||||
|
uint64_t target |
||||
|
) |
||||
|
{ |
||||
|
|
||||
|
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
|
||||
|
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
||||
|
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); |
||||
|
if (cuda_swab64(hash) < target) |
||||
|
#else |
||||
|
hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid); |
||||
|
if (cuda_swab64(hash.uint64s[0]) < target) |
||||
|
#endif |
||||
|
{ |
||||
|
atomicInc(g_output, d_max_outputs); |
||||
|
g_output[g_output[0]] = gid; |
||||
|
} |
||||
|
|
||||
|
} |
||||
|
|
||||
|
void run_ethash_search( |
||||
|
uint32_t blocks, |
||||
|
uint32_t threads, |
||||
|
cudaStream_t stream, |
||||
|
uint32_t* g_output, |
||||
|
hash32_t const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t start_nonce, |
||||
|
uint64_t target |
||||
|
) |
||||
|
{ |
||||
|
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER |
||||
|
ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target); |
||||
|
#else |
||||
|
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
cudaError set_constants( |
||||
|
uint32_t * dag_size, |
||||
|
uint32_t * max_outputs |
||||
|
) |
||||
|
{ |
||||
|
cudaError result; |
||||
|
result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t)); |
||||
|
result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t)); |
||||
|
return result; |
||||
|
} |
@ -0,0 +1,60 @@ |
|||||
|
#ifndef _ETHASH_CU_MINER_KERNEL_H_ |
||||
|
#define _ETHASH_CU_MINER_KERNEL_H_ |
||||
|
|
||||
|
#include <stdint.h> |
||||
|
|
||||
|
|
||||
|
typedef union |
||||
|
{ |
||||
|
uint64_t uint64s[16 / sizeof(uint64_t)]; |
||||
|
uint32_t uint32s[16 / sizeof(uint32_t)]; |
||||
|
} hash16_t; |
||||
|
|
||||
|
typedef union |
||||
|
{ |
||||
|
uint32_t uint32s[32 / sizeof(uint32_t)]; |
||||
|
uint64_t uint64s[32 / sizeof(uint64_t)]; |
||||
|
uint2 uint2s[32 / sizeof(uint2)]; |
||||
|
} hash32_t; |
||||
|
|
||||
|
|
||||
|
typedef union |
||||
|
{ |
||||
|
uint32_t uint32s[64 / sizeof(uint32_t)]; |
||||
|
uint64_t uint64s[64 / sizeof(uint64_t)]; |
||||
|
uint4 uint4s[64 / sizeof(uint4)]; |
||||
|
} hash64_t; |
||||
|
|
||||
|
|
||||
|
typedef union |
||||
|
{ |
||||
|
uint32_t uint32s[128 / sizeof(uint32_t)]; |
||||
|
uint4 uint4s[128 / sizeof(uint4)]; |
||||
|
} hash128_t; |
||||
|
|
||||
|
//typedef uint32_t hash128_t;
|
||||
|
|
||||
|
cudaError set_constants( |
||||
|
uint32_t * dag_size, |
||||
|
uint32_t * max_outputs |
||||
|
); |
||||
|
|
||||
|
void run_ethash_hash( |
||||
|
hash32_t* g_hashes, |
||||
|
hash32_t const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t start_nonce |
||||
|
); |
||||
|
|
||||
|
void run_ethash_search( |
||||
|
uint32_t search_batch_size, |
||||
|
uint32_t workgroup_size, |
||||
|
cudaStream_t stream, |
||||
|
uint32_t* g_output, |
||||
|
hash32_t const* g_header, |
||||
|
hash128_t const* g_dag, |
||||
|
uint64_t start_nonce, |
||||
|
uint64_t target |
||||
|
); |
||||
|
|
||||
|
#endif |
@ -0,0 +1,9 @@ |
|||||
|
#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ |
||||
|
#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ |
||||
|
|
||||
|
//#include "cuda_helper.h"
|
||||
|
|
||||
|
__constant__ uint32_t d_dag_size; |
||||
|
__constant__ uint32_t d_max_outputs; |
||||
|
|
||||
|
#endif |
@ -0,0 +1,89 @@ |
|||||
|
#include "cuda_helper.h" |
||||
|
|
||||
|
__device__ __constant__ uint64_t const keccak_round_constants[24] = { |
||||
|
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL, |
||||
|
0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL, |
||||
|
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL, |
||||
|
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL, |
||||
|
0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL, |
||||
|
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, |
||||
|
0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL, |
||||
|
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL |
||||
|
}; |
||||
|
|
||||
|
#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) |
||||
|
|
||||
|
__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size) |
||||
|
{ |
||||
|
uint2 t[5], u, v; |
||||
|
|
||||
|
#pragma unroll 3 |
||||
|
for (int i = 0; i < 24; i++) |
||||
|
{ |
||||
|
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ |
||||
|
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; |
||||
|
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; |
||||
|
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; |
||||
|
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; |
||||
|
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; |
||||
|
|
||||
|
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ |
||||
|
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ |
||||
|
u = t[4] ^ ROL2(t[1], 1); |
||||
|
s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u; |
||||
|
u = t[0] ^ ROL2(t[2], 1); |
||||
|
s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u; |
||||
|
u = t[1] ^ ROL2(t[3], 1); |
||||
|
s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u; |
||||
|
u = t[2] ^ ROL2(t[4], 1); |
||||
|
s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u; |
||||
|
u = t[3] ^ ROL2(t[0], 1); |
||||
|
s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u; |
||||
|
|
||||
|
/* rho pi: b[..] = rotl(a[..], ..) */ |
||||
|
u = s[1]; |
||||
|
|
||||
|
s[1] = ROL2(s[6], 44); |
||||
|
s[6] = ROL2(s[9], 20); |
||||
|
s[9] = ROL2(s[22], 61); |
||||
|
s[22] = ROL2(s[14], 39); |
||||
|
s[14] = ROL2(s[20], 18); |
||||
|
s[20] = ROL2(s[2], 62); |
||||
|
s[2] = ROL2(s[12], 43); |
||||
|
s[12] = ROL2(s[13], 25); |
||||
|
s[13] = ROL2(s[19], 8); |
||||
|
s[19] = ROL2(s[23], 56); |
||||
|
s[23] = ROL2(s[15], 41); |
||||
|
s[15] = ROL2(s[4], 27); |
||||
|
s[4] = ROL2(s[24], 14); |
||||
|
s[24] = ROL2(s[21], 2); |
||||
|
s[21] = ROL2(s[8], 55); |
||||
|
s[8] = ROL2(s[16], 45); |
||||
|
s[16] = ROL2(s[5], 36); |
||||
|
s[5] = ROL2(s[3], 28); |
||||
|
s[3] = ROL2(s[18], 21); |
||||
|
s[18] = ROL2(s[17], 15); |
||||
|
s[17] = ROL2(s[11], 10); |
||||
|
s[11] = ROL2(s[7], 6); |
||||
|
s[7] = ROL2(s[10], 3); |
||||
|
s[10] = ROL2(u, 1); |
||||
|
|
||||
|
// squeeze this in here |
||||
|
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ |
||||
|
u = s[0]; v = s[1]; s[0] ^= (~v) & s[2]; |
||||
|
|
||||
|
/* iota: a[0,0] ^= round constant */ |
||||
|
s[0] ^= vectorize(keccak_round_constants[i]); |
||||
|
if (i == 23 && out_size == 1) return; |
||||
|
|
||||
|
// continue chi |
||||
|
s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v; |
||||
|
u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; |
||||
|
|
||||
|
if (i == 23) return; |
||||
|
s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v; |
||||
|
u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u); |
||||
|
u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u); |
||||
|
u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u); |
||||
|
} |
||||
|
} |
Loading…
Reference in new issue