Browse Source

end of day commit..nothing works yet

cl-refactor
Jan Willem Penterman 10 years ago
parent
commit
30bfe53bad
  1. 24
      CMakeLists.txt
  2. 6
      cmake/EthDependencies.cmake
  3. 43
      ethminer/MinerAux.h
  4. 29
      libethash-cu/CMakeLists.txt
  5. 1057
      libethash-cu/cuda_helper.h
  6. 22
      libethash-cu/dagger.cuh
  7. 136
      libethash-cu/dagger_shared.cuh
  8. 107
      libethash-cu/dagger_shuffled.cuh
  9. 281
      libethash-cu/ethash_cu_miner.cpp
  10. 56
      libethash-cu/ethash_cu_miner.h
  11. 72
      libethash-cu/ethash_cu_miner_kernel.cu
  12. 60
      libethash-cu/ethash_cu_miner_kernel.h
  13. 9
      libethash-cu/ethash_cu_miner_kernel_globals.h
  14. 89
      libethash-cu/keccak.cuh
  15. 3
      libethcore/CMakeLists.txt

24
CMakeLists.txt

@ -49,6 +49,7 @@ set(D_TOOLS ON)
set(D_TESTS ON)
set(D_FATDB ON)
set(D_ETHASHCL ON)
set(D_ETHASHCU OFF)
set(D_EVMJIT ON)
set(D_JSCONSOLE ON)
set(D_JSONRPC ON)
@ -126,6 +127,20 @@ elseif (BUNDLE STREQUAL "miner")
set(D_JSONRPC ON)
set(D_JSCONSOLE OFF)
set(D_EVMJIT OFF)
elseif (BUNDLE STREQUAL "cudaminer")
set(D_SERPENT OFF)
set(D_USENPM OFF)
set(D_GUI OFF)
set(D_TOOLS OFF)
set(D_TESTS OFF)
set(D_ETHKEY OFF)
set(D_MINER ON)
set(D_ETHASHCL ON)
set(D_ETHASHCU ON)
set(D_FATDB OFF)
set(D_JSONRPC ON)
set(D_JSCONSOLE OFF)
set(D_EVMJIT OFF)
elseif (BUNDLE STREQUAL "release") # release builds
set(D_SERPENT ${DECENT_PLATFORM})
set(D_USENPM OFF)
@ -158,6 +173,10 @@ function(configureProject)
add_definitions(-DETH_ETHASHCL)
endif()
if (ETHASHCU)
add_definitions(-DETH_ETHASHCU)
endif()
if (EVMJIT)
add_definitions(-DETH_EVMJIT)
endif()
@ -291,6 +310,7 @@ eth_format_option(ROCKSDB)
eth_format_option(TOOLS)
eth_format_option(ETHKEY)
eth_format_option(ETHASHCL)
eth_format_option(ETHASHCU)
eth_format_option(JSCONSOLE)
eth_format_option(OLYMPIC)
eth_format_option(SERPENT)
@ -341,6 +361,7 @@ message("-- SERPENT Build Serpent language components ${SERPENT}
message("-- GUI Build GUI components ${GUI}")
message("-- TESTS Build tests ${TESTS}")
message("-- ETHASHCL Build OpenCL components ${ETHASHCL}")
message("-- ETHASHCU Build CUDA components ${ETHASHCU}")
message("-- JSCONSOLE Build with javascript console ${JSCONSOLE}")
message("-- EVMJIT Build LLVM-based JIT EVM ${EVMJIT}")
message("------------------------------------------------------------------------")
@ -429,6 +450,9 @@ if (GENERAL OR MINER)
if (ETHASHCL)
add_subdirectory(libethash-cl)
endif ()
if (ETHASHCU)
add_subdirectory(libethash-cu)
endif ()
endif ()
add_subdirectory(libethcore)

6
cmake/EthDependencies.cmake

@ -128,6 +128,12 @@ if (OpenCL_FOUND)
message(" - opencl lib : ${OpenCL_LIBRARIES}")
endif()
find_package (CUDA)
if (CUDA_FOUND)
message(" - CUDA header: ${CUDA_INCLUDE_DIRS}")
message(" - CUDA lib : ${CUDA_LIBRARIES}")
endif()
# find location of jsonrpcstub
find_program(ETH_JSON_RPC_STUB jsonrpcstub)
message(" - jsonrpcstub location : ${ETH_JSON_RPC_STUB}")

43
ethminer/MinerAux.h

@ -39,12 +39,16 @@
#include <libdevcore/SHA3.h>
#include <libdevcore/CommonJS.h>
#include <libethcore/EthashAux.h>
#include <libethcore/EthashCUDAMiner.h>
#include <libethcore/EthashGPUMiner.h>
#include <libethcore/EthashCPUMiner.h>
#include <libethcore/Farm.h>
#if ETH_ETHASHCL || !ETH_TRUE
#include <libethash-cl/ethash_cl_miner.h>
#endif
#if ETH_ETHASHCU || !ETH_TRUE
#include <libethash-cu/ethash_cu_miner.h>
#endif
#if ETH_JSONRPC || !ETH_TRUE
#include <libweb3jsonrpc/WebThreeStubServer.h>
#include <jsonrpccpp/server/connectors/httpserver.h>
@ -140,8 +144,8 @@ public:
cerr << "Bad " << arg << " option: " << argv[i] << endl;
BOOST_THROW_EXCEPTION(BadArgument());
}
#if ETH_ETHASHCL || !ETH_TRUE
else if (arg == "--cl-global-work" && i + 1 < argc)
#if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE
else if (arg == "--gpu-global-work" && i + 1 < argc)
try {
m_globalWorkSizeMultiplier = stol(argv[++i]);
}
@ -150,7 +154,7 @@ public:
cerr << "Bad " << arg << " option: " << argv[i] << endl;
BOOST_THROW_EXCEPTION(BadArgument());
}
else if (arg == "--cl-local-work" && i + 1 < argc)
else if (arg == "--gpu-local-work" && i + 1 < argc)
try {
m_localWorkSize = stol(argv[++i]);
}
@ -219,6 +223,8 @@ public:
m_minerType = MinerType::CPU;
else if (arg == "-G" || arg == "--opencl")
m_minerType = MinerType::GPU;
else if (arg == "-U" || arg == "--cuda")
m_minerType = MinerType::CUDA;
else if (arg == "--current-block" && i + 1 < argc)
m_currentBlock = stol(argv[++i]);
else if (arg == "--no-precompute")
@ -289,6 +295,21 @@ public:
BOOST_THROW_EXCEPTION(BadArgument());
}
}
else if (arg == "--cuda-devices") {
while (m_cudaDeviceCount < 16 && i + 1 < argc)
{
try {
m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
}
catch (...)
{
break;
}
}
}
else if (arg == "--cuda-high-cpu") {
m_cudaHighCPULoad = true;
}
else
return false;
return true;
@ -377,7 +398,8 @@ public:
enum class MinerType
{
CPU,
GPU
GPU,
CUDA
};
MinerType minerType() const { return m_minerType; }
@ -476,6 +498,9 @@ private:
sealers["cpu"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCPUMiner(ci); }};
#if ETH_ETHASHCL
sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
#endif
#if ETH_ETHASHCU
sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
#endif
(void)_m;
(void)_remote;
@ -491,7 +516,8 @@ private:
f.start("cpu");
else if (_m == MinerType::GPU)
f.start("opencl");
else if (_m == MinerType::CUDA)
f.start("cuda");
EthashProofOfWork::WorkPackage current;
EthashAux::FullType dag;
while (true)
@ -589,6 +615,13 @@ private:
unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier;
unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize;
unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch;
#endif
#if ETH_ETHASHCU || !ETH_TRUE
unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier;
unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize;
unsigned m_cudaDeviceCount = 0;
unsigned m_cudaDevices[16];
bool m_cudaHighCPULoad = false;
#endif
uint64_t m_currentBlock = 0;
// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)

29
libethash-cu/CMakeLists.txt

@ -0,0 +1,29 @@
set(EXECUTABLE ethash-cu)
FIND_PACKAGE(CUDA REQUIRED)
file(GLOB SRC_LIST "*.cpp" "*.cu")
file(GLOB HEADERS "*.h" "*.cuh")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
if(COMPUTE)
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
else(COMPUTE)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
endif(COMPUTE)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(..)
CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash)
install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )

1057
libethash-cu/cuda_helper.h

File diff suppressed because it is too large

22
libethash-cu/dagger.cuh

@ -0,0 +1,22 @@
#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; }
#define ACCESSES 64
#define THREADS_PER_HASH (128 / 16)
#define FNV_PRIME 0x01000193
#define fnv(x,y) ((x) * FNV_PRIME ^(y))
__device__ uint4 fnv4(uint4 a, uint4 b)
{
uint4 c;
c.x = a.x * FNV_PRIME ^ b.x;
c.y = a.y * FNV_PRIME ^ b.y;
c.z = a.z * FNV_PRIME ^ b.z;
c.w = a.w * FNV_PRIME ^ b.w;
return c;
}
__device__ uint32_t fnv_reduce(uint4 v)
{
return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
}

136
libethash-cu/dagger_shared.cuh

@ -0,0 +1,136 @@
#include "ethash_cu_miner_kernel_globals.h"
#include "ethash_cu_miner_kernel.h"
#include "keccak.cuh"
#include "dagger.cuh"
typedef union
{
hash64_t init;
hash32_t mix;
} compute_hash_share;
__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
{
hash64_t init;
// sha3_512(header .. nonce)
uint64_t state[25];
copy(state, header->uint64s, 4);
state[4] = nonce;
state[5] = 0x0000000000000001;
state[6] = 0;
state[7] = 0;
state[8] = 0x8000000000000000;
for (uint32_t i = 9; i < 25; i++)
{
state[i] = 0;
}
keccak_f1600_block((uint2 *)state, 8);
copy(init.uint64s, state, 8);
return init;
}
__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
{
// share init0
if (thread_id == 0)
*share = mix.x;
uint32_t init0 = *share;
uint32_t a = 0;
do
{
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
//#pragma unroll 4
for (uint32_t i = 0; i < 4; i++)
{
if (update_share)
{
uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w };
*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size;
}
__threadfence_block();
#if __CUDA_ARCH__ >= 350
mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
#else
mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
#endif
}
} while ((a += 4) != ACCESSES);
return fnv_reduce(mix);
}
__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
{
uint64_t state[25];
hash32_t hash;
// keccak_256(keccak_512(header..nonce) .. mix);
copy(state, init->uint64s, 8);
copy(state + 8, mix->uint64s, 4);
state[12] = 0x0000000000000001;
for (uint32_t i = 13; i < 16; i++)
{
state[i] = 0;
}
state[16] = 0x8000000000000000;
for (uint32_t i = 17; i < 25; i++)
{
state[i] = 0;
}
keccak_f1600_block((uint2 *)state, 1);
// copy out
copy(hash.uint64s, state, 4);
return hash;
}
__device__ hash32_t compute_hash(
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t nonce
)
{
extern __shared__ compute_hash_share share[];
// Compute one init hash per work item.
hash64_t init = init_hash(g_header, nonce);
// Threads work together in this phase in groups of 8.
uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
uint32_t const hash_id = threadIdx.x >> 3;
hash32_t mix;
for (int i = 0; i < THREADS_PER_HASH; i++)
{
// share init with other threads
if (i == thread_id)
share[hash_id].init = init;
uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
share[hash_id].mix.uint32s[thread_id] = thread_mix;
if (i == thread_id)
mix = share[hash_id].mix;
}
return final_hash(&init, &mix);
}

107
libethash-cu/dagger_shuffled.cuh

@ -0,0 +1,107 @@
#include "ethash_cu_miner_kernel_globals.h"
#include "ethash_cu_miner_kernel.h"
#include "keccak.cuh"
#include "dagger.cuh"
__device__ uint64_t compute_hash_shuffle(
uint2 const* g_header,
hash128_t const* g_dag,
uint64_t nonce
)
{
// sha3_512(header .. nonce)
uint2 state[25];
state[0] = g_header[0];
state[1] = g_header[1];
state[2] = g_header[2];
state[3] = g_header[3];
state[4] = vectorize(nonce);
state[5] = vectorize(0x0000000000000001ULL);
for (uint32_t i = 6; i < 25; i++)
{
state[i] = make_uint2(0, 0);
}
state[8] = vectorize(0x8000000000000000ULL);
keccak_f1600_block(state,8);
// Threads work together in this phase in groups of 8.
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
const int mix_idx = thread_id & 3;
uint4 mix;
uint2 shuffle[8];
for (int i = 0; i < THREADS_PER_HASH; i++)
{
// share init among threads
for (int j = 0; j < 8; j++) {
shuffle[j].x = __shfl(state[j].x, start_lane + i);
shuffle[j].y = __shfl(state[j].y, start_lane + i);
}
// ugly but avoids local reads/writes
if (mix_idx < 2) {
if (mix_idx == 0)
mix = vectorize2(shuffle[0], shuffle[1]);
else
mix = vectorize2(shuffle[2], shuffle[3]);
}
else {
if (mix_idx == 2)
mix = vectorize2(shuffle[4], shuffle[5]);
else
mix = vectorize2(shuffle[6], shuffle[7]);
}
uint32_t init0 = __shfl(shuffle[0].x, start_lane);
for (uint32_t a = 0; a < ACCESSES; a += 4)
{
int t = ((a >> 2) & (THREADS_PER_HASH - 1));
for (uint32_t b = 0; b < 4; b++)
{
if (thread_id == t)
{
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
}
shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
}
}
uint32_t thread_mix = fnv_reduce(mix);
// update mix accross threads
shuffle[0].x = __shfl(thread_mix, start_lane + 0);
shuffle[0].y = __shfl(thread_mix, start_lane + 1);
shuffle[1].x = __shfl(thread_mix, start_lane + 2);
shuffle[1].y = __shfl(thread_mix, start_lane + 3);
shuffle[2].x = __shfl(thread_mix, start_lane + 4);
shuffle[2].y = __shfl(thread_mix, start_lane + 5);
shuffle[3].x = __shfl(thread_mix, start_lane + 6);
shuffle[3].y = __shfl(thread_mix, start_lane + 7);
if (i == thread_id) {
//move mix into state:
state[8] = shuffle[0];
state[9] = shuffle[1];
state[10] = shuffle[2];
state[11] = shuffle[3];
}
}
// keccak_256(keccak_512(header..nonce) .. mix);
state[12] = vectorize(0x0000000000000001ULL);
for (uint32_t i = 13; i < 25; i++)
{
state[i] = vectorize(0ULL);
}
state[16] = vectorize(0x8000000000000000);
keccak_f1600_block(state, 1);
return devectorize(state[0]);
}

281
libethash-cu/ethash_cu_miner.cpp

@ -0,0 +1,281 @@
/*
This file is part of c-ethash.
c-ethash is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
c-ethash is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with cpp-ethereum. If not, see <http://www.gnu.org/licenses/>.
*/
/** @file ethash_cu_miner.cpp
* @author Tim Hughes <tim@twistedfury.com>
* @date 2015
*/
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <assert.h>
#include <queue>
#include <random>
#include <vector>
#include <chrono>
#include <thread>
#include <libethash/util.h>
#include <libethash/ethash.h>
#include "ethash_cu_miner.h"
#include "ethash_cu_miner_kernel_globals.h"
#define ETHASH_BYTES 32
// workaround lame platforms
#if !CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
#define CL_MEM_HOST_READ_ONLY 0
#endif
#undef min
#undef max
using namespace std;
unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128;
unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
ethash_cu_miner::search_hook::~search_hook() {}
ethash_cu_miner::ethash_cu_miner()
{
}
std::string ethash_cu_miner::platform_info(unsigned _deviceId)
{
int runtime_version;
int device_count;
device_count = get_num_devices();
if (device_count == 0)
return std::string();
if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue)
{
cout << cudaGetErrorString(cudaErrorInvalidValue) << endl;
return std::string();
}
// use selected default device
int device_num = std::min<int>((int)_deviceId, device_count - 1);
cudaDeviceProp device_props;
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
{
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
return std::string();
}
char platform[5];
int version_major = runtime_version / 1000;
int version_minor = (runtime_version - (version_major * 1000)) / 10;
sprintf(platform, "%d.%d", version_major, version_minor);
char compute[5];
sprintf(compute, "%d.%d", device_props.major, device_props.minor);
return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
}
int ethash_cu_miner::get_num_devices()
{
int device_count;
if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice)
{
cout << cudaGetErrorString(cudaErrorNoDevice) << endl;
return 0;
}
return device_count;
}
void ethash_cu_miner::finish()
{
for (unsigned i = 0; i != m_num_buffers; i++) {
cudaStreamDestroy(m_streams[i]);
m_streams[i] = 0;
}
cudaDeviceReset();
}
bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu)
{
int device_count = get_num_devices();
if (device_count == 0)
return false;
// use selected device
int device_num = std::min<int>((int)_deviceId, device_count - 1);
cudaDeviceProp device_props;
if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
{
cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
return false;
}
cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl;
cudaError_t r = cudaSetDevice(device_num);
if (r != cudaSuccess)
{
cout << cudaGetErrorString(r) << endl;
return false;
}
cudaDeviceReset();
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
m_num_buffers = num_buffers;
m_search_batch_size = search_batch_size;
m_hash_buf = new void *[m_num_buffers];
m_search_buf = new uint32_t *[m_num_buffers];
m_streams = new cudaStream_t[m_num_buffers];
// use requested workgroup size, but we require multiple of 8
m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
m_highcpu = highcpu;
// patch source code
cudaError result;
uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
unsigned max_outputs = c_max_search_results;
result = set_constants(&dagSize128, &max_outputs);
// create buffer for dag
result = cudaMalloc(&m_dag_ptr, _dagSize);
// create buffer for header256
result = cudaMalloc(&m_header, 32);
// copy dag to CPU.
result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
// create mining buffers
for (unsigned i = 0; i != m_num_buffers; ++i)
{
result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size);
result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
result = cudaStreamCreate(&m_streams[i]);
}
if (result != cudaSuccess)
{
cout << cudaGetErrorString(result) << endl;
return false;
}
return true;
}
/**
* Prevent High CPU usage while waiting for an async task
*/
static unsigned waitStream(cudaStream_t stream)
{
unsigned wait_ms = 0;
while (cudaStreamQuery(stream) == cudaErrorNotReady) {
this_thread::sleep_for(chrono::milliseconds(10));
wait_ms += 10;
}
return wait_ms;
}
void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
{
struct pending_batch
{
uint64_t start_nonce;
unsigned buf;
};
std::queue<pending_batch> pending;
static uint32_t const c_zero = 0;
// update header constant buffer
cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
for (unsigned i = 0; i != m_num_buffers; ++i)
{
cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice);
}
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
throw std::runtime_error(cudaGetErrorString(err));
}
unsigned buf = 0;
std::random_device engine;
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
for (;; start_nonce += m_search_batch_size)
{
run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);
pending.push({ start_nonce, buf });
buf = (buf + 1) % m_num_buffers;
// read results
if (pending.size() == m_num_buffers)
{
pending_batch const& batch = pending.front();
uint32_t results[1 + c_max_search_results];
if (!m_highcpu)
waitStream(m_streams[buf]); // 28ms
cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]);
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
uint64_t nonces[c_max_search_results];
for (unsigned i = 0; i != num_found; ++i)
{
nonces[i] = batch.start_nonce + results[i + 1];
//cout << results[i + 1] << ", ";
}
//if (num_found > 0)
// cout << endl;
bool exit = num_found && hook.found(nonces, num_found);
exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
if (exit)
break;
start_nonce += m_search_batch_size;
// reset search buffer if we're still going
if (num_found)
cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
throw std::runtime_error(cudaGetErrorString(err));
}
pending.pop();
}
}
}

56
libethash-cu/ethash_cu_miner.h

@ -0,0 +1,56 @@
#pragma once
#include <cuda_runtime.h>
#include <time.h>
#include <functional>
#include <libethash/ethash.h>
#include "ethash_cu_miner_kernel.h"
class ethash_cu_miner
{
public:
struct search_hook
{
virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
// reports progress, return true to abort
virtual bool found(uint64_t const* nonces, uint32_t count) = 0;
virtual bool searched(uint64_t start_nonce, uint32_t count) = 0;
};
public:
ethash_cu_miner();
bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false);
static std::string platform_info(unsigned _deviceId = 0);
static int get_num_devices();
void finish();
void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
void search(uint8_t const* header, uint64_t target, search_hook& hook);
/* -- default values -- */
/// Default value of the local work size. Also known as workgroup size.
static unsigned const c_defaultLocalWorkSize;
/// Default value of the global work size as a multiplier of the local work size
static unsigned const c_defaultGlobalWorkSizeMultiplier;
private:
enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
bool m_highcpu;
unsigned m_num_buffers;
unsigned m_search_batch_size;
unsigned m_workgroup_size;
hash128_t * m_dag_ptr;
hash32_t * m_header;
void ** m_hash_buf;
uint32_t ** m_search_buf;
cudaStream_t * m_streams;
};

72
libethash-cu/ethash_cu_miner_kernel.cu

@ -0,0 +1,72 @@
/*
* Genoil's CUDA mining kernel for Ethereum
* based on Tim Hughes' opencl kernel.
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
*/
#include "ethash_cu_miner_kernel.h"
#include "ethash_cu_miner_kernel_globals.h"
#include "cuda_helper.h"
#define SHUFFLE_MIN_VER 350
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
#include "dagger_shuffled.cuh"
#else
#include "dagger_shared.cuh"
#endif
__global__ void
__launch_bounds__(128, 7)
ethash_search(
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
)
{
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
if (cuda_swab64(hash) < target)
#else
hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid);
if (cuda_swab64(hash.uint64s[0]) < target)
#endif
{
atomicInc(g_output, d_max_outputs);
g_output[g_output[0]] = gid;
}
}
void run_ethash_search(
uint32_t blocks,
uint32_t threads,
cudaStream_t stream,
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
)
{
#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
#else
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
#endif
}
cudaError set_constants(
uint32_t * dag_size,
uint32_t * max_outputs
)
{
cudaError result;
result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
return result;
}

60
libethash-cu/ethash_cu_miner_kernel.h

@ -0,0 +1,60 @@
#ifndef _ETHASH_CU_MINER_KERNEL_H_
#define _ETHASH_CU_MINER_KERNEL_H_
#include <stdint.h>
typedef union
{
uint64_t uint64s[16 / sizeof(uint64_t)];
uint32_t uint32s[16 / sizeof(uint32_t)];
} hash16_t;
typedef union
{
uint32_t uint32s[32 / sizeof(uint32_t)];
uint64_t uint64s[32 / sizeof(uint64_t)];
uint2 uint2s[32 / sizeof(uint2)];
} hash32_t;
typedef union
{
uint32_t uint32s[64 / sizeof(uint32_t)];
uint64_t uint64s[64 / sizeof(uint64_t)];
uint4 uint4s[64 / sizeof(uint4)];
} hash64_t;
typedef union
{
uint32_t uint32s[128 / sizeof(uint32_t)];
uint4 uint4s[128 / sizeof(uint4)];
} hash128_t;
//typedef uint32_t hash128_t;
cudaError set_constants(
uint32_t * dag_size,
uint32_t * max_outputs
);
void run_ethash_hash(
hash32_t* g_hashes,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce
);
void run_ethash_search(
uint32_t search_batch_size,
uint32_t workgroup_size,
cudaStream_t stream,
uint32_t* g_output,
hash32_t const* g_header,
hash128_t const* g_dag,
uint64_t start_nonce,
uint64_t target
);
#endif

9
libethash-cu/ethash_cu_miner_kernel_globals.h

@ -0,0 +1,9 @@
#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
//#include "cuda_helper.h"
__constant__ uint32_t d_dag_size;
__constant__ uint32_t d_max_outputs;
#endif

89
libethash-cu/keccak.cuh

@ -0,0 +1,89 @@
#include "cuda_helper.h"
__device__ __constant__ uint64_t const keccak_round_constants[24] = {
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
};
#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size)
{
uint2 t[5], u, v;
#pragma unroll 3
for (int i = 0; i < 24; i++)
{
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
u = t[4] ^ ROL2(t[1], 1);
s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u;
u = t[0] ^ ROL2(t[2], 1);
s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u;
u = t[1] ^ ROL2(t[3], 1);
s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u;
u = t[2] ^ ROL2(t[4], 1);
s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u;
u = t[3] ^ ROL2(t[0], 1);
s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u;
/* rho pi: b[..] = rotl(a[..], ..) */
u = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[22] = ROL2(s[14], 39);
s[14] = ROL2(s[20], 18);
s[20] = ROL2(s[2], 62);
s[2] = ROL2(s[12], 43);
s[12] = ROL2(s[13], 25);
s[13] = ROL2(s[19], 8);
s[19] = ROL2(s[23], 56);
s[23] = ROL2(s[15], 41);
s[15] = ROL2(s[4], 27);
s[4] = ROL2(s[24], 14);
s[24] = ROL2(s[21], 2);
s[21] = ROL2(s[8], 55);
s[8] = ROL2(s[16], 45);
s[16] = ROL2(s[5], 36);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[18] = ROL2(s[17], 15);
s[17] = ROL2(s[11], 10);
s[11] = ROL2(s[7], 6);
s[7] = ROL2(s[10], 3);
s[10] = ROL2(u, 1);
// squeeze this in here
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
u = s[0]; v = s[1]; s[0] ^= (~v) & s[2];
/* iota: a[0,0] ^= round constant */
s[0] ^= vectorize(keccak_round_constants[i]);
if (i == 23 && out_size == 1) return;
// continue chi
s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v;
u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9];
if (i == 23) return;
s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v;
u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u);
u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u);
u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u);
}
}

3
libethcore/CMakeLists.txt

@ -21,6 +21,9 @@ target_link_libraries(${EXECUTABLE} evmcore)
if (ETHASHCL)
target_link_libraries(${EXECUTABLE} ethash-cl)
endif ()
if (ETHASHCU)
target_link_libraries(${EXECUTABLE} ethash-cu)
endif ()
target_link_libraries(${EXECUTABLE} devcrypto)
if (CPUID_FOUND)
target_link_libraries(${EXECUTABLE} ${CPUID_LIBRARIES})

Loading…
Cancel
Save