From 30bfe53bad12e0e7ed8d28fb00b738f5783d5e0b Mon Sep 17 00:00:00 2001 From: Jan Willem Penterman Date: Fri, 21 Aug 2015 17:03:18 +0200 Subject: [PATCH] end of day commit..nothing works yet --- CMakeLists.txt | 24 + cmake/EthDependencies.cmake | 6 + ethminer/MinerAux.h | 43 +- libethash-cu/CMakeLists.txt | 29 + libethash-cu/cuda_helper.h | 1057 +++++++++++++++++ libethash-cu/dagger.cuh | 22 + libethash-cu/dagger_shared.cuh | 136 +++ libethash-cu/dagger_shuffled.cuh | 107 ++ libethash-cu/ethash_cu_miner.cpp | 281 +++++ libethash-cu/ethash_cu_miner.h | 56 + libethash-cu/ethash_cu_miner_kernel.cu | 72 ++ libethash-cu/ethash_cu_miner_kernel.h | 60 + libethash-cu/ethash_cu_miner_kernel_globals.h | 9 + libethash-cu/keccak.cuh | 89 ++ libethcore/CMakeLists.txt | 3 + 15 files changed, 1989 insertions(+), 5 deletions(-) create mode 100644 libethash-cu/CMakeLists.txt create mode 100644 libethash-cu/cuda_helper.h create mode 100644 libethash-cu/dagger.cuh create mode 100644 libethash-cu/dagger_shared.cuh create mode 100644 libethash-cu/dagger_shuffled.cuh create mode 100644 libethash-cu/ethash_cu_miner.cpp create mode 100644 libethash-cu/ethash_cu_miner.h create mode 100644 libethash-cu/ethash_cu_miner_kernel.cu create mode 100644 libethash-cu/ethash_cu_miner_kernel.h create mode 100644 libethash-cu/ethash_cu_miner_kernel_globals.h create mode 100644 libethash-cu/keccak.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index 29e954031..eeec392c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,7 @@ set(D_TOOLS ON) set(D_TESTS ON) set(D_FATDB ON) set(D_ETHASHCL ON) +set(D_ETHASHCU OFF) set(D_EVMJIT ON) set(D_JSCONSOLE ON) set(D_JSONRPC ON) @@ -126,6 +127,20 @@ elseif (BUNDLE STREQUAL "miner") set(D_JSONRPC ON) set(D_JSCONSOLE OFF) set(D_EVMJIT OFF) +elseif (BUNDLE STREQUAL "cudaminer") + set(D_SERPENT OFF) + set(D_USENPM OFF) + set(D_GUI OFF) + set(D_TOOLS OFF) + set(D_TESTS OFF) + set(D_ETHKEY OFF) + set(D_MINER ON) + set(D_ETHASHCL ON) + set(D_ETHASHCU ON) + set(D_FATDB OFF) + set(D_JSONRPC ON) + set(D_JSCONSOLE OFF) + set(D_EVMJIT OFF) elseif (BUNDLE STREQUAL "release") # release builds set(D_SERPENT ${DECENT_PLATFORM}) set(D_USENPM OFF) @@ -158,6 +173,10 @@ function(configureProject) add_definitions(-DETH_ETHASHCL) endif() + if (ETHASHCU) + add_definitions(-DETH_ETHASHCU) + endif() + if (EVMJIT) add_definitions(-DETH_EVMJIT) endif() @@ -291,6 +310,7 @@ eth_format_option(ROCKSDB) eth_format_option(TOOLS) eth_format_option(ETHKEY) eth_format_option(ETHASHCL) +eth_format_option(ETHASHCU) eth_format_option(JSCONSOLE) eth_format_option(OLYMPIC) eth_format_option(SERPENT) @@ -341,6 +361,7 @@ message("-- SERPENT Build Serpent language components ${SERPENT} message("-- GUI Build GUI components ${GUI}") message("-- TESTS Build tests ${TESTS}") message("-- ETHASHCL Build OpenCL components ${ETHASHCL}") +message("-- ETHASHCU Build CUDA components ${ETHASHCU}") message("-- JSCONSOLE Build with javascript console ${JSCONSOLE}") message("-- EVMJIT Build LLVM-based JIT EVM ${EVMJIT}") message("------------------------------------------------------------------------") @@ -429,6 +450,9 @@ if (GENERAL OR MINER) if (ETHASHCL) add_subdirectory(libethash-cl) endif () + if (ETHASHCU) + add_subdirectory(libethash-cu) + endif () endif () add_subdirectory(libethcore) diff --git a/cmake/EthDependencies.cmake b/cmake/EthDependencies.cmake index 389664c99..f5fdae74f 100644 --- a/cmake/EthDependencies.cmake +++ b/cmake/EthDependencies.cmake @@ -128,6 +128,12 @@ if (OpenCL_FOUND) message(" - opencl lib : ${OpenCL_LIBRARIES}") endif() +find_package (CUDA) +if (CUDA_FOUND) + message(" - CUDA header: ${CUDA_INCLUDE_DIRS}") + message(" - CUDA lib : ${CUDA_LIBRARIES}") +endif() + # find location of jsonrpcstub find_program(ETH_JSON_RPC_STUB jsonrpcstub) message(" - jsonrpcstub location : ${ETH_JSON_RPC_STUB}") diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h index 84cd69c2c..7486d64f8 100644 --- a/ethminer/MinerAux.h +++ b/ethminer/MinerAux.h @@ -39,12 +39,16 @@ #include #include #include +#include #include #include #include #if ETH_ETHASHCL || !ETH_TRUE #include #endif +#if ETH_ETHASHCU || !ETH_TRUE +#include +#endif #if ETH_JSONRPC || !ETH_TRUE #include #include @@ -140,8 +144,8 @@ public: cerr << "Bad " << arg << " option: " << argv[i] << endl; BOOST_THROW_EXCEPTION(BadArgument()); } -#if ETH_ETHASHCL || !ETH_TRUE - else if (arg == "--cl-global-work" && i + 1 < argc) +#if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE + else if (arg == "--gpu-global-work" && i + 1 < argc) try { m_globalWorkSizeMultiplier = stol(argv[++i]); } @@ -150,7 +154,7 @@ public: cerr << "Bad " << arg << " option: " << argv[i] << endl; BOOST_THROW_EXCEPTION(BadArgument()); } - else if (arg == "--cl-local-work" && i + 1 < argc) + else if (arg == "--gpu-local-work" && i + 1 < argc) try { m_localWorkSize = stol(argv[++i]); } @@ -219,6 +223,8 @@ public: m_minerType = MinerType::CPU; else if (arg == "-G" || arg == "--opencl") m_minerType = MinerType::GPU; + else if (arg == "-U" || arg == "--cuda") + m_minerType = MinerType::CUDA; else if (arg == "--current-block" && i + 1 < argc) m_currentBlock = stol(argv[++i]); else if (arg == "--no-precompute") @@ -289,6 +295,21 @@ public: BOOST_THROW_EXCEPTION(BadArgument()); } } + else if (arg == "--cuda-devices") { + while (m_cudaDeviceCount < 16 && i + 1 < argc) + { + try { + m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]); + } + catch (...) + { + break; + } + } + } + else if (arg == "--cuda-high-cpu") { + m_cudaHighCPULoad = true; + } else return false; return true; @@ -377,7 +398,8 @@ public: enum class MinerType { CPU, - GPU + GPU, + CUDA }; MinerType minerType() const { return m_minerType; } @@ -476,6 +498,9 @@ private: sealers["cpu"] = GenericFarm::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner::ConstructionInfo ci){ return new EthashCPUMiner(ci); }}; #if ETH_ETHASHCL sealers["opencl"] = GenericFarm::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner::ConstructionInfo ci){ return new EthashGPUMiner(ci); }}; +#endif +#if ETH_ETHASHCU + sealers["cuda"] = GenericFarm::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } }; #endif (void)_m; (void)_remote; @@ -491,7 +516,8 @@ private: f.start("cpu"); else if (_m == MinerType::GPU) f.start("opencl"); - + else if (_m == MinerType::CUDA) + f.start("cuda"); EthashProofOfWork::WorkPackage current; EthashAux::FullType dag; while (true) @@ -589,6 +615,13 @@ private: unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier; unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize; unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch; +#endif +#if ETH_ETHASHCU || !ETH_TRUE + unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier; + unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize; + unsigned m_cudaDeviceCount = 0; + unsigned m_cudaDevices[16]; + bool m_cudaHighCPULoad = false; #endif uint64_t m_currentBlock = 0; // default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.) diff --git a/libethash-cu/CMakeLists.txt b/libethash-cu/CMakeLists.txt new file mode 100644 index 000000000..f5a682b12 --- /dev/null +++ b/libethash-cu/CMakeLists.txt @@ -0,0 +1,29 @@ +set(EXECUTABLE ethash-cu) + +FIND_PACKAGE(CUDA REQUIRED) + +file(GLOB SRC_LIST "*.cpp" "*.cu") +file(GLOB HEADERS "*.h" "*.cuh") + +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) + +LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) +LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G) + +if(COMPUTE) + LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) +else(COMPUTE) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52) +endif(COMPUTE) + + + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(..) +CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) +TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash) + +install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) +install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) + diff --git a/libethash-cu/cuda_helper.h b/libethash-cu/cuda_helper.h new file mode 100644 index 000000000..1b75d2ffd --- /dev/null +++ b/libethash-cu/cuda_helper.h @@ -0,0 +1,1057 @@ +#ifndef CUDA_HELPER_H +#define CUDA_HELPER_H + +#include +#include + +#ifdef __INTELLISENSE__ +/* reduce vstudio warnings (__byteperm, blockIdx...) */ +#include +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define asm("a" : "=l"(result) : "l"(a)) + +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); + +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +#endif + +#include + +#ifndef MAX_GPUS +#define MAX_GPUS 32 +#endif + +extern "C" int device_map[MAX_GPUS]; +extern "C" long device_sm[MAX_GPUS]; +extern cudaStream_t gpustream[MAX_GPUS]; + +// common functions +extern void cuda_check_cpu_init(int thr_id, uint32_t threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2); +extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); +extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce); +extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + + +#ifndef SPH_C32 +#define SPH_C32(x) ((x ## U)) +// #define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) ((x ## ULL)) +// #define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#ifndef SPH_T32 +#define SPH_T32(x) (x) +// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#endif +#ifndef SPH_T64 +#define SPH_T64(x) (x) +// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#endif + +#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n))) + +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#else +// Kepler (Compute 3.5, 5.0) +__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return(__funnelshift_l((x), (x), (n))); +} +#endif +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else +__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return(__funnelshift_r((x), (x), (n))); +} +#endif + + + + +__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(LO), "r"(HI)); + return result; +} + +__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y) +{ + uint64_t result; + asm( + "{\n\t" + ".reg .u32 t,t2; \n\t" + "mov.b64 {t2,t},%1; \n\t" + "mov.b64 %0,{t2,%2}; \n\t" + "}" : "=l"(result) : "l"(x), "r"(y) + ); + return result; + +} +__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y) +{ + uint64_t result; + asm( + "{\n\t" + ".reg .u32 t,t2; \n\t" + "mov.b64 {t2,t},%1; \n\t" + "mov.b64 %0,{%2,t}; \n\t" + "}" : "=l"(result) : "l"(x) , "r"(y) + ); + return result; +} + +// Endian Drehung für 32 Bit Typen +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else + /* host */ + #define cuda_swab32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#endif + + +static __device__ __forceinline__ uint32_t _HIWORD(const uint64_t x) +{ + uint32_t result; + asm( + "{\n\t" + ".reg .u32 xl; \n\t" + "mov.b64 {xl,%0},%1; \n\t" + "}" : "=r"(result) : "l"(x) + ); + return result; +} + +static __device__ __forceinline__ uint32_t _LOWORD(const uint64_t x) +{ + uint32_t result; + asm( + "{\n\t" + ".reg .u32 xh; \n\t" + "mov.b64 {%0,xh},%1; \n\t" + "}" : "=r"(result) : "l"(x) + ); + return result; +} + +// Input: 77665544 33221100 +// Output: 00112233 44556677 +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ uint64_t cuda_swab64(const uint64_t x) +{ + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x=__byte_perm(t.x, 0, 0x0123); + t.y=__byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; +} +#else + /* host */ + #define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +#endif + +/*********************************************************************/ +// Macros to catch CUDA errors in CUDA runtime calls + +#define CUDA_SAFE_CALL(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \ + __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET(call) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return; \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET_X(call, ret) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return ret; \ + } \ +} while (0) + +/*********************************************************************/ +#ifdef _WIN64 +#define USE_XOR_ASM_OPTS 0 +#else +#define USE_XOR_ASM_OPTS 1 +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor1(const uint64_t a, const uint64_t b) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + return result; +} +#else +#define xor1(a,b) (a ^ b) +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("xor.b64 %0, %2, %3;\n\t" + "xor.b64 %0, %0, %1;\n\t" + /* output : input registers */ + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} +#else +#define xor3(a,b,c) (a ^ b ^ c) +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} +#else +#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h)) +#endif + +// device asm for x17 +__device__ __forceinline__ +uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 n;\n\t" + "xor.b64 %0, %2, %3;\n\t" + "and.b64 n, %0, %1;\n\t" + "xor.b64 %0, n, %3;" + "}\n" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t andor(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 m,n;\n\t" + "and.b64 m, %1, %2;\n\t" + " or.b64 n, %1, %2;\n\t" + "and.b64 %0, n, %3;\n\t" + " or.b64 %0, %0, m ;\n\t" + "}\n" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t shr_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shr.b64 %0,%1,%2;\n\t" + : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +// device asm for ? +__device__ __forceinline__ +uint64_t shl_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shl.b64 %0,%1,%2;\n\t" + : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +#ifndef USE_ROT_ASM_OPT +#define USE_ROT_ASM_OPT 1 +#endif + +// 64-bit ROTATE RIGHT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t value, const int offset) { + uint2 result; + if(offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shr.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shl.b64 %0, %1, roff;\n\t" + "add.u64 %0, %0, lhs;\n\t" + "}\n" + : "=l"(result) : "l"(x), "r"(offset)); + return result; +} +#else +/* host */ +#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif + +// 64-bit ROTATE LEFT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +__device__ __forceinline__ +uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +__device__ __forceinline__ +uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shl.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shr.b64 %0, %1, roff;\n\t" + "add.u64 %0, lhs, %0;\n\t" + "}\n" + : "=l"(result) : "l"(x), "r"(offset)); + return result; +} +#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3 +__device__ +uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) : "l"(x) , "r"(offset) + ); + return res; +} +#else +/* host */ +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +__device__ __forceinline__ +uint64_t SWAPDWORDS(uint64_t value) +{ +#if __CUDA_ARCH__ >= 320 + uint2 temp; + asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +#else + return ROTL64(value, 32); +#endif +} + +/* lyra2 - int2 operators */ + +__device__ __forceinline__ +void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(lo), "=r"(hi) : "l"(x)); +} + +__device__ __forceinline__ uint64_t devectorize(uint2 x) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(x.x), "r"(x.y)); + return result; +} + + +__device__ __forceinline__ uint2 vectorize(const uint64_t x) +{ + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.x), "=r"(result.y) : "l"(x)); + return result; +} +__device__ __forceinline__ void devectorize2(uint4 inn, uint2 &x, uint2 &y) +{ + x.x = inn.x; + x.y = inn.y; + y.x = inn.z; + y.y = inn.w; +} + + +__device__ __forceinline__ uint4 vectorize2(uint2 x, uint2 y) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = y.x; + result.w = y.y; + + return result; +} + +__device__ __forceinline__ uint4 vectorize2(uint2 x) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = x.x; + result.w = x.y; + return result; +} + + +__device__ __forceinline__ uint4 vectorize4(uint64_t x, uint64_t y) +{ + uint4 result; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.x), "=r"(result.y) : "l"(x)); + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.z), "=r"(result.w) : "l"(y)); + return result; +} +__device__ __forceinline__ void devectorize4(uint4 inn, uint64_t &x, uint64_t &y) +{ + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(x) : "r"(inn.x), "r"(inn.y)); + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(y) : "r"(inn.z), "r"(inn.w)); +} + + + +static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) { + uint2 result; + result.x = v; + result.y = 0; + return result; +} +static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) { + uint2 result; + result.x = 0; + result.y = v; + return result; +} + +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); } +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } +static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static __device__ __forceinline__ uint2 operator- (uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + + + +static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); } +static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); } +static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); } +static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __device__ __forceinline__ uint4 operator^ (uint4 a, uint2 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); } + + +static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +// uint2 method +#if __CUDA_ARCH__ >= 350 +__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; + if (offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__device__ __inline__ uint2 ROR2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y >> (n)) | (v.x << (32 - n))); + result.x = ((v.x >> (n)) | (v.y << (32 - n))); + } + else + { + result.y = ((v.x >> (n - 32)) | (v.y << (64 - n))); + result.x = ((v.y >> (n - 32)) | (v.x << (64 - n))); + } + return result; +} +#endif + + + +__device__ __inline__ uint32_t ROL8(const uint32_t x) +{ + return __byte_perm(x, x, 0x2103); +} +__device__ __inline__ uint32_t ROL16(const uint32_t x) +{ + return __byte_perm(x, x, 0x1032); +} +__device__ __inline__ uint32_t ROL24(const uint32_t x) +{ + return __byte_perm(x, x, 0x0321); +} + +__device__ __inline__ uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x4321); + + return result; +} + +__device__ __inline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + + return result; +} + +__device__ __inline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + + return result; +} + +__device__ __inline__ uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + + return result; +} + +__device__ __inline__ uint2 ROL16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +__device__ __inline__ uint2 ROL24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x0765); + + return result; +} + + + +#if __CUDA_ARCH__ >= 350 +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { + uint2 result; + if (offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else + { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + + } + return result; +} +#endif + +__device__ __forceinline__ +uint64_t ROTR16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x)); + return x; +#else + return ROTR64(x, 16); +#endif +} +__device__ __forceinline__ +uint64_t ROTL16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z)); + return x; +#else + return ROTL64(x, 16); +#endif +} + +__device__ __forceinline__ +uint2 SWAPINT2(uint2 x) +{ + return(make_uint2(x.y, x.x)); +} +__device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target) +{ + if (hash[7] > target[7]) + return false; + if (hash[7] < target[7]) + return true; + if (hash[6] > target[6]) + return false; + if (hash[6] < target[6]) + return true; + if (hash[5] > target[5]) + return false; + if (hash[5] < target[5]) + return true; + if (hash[4] > target[4]) + return false; + if (hash[4] < target[4]) + return true; + if (hash[3] > target[3]) + return false; + if (hash[3] < target[3]) + return true; + if (hash[2] > target[2]) + return false; + if (hash[2] < target[2]) + return true; + if (hash[1] > target[1]) + return false; + if (hash[1] < target[1]) + return true; + if (hash[0] > target[0]) + return false; + return true; +} + +__device__ __forceinline__ +uint2 SWAPDWORDS2(uint2 value) +{ + return make_uint2(value.y, value.x); +} +__device__ __forceinline__ +uint4 SWAPDWORDS2(uint4 value) +{ + return make_uint4(value.y, value.x, value.w ,value.z); +} + +static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset) +{ +#if __CUDA_ARCH__ > 300 + uint2 result; + if (offset<32) + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + else { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; +#else + if (offset<=32) + { + a.y = (a.y << offset) | (a.x >> (32 - offset)); + a.x = (a.x << offset); + } + else + { + a.y = (a.x << (offset-32)); + a.x = 0; + } + return a; +#endif +} +static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset) +{ + #if __CUDA_ARCH__ > 300 + uint2 result; + if (offset<32) { + asm("{\n\t" + "shf.r.clamp.b32 %0,%2,%3,%4; \n\t" + "shr.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + else { + asm("{\n\t" + "shf.l.clamp.b32 %0,%2,%3,%4; \n\t" + "shl.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; + #else + if (offset<=32) + { + a.x = (a.x >> offset) | (a.y << (32 - offset)); + a.y = (a.y >> offset); + } + else + { + a.x = (a.y >> (offset - 32)); + a.y = 0; + } + return a; + #endif +} + +static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); } +static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { + uint2 result; + LOHI(result.y, result.x, v); + result.x = cuda_swab32(result.x); + result.y = cuda_swab32(result.y); + return result; +} + + +__device__ __forceinline__ uint32_t devectorize16(ushort2 x) +{ + uint32_t result; + asm("mov.b32 %0,{%1,%2}; \n\t" + : "=r"(result) : "h"(x.x) , "h"(x.y)); + return result; +} + + +__device__ __forceinline__ ushort2 vectorize16(uint32_t x) +{ + ushort2 result; + asm("mov.b32 {%0,%1},%2; \n\t" + : "=h"(result.x), "=h"(result.y) : "r"(x)); + return result; +} + + + + +static __device__ __forceinline__ uint4 mul4(uint4 a) +{ + uint4 result; + asm("{\n\t" + "mul.lo.u32 %0,%4,%5; \n\t" + "mul.hi.u32 %1,%4,%5; \n\t" + "mul.lo.u32 %2,%6,%7; \n\t" + "mul.hi.u32 %3,%6,%7; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w)); + return result; +} +static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "add.cc.u32 %0,%4,%8; \n\t" + "addc.u32 %1,%5,%9; \n\t" + "add.cc.u32 %2,%6,%10; \n\t" + "addc.u32 %3,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "mad.lo.cc.u32 %0,%4,%5,%8; \n\t" + "madc.hi.u32 %1,%4,%5,%9; \n\t" + "mad.lo.cc.u32 %2,%6,%7,%10; \n\t" + "madc.hi.u32 %3,%6,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b) + { + ulonglong2 result; + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %2;\n\t" + "mov.b64 {a2,a3}, %3;\n\t" + "mov.b64 {b0,b1}, %4;\n\t" + "mov.b64 {b2,b3}, %5;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)); + return result; + } +static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b) + { + + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %0;\n\t" + "mov.b64 {a2,a3}, %1;\n\t" + "mov.b64 {b0,b1}, %2;\n\t" + "mov.b64 {b2,b3}, %3;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y)); + +} + +__device__ __forceinline__ +uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} + +__device__ __forceinline__ +uint32_t shr_t32(uint32_t x, uint32_t n) { + uint32_t result; + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +__device__ __forceinline__ +uint32_t shl_t32(uint32_t x, uint32_t n) { + uint32_t result; + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +// device asm 32 for pluck +__device__ __forceinline__ +uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} + +__device__ __forceinline__ +uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) { + uint32_t ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits)); + return ret; + +} + +__device__ __forceinline__ +uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) { + uint32_t ret; + asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits)); + return ret; +} + + + +#endif // #ifndef CUDA_HELPER_H + + diff --git a/libethash-cu/dagger.cuh b/libethash-cu/dagger.cuh new file mode 100644 index 000000000..8b341171b --- /dev/null +++ b/libethash-cu/dagger.cuh @@ -0,0 +1,22 @@ +#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; } + +#define ACCESSES 64 +#define THREADS_PER_HASH (128 / 16) +#define FNV_PRIME 0x01000193 + +#define fnv(x,y) ((x) * FNV_PRIME ^(y)) + +__device__ uint4 fnv4(uint4 a, uint4 b) +{ + uint4 c; + c.x = a.x * FNV_PRIME ^ b.x; + c.y = a.y * FNV_PRIME ^ b.y; + c.z = a.z * FNV_PRIME ^ b.z; + c.w = a.w * FNV_PRIME ^ b.w; + return c; +} + +__device__ uint32_t fnv_reduce(uint4 v) +{ + return fnv(fnv(fnv(v.x, v.y), v.z), v.w); +} \ No newline at end of file diff --git a/libethash-cu/dagger_shared.cuh b/libethash-cu/dagger_shared.cuh new file mode 100644 index 000000000..8de4c591f --- /dev/null +++ b/libethash-cu/dagger_shared.cuh @@ -0,0 +1,136 @@ +#include "ethash_cu_miner_kernel_globals.h" +#include "ethash_cu_miner_kernel.h" +#include "keccak.cuh" +#include "dagger.cuh" + +typedef union +{ + hash64_t init; + hash32_t mix; +} compute_hash_share; + +__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce) +{ + hash64_t init; + + // sha3_512(header .. nonce) + uint64_t state[25]; + + copy(state, header->uint64s, 4); + state[4] = nonce; + state[5] = 0x0000000000000001; + state[6] = 0; + state[7] = 0; + state[8] = 0x8000000000000000; + for (uint32_t i = 9; i < 25; i++) + { + state[i] = 0; + } + + keccak_f1600_block((uint2 *)state, 8); + copy(init.uint64s, state, 8); + return init; +} + +__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag) +{ + // share init0 + if (thread_id == 0) + *share = mix.x; + + uint32_t init0 = *share; + + uint32_t a = 0; + + do + { + + bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); + + //#pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) + { + + if (update_share) + { + uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w }; + *share = fnv(init0 ^ (a + i), m[i]) % d_dag_size; + } + __threadfence_block(); + +#if __CUDA_ARCH__ >= 350 + mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id])); +#else + mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); +#endif + + } + + } while ((a += 4) != ACCESSES); + + return fnv_reduce(mix); +} + +__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix) +{ + uint64_t state[25]; + + hash32_t hash; + + // keccak_256(keccak_512(header..nonce) .. mix); + copy(state, init->uint64s, 8); + copy(state + 8, mix->uint64s, 4); + state[12] = 0x0000000000000001; + for (uint32_t i = 13; i < 16; i++) + { + state[i] = 0; + } + state[16] = 0x8000000000000000; + for (uint32_t i = 17; i < 25; i++) + { + state[i] = 0; + } + + keccak_f1600_block((uint2 *)state, 1); + + // copy out + copy(hash.uint64s, state, 4); + return hash; +} + +__device__ hash32_t compute_hash( + hash32_t const* g_header, + hash128_t const* g_dag, + uint64_t nonce + ) +{ + extern __shared__ compute_hash_share share[]; + + // Compute one init hash per work item. + hash64_t init = init_hash(g_header, nonce); + + // Threads work together in this phase in groups of 8. + uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1); + uint32_t const hash_id = threadIdx.x >> 3; + + hash32_t mix; + + for (int i = 0; i < THREADS_PER_HASH; i++) + { + // share init with other threads + if (i == thread_id) + share[hash_id].init = init; + + uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3]; + + uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag); + + share[hash_id].mix.uint32s[thread_id] = thread_mix; + + + if (i == thread_id) + mix = share[hash_id].mix; + } + + return final_hash(&init, &mix); +} \ No newline at end of file diff --git a/libethash-cu/dagger_shuffled.cuh b/libethash-cu/dagger_shuffled.cuh new file mode 100644 index 000000000..fdac9d9e1 --- /dev/null +++ b/libethash-cu/dagger_shuffled.cuh @@ -0,0 +1,107 @@ +#include "ethash_cu_miner_kernel_globals.h" +#include "ethash_cu_miner_kernel.h" +#include "keccak.cuh" +#include "dagger.cuh" + +__device__ uint64_t compute_hash_shuffle( + uint2 const* g_header, + hash128_t const* g_dag, + uint64_t nonce + ) +{ + // sha3_512(header .. nonce) + uint2 state[25]; + + state[0] = g_header[0]; + state[1] = g_header[1]; + state[2] = g_header[2]; + state[3] = g_header[3]; + state[4] = vectorize(nonce); + state[5] = vectorize(0x0000000000000001ULL); + for (uint32_t i = 6; i < 25; i++) + { + state[i] = make_uint2(0, 0); + } + state[8] = vectorize(0x8000000000000000ULL); + keccak_f1600_block(state,8); + + // Threads work together in this phase in groups of 8. + const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1); + const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1); + const int mix_idx = thread_id & 3; + + uint4 mix; + uint2 shuffle[8]; + + for (int i = 0; i < THREADS_PER_HASH; i++) + { + // share init among threads + for (int j = 0; j < 8; j++) { + shuffle[j].x = __shfl(state[j].x, start_lane + i); + shuffle[j].y = __shfl(state[j].y, start_lane + i); + } + + // ugly but avoids local reads/writes + if (mix_idx < 2) { + if (mix_idx == 0) + mix = vectorize2(shuffle[0], shuffle[1]); + else + mix = vectorize2(shuffle[2], shuffle[3]); + } + else { + if (mix_idx == 2) + mix = vectorize2(shuffle[4], shuffle[5]); + else + mix = vectorize2(shuffle[6], shuffle[7]); + } + + uint32_t init0 = __shfl(shuffle[0].x, start_lane); + + for (uint32_t a = 0; a < ACCESSES; a += 4) + { + int t = ((a >> 2) & (THREADS_PER_HASH - 1)); + + for (uint32_t b = 0; b < 4; b++) + { + if (thread_id == t) + { + shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size; + } + shuffle[0].x = __shfl(shuffle[0].x, start_lane + t); + + mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]); + } + } + + uint32_t thread_mix = fnv_reduce(mix); + + // update mix accross threads + shuffle[0].x = __shfl(thread_mix, start_lane + 0); + shuffle[0].y = __shfl(thread_mix, start_lane + 1); + shuffle[1].x = __shfl(thread_mix, start_lane + 2); + shuffle[1].y = __shfl(thread_mix, start_lane + 3); + shuffle[2].x = __shfl(thread_mix, start_lane + 4); + shuffle[2].y = __shfl(thread_mix, start_lane + 5); + shuffle[3].x = __shfl(thread_mix, start_lane + 6); + shuffle[3].y = __shfl(thread_mix, start_lane + 7); + + if (i == thread_id) { + //move mix into state: + state[8] = shuffle[0]; + state[9] = shuffle[1]; + state[10] = shuffle[2]; + state[11] = shuffle[3]; + } + } + + // keccak_256(keccak_512(header..nonce) .. mix); + state[12] = vectorize(0x0000000000000001ULL); + for (uint32_t i = 13; i < 25; i++) + { + state[i] = vectorize(0ULL); + } + state[16] = vectorize(0x8000000000000000); + keccak_f1600_block(state, 1); + + return devectorize(state[0]); +} \ No newline at end of file diff --git a/libethash-cu/ethash_cu_miner.cpp b/libethash-cu/ethash_cu_miner.cpp new file mode 100644 index 000000000..4d2f3d2be --- /dev/null +++ b/libethash-cu/ethash_cu_miner.cpp @@ -0,0 +1,281 @@ +/* + This file is part of c-ethash. + + c-ethash is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + c-ethash is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with cpp-ethereum. If not, see . +*/ +/** @file ethash_cu_miner.cpp +* @author Tim Hughes +* @date 2015 +*/ + + +#define _CRT_SECURE_NO_WARNINGS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ethash_cu_miner.h" +#include "ethash_cu_miner_kernel_globals.h" + + +#define ETHASH_BYTES 32 + +// workaround lame platforms +#if !CL_VERSION_1_2 +#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE +#define CL_MEM_HOST_READ_ONLY 0 +#endif + +#undef min +#undef max + +using namespace std; + +unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128; +unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE + +ethash_cu_miner::search_hook::~search_hook() {} + +ethash_cu_miner::ethash_cu_miner() +{ +} + +std::string ethash_cu_miner::platform_info(unsigned _deviceId) +{ + int runtime_version; + int device_count; + + device_count = get_num_devices(); + + if (device_count == 0) + return std::string(); + + if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue) + { + cout << cudaGetErrorString(cudaErrorInvalidValue) << endl; + return std::string(); + } + + // use selected default device + int device_num = std::min((int)_deviceId, device_count - 1); + + cudaDeviceProp device_props; + if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) + { + cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; + return std::string(); + } + + char platform[5]; + int version_major = runtime_version / 1000; + int version_minor = (runtime_version - (version_major * 1000)) / 10; + sprintf(platform, "%d.%d", version_major, version_minor); + + + char compute[5]; + sprintf(compute, "%d.%d", device_props.major, device_props.minor); + + return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }"; + +} + +int ethash_cu_miner::get_num_devices() +{ + int device_count; + + if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice) + { + cout << cudaGetErrorString(cudaErrorNoDevice) << endl; + return 0; + } + return device_count; +} + +void ethash_cu_miner::finish() +{ + for (unsigned i = 0; i != m_num_buffers; i++) { + cudaStreamDestroy(m_streams[i]); + m_streams[i] = 0; + } + cudaDeviceReset(); +} + +bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu) +{ + + int device_count = get_num_devices(); + + if (device_count == 0) + return false; + + // use selected device + int device_num = std::min((int)_deviceId, device_count - 1); + + cudaDeviceProp device_props; + if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice) + { + cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl; + return false; + } + + cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl; + + cudaError_t r = cudaSetDevice(device_num); + if (r != cudaSuccess) + { + cout << cudaGetErrorString(r) << endl; + return false; + } + cudaDeviceReset(); + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + + m_num_buffers = num_buffers; + m_search_batch_size = search_batch_size; + + m_hash_buf = new void *[m_num_buffers]; + m_search_buf = new uint32_t *[m_num_buffers]; + m_streams = new cudaStream_t[m_num_buffers]; + + // use requested workgroup size, but we require multiple of 8 + m_workgroup_size = ((workgroup_size + 7) / 8) * 8; + + m_highcpu = highcpu; + + // patch source code + cudaError result; + + uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES); + unsigned max_outputs = c_max_search_results; + + result = set_constants(&dagSize128, &max_outputs); + + // create buffer for dag + result = cudaMalloc(&m_dag_ptr, _dagSize); + + // create buffer for header256 + result = cudaMalloc(&m_header, 32); + + // copy dag to CPU. + result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice); + + // create mining buffers + for (unsigned i = 0; i != m_num_buffers; ++i) + { + result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size); + result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t)); + result = cudaStreamCreate(&m_streams[i]); + } + if (result != cudaSuccess) + { + cout << cudaGetErrorString(result) << endl; + return false; + } + return true; +} + +/** + * Prevent High CPU usage while waiting for an async task + */ +static unsigned waitStream(cudaStream_t stream) +{ + unsigned wait_ms = 0; + while (cudaStreamQuery(stream) == cudaErrorNotReady) { + this_thread::sleep_for(chrono::milliseconds(10)); + wait_ms += 10; + } + return wait_ms; +} + +void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook) +{ + struct pending_batch + { + uint64_t start_nonce; + unsigned buf; + }; + std::queue pending; + + static uint32_t const c_zero = 0; + + // update header constant buffer + cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice); + for (unsigned i = 0; i != m_num_buffers; ++i) + { + cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice); + } + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) + { + throw std::runtime_error(cudaGetErrorString(err)); + } + + unsigned buf = 0; + std::random_device engine; + uint64_t start_nonce = std::uniform_int_distribution()(engine); + for (;; start_nonce += m_search_batch_size) + { + run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target); + + pending.push({ start_nonce, buf }); + buf = (buf + 1) % m_num_buffers; + + // read results + if (pending.size() == m_num_buffers) + { + pending_batch const& batch = pending.front(); + + uint32_t results[1 + c_max_search_results]; + + if (!m_highcpu) + waitStream(m_streams[buf]); // 28ms + cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]); + + unsigned num_found = std::min(results[0], c_max_search_results); + uint64_t nonces[c_max_search_results]; + for (unsigned i = 0; i != num_found; ++i) + { + nonces[i] = batch.start_nonce + results[i + 1]; + //cout << results[i + 1] << ", "; + } + //if (num_found > 0) + // cout << endl; + + bool exit = num_found && hook.found(nonces, num_found); + exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit + if (exit) + break; + + start_nonce += m_search_batch_size; + // reset search buffer if we're still going + if (num_found) + cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]); + + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) + { + throw std::runtime_error(cudaGetErrorString(err)); + } + pending.pop(); + } + } +} + diff --git a/libethash-cu/ethash_cu_miner.h b/libethash-cu/ethash_cu_miner.h new file mode 100644 index 000000000..54e535850 --- /dev/null +++ b/libethash-cu/ethash_cu_miner.h @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include +#include +#include +#include "ethash_cu_miner_kernel.h" + +class ethash_cu_miner +{ +public: + struct search_hook + { + virtual ~search_hook(); // always a virtual destructor for a class with virtuals. + + // reports progress, return true to abort + virtual bool found(uint64_t const* nonces, uint32_t count) = 0; + virtual bool searched(uint64_t start_nonce, uint32_t count) = 0; + }; + +public: + ethash_cu_miner(); + + bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false); + static std::string platform_info(unsigned _deviceId = 0); + static int get_num_devices(); + + + void finish(); + void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count); + void search(uint8_t const* header, uint64_t target, search_hook& hook); + + /* -- default values -- */ + /// Default value of the local work size. Also known as workgroup size. + static unsigned const c_defaultLocalWorkSize; + /// Default value of the global work size as a multiplier of the local work size + static unsigned const c_defaultGlobalWorkSizeMultiplier; + +private: + enum { c_max_search_results = 63, c_hash_batch_size = 1024 }; + + bool m_highcpu; + unsigned m_num_buffers; + unsigned m_search_batch_size; + unsigned m_workgroup_size; + + hash128_t * m_dag_ptr; + hash32_t * m_header; + + void ** m_hash_buf; + uint32_t ** m_search_buf; + cudaStream_t * m_streams; + + +}; \ No newline at end of file diff --git a/libethash-cu/ethash_cu_miner_kernel.cu b/libethash-cu/ethash_cu_miner_kernel.cu new file mode 100644 index 000000000..dc1da4fb1 --- /dev/null +++ b/libethash-cu/ethash_cu_miner_kernel.cu @@ -0,0 +1,72 @@ +/* +* Genoil's CUDA mining kernel for Ethereum +* based on Tim Hughes' opencl kernel. +* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer. +*/ + +#include "ethash_cu_miner_kernel.h" +#include "ethash_cu_miner_kernel_globals.h" +#include "cuda_helper.h" + +#define SHUFFLE_MIN_VER 350 +#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER +#include "dagger_shuffled.cuh" +#else +#include "dagger_shared.cuh" +#endif + +__global__ void +__launch_bounds__(128, 7) +ethash_search( + uint32_t* g_output, + hash32_t const* g_header, + hash128_t const* g_dag, + uint64_t start_nonce, + uint64_t target + ) +{ + + uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x; + +#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER + uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid); + if (cuda_swab64(hash) < target) +#else + hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid); + if (cuda_swab64(hash.uint64s[0]) < target) +#endif + { + atomicInc(g_output, d_max_outputs); + g_output[g_output[0]] = gid; + } + +} + +void run_ethash_search( + uint32_t blocks, + uint32_t threads, + cudaStream_t stream, + uint32_t* g_output, + hash32_t const* g_header, + hash128_t const* g_dag, + uint64_t start_nonce, + uint64_t target +) +{ +#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER + ethash_search <<>>(g_output, g_header, g_dag, start_nonce, target); +#else + ethash_search <<>>(g_output, g_header, g_dag, start_nonce, target); +#endif +} + +cudaError set_constants( + uint32_t * dag_size, + uint32_t * max_outputs + ) +{ + cudaError result; + result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t)); + result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t)); + return result; +} diff --git a/libethash-cu/ethash_cu_miner_kernel.h b/libethash-cu/ethash_cu_miner_kernel.h new file mode 100644 index 000000000..6f22ed447 --- /dev/null +++ b/libethash-cu/ethash_cu_miner_kernel.h @@ -0,0 +1,60 @@ +#ifndef _ETHASH_CU_MINER_KERNEL_H_ +#define _ETHASH_CU_MINER_KERNEL_H_ + +#include + + +typedef union +{ + uint64_t uint64s[16 / sizeof(uint64_t)]; + uint32_t uint32s[16 / sizeof(uint32_t)]; +} hash16_t; + +typedef union +{ + uint32_t uint32s[32 / sizeof(uint32_t)]; + uint64_t uint64s[32 / sizeof(uint64_t)]; + uint2 uint2s[32 / sizeof(uint2)]; +} hash32_t; + + +typedef union +{ + uint32_t uint32s[64 / sizeof(uint32_t)]; + uint64_t uint64s[64 / sizeof(uint64_t)]; + uint4 uint4s[64 / sizeof(uint4)]; +} hash64_t; + + +typedef union +{ + uint32_t uint32s[128 / sizeof(uint32_t)]; + uint4 uint4s[128 / sizeof(uint4)]; +} hash128_t; + +//typedef uint32_t hash128_t; + +cudaError set_constants( + uint32_t * dag_size, + uint32_t * max_outputs +); + +void run_ethash_hash( + hash32_t* g_hashes, + hash32_t const* g_header, + hash128_t const* g_dag, + uint64_t start_nonce +); + +void run_ethash_search( + uint32_t search_batch_size, + uint32_t workgroup_size, + cudaStream_t stream, + uint32_t* g_output, + hash32_t const* g_header, + hash128_t const* g_dag, + uint64_t start_nonce, + uint64_t target +); + +#endif diff --git a/libethash-cu/ethash_cu_miner_kernel_globals.h b/libethash-cu/ethash_cu_miner_kernel_globals.h new file mode 100644 index 000000000..20754d8a9 --- /dev/null +++ b/libethash-cu/ethash_cu_miner_kernel_globals.h @@ -0,0 +1,9 @@ +#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ +#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_ + +//#include "cuda_helper.h" + +__constant__ uint32_t d_dag_size; +__constant__ uint32_t d_max_outputs; + +#endif \ No newline at end of file diff --git a/libethash-cu/keccak.cuh b/libethash-cu/keccak.cuh new file mode 100644 index 000000000..99ff45222 --- /dev/null +++ b/libethash-cu/keccak.cuh @@ -0,0 +1,89 @@ +#include "cuda_helper.h" + +__device__ __constant__ uint64_t const keccak_round_constants[24] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL, + 0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL, + 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL, + 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL, + 0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL, + 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, + 0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL, + 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL +}; + +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) + +__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size) +{ + uint2 t[5], u, v; + +#pragma unroll 3 + for (int i = 0; i < 24; i++) + { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + u = t[4] ^ ROL2(t[1], 1); + s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u; + u = t[0] ^ ROL2(t[2], 1); + s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u; + u = t[1] ^ ROL2(t[3], 1); + s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u; + u = t[2] ^ ROL2(t[4], 1); + s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u; + u = t[3] ^ ROL2(t[0], 1); + s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u; + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(u, 1); + + // squeeze this in here + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + u = s[0]; v = s[1]; s[0] ^= (~v) & s[2]; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= vectorize(keccak_round_constants[i]); + if (i == 23 && out_size == 1) return; + + // continue chi + s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v; + u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; + + if (i == 23) return; + s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v; + u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u); + u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u); + u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u); + } +} \ No newline at end of file diff --git a/libethcore/CMakeLists.txt b/libethcore/CMakeLists.txt index a5be81a26..3bd626449 100644 --- a/libethcore/CMakeLists.txt +++ b/libethcore/CMakeLists.txt @@ -21,6 +21,9 @@ target_link_libraries(${EXECUTABLE} evmcore) if (ETHASHCL) target_link_libraries(${EXECUTABLE} ethash-cl) endif () +if (ETHASHCU) + target_link_libraries(${EXECUTABLE} ethash-cu) +endif () target_link_libraries(${EXECUTABLE} devcrypto) if (CPUID_FOUND) target_link_libraries(${EXECUTABLE} ${CPUID_LIBRARIES})