end of day commit..nothing works yet

10 years ago · 30bfe53bad
15 changed files with 1989 additions and 5 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,6 +49,7 @@ set(D_TOOLS ON)
 set(D_TESTS ON)
 set(D_FATDB ON)
 set(D_ETHASHCL ON)
+set(D_ETHASHCU OFF)
 set(D_EVMJIT ON)
 set(D_JSCONSOLE ON)
 set(D_JSONRPC ON)
@ -126,6 +127,20 @@ elseif (BUNDLE STREQUAL "miner")
 	set(D_JSONRPC ON)
 	set(D_JSCONSOLE OFF)
 	set(D_EVMJIT OFF)
+elseif (BUNDLE STREQUAL "cudaminer")
+	set(D_SERPENT OFF)
+	set(D_USENPM OFF)
+	set(D_GUI OFF)
+	set(D_TOOLS OFF)
+	set(D_TESTS OFF)
+	set(D_ETHKEY OFF)
+	set(D_MINER ON)
+	set(D_ETHASHCL ON)
+	set(D_ETHASHCU ON) 
+	set(D_FATDB OFF)
+	set(D_JSONRPC ON)
+	set(D_JSCONSOLE OFF)
+	set(D_EVMJIT OFF)
 elseif (BUNDLE STREQUAL "release")      # release builds
 	set(D_SERPENT ${DECENT_PLATFORM})
 	set(D_USENPM OFF)
@ -158,6 +173,10 @@ function(configureProject)
 		add_definitions(-DETH_ETHASHCL)
 	endif()

+	if (ETHASHCU)
+		add_definitions(-DETH_ETHASHCU)
+	endif()
+	
 	if (EVMJIT)
 		add_definitions(-DETH_EVMJIT)
 	endif()
@ -291,6 +310,7 @@ eth_format_option(ROCKSDB)
 eth_format_option(TOOLS)
 eth_format_option(ETHKEY)
 eth_format_option(ETHASHCL)
+eth_format_option(ETHASHCU)
 eth_format_option(JSCONSOLE)
 eth_format_option(OLYMPIC)
 eth_format_option(SERPENT)
@ -341,6 +361,7 @@ message("-- SERPENT          Build Serpent language components        ${SERPENT}
 message("-- GUI              Build GUI components                     ${GUI}")
 message("-- TESTS            Build tests                              ${TESTS}")
 message("-- ETHASHCL         Build OpenCL components                  ${ETHASHCL}")
+message("-- ETHASHCU         Build CUDA components                    ${ETHASHCU}")
 message("-- JSCONSOLE        Build with javascript console            ${JSCONSOLE}")
 message("-- EVMJIT           Build LLVM-based JIT EVM                 ${EVMJIT}")
 message("------------------------------------------------------------------------")
@ -429,6 +450,9 @@ if (GENERAL OR MINER)
 	if (ETHASHCL)
 		add_subdirectory(libethash-cl)
 	endif ()
+	if (ETHASHCU)
+		add_subdirectory(libethash-cu)
+	endif ()
 endif ()

 add_subdirectory(libethcore)
--- a/cmake/EthDependencies.cmake
+++ b/cmake/EthDependencies.cmake
@ -128,6 +128,12 @@ if (OpenCL_FOUND)
 	message(" - opencl lib   : ${OpenCL_LIBRARIES}")
 endif()

+find_package (CUDA)
+if (CUDA_FOUND)
+	message(" - CUDA header: ${CUDA_INCLUDE_DIRS}")
+	message(" - CUDA lib   : ${CUDA_LIBRARIES}")
+endif()
+
 # find location of jsonrpcstub
 find_program(ETH_JSON_RPC_STUB jsonrpcstub)
 message(" - jsonrpcstub location    : ${ETH_JSON_RPC_STUB}")
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -39,12 +39,16 @@
 #include <libdevcore/SHA3.h>
 #include <libdevcore/CommonJS.h>
 #include <libethcore/EthashAux.h>
+#include <libethcore/EthashCUDAMiner.h>
 #include <libethcore/EthashGPUMiner.h>
 #include <libethcore/EthashCPUMiner.h>
 #include <libethcore/Farm.h>
 #if ETH_ETHASHCL || !ETH_TRUE
 #include <libethash-cl/ethash_cl_miner.h>
 #endif
+#if ETH_ETHASHCU || !ETH_TRUE
+#include <libethash-cu/ethash_cu_miner.h>
+#endif
 #if ETH_JSONRPC || !ETH_TRUE
 #include <libweb3jsonrpc/WebThreeStubServer.h>
 #include <jsonrpccpp/server/connectors/httpserver.h>
@ -140,8 +144,8 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-#if ETH_ETHASHCL || !ETH_TRUE
-		else if (arg == "--cl-global-work" && i + 1 < argc)
+#if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE
+		else if (arg == "--gpu-global-work" && i + 1 < argc)
 			try {
 				m_globalWorkSizeMultiplier = stol(argv[++i]);
 			}
@ -150,7 +154,7 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-		else if (arg == "--cl-local-work" && i + 1 < argc)
+		else if (arg == "--gpu-local-work" && i + 1 < argc)
 			try {
 				m_localWorkSize = stol(argv[++i]);
 			}
@ -219,6 +223,8 @@ public:
 			m_minerType = MinerType::CPU;
 		else if (arg == "-G" || arg == "--opencl")
 			m_minerType = MinerType::GPU;
+		else if (arg == "-U" || arg == "--cuda")
+			m_minerType = MinerType::CUDA;
 		else if (arg == "--current-block" && i + 1 < argc)
 			m_currentBlock = stol(argv[++i]);
 		else if (arg == "--no-precompute")
@ -289,6 +295,21 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
+		else if (arg == "--cuda-devices") {
+			while (m_cudaDeviceCount < 16 && i + 1 < argc)
+			{
+				try {
+					m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
+				}
+				catch (...)
+				{
+					break;
+				}
+			}
+		}
+		else if (arg == "--cuda-high-cpu") {
+			m_cudaHighCPULoad = true;
+		}
 		else
 			return false;
 		return true;
@ -377,7 +398,8 @@ public:
 	enum class MinerType
 	{
 		CPU,
-		GPU
+		GPU,
+		CUDA
 	};

 	MinerType minerType() const { return m_minerType; }
@ -476,6 +498,9 @@ private:
 		sealers["cpu"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCPUMiner(ci); }};
 #if ETH_ETHASHCL
 		sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
+#endif
+#if ETH_ETHASHCU
+		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
 #endif
 		(void)_m;
 		(void)_remote;
@ -491,7 +516,8 @@ private:
 			f.start("cpu");
 		else if (_m == MinerType::GPU)
 			f.start("opencl");
-
+		else if (_m == MinerType::CUDA)
+			f.start("cuda");
 		EthashProofOfWork::WorkPackage current;
 		EthashAux::FullType dag;
 		while (true)
@ -589,6 +615,13 @@ private:
 	unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier;
 	unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize;
 	unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch;
+#endif
+#if ETH_ETHASHCU || !ETH_TRUE
+	unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier;
+	unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize;
+	unsigned m_cudaDeviceCount = 0;
+	unsigned m_cudaDevices[16];
+	bool	 m_cudaHighCPULoad = false;
 #endif
 	uint64_t m_currentBlock = 0;
 	// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
--- a/libethash-cu/CMakeLists.txt
+++ b/libethash-cu/CMakeLists.txt
@ -0,0 +1,29 @@
+set(EXECUTABLE ethash-cu)
+
+FIND_PACKAGE(CUDA REQUIRED)
+
+file(GLOB SRC_LIST "*.cpp" "*.cu")
+file(GLOB HEADERS "*.h" "*.cuh")
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
+
+if(COMPUTE)
+	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE)
+	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
+endif(COMPUTE)
+
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash)
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
+
--- a/libethash-cu/cuda_helper.h
+++ b/libethash-cu/cuda_helper.h
--- a/libethash-cu/dagger.cuh
+++ b/libethash-cu/dagger.cuh
@ -0,0 +1,22 @@
+#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; }
+
+#define ACCESSES 64
+#define THREADS_PER_HASH (128 / 16)
+#define FNV_PRIME	0x01000193
+
+#define fnv(x,y) ((x) * FNV_PRIME ^(y))
+
+__device__ uint4 fnv4(uint4 a, uint4 b)
+{
+	uint4 c;
+	c.x = a.x * FNV_PRIME ^ b.x;
+	c.y = a.y * FNV_PRIME ^ b.y;
+	c.z = a.z * FNV_PRIME ^ b.z;
+	c.w = a.w * FNV_PRIME ^ b.w;
+	return c;
+}
+
+__device__ uint32_t fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
--- a/libethash-cu/dagger_shared.cuh
+++ b/libethash-cu/dagger_shared.cuh
@ -0,0 +1,136 @@
+#include "ethash_cu_miner_kernel_globals.h"
+#include "ethash_cu_miner_kernel.h"
+#include "keccak.cuh"
+#include "dagger.cuh"
+
+typedef union
+{
+	hash64_t init;
+	hash32_t mix;
+} compute_hash_share;
+
+__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
+{
+	hash64_t init;
+
+	// sha3_512(header .. nonce)
+	uint64_t state[25];
+
+	copy(state, header->uint64s, 4);
+	state[4] = nonce;
+	state[5] = 0x0000000000000001;
+	state[6] = 0;
+	state[7] = 0;
+	state[8] = 0x8000000000000000;
+	for (uint32_t i = 9; i < 25; i++)
+	{
+		state[i] = 0;
+	}
+
+	keccak_f1600_block((uint2 *)state, 8);
+	copy(init.uint64s, state, 8);
+	return init;
+}
+
+__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
+{
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+
+	uint32_t init0 = *share;
+
+	uint32_t a = 0;
+
+	do
+	{
+
+		bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
+
+		//#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+
+			if (update_share)
+			{
+				uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size;
+			}
+			__threadfence_block();
+
+#if __CUDA_ARCH__ >= 350
+			mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
+#else
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+#endif
+
+		}
+
+	} while ((a += 4) != ACCESSES);
+
+	return fnv_reduce(mix);
+}
+
+__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
+{
+	uint64_t state[25];
+
+	hash32_t hash;
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->uint64s, 8);
+	copy(state + 8, mix->uint64s, 4);
+	state[12] = 0x0000000000000001;
+	for (uint32_t i = 13; i < 16; i++)
+	{
+		state[i] = 0;
+	}
+	state[16] = 0x8000000000000000;
+	for (uint32_t i = 17; i < 25; i++)
+	{
+		state[i] = 0;
+	}
+
+	keccak_f1600_block((uint2 *)state, 1);
+
+	// copy out
+	copy(hash.uint64s, state, 4);
+	return hash;
+}
+
+__device__ hash32_t compute_hash(
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t nonce
+	)
+{
+	extern __shared__  compute_hash_share share[];
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce);
+
+	// Threads work together in this phase in groups of 8.
+	uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
+	uint32_t const hash_id = threadIdx.x >> 3;
+
+	hash32_t mix;
+
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
+
+		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
+
+		share[hash_id].mix.uint32s[thread_id] = thread_mix;
+
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+	}
+
+	return final_hash(&init, &mix);
+}
--- a/libethash-cu/dagger_shuffled.cuh
+++ b/libethash-cu/dagger_shuffled.cuh
@ -0,0 +1,107 @@
+#include "ethash_cu_miner_kernel_globals.h"
+#include "ethash_cu_miner_kernel.h"
+#include "keccak.cuh"
+#include "dagger.cuh"
+
+__device__ uint64_t compute_hash_shuffle(
+	uint2 const* g_header,
+	hash128_t const* g_dag,
+	uint64_t nonce
+	)
+{
+	// sha3_512(header .. nonce)
+	uint2 state[25];
+
+	state[0] = g_header[0];
+	state[1] = g_header[1];
+	state[2] = g_header[2];
+	state[3] = g_header[3];
+	state[4] = vectorize(nonce);
+	state[5] = vectorize(0x0000000000000001ULL);
+	for (uint32_t i = 6; i < 25; i++)
+	{
+		state[i] = make_uint2(0, 0);
+	}
+	state[8] = vectorize(0x8000000000000000ULL);
+	keccak_f1600_block(state,8);
+
+	// Threads work together in this phase in groups of 8.
+	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
+	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
+	const int mix_idx    = thread_id & 3;
+
+	uint4 mix;
+	uint2 shuffle[8];
+
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init among threads
+		for (int j = 0; j < 8; j++) {
+			shuffle[j].x = __shfl(state[j].x, start_lane + i);
+			shuffle[j].y = __shfl(state[j].y, start_lane + i);
+		}
+
+		// ugly but avoids local reads/writes
+		if (mix_idx < 2) {
+			if (mix_idx == 0)
+				mix = vectorize2(shuffle[0], shuffle[1]);
+			else
+				mix = vectorize2(shuffle[2], shuffle[3]);
+		}
+		else  {
+			if (mix_idx == 2)
+				mix = vectorize2(shuffle[4], shuffle[5]);
+			else
+				mix = vectorize2(shuffle[6], shuffle[7]);
+		}
+		
+		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
+
+		for (uint32_t a = 0; a < ACCESSES; a += 4)
+		{
+			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
+
+			for (uint32_t b = 0; b < 4; b++)
+			{
+				if (thread_id == t)
+				{	
+					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
+				}
+				shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
+
+				mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
+			}
+		}
+
+		uint32_t thread_mix = fnv_reduce(mix);
+
+		// update mix accross threads
+		shuffle[0].x = __shfl(thread_mix, start_lane + 0);
+		shuffle[0].y = __shfl(thread_mix, start_lane + 1);
+		shuffle[1].x = __shfl(thread_mix, start_lane + 2);
+		shuffle[1].y = __shfl(thread_mix, start_lane + 3);
+		shuffle[2].x = __shfl(thread_mix, start_lane + 4);
+		shuffle[2].y = __shfl(thread_mix, start_lane + 5);
+		shuffle[3].x = __shfl(thread_mix, start_lane + 6);
+		shuffle[3].y = __shfl(thread_mix, start_lane + 7);
+
+		if (i == thread_id) {
+			//move mix into state:
+			state[8] = shuffle[0];
+			state[9] = shuffle[1];
+			state[10] = shuffle[2];
+			state[11] = shuffle[3];
+		}
+	}
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	state[12] = vectorize(0x0000000000000001ULL);
+	for (uint32_t i = 13; i < 25; i++)
+	{
+		state[i] = vectorize(0ULL);
+	}
+	state[16] = vectorize(0x8000000000000000);
+	keccak_f1600_block(state, 1);
+
+	return devectorize(state[0]);
+}
--- a/libethash-cu/ethash_cu_miner.cpp
+++ b/libethash-cu/ethash_cu_miner.cpp
@ -0,0 +1,281 @@
+/*
+  This file is part of c-ethash.
+
+  c-ethash is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  c-ethash is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/** @file ethash_cu_miner.cpp
+* @author Tim Hughes <tim@twistedfury.com>
+* @date 2015
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <assert.h>
+#include <queue>
+#include <random>
+#include <vector>
+#include <chrono>
+#include <thread>
+#include <libethash/util.h>
+#include <libethash/ethash.h>
+#include "ethash_cu_miner.h"
+#include "ethash_cu_miner_kernel_globals.h"
+
+
+#define ETHASH_BYTES 32
+
+// workaround lame platforms
+#if !CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
+#define CL_MEM_HOST_READ_ONLY 0
+#endif
+
+#undef min
+#undef max
+
+using namespace std;
+
+unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128;
+unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
+
+ethash_cu_miner::search_hook::~search_hook() {}
+
+ethash_cu_miner::ethash_cu_miner()
+{
+}
+
+std::string ethash_cu_miner::platform_info(unsigned _deviceId)
+{
+	int runtime_version;
+	int device_count;
+	
+	device_count = get_num_devices();
+
+	if (device_count == 0)
+		return std::string();
+
+	if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidValue) << endl;
+		return std::string();
+	}
+
+	// use selected default device
+	int device_num = std::min<int>((int)_deviceId, device_count - 1);
+
+	cudaDeviceProp device_props;
+	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
+		return std::string();
+	}
+
+	char platform[5];
+	int version_major = runtime_version / 1000;
+	int version_minor = (runtime_version - (version_major * 1000)) / 10;
+	sprintf(platform, "%d.%d", version_major, version_minor);
+
+
+	char compute[5];
+	sprintf(compute, "%d.%d", device_props.major, device_props.minor);
+
+	return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
+
+}
+
+int ethash_cu_miner::get_num_devices()
+{
+	int device_count;
+
+	if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorNoDevice) << endl;
+		return 0;
+	}
+	return device_count;
+}
+
+void ethash_cu_miner::finish()
+{
+	for (unsigned i = 0; i != m_num_buffers; i++) {
+		cudaStreamDestroy(m_streams[i]);
+		m_streams[i] = 0;
+	}
+	cudaDeviceReset();
+}
+
+bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu)
+{
+	
+	int device_count = get_num_devices();
+
+	if (device_count == 0)
+		return false;
+
+	// use selected device
+	int device_num = std::min<int>((int)_deviceId, device_count - 1);
+	
+	cudaDeviceProp device_props;
+	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
+		return false;
+	}
+
+	cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl;
+
+	cudaError_t r = cudaSetDevice(device_num);
+	if (r != cudaSuccess)
+	{
+		cout << cudaGetErrorString(r) << endl;
+		return false;
+	}
+	cudaDeviceReset();
+	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
+
+	m_num_buffers = num_buffers;
+	m_search_batch_size = search_batch_size;
+
+	m_hash_buf	 = new void *[m_num_buffers];
+	m_search_buf = new uint32_t *[m_num_buffers];
+	m_streams    = new cudaStream_t[m_num_buffers];
+
+	// use requested workgroup size, but we require multiple of 8
+	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+
+	m_highcpu = highcpu;
+
+	// patch source code
+	cudaError result;
+
+	uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
+	unsigned max_outputs = c_max_search_results;
+
+	result = set_constants(&dagSize128, &max_outputs);
+
+	// create buffer for dag
+	result = cudaMalloc(&m_dag_ptr, _dagSize);
+
+	// create buffer for header256
+	result = cudaMalloc(&m_header, 32);
+
+	// copy dag to CPU.
+    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
+	
+	// create mining buffers
+	for (unsigned i = 0; i != m_num_buffers; ++i)
+	{		
+		result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size);
+		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
+		result = cudaStreamCreate(&m_streams[i]);
+	}
+	if (result != cudaSuccess)
+	{
+		cout << cudaGetErrorString(result) << endl;
+		return false;
+	}
+	return true;
+}
+
+/**
+ * Prevent High CPU usage while waiting for an async task
+ */
+static unsigned waitStream(cudaStream_t stream)
+{
+	unsigned wait_ms = 0;
+	while (cudaStreamQuery(stream) == cudaErrorNotReady) {
+		this_thread::sleep_for(chrono::milliseconds(10));
+		wait_ms += 10;
+	}
+	return wait_ms;
+}
+
+void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
+{
+	struct pending_batch
+	{
+		uint64_t start_nonce;
+		unsigned buf;
+	};
+	std::queue<pending_batch> pending;
+
+	static uint32_t const c_zero = 0;
+
+	// update header constant buffer
+	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
+	for (unsigned i = 0; i != m_num_buffers; ++i)
+	{
+		cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice);
+	}
+	cudaError err = cudaGetLastError();
+	if (cudaSuccess != err)
+	{
+		throw std::runtime_error(cudaGetErrorString(err));
+	}
+
+	unsigned buf = 0;
+	std::random_device engine;
+	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
+	for (;; start_nonce += m_search_batch_size)
+	{
+		run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);	
+		
+		pending.push({ start_nonce, buf });
+		buf = (buf + 1) % m_num_buffers;
+
+		// read results
+		if (pending.size() == m_num_buffers)
+		{
+			pending_batch const& batch = pending.front();
+
+			uint32_t results[1 + c_max_search_results];
+
+			if (!m_highcpu)
+				waitStream(m_streams[buf]); // 28ms
+			cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]);
+
+			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
+			uint64_t nonces[c_max_search_results];
+			for (unsigned i = 0; i != num_found; ++i)
+			{
+				nonces[i] = batch.start_nonce + results[i + 1];
+				//cout << results[i + 1] << ", ";
+			}
+			//if (num_found > 0)
+			//	cout << endl;
+			
+			bool exit = num_found && hook.found(nonces, num_found);
+			exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
+			if (exit)
+				break;
+
+			start_nonce += m_search_batch_size;
+			// reset search buffer if we're still going
+			if (num_found)
+				cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
+
+			cudaError err = cudaGetLastError();
+			if (cudaSuccess != err)
+			{
+				throw std::runtime_error(cudaGetErrorString(err));
+			}
+			pending.pop();
+		}
+	}	
+}
+
--- a/libethash-cu/ethash_cu_miner.h
+++ b/libethash-cu/ethash_cu_miner.h
@ -0,0 +1,56 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <time.h>
+#include <functional>
+#include <libethash/ethash.h>
+#include "ethash_cu_miner_kernel.h"
+
+class ethash_cu_miner
+{
+public:
+	struct search_hook
+	{
+		virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
+
+		// reports progress, return true to abort
+		virtual bool found(uint64_t const* nonces, uint32_t count) = 0;
+		virtual bool searched(uint64_t start_nonce, uint32_t count) = 0;
+	};
+
+public:
+	ethash_cu_miner();
+
+	bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false);
+	static std::string platform_info(unsigned _deviceId = 0);
+	static int get_num_devices();
+
+
+	void finish();
+	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
+	void search(uint8_t const* header, uint64_t target, search_hook& hook);
+
+	/* -- default values -- */
+	/// Default value of the local work size. Also known as workgroup size.
+	static unsigned const c_defaultLocalWorkSize;
+	/// Default value of the global work size as a multiplier of the local work size
+	static unsigned const c_defaultGlobalWorkSizeMultiplier;
+
+private:
+	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
+	
+	bool	 m_highcpu;
+	unsigned m_num_buffers;
+	unsigned m_search_batch_size;
+	unsigned m_workgroup_size;
+
+	hash128_t * m_dag_ptr;
+	hash32_t * m_header;
+
+	void ** m_hash_buf;
+	uint32_t ** m_search_buf;
+	cudaStream_t  * m_streams;
+
+	
+};
--- a/libethash-cu/ethash_cu_miner_kernel.cu
+++ b/libethash-cu/ethash_cu_miner_kernel.cu
@ -0,0 +1,72 @@
+/*
+* Genoil's CUDA mining kernel for Ethereum
+* based on Tim Hughes' opencl kernel.
+* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
+*/
+
+#include "ethash_cu_miner_kernel.h"
+#include "ethash_cu_miner_kernel_globals.h"
+#include "cuda_helper.h"
+
+#define SHUFFLE_MIN_VER 350
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+#include "dagger_shuffled.cuh"
+#else
+#include "dagger_shared.cuh"
+#endif
+
+__global__ void 
+__launch_bounds__(128, 7)
+ethash_search(
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+	)
+{
+	
+	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
+	
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+	uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
+	if (cuda_swab64(hash) < target)
+#else
+	hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid);	
+	if (cuda_swab64(hash.uint64s[0]) < target)
+#endif
+	{
+		atomicInc(g_output, d_max_outputs);
+		g_output[g_output[0]] = gid;
+	}
+	
+}
+
+void run_ethash_search(
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+)
+{
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+	ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
+#else
+	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
+#endif
+}
+
+cudaError set_constants(
+	uint32_t * dag_size,
+	uint32_t * max_outputs
+	)
+{
+	cudaError result;
+	result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
+	result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
+	return result;
+}
--- a/libethash-cu/ethash_cu_miner_kernel.h
+++ b/libethash-cu/ethash_cu_miner_kernel.h
@ -0,0 +1,60 @@
+#ifndef _ETHASH_CU_MINER_KERNEL_H_
+#define _ETHASH_CU_MINER_KERNEL_H_
+
+#include <stdint.h>
+
+
+typedef union
+{
+	uint64_t uint64s[16 / sizeof(uint64_t)];
+	uint32_t uint32s[16 / sizeof(uint32_t)];
+} hash16_t;
+
+typedef union
+{
+	uint32_t uint32s[32 / sizeof(uint32_t)];
+	uint64_t uint64s[32 / sizeof(uint64_t)];
+	uint2 uint2s[32 / sizeof(uint2)];
+} hash32_t;
+
+
+typedef union
+{
+	uint32_t uint32s[64 / sizeof(uint32_t)];
+	uint64_t uint64s[64 / sizeof(uint64_t)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+
+typedef union
+{
+	uint32_t uint32s[128 / sizeof(uint32_t)];
+	uint4	 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+//typedef uint32_t hash128_t;
+
+cudaError set_constants(
+	uint32_t * dag_size,
+	uint32_t * max_outputs
+);
+
+void run_ethash_hash(
+	hash32_t* g_hashes,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce
+);
+
+void run_ethash_search(
+	uint32_t search_batch_size,
+	uint32_t workgroup_size,
+	cudaStream_t stream,
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+);
+
+#endif
--- a/libethash-cu/ethash_cu_miner_kernel_globals.h
+++ b/libethash-cu/ethash_cu_miner_kernel_globals.h
@ -0,0 +1,9 @@
+#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
+#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
+
+//#include "cuda_helper.h"
+
+__constant__ uint32_t d_dag_size;
+__constant__ uint32_t d_max_outputs;
+
+#endif
--- a/libethash-cu/keccak.cuh
+++ b/libethash-cu/keccak.cuh
@ -0,0 +1,89 @@
+#include "cuda_helper.h"
+
+__device__ __constant__ uint64_t const keccak_round_constants[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
+	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
+	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
+
+__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size)
+{
+	uint2 t[5], u, v;
+
+#pragma unroll 3
+	for (int i = 0; i < 24; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		u = t[4] ^ ROL2(t[1], 1);
+		s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u;
+		u = t[0] ^ ROL2(t[2], 1);
+		s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u;
+		u = t[1] ^ ROL2(t[3], 1);
+		s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u;
+		u = t[2] ^ ROL2(t[4], 1);
+		s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u;
+		u = t[3] ^ ROL2(t[0], 1);
+		s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u;
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(u, 1);
+
+		// squeeze this in here
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		u = s[0]; v = s[1]; s[0] ^= (~v) & s[2];
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+		if (i == 23 && out_size == 1) return;
+
+		// continue chi
+		s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v;
+		u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9];
+
+		if (i == 23) return;
+		s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v;
+		u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u);
+		u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u);
+		u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u);
+	}
+}
--- a/libethcore/CMakeLists.txt
+++ b/libethcore/CMakeLists.txt
@ -21,6 +21,9 @@ target_link_libraries(${EXECUTABLE} evmcore)
 if (ETHASHCL)
 	target_link_libraries(${EXECUTABLE} ethash-cl)
 endif ()
+if (ETHASHCU)
+	target_link_libraries(${EXECUTABLE} ethash-cu)
+endif ()
 target_link_libraries(${EXECUTABLE} devcrypto)
 if (CPUID_FOUND)
 	target_link_libraries(${EXECUTABLE} ${CPUID_LIBRARIES})