end of day commit..nothing works yet

10 years ago · 30bfe53bad
15 changed files with 1989 additions and 5 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,6 +49,7 @@ set(D_TOOLS ON)
 set(D_TESTS ON)
 set(D_FATDB ON)
 set(D_ETHASHCL ON)
 set(D_ETHASHCU OFF)
 set(D_EVMJIT ON)
 set(D_JSCONSOLE ON)
 set(D_JSONRPC ON)
@ -126,6 +127,20 @@ elseif (BUNDLE STREQUAL "miner")
 	set(D_JSONRPC ON)
 	set(D_JSCONSOLE OFF)
 	set(D_EVMJIT OFF)
 elseif (BUNDLE STREQUAL "cudaminer")
 	set(D_SERPENT OFF)
 	set(D_USENPM OFF)
 	set(D_GUI OFF)
 	set(D_TOOLS OFF)
 	set(D_TESTS OFF)
 	set(D_ETHKEY OFF)
 	set(D_MINER ON)
 	set(D_ETHASHCL ON)
 	set(D_ETHASHCU ON) 
 	set(D_FATDB OFF)
 	set(D_JSONRPC ON)
 	set(D_JSCONSOLE OFF)
 	set(D_EVMJIT OFF)
 elseif (BUNDLE STREQUAL "release")      # release builds
 	set(D_SERPENT ${DECENT_PLATFORM})
 	set(D_USENPM OFF)
@ -158,6 +173,10 @@ function(configureProject)
 		add_definitions(-DETH_ETHASHCL)
 	endif()
 	if (ETHASHCU)
 		add_definitions(-DETH_ETHASHCU)
 	endif()
 	if (EVMJIT)
 		add_definitions(-DETH_EVMJIT)
 	endif()
@ -291,6 +310,7 @@ eth_format_option(ROCKSDB)
 eth_format_option(TOOLS)
 eth_format_option(ETHKEY)
 eth_format_option(ETHASHCL)
 eth_format_option(ETHASHCU)
 eth_format_option(JSCONSOLE)
 eth_format_option(OLYMPIC)
 eth_format_option(SERPENT)
@ -341,6 +361,7 @@ message("-- SERPENT          Build Serpent language components        ${SERPENT}
 message("-- GUI              Build GUI components                     ${GUI}")
 message("-- TESTS            Build tests                              ${TESTS}")
 message("-- ETHASHCL         Build OpenCL components                  ${ETHASHCL}")
 message("-- ETHASHCU         Build CUDA components                    ${ETHASHCU}")
 message("-- JSCONSOLE        Build with javascript console            ${JSCONSOLE}")
 message("-- EVMJIT           Build LLVM-based JIT EVM                 ${EVMJIT}")
 message("------------------------------------------------------------------------")
@ -429,6 +450,9 @@ if (GENERAL OR MINER)
 	if (ETHASHCL)
 		add_subdirectory(libethash-cl)
 	endif ()
 	if (ETHASHCU)
 		add_subdirectory(libethash-cu)
 	endif ()
 endif ()
 add_subdirectory(libethcore)
--- a/cmake/EthDependencies.cmake
+++ b/cmake/EthDependencies.cmake
@ -128,6 +128,12 @@ if (OpenCL_FOUND)
 	message(" - opencl lib   : ${OpenCL_LIBRARIES}")
 endif()
 find_package (CUDA)
 if (CUDA_FOUND)
 	message(" - CUDA header: ${CUDA_INCLUDE_DIRS}")
 	message(" - CUDA lib   : ${CUDA_LIBRARIES}")
 endif()
 # find location of jsonrpcstub
 find_program(ETH_JSON_RPC_STUB jsonrpcstub)
 message(" - jsonrpcstub location    : ${ETH_JSON_RPC_STUB}")
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -39,12 +39,16 @@
 #include <libdevcore/SHA3.h>
 #include <libdevcore/CommonJS.h>
 #include <libethcore/EthashAux.h>
 #include <libethcore/EthashCUDAMiner.h>
 #include <libethcore/EthashGPUMiner.h>
 #include <libethcore/EthashCPUMiner.h>
 #include <libethcore/Farm.h>
 #if ETH_ETHASHCL || !ETH_TRUE
 #include <libethash-cl/ethash_cl_miner.h>
 #endif
 #if ETH_ETHASHCU || !ETH_TRUE
 #include <libethash-cu/ethash_cu_miner.h>
 #endif
 #if ETH_JSONRPC || !ETH_TRUE
 #include <libweb3jsonrpc/WebThreeStubServer.h>
 #include <jsonrpccpp/server/connectors/httpserver.h>
@ -140,8 +144,8 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-#if ETH_ETHASHCL || !ETH_TRUE
+#if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE
-		else if (arg == "--cl-global-work" && i + 1 < argc)
+		else if (arg == "--gpu-global-work" && i + 1 < argc)
 			try {
 				m_globalWorkSizeMultiplier = stol(argv[++i]);
 			}
@ -150,7 +154,7 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-		else if (arg == "--cl-local-work" && i + 1 < argc)
+		else if (arg == "--gpu-local-work" && i + 1 < argc)
 			try {
 				m_localWorkSize = stol(argv[++i]);
 			}
@ -219,6 +223,8 @@ public:
 			m_minerType = MinerType::CPU;
 		else if (arg == "-G" || arg == "--opencl")
 			m_minerType = MinerType::GPU;
 		else if (arg == "-U" || arg == "--cuda")
 			m_minerType = MinerType::CUDA;
 		else if (arg == "--current-block" && i + 1 < argc)
 			m_currentBlock = stol(argv[++i]);
 		else if (arg == "--no-precompute")
@ -289,6 +295,21 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
 		else if (arg == "--cuda-devices") {
 			while (m_cudaDeviceCount < 16 && i + 1 < argc)
 			{
 				try {
 					m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
 				}
 				catch (...)
 				{
 					break;
 				}
 			}
 		}
 		else if (arg == "--cuda-high-cpu") {
 			m_cudaHighCPULoad = true;
 		}
 		else
 			return false;
 		return true;
@ -377,7 +398,8 @@ public:
 	enum class MinerType
 	{
 		CPU,
-		GPU
+		GPU,
 		CUDA
 	};
 	MinerType minerType() const { return m_minerType; }
@ -476,6 +498,9 @@ private:
 		sealers["cpu"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCPUMiner(ci); }};
 #if ETH_ETHASHCL
 		sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
 #endif
 #if ETH_ETHASHCU
 		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
 #endif
 		(void)_m;
 		(void)_remote;
@ -491,7 +516,8 @@ private:
 			f.start("cpu");
 		else if (_m == MinerType::GPU)
 			f.start("opencl");
-
+		else if (_m == MinerType::CUDA)
 			f.start("cuda");
 		EthashProofOfWork::WorkPackage current;
 		EthashAux::FullType dag;
 		while (true)
@ -589,6 +615,13 @@ private:
 	unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier;
 	unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize;
 	unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch;
 #endif
 #if ETH_ETHASHCU || !ETH_TRUE
 	unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier;
 	unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize;
 	unsigned m_cudaDeviceCount = 0;
 	unsigned m_cudaDevices[16];
 	bool	 m_cudaHighCPULoad = false;
 #endif
 	uint64_t m_currentBlock = 0;
 	// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
--- a/libethash-cu/CMakeLists.txt
+++ b/libethash-cu/CMakeLists.txt
@ -0,0 +1,29 @@
 set(EXECUTABLE ethash-cu)
 FIND_PACKAGE(CUDA REQUIRED)
 file(GLOB SRC_LIST "*.cpp" "*.cu")
 file(GLOB HEADERS "*.h" "*.cuh")
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
 LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
 LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
 if(COMPUTE)
 	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
 else(COMPUTE)
 	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
 endif(COMPUTE)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${CUDA_INCLUDE_DIRS})
 include_directories(..)
 CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
 TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash)
 install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
 install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
--- a/libethash-cu/cuda_helper.h
+++ b/libethash-cu/cuda_helper.h
--- a/libethash-cu/dagger.cuh
+++ b/libethash-cu/dagger.cuh
@ -0,0 +1,22 @@
 #define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; }
 #define ACCESSES 64
 #define THREADS_PER_HASH (128 / 16)
 #define FNV_PRIME	0x01000193
 #define fnv(x,y) ((x) * FNV_PRIME ^(y))
 __device__ uint4 fnv4(uint4 a, uint4 b)
 {
 	uint4 c;
 	c.x = a.x * FNV_PRIME ^ b.x;
 	c.y = a.y * FNV_PRIME ^ b.y;
 	c.z = a.z * FNV_PRIME ^ b.z;
 	c.w = a.w * FNV_PRIME ^ b.w;
 	return c;
 }
 __device__ uint32_t fnv_reduce(uint4 v)
 {
 	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
 }
--- a/libethash-cu/dagger_shared.cuh
+++ b/libethash-cu/dagger_shared.cuh
@ -0,0 +1,136 @@
 #include "ethash_cu_miner_kernel_globals.h"
 #include "ethash_cu_miner_kernel.h"
 #include "keccak.cuh"
 #include "dagger.cuh"
 typedef union
 {
 	hash64_t init;
 	hash32_t mix;
 } compute_hash_share;
 __device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
 {
 	hash64_t init;
 	// sha3_512(header .. nonce)
 	uint64_t state[25];
 	copy(state, header->uint64s, 4);
 	state[4] = nonce;
 	state[5] = 0x0000000000000001;
 	state[6] = 0;
 	state[7] = 0;
 	state[8] = 0x8000000000000000;
 	for (uint32_t i = 9; i < 25; i++)
 	{
 		state[i] = 0;
 	}
 	keccak_f1600_block((uint2 *)state, 8);
 	copy(init.uint64s, state, 8);
 	return init;
 }
 __device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
 {
 	// share init0
 	if (thread_id == 0)
 		*share = mix.x;
 	uint32_t init0 = *share;
 	uint32_t a = 0;
 	do
 	{
 		bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
 		//#pragma unroll 4
 		for (uint32_t i = 0; i < 4; i++)
 		{
 			if (update_share)
 			{
 				uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w };
 				*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size;
 			}
 			__threadfence_block();
 #if __CUDA_ARCH__ >= 350
 			mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
 #else
 			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
 #endif
 		}
 	} while ((a += 4) != ACCESSES);
 	return fnv_reduce(mix);
 }
 __device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
 {
 	uint64_t state[25];
 	hash32_t hash;
 	// keccak_256(keccak_512(header..nonce) .. mix);
 	copy(state, init->uint64s, 8);
 	copy(state + 8, mix->uint64s, 4);
 	state[12] = 0x0000000000000001;
 	for (uint32_t i = 13; i < 16; i++)
 	{
 		state[i] = 0;
 	}
 	state[16] = 0x8000000000000000;
 	for (uint32_t i = 17; i < 25; i++)
 	{
 		state[i] = 0;
 	}
 	keccak_f1600_block((uint2 *)state, 1);
 	// copy out
 	copy(hash.uint64s, state, 4);
 	return hash;
 }
 __device__ hash32_t compute_hash(
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	extern __shared__  compute_hash_share share[];
 	// Compute one init hash per work item.
 	hash64_t init = init_hash(g_header, nonce);
 	// Threads work together in this phase in groups of 8.
 	uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
 	uint32_t const hash_id = threadIdx.x >> 3;
 	hash32_t mix;
 	for (int i = 0; i < THREADS_PER_HASH; i++)
 	{
 		// share init with other threads
 		if (i == thread_id)
 			share[hash_id].init = init;
 		uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
 		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
 		share[hash_id].mix.uint32s[thread_id] = thread_mix;
 		if (i == thread_id)
 			mix = share[hash_id].mix;
 	}
 	return final_hash(&init, &mix);
 }
--- a/libethash-cu/dagger_shuffled.cuh
+++ b/libethash-cu/dagger_shuffled.cuh
@ -0,0 +1,107 @@
 #include "ethash_cu_miner_kernel_globals.h"
 #include "ethash_cu_miner_kernel.h"
 #include "keccak.cuh"
 #include "dagger.cuh"
 __device__ uint64_t compute_hash_shuffle(
 	uint2 const* g_header,
 	hash128_t const* g_dag,
 	uint64_t nonce
 	)
 {
 	// sha3_512(header .. nonce)
 	uint2 state[25];
 	state[0] = g_header[0];
 	state[1] = g_header[1];
 	state[2] = g_header[2];
 	state[3] = g_header[3];
 	state[4] = vectorize(nonce);
 	state[5] = vectorize(0x0000000000000001ULL);
 	for (uint32_t i = 6; i < 25; i++)
 	{
 		state[i] = make_uint2(0, 0);
 	}
 	state[8] = vectorize(0x8000000000000000ULL);
 	keccak_f1600_block(state,8);
 	// Threads work together in this phase in groups of 8.
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
 	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;
 	uint4 mix;
 	uint2 shuffle[8];
 	for (int i = 0; i < THREADS_PER_HASH; i++)
 	{
 		// share init among threads
 		for (int j = 0; j < 8; j++) {
 			shuffle[j].x = __shfl(state[j].x, start_lane + i);
 			shuffle[j].y = __shfl(state[j].y, start_lane + i);
 		}
 		// ugly but avoids local reads/writes
 		if (mix_idx < 2) {
 			if (mix_idx == 0)
 				mix = vectorize2(shuffle[0], shuffle[1]);
 			else
 				mix = vectorize2(shuffle[2], shuffle[3]);
 		}
 		else  {
 			if (mix_idx == 2)
 				mix = vectorize2(shuffle[4], shuffle[5]);
 			else
 				mix = vectorize2(shuffle[6], shuffle[7]);
 		}
 		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
 		for (uint32_t a = 0; a < ACCESSES; a += 4)
 		{
 			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
 			for (uint32_t b = 0; b < 4; b++)
 			{
 				if (thread_id == t)
 				{	
 					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
 				}
 				shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
 				mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
 			}
 		}
 		uint32_t thread_mix = fnv_reduce(mix);
 		// update mix accross threads
 		shuffle[0].x = __shfl(thread_mix, start_lane + 0);
 		shuffle[0].y = __shfl(thread_mix, start_lane + 1);
 		shuffle[1].x = __shfl(thread_mix, start_lane + 2);
 		shuffle[1].y = __shfl(thread_mix, start_lane + 3);
 		shuffle[2].x = __shfl(thread_mix, start_lane + 4);
 		shuffle[2].y = __shfl(thread_mix, start_lane + 5);
 		shuffle[3].x = __shfl(thread_mix, start_lane + 6);
 		shuffle[3].y = __shfl(thread_mix, start_lane + 7);
 		if (i == thread_id) {
 			//move mix into state:
 			state[8] = shuffle[0];
 			state[9] = shuffle[1];
 			state[10] = shuffle[2];
 			state[11] = shuffle[3];
 		}
 	}
 	// keccak_256(keccak_512(header..nonce) .. mix);
 	state[12] = vectorize(0x0000000000000001ULL);
 	for (uint32_t i = 13; i < 25; i++)
 	{
 		state[i] = vectorize(0ULL);
 	}
 	state[16] = vectorize(0x8000000000000000);
 	keccak_f1600_block(state, 1);
 	return devectorize(state[0]);
 }
--- a/libethash-cu/ethash_cu_miner.cpp
+++ b/libethash-cu/ethash_cu_miner.cpp
@ -0,0 +1,281 @@
 /*
  This file is part of c-ethash.
  c-ethash is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  c-ethash is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 */
 /** @file ethash_cu_miner.cpp
 * @author Tim Hughes <tim@twistedfury.com>
 * @date 2015
 */
 #define _CRT_SECURE_NO_WARNINGS
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
 #include <assert.h>
 #include <queue>
 #include <random>
 #include <vector>
 #include <chrono>
 #include <thread>
 #include <libethash/util.h>
 #include <libethash/ethash.h>
 #include "ethash_cu_miner.h"
 #include "ethash_cu_miner_kernel_globals.h"
 #define ETHASH_BYTES 32
 // workaround lame platforms
 #if !CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
 #define CL_MEM_HOST_READ_ONLY 0
 #endif
 #undef min
 #undef max
 using namespace std;
 unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128;
 unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
 ethash_cu_miner::search_hook::~search_hook() {}
 ethash_cu_miner::ethash_cu_miner()
 {
 }
 std::string ethash_cu_miner::platform_info(unsigned _deviceId)
 {
 	int runtime_version;
 	int device_count;
 	device_count = get_num_devices();
 	if (device_count == 0)
 		return std::string();
 	if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue)
 	{
 		cout << cudaGetErrorString(cudaErrorInvalidValue) << endl;
 		return std::string();
 	}
 	// use selected default device
 	int device_num = std::min<int>((int)_deviceId, device_count - 1);
 	cudaDeviceProp device_props;
 	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
 	{
 		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
 		return std::string();
 	}
 	char platform[5];
 	int version_major = runtime_version / 1000;
 	int version_minor = (runtime_version - (version_major * 1000)) / 10;
 	sprintf(platform, "%d.%d", version_major, version_minor);
 	char compute[5];
 	sprintf(compute, "%d.%d", device_props.major, device_props.minor);
 	return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
 }
 int ethash_cu_miner::get_num_devices()
 {
 	int device_count;
 	if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice)
 	{
 		cout << cudaGetErrorString(cudaErrorNoDevice) << endl;
 		return 0;
 	}
 	return device_count;
 }
 void ethash_cu_miner::finish()
 {
 	for (unsigned i = 0; i != m_num_buffers; i++) {
 		cudaStreamDestroy(m_streams[i]);
 		m_streams[i] = 0;
 	}
 	cudaDeviceReset();
 }
 bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu)
 {
 	int device_count = get_num_devices();
 	if (device_count == 0)
 		return false;
 	// use selected device
 	int device_num = std::min<int>((int)_deviceId, device_count - 1);
 	cudaDeviceProp device_props;
 	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
 	{
 		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
 		return false;
 	}
 	cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl;
 	cudaError_t r = cudaSetDevice(device_num);
 	if (r != cudaSuccess)
 	{
 		cout << cudaGetErrorString(r) << endl;
 		return false;
 	}
 	cudaDeviceReset();
 	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
 	m_num_buffers = num_buffers;
 	m_search_batch_size = search_batch_size;
 	m_hash_buf	 = new void *[m_num_buffers];
 	m_search_buf = new uint32_t *[m_num_buffers];
 	m_streams    = new cudaStream_t[m_num_buffers];
 	// use requested workgroup size, but we require multiple of 8
 	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
 	m_highcpu = highcpu;
 	// patch source code
 	cudaError result;
 	uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
 	unsigned max_outputs = c_max_search_results;
 	result = set_constants(&dagSize128, &max_outputs);
 	// create buffer for dag
 	result = cudaMalloc(&m_dag_ptr, _dagSize);
 	// create buffer for header256
 	result = cudaMalloc(&m_header, 32);
 	// copy dag to CPU.
    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
 	// create mining buffers
 	for (unsigned i = 0; i != m_num_buffers; ++i)
 	{		
 		result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size);
 		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
 		result = cudaStreamCreate(&m_streams[i]);
 	}
 	if (result != cudaSuccess)
 	{
 		cout << cudaGetErrorString(result) << endl;
 		return false;
 	}
 	return true;
 }
 /**
 * Prevent High CPU usage while waiting for an async task
 */
 static unsigned waitStream(cudaStream_t stream)
 {
 	unsigned wait_ms = 0;
 	while (cudaStreamQuery(stream) == cudaErrorNotReady) {
 		this_thread::sleep_for(chrono::milliseconds(10));
 		wait_ms += 10;
 	}
 	return wait_ms;
 }
 void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
 	struct pending_batch
 	{
 		uint64_t start_nonce;
 		unsigned buf;
 	};
 	std::queue<pending_batch> pending;
 	static uint32_t const c_zero = 0;
 	// update header constant buffer
 	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
 	for (unsigned i = 0; i != m_num_buffers; ++i)
 	{
 		cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice);
 	}
 	cudaError err = cudaGetLastError();
 	if (cudaSuccess != err)
 	{
 		throw std::runtime_error(cudaGetErrorString(err));
 	}
 	unsigned buf = 0;
 	std::random_device engine;
 	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
 	for (;; start_nonce += m_search_batch_size)
 	{
 		run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);	
 		pending.push({ start_nonce, buf });
 		buf = (buf + 1) % m_num_buffers;
 		// read results
 		if (pending.size() == m_num_buffers)
 		{
 			pending_batch const& batch = pending.front();
 			uint32_t results[1 + c_max_search_results];
 			if (!m_highcpu)
 				waitStream(m_streams[buf]); // 28ms
 			cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]);
 			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
 			uint64_t nonces[c_max_search_results];
 			for (unsigned i = 0; i != num_found; ++i)
 			{
 				nonces[i] = batch.start_nonce + results[i + 1];
 				//cout << results[i + 1] << ", ";
 			}
 			//if (num_found > 0)
 			//	cout << endl;
 			bool exit = num_found && hook.found(nonces, num_found);
 			exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
 			if (exit)
 				break;
 			start_nonce += m_search_batch_size;
 			// reset search buffer if we're still going
 			if (num_found)
 				cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
 			cudaError err = cudaGetLastError();
 			if (cudaSuccess != err)
 			{
 				throw std::runtime_error(cudaGetErrorString(err));
 			}
 			pending.pop();
 		}
 	}	
 }
--- a/libethash-cu/ethash_cu_miner.h
+++ b/libethash-cu/ethash_cu_miner.h
@ -0,0 +1,56 @@
 #pragma once
 #include <cuda_runtime.h>
 #include <time.h>
 #include <functional>
 #include <libethash/ethash.h>
 #include "ethash_cu_miner_kernel.h"
 class ethash_cu_miner
 {
 public:
 	struct search_hook
 	{
 		virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
 		// reports progress, return true to abort
 		virtual bool found(uint64_t const* nonces, uint32_t count) = 0;
 		virtual bool searched(uint64_t start_nonce, uint32_t count) = 0;
 	};
 public:
 	ethash_cu_miner();
 	bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false);
 	static std::string platform_info(unsigned _deviceId = 0);
 	static int get_num_devices();
 	void finish();
 	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
 	void search(uint8_t const* header, uint64_t target, search_hook& hook);
 	/* -- default values -- */
 	/// Default value of the local work size. Also known as workgroup size.
 	static unsigned const c_defaultLocalWorkSize;
 	/// Default value of the global work size as a multiplier of the local work size
 	static unsigned const c_defaultGlobalWorkSizeMultiplier;
 private:
 	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
 	bool	 m_highcpu;
 	unsigned m_num_buffers;
 	unsigned m_search_batch_size;
 	unsigned m_workgroup_size;
 	hash128_t * m_dag_ptr;
 	hash32_t * m_header;
 	void ** m_hash_buf;
 	uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;
 };
--- a/libethash-cu/ethash_cu_miner_kernel.cu
+++ b/libethash-cu/ethash_cu_miner_kernel.cu
@ -0,0 +1,72 @@
 /*
 * Genoil's CUDA mining kernel for Ethereum
 * based on Tim Hughes' opencl kernel.
 * thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
 */
 #include "ethash_cu_miner_kernel.h"
 #include "ethash_cu_miner_kernel_globals.h"
 #include "cuda_helper.h"
 #define SHUFFLE_MIN_VER 350
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
 #include "dagger_shuffled.cuh"
 #else
 #include "dagger_shared.cuh"
 #endif
 __global__ void 
 __launch_bounds__(128, 7)
 ethash_search(
 	uint32_t* g_output,
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
 	uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
 	if (cuda_swab64(hash) < target)
 #else
 	hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid);	
 	if (cuda_swab64(hash.uint64s[0]) < target)
 #endif
 	{
 		atomicInc(g_output, d_max_outputs);
 		g_output[g_output[0]] = gid;
 	}
 }
 void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
 	cudaStream_t stream,
 	uint32_t* g_output,
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 )
 {
 #if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
 	ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
 #else
 	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
 #endif
 }
 cudaError set_constants(
 	uint32_t * dag_size,
 	uint32_t * max_outputs
 	)
 {
 	cudaError result;
 	result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
 	result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
 	return result;
 }
--- a/libethash-cu/ethash_cu_miner_kernel.h
+++ b/libethash-cu/ethash_cu_miner_kernel.h
@ -0,0 +1,60 @@
 #ifndef _ETHASH_CU_MINER_KERNEL_H_
 #define _ETHASH_CU_MINER_KERNEL_H_
 #include <stdint.h>
 typedef union
 {
 	uint64_t uint64s[16 / sizeof(uint64_t)];
 	uint32_t uint32s[16 / sizeof(uint32_t)];
 } hash16_t;
 typedef union
 {
 	uint32_t uint32s[32 / sizeof(uint32_t)];
 	uint64_t uint64s[32 / sizeof(uint64_t)];
 	uint2 uint2s[32 / sizeof(uint2)];
 } hash32_t;
 typedef union
 {
 	uint32_t uint32s[64 / sizeof(uint32_t)];
 	uint64_t uint64s[64 / sizeof(uint64_t)];
 	uint4	 uint4s[64 / sizeof(uint4)];
 } hash64_t;
 typedef union
 {
 	uint32_t uint32s[128 / sizeof(uint32_t)];
 	uint4	 uint4s[128 / sizeof(uint4)];
 } hash128_t;
 //typedef uint32_t hash128_t;
 cudaError set_constants(
 	uint32_t * dag_size,
 	uint32_t * max_outputs
 );
 void run_ethash_hash(
 	hash32_t* g_hashes,
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t start_nonce
 );
 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
 	cudaStream_t stream,
 	uint32_t* g_output,
 	hash32_t const* g_header,
 	hash128_t const* g_dag,
 	uint64_t start_nonce,
 	uint64_t target
 );
 #endif
--- a/libethash-cu/ethash_cu_miner_kernel_globals.h
+++ b/libethash-cu/ethash_cu_miner_kernel_globals.h
@ -0,0 +1,9 @@
 #ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
 #define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
 //#include "cuda_helper.h"
 __constant__ uint32_t d_dag_size;
 __constant__ uint32_t d_max_outputs;
 #endif
--- a/libethash-cu/keccak.cuh
+++ b/libethash-cu/keccak.cuh
@ -0,0 +1,89 @@
 #include "cuda_helper.h"
 __device__ __constant__ uint64_t const keccak_round_constants[24] = {
 	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
 	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
 	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
 	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
 	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
 	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
 	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
 	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
 };
 #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 __device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size)
 {
 	uint2 t[5], u, v;
 #pragma unroll 3
 	for (int i = 0; i < 24; i++)
 	{
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		u = t[4] ^ ROL2(t[1], 1);
 		s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u;
 		u = t[0] ^ ROL2(t[2], 1);
 		s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u;
 		u = t[1] ^ ROL2(t[3], 1);
 		s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u;
 		u = t[2] ^ ROL2(t[4], 1);
 		s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u;
 		u = t[3] ^ ROL2(t[0], 1);
 		s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u;
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		u = s[1];
 		s[1] = ROL2(s[6], 44);
 		s[6] = ROL2(s[9], 20);
 		s[9] = ROL2(s[22], 61);
 		s[22] = ROL2(s[14], 39);
 		s[14] = ROL2(s[20], 18);
 		s[20] = ROL2(s[2], 62);
 		s[2] = ROL2(s[12], 43);
 		s[12] = ROL2(s[13], 25);
 		s[13] = ROL2(s[19], 8);
 		s[19] = ROL2(s[23], 56);
 		s[23] = ROL2(s[15], 41);
 		s[15] = ROL2(s[4], 27);
 		s[4] = ROL2(s[24], 14);
 		s[24] = ROL2(s[21], 2);
 		s[21] = ROL2(s[8], 55);
 		s[8] = ROL2(s[16], 45);
 		s[16] = ROL2(s[5], 36);
 		s[5] = ROL2(s[3], 28);
 		s[3] = ROL2(s[18], 21);
 		s[18] = ROL2(s[17], 15);
 		s[17] = ROL2(s[11], 10);
 		s[11] = ROL2(s[7], 6);
 		s[7] = ROL2(s[10], 3);
 		s[10] = ROL2(u, 1);
 		// squeeze this in here
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		u = s[0]; v = s[1]; s[0] ^= (~v) & s[2];
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= vectorize(keccak_round_constants[i]);
 		if (i == 23 && out_size == 1) return;
 		// continue chi
 		s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v;
 		u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9];
 		if (i == 23) return;
 		s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v;
 		u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u);
 		u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u);
 		u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u);
 	}
 }
--- a/libethcore/CMakeLists.txt
+++ b/libethcore/CMakeLists.txt
@ -21,6 +21,9 @@ target_link_libraries(${EXECUTABLE} evmcore)
 if (ETHASHCL)
 	target_link_libraries(${EXECUTABLE} ethash-cl)
 endif ()
 if (ETHASHCU)
 	target_link_libraries(${EXECUTABLE} ethash-cu)
 endif ()
 target_link_libraries(${EXECUTABLE} devcrypto)
 if (CPUID_FOUND)
 	target_link_libraries(${EXECUTABLE} ${CPUID_LIBRARIES})