diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29e954031..eeec392c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,7 @@ set(D_TOOLS ON)
 set(D_TESTS ON)
 set(D_FATDB ON)
 set(D_ETHASHCL ON)
+set(D_ETHASHCU OFF)
 set(D_EVMJIT ON)
 set(D_JSCONSOLE ON)
 set(D_JSONRPC ON)
@@ -126,6 +127,20 @@ elseif (BUNDLE STREQUAL "miner")
 	set(D_JSONRPC ON)
 	set(D_JSCONSOLE OFF)
 	set(D_EVMJIT OFF)
+elseif (BUNDLE STREQUAL "cudaminer")
+	set(D_SERPENT OFF)
+	set(D_USENPM OFF)
+	set(D_GUI OFF)
+	set(D_TOOLS OFF)
+	set(D_TESTS OFF)
+	set(D_ETHKEY OFF)
+	set(D_MINER ON)
+	set(D_ETHASHCL ON)
+	set(D_ETHASHCU ON) 
+	set(D_FATDB OFF)
+	set(D_JSONRPC ON)
+	set(D_JSCONSOLE OFF)
+	set(D_EVMJIT OFF)
 elseif (BUNDLE STREQUAL "release")      # release builds
 	set(D_SERPENT ${DECENT_PLATFORM})
 	set(D_USENPM OFF)
@@ -158,6 +173,10 @@ function(configureProject)
 		add_definitions(-DETH_ETHASHCL)
 	endif()
 
+	if (ETHASHCU)
+		add_definitions(-DETH_ETHASHCU)
+	endif()
+	
 	if (EVMJIT)
 		add_definitions(-DETH_EVMJIT)
 	endif()
@@ -291,6 +310,7 @@ eth_format_option(ROCKSDB)
 eth_format_option(TOOLS)
 eth_format_option(ETHKEY)
 eth_format_option(ETHASHCL)
+eth_format_option(ETHASHCU)
 eth_format_option(JSCONSOLE)
 eth_format_option(OLYMPIC)
 eth_format_option(SERPENT)
@@ -341,6 +361,7 @@ message("-- SERPENT          Build Serpent language components        ${SERPENT}
 message("-- GUI              Build GUI components                     ${GUI}")
 message("-- TESTS            Build tests                              ${TESTS}")
 message("-- ETHASHCL         Build OpenCL components                  ${ETHASHCL}")
+message("-- ETHASHCU         Build CUDA components                    ${ETHASHCU}")
 message("-- JSCONSOLE        Build with javascript console            ${JSCONSOLE}")
 message("-- EVMJIT           Build LLVM-based JIT EVM                 ${EVMJIT}")
 message("------------------------------------------------------------------------")
@@ -429,6 +450,9 @@ if (GENERAL OR MINER)
 	if (ETHASHCL)
 		add_subdirectory(libethash-cl)
 	endif ()
+	if (ETHASHCU)
+		add_subdirectory(libethash-cu)
+	endif ()
 endif ()
 
 add_subdirectory(libethcore)
diff --git a/cmake/EthDependencies.cmake b/cmake/EthDependencies.cmake
index 389664c99..f5fdae74f 100644
--- a/cmake/EthDependencies.cmake
+++ b/cmake/EthDependencies.cmake
@@ -128,6 +128,12 @@ if (OpenCL_FOUND)
 	message(" - opencl lib   : ${OpenCL_LIBRARIES}")
 endif()
 
+find_package (CUDA)
+if (CUDA_FOUND)
+	message(" - CUDA header: ${CUDA_INCLUDE_DIRS}")
+	message(" - CUDA lib   : ${CUDA_LIBRARIES}")
+endif()
+
 # find location of jsonrpcstub
 find_program(ETH_JSON_RPC_STUB jsonrpcstub)
 message(" - jsonrpcstub location    : ${ETH_JSON_RPC_STUB}")
diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 84cd69c2c..7486d64f8 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -39,12 +39,16 @@
 #include <libdevcore/SHA3.h>
 #include <libdevcore/CommonJS.h>
 #include <libethcore/EthashAux.h>
+#include <libethcore/EthashCUDAMiner.h>
 #include <libethcore/EthashGPUMiner.h>
 #include <libethcore/EthashCPUMiner.h>
 #include <libethcore/Farm.h>
 #if ETH_ETHASHCL || !ETH_TRUE
 #include <libethash-cl/ethash_cl_miner.h>
 #endif
+#if ETH_ETHASHCU || !ETH_TRUE
+#include <libethash-cu/ethash_cu_miner.h>
+#endif
 #if ETH_JSONRPC || !ETH_TRUE
 #include <libweb3jsonrpc/WebThreeStubServer.h>
 #include <jsonrpccpp/server/connectors/httpserver.h>
@@ -140,8 +144,8 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-#if ETH_ETHASHCL || !ETH_TRUE
-		else if (arg == "--cl-global-work" && i + 1 < argc)
+#if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE
+		else if (arg == "--gpu-global-work" && i + 1 < argc)
 			try {
 				m_globalWorkSizeMultiplier = stol(argv[++i]);
 			}
@@ -150,7 +154,7 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-		else if (arg == "--cl-local-work" && i + 1 < argc)
+		else if (arg == "--gpu-local-work" && i + 1 < argc)
 			try {
 				m_localWorkSize = stol(argv[++i]);
 			}
@@ -219,6 +223,8 @@ public:
 			m_minerType = MinerType::CPU;
 		else if (arg == "-G" || arg == "--opencl")
 			m_minerType = MinerType::GPU;
+		else if (arg == "-U" || arg == "--cuda")
+			m_minerType = MinerType::CUDA;
 		else if (arg == "--current-block" && i + 1 < argc)
 			m_currentBlock = stol(argv[++i]);
 		else if (arg == "--no-precompute")
@@ -289,6 +295,21 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
+		else if (arg == "--cuda-devices") {
+			while (m_cudaDeviceCount < 16 && i + 1 < argc)
+			{
+				try {
+					m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
+				}
+				catch (...)
+				{
+					break;
+				}
+			}
+		}
+		else if (arg == "--cuda-high-cpu") {
+			m_cudaHighCPULoad = true;
+		}
 		else
 			return false;
 		return true;
@@ -377,7 +398,8 @@ public:
 	enum class MinerType
 	{
 		CPU,
-		GPU
+		GPU,
+		CUDA
 	};
 
 	MinerType minerType() const { return m_minerType; }
@@ -476,6 +498,9 @@ private:
 		sealers["cpu"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCPUMiner(ci); }};
 #if ETH_ETHASHCL
 		sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
+#endif
+#if ETH_ETHASHCU
+		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
 #endif
 		(void)_m;
 		(void)_remote;
@@ -491,7 +516,8 @@ private:
 			f.start("cpu");
 		else if (_m == MinerType::GPU)
 			f.start("opencl");
-
+		else if (_m == MinerType::CUDA)
+			f.start("cuda");
 		EthashProofOfWork::WorkPackage current;
 		EthashAux::FullType dag;
 		while (true)
@@ -589,6 +615,13 @@ private:
 	unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier;
 	unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize;
 	unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch;
+#endif
+#if ETH_ETHASHCU || !ETH_TRUE
+	unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier;
+	unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize;
+	unsigned m_cudaDeviceCount = 0;
+	unsigned m_cudaDevices[16];
+	bool	 m_cudaHighCPULoad = false;
 #endif
 	uint64_t m_currentBlock = 0;
 	// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
diff --git a/libethash-cu/CMakeLists.txt b/libethash-cu/CMakeLists.txt
new file mode 100644
index 000000000..f5a682b12
--- /dev/null
+++ b/libethash-cu/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(EXECUTABLE ethash-cu)
+
+FIND_PACKAGE(CUDA REQUIRED)
+
+file(GLOB SRC_LIST "*.cpp" "*.cu")
+file(GLOB HEADERS "*.h" "*.cuh")
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
+
+if(COMPUTE)
+	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE)
+	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
+endif(COMPUTE)
+
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash)
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
+
diff --git a/libethash-cu/cuda_helper.h b/libethash-cu/cuda_helper.h
new file mode 100644
index 000000000..1b75d2ffd
--- /dev/null
+++ b/libethash-cu/cuda_helper.h
@@ -0,0 +1,1057 @@
+#ifndef CUDA_HELPER_H
+#define CUDA_HELPER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef __INTELLISENSE__
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define asm("a" : "=l"(result) : "l"(a))
+
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+#endif
+
+#include <stdint.h>
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 32
+#endif
+
+extern "C" int device_map[MAX_GPUS];
+extern "C"  long device_sm[MAX_GPUS];
+extern cudaStream_t gpustream[MAX_GPUS];
+
+// common functions
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2);
+extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+
+#ifndef SPH_C32
+#define SPH_C32(x) ((x ## U))
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) ((x ## ULL))
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+
+#if __CUDA_ARCH__ < 320
+// Kepler (Compute 3.0)
+#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#else
+// Kepler (Compute 3.5, 5.0)
+__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+	return(__funnelshift_l((x), (x), (n)));
+}
+#endif
+#if __CUDA_ARCH__ < 320
+// Kepler (Compute 3.0)
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+	return(__funnelshift_r((x), (x), (n)));
+}
+#endif
+
+
+
+
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+{
+	uint64_t result;
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(LO), "r"(HI));
+	return result;
+}
+
+__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y) 
+{
+	uint64_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 t,t2; \n\t"
+		"mov.b64 {t2,t},%1; \n\t"
+		"mov.b64 %0,{t2,%2}; \n\t"
+		"}" : "=l"(result) : "l"(x), "r"(y)
+		);
+	return result;
+
+}
+__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y) 
+{
+	uint64_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 t,t2; \n\t"
+		"mov.b64 {t2,t},%1; \n\t"
+		"mov.b64 %0,{%2,t}; \n\t"
+		"}" : "=l"(result) : "l"(x) , "r"(y)
+		);
+	return result;
+}
+
+// Endian Drehung für 32 Bit Typen
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
+
+static __device__ __forceinline__ uint32_t _HIWORD(const uint64_t x)
+{
+	uint32_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 xl; \n\t"
+		"mov.b64 {xl,%0},%1; \n\t"
+		"}" : "=r"(result) : "l"(x)
+		);
+	return result;
+}
+
+static __device__ __forceinline__ uint32_t _LOWORD(const uint64_t x)
+{
+	uint32_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 xh; \n\t"
+		"mov.b64 {%0,xh},%1; \n\t"
+		"}" : "=r"(result) : "l"(x)
+		);
+	return result;
+}
+
+// Input:       77665544 33221100
+// Output:      00112233 44556677
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint64_t cuda_swab64(const uint64_t x)
+{
+	uint64_t result;
+	uint2 t;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(t.x), "=r"(t.y) : "l"(x));
+	t.x=__byte_perm(t.x, 0, 0x0123);
+	t.y=__byte_perm(t.y, 0, 0x0123);
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(t.y), "r"(t.x));
+	return result;
+}
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif
+
+/*********************************************************************/
+// Macros to catch CUDA errors in CUDA runtime calls
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
+/*********************************************************************/
+#ifdef _WIN64
+#define USE_XOR_ASM_OPTS 0
+#else
+#define USE_XOR_ASM_OPTS 1
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(const uint64_t a, const uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
+	return result;
+}
+#else
+#define xor1(a,b) (a ^ b)
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %2, %3;\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+#else
+#define xor3(a,b,c) (a ^ b ^ c)
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const  uint64_t h)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+#else
+#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
+#endif
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 n;\n\t"
+		"xor.b64 %0, %2, %3;\n\t"
+		"and.b64 n, %0, %1;\n\t"
+		"xor.b64 %0, n, %3;"
+	"}\n"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m,n;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m ;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+	asm("shr.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+}
+
+// device asm for ?
+__device__ __forceinline__
+uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+	asm("shl.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+}
+
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 1
+#endif
+
+// 64-bit ROTATE RIGHT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shr.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shl.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, %0, lhs;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shl.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shr.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, lhs, %0;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+#else
+/* host */
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+__device__ __forceinline__
+uint64_t SWAPDWORDS(uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
+/* lyra2 - int2 operators */
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+}
+
+__device__ __forceinline__ uint64_t devectorize(uint2 x)
+{
+	uint64_t result;
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(x.x), "r"(x.y));
+	return result;
+}
+
+
+__device__ __forceinline__ uint2 vectorize(const uint64_t x)
+{
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(x));
+	return result;
+}
+__device__ __forceinline__ void devectorize2(uint4 inn, uint2 &x, uint2 &y)
+{
+	x.x = inn.x;
+	x.y = inn.y;
+	y.x = inn.z;
+	y.y = inn.w;
+}
+
+
+__device__ __forceinline__ uint4 vectorize2(uint2 x, uint2 y)
+{
+	uint4 result;
+	result.x = x.x;
+	result.y = x.y;
+	result.z = y.x;
+	result.w = y.y;
+
+	return result;
+}
+
+__device__ __forceinline__ uint4 vectorize2(uint2 x)
+{
+	uint4 result;
+	result.x = x.x;
+	result.y = x.y;
+	result.z = x.x;
+	result.w = x.y;
+	return result;
+}
+
+
+__device__ __forceinline__ uint4 vectorize4(uint64_t x, uint64_t y)
+{
+	uint4 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(x));
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.z), "=r"(result.w) : "l"(y));
+	return result;
+}
+__device__ __forceinline__ void devectorize4(uint4 inn, uint64_t &x, uint64_t &y)
+{
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(x) : "r"(inn.x), "r"(inn.y));
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(y) : "r"(inn.z), "r"(inn.w));
+}
+
+
+
+static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) {
+	uint2 result;
+	result.x = v;
+	result.y = 0;
+	return result;
+}
+static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) {
+	uint2 result;
+	result.x = 0;
+	result.y = v;
+	return result;
+}
+
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); }
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+}
+
+
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint32_t b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+}
+
+
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+
+
+
+static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); }
+static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); }
+static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); }
+static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __device__ __forceinline__ uint4 operator^ (uint4 a, uint2 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); }
+
+
+static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"mul.lo.u32        %0,%2,%4;  \n\t"
+		"mul.hi.u32        %1,%2,%4;  \n\t"
+		"mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
+		"madc.lo.u32      %1,%3,%5,%1; \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+
+// uint2 method
+#if  __CUDA_ARCH__ >= 350
+__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) 
+{
+	uint2 result;
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__device__ __inline__ uint2 ROR2(const uint2 v, const int n) 
+{
+	uint2 result;
+	if (n <= 32) 
+	{
+		result.y = ((v.y >> (n)) | (v.x << (32 - n)));
+		result.x = ((v.x >> (n)) | (v.y << (32 - n)));
+	}
+	else 
+	{
+		result.y = ((v.x >> (n - 32)) | (v.y << (64 - n)));
+		result.x = ((v.y >> (n - 32)) | (v.x << (64 - n)));
+	}
+	return result;
+}
+#endif
+
+
+
+__device__ __inline__ uint32_t ROL8(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x2103);
+}
+__device__ __inline__ uint32_t ROL16(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x1032);
+}
+__device__ __inline__ uint32_t ROL24(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x0321);
+}
+
+__device__ __inline__ uint2 ROR8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x5432);
+	result.y = __byte_perm(a.y, a.x, 0x1076);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x4321);
+	result.y = __byte_perm(a.y, a.x, 0x0765);
+
+	return result;
+}
+
+
+
+#if  __CUDA_ARCH__ >= 350
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
+	uint2 result;
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
+{
+		uint2 result;
+		if (n <= 32) 
+		{
+			result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+			result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+		}
+		else 
+		{
+			result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+			result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+
+		}
+		return result;
+}
+#endif
+
+__device__ __forceinline__
+uint64_t ROTR16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500
+	short4 temp;
+	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
+	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x));
+	return x;
+#else
+	return ROTR64(x, 16);
+#endif
+}
+__device__ __forceinline__
+uint64_t ROTL16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500
+	short4 temp;
+	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
+	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z));
+	return x;
+#else
+	return ROTL64(x, 16);
+#endif
+}
+
+__device__ __forceinline__
+uint2 SWAPINT2(uint2 x)
+{
+	return(make_uint2(x.y, x.x));
+}
+__device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
+{
+	if (hash[7] > target[7])
+		return false;
+	if (hash[7] < target[7])
+		return true;
+	if (hash[6] > target[6])
+		return false;
+	if (hash[6] < target[6])
+		return true;
+	if (hash[5] > target[5])
+		return false;
+	if (hash[5] < target[5])
+		return true;
+	if (hash[4] > target[4])
+		return false;
+	if (hash[4] < target[4])
+		return true;
+	if (hash[3] > target[3])
+		return false;
+	if (hash[3] < target[3])
+		return true;
+	if (hash[2] > target[2])
+		return false;
+	if (hash[2] < target[2])
+		return true;
+	if (hash[1] > target[1])
+		return false;
+	if (hash[1] < target[1])
+		return true;
+	if (hash[0] > target[0])
+		return false;
+	return true;
+}
+
+__device__ __forceinline__
+uint2 SWAPDWORDS2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+__device__ __forceinline__
+uint4 SWAPDWORDS2(uint4 value)
+{
+	return make_uint4(value.y, value.x, value.w ,value.z);
+}
+
+static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset<32) 
+	{
+		asm("{\n\t"
+			"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+			"shl.b32 %0,%2,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+			"shl.b32 %0,%2,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+#else
+	if (offset<=32) 
+	{
+		a.y = (a.y << offset) | (a.x >> (32 - offset));
+		a.x = (a.x << offset);
+	}
+	else
+	{
+		a.y = (a.x << (offset-32));
+		a.x = 0;
+	}
+	return a;
+#endif
+}
+static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset)
+{
+	#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset<32) {
+		asm("{\n\t"
+			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shr.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shl.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+	#else
+	if (offset<=32) 
+	{
+		a.x = (a.x >> offset) | (a.y << (32 - offset));
+		a.y = (a.y >> offset);
+	}
+	else
+	{
+		a.x = (a.y >> (offset - 32));
+		a.y = 0;
+	}
+	return a;
+	#endif
+}
+
+static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); }
+static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	LOHI(result.y, result.x, v);
+	result.x = cuda_swab32(result.x);
+	result.y = cuda_swab32(result.y);
+	return result;
+}
+
+
+__device__ __forceinline__ uint32_t devectorize16(ushort2 x)
+{
+	uint32_t result;
+	asm("mov.b32 %0,{%1,%2}; \n\t"
+		: "=r"(result) : "h"(x.x) , "h"(x.y));
+	return result;
+}
+
+
+__device__ __forceinline__ ushort2 vectorize16(uint32_t x)
+{
+	ushort2 result;
+	asm("mov.b32 {%0,%1},%2; \n\t"
+		: "=h"(result.x), "=h"(result.y) : "r"(x));
+	return result;
+}
+
+
+
+
+static __device__ __forceinline__ uint4 mul4(uint4 a)
+{
+	uint4 result;
+	asm("{\n\t"
+		 "mul.lo.u32        %0,%4,%5;  \n\t"
+		 "mul.hi.u32        %1,%4,%5;  \n\t"
+		 "mul.lo.u32        %2,%6,%7;  \n\t"
+		 "mul.hi.u32        %3,%6,%7;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w));
+	return result;
+}
+static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "add.cc.u32           %0,%4,%8;  \n\t"
+		 "addc.u32             %1,%5,%9;  \n\t"
+		 "add.cc.u32           %2,%6,%10;  \n\t"
+		 "addc.u32             %3,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "mad.lo.cc.u32        %0,%4,%5,%8;  \n\t"
+		 "madc.hi.u32          %1,%4,%5,%9;  \n\t"
+		 "mad.lo.cc.u32        %2,%6,%7,%10;  \n\t"
+		 "madc.hi.u32          %3,%6,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b)
+ {
+	ulonglong2 result;
+	asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %2;\n\t"
+		 "mov.b64 {a2,a3}, %3;\n\t"
+		 "mov.b64 {b0,b1}, %4;\n\t"
+		 "mov.b64 {b2,b3}, %5;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y));
+	return result;
+	}
+static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b)
+ {
+	
+		asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %0;\n\t"
+		 "mov.b64 {a2,a3}, %1;\n\t"
+		 "mov.b64 {b0,b1}, %2;\n\t"
+		 "mov.b64 {b2,b3}, %3;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y));
+	
+}
+
+__device__ __forceinline__
+uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+	asm("{ .reg .u32 t1;\n\t"
+		"xor.b32 t1, %2, %3;\n\t"
+		"xor.b32 %0, %1, t1;\n\t"
+		"}"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+	return result;
+}
+
+// device asm 32 for pluck
+__device__ __forceinline__
+uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+	asm("{ .reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) {
+	uint32_t ret;
+	asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits));
+	return ret;
+
+}
+
+__device__ __forceinline__
+uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) {
+	uint32_t ret;
+	asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits));
+	return ret;
+}
+
+
+
+#endif // #ifndef CUDA_HELPER_H
+
+
diff --git a/libethash-cu/dagger.cuh b/libethash-cu/dagger.cuh
new file mode 100644
index 000000000..8b341171b
--- /dev/null
+++ b/libethash-cu/dagger.cuh
@@ -0,0 +1,22 @@
+#define copy(dst, src, count) for (uint32_t i = 0; i < count; i++) { (dst)[i] = (src)[i]; }
+
+#define ACCESSES 64
+#define THREADS_PER_HASH (128 / 16)
+#define FNV_PRIME	0x01000193
+
+#define fnv(x,y) ((x) * FNV_PRIME ^(y))
+
+__device__ uint4 fnv4(uint4 a, uint4 b)
+{
+	uint4 c;
+	c.x = a.x * FNV_PRIME ^ b.x;
+	c.y = a.y * FNV_PRIME ^ b.y;
+	c.z = a.z * FNV_PRIME ^ b.z;
+	c.w = a.w * FNV_PRIME ^ b.w;
+	return c;
+}
+
+__device__ uint32_t fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
\ No newline at end of file
diff --git a/libethash-cu/dagger_shared.cuh b/libethash-cu/dagger_shared.cuh
new file mode 100644
index 000000000..8de4c591f
--- /dev/null
+++ b/libethash-cu/dagger_shared.cuh
@@ -0,0 +1,136 @@
+#include "ethash_cu_miner_kernel_globals.h"
+#include "ethash_cu_miner_kernel.h"
+#include "keccak.cuh"
+#include "dagger.cuh"
+
+typedef union
+{
+	hash64_t init;
+	hash32_t mix;
+} compute_hash_share;
+
+__device__ hash64_t init_hash(hash32_t const* header, uint64_t nonce)
+{
+	hash64_t init;
+
+	// sha3_512(header .. nonce)
+	uint64_t state[25];
+
+	copy(state, header->uint64s, 4);
+	state[4] = nonce;
+	state[5] = 0x0000000000000001;
+	state[6] = 0;
+	state[7] = 0;
+	state[8] = 0x8000000000000000;
+	for (uint32_t i = 9; i < 25; i++)
+	{
+		state[i] = 0;
+	}
+
+	keccak_f1600_block((uint2 *)state, 8);
+	copy(init.uint64s, state, 8);
+	return init;
+}
+
+__device__ uint32_t inner_loop(uint4 mix, uint32_t thread_id, uint32_t* share, hash128_t const* g_dag)
+{
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+
+	uint32_t init0 = *share;
+
+	uint32_t a = 0;
+
+	do
+	{
+
+		bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
+
+		//#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+
+			if (update_share)
+			{
+				uint32_t m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a + i), m[i]) % d_dag_size;
+			}
+			__threadfence_block();
+
+#if __CUDA_ARCH__ >= 350
+			mix = fnv4(mix, __ldg(&g_dag[*share].uint4s[thread_id]));
+#else
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+#endif
+
+		}
+
+	} while ((a += 4) != ACCESSES);
+
+	return fnv_reduce(mix);
+}
+
+__device__ hash32_t final_hash(hash64_t const* init, hash32_t const* mix)
+{
+	uint64_t state[25];
+
+	hash32_t hash;
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->uint64s, 8);
+	copy(state + 8, mix->uint64s, 4);
+	state[12] = 0x0000000000000001;
+	for (uint32_t i = 13; i < 16; i++)
+	{
+		state[i] = 0;
+	}
+	state[16] = 0x8000000000000000;
+	for (uint32_t i = 17; i < 25; i++)
+	{
+		state[i] = 0;
+	}
+
+	keccak_f1600_block((uint2 *)state, 1);
+
+	// copy out
+	copy(hash.uint64s, state, 4);
+	return hash;
+}
+
+__device__ hash32_t compute_hash(
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t nonce
+	)
+{
+	extern __shared__  compute_hash_share share[];
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce);
+
+	// Threads work together in this phase in groups of 8.
+	uint32_t const thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
+	uint32_t const hash_id = threadIdx.x >> 3;
+
+	hash32_t mix;
+
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id & 3];
+
+		uint32_t thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uint32s, g_dag);
+
+		share[hash_id].mix.uint32s[thread_id] = thread_mix;
+
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+	}
+
+	return final_hash(&init, &mix);
+}
\ No newline at end of file
diff --git a/libethash-cu/dagger_shuffled.cuh b/libethash-cu/dagger_shuffled.cuh
new file mode 100644
index 000000000..fdac9d9e1
--- /dev/null
+++ b/libethash-cu/dagger_shuffled.cuh
@@ -0,0 +1,107 @@
+#include "ethash_cu_miner_kernel_globals.h"
+#include "ethash_cu_miner_kernel.h"
+#include "keccak.cuh"
+#include "dagger.cuh"
+
+__device__ uint64_t compute_hash_shuffle(
+	uint2 const* g_header,
+	hash128_t const* g_dag,
+	uint64_t nonce
+	)
+{
+	// sha3_512(header .. nonce)
+	uint2 state[25];
+
+	state[0] = g_header[0];
+	state[1] = g_header[1];
+	state[2] = g_header[2];
+	state[3] = g_header[3];
+	state[4] = vectorize(nonce);
+	state[5] = vectorize(0x0000000000000001ULL);
+	for (uint32_t i = 6; i < 25; i++)
+	{
+		state[i] = make_uint2(0, 0);
+	}
+	state[8] = vectorize(0x8000000000000000ULL);
+	keccak_f1600_block(state,8);
+
+	// Threads work together in this phase in groups of 8.
+	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
+	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
+	const int mix_idx    = thread_id & 3;
+
+	uint4 mix;
+	uint2 shuffle[8];
+
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init among threads
+		for (int j = 0; j < 8; j++) {
+			shuffle[j].x = __shfl(state[j].x, start_lane + i);
+			shuffle[j].y = __shfl(state[j].y, start_lane + i);
+		}
+
+		// ugly but avoids local reads/writes
+		if (mix_idx < 2) {
+			if (mix_idx == 0)
+				mix = vectorize2(shuffle[0], shuffle[1]);
+			else
+				mix = vectorize2(shuffle[2], shuffle[3]);
+		}
+		else  {
+			if (mix_idx == 2)
+				mix = vectorize2(shuffle[4], shuffle[5]);
+			else
+				mix = vectorize2(shuffle[6], shuffle[7]);
+		}
+		
+		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
+
+		for (uint32_t a = 0; a < ACCESSES; a += 4)
+		{
+			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
+
+			for (uint32_t b = 0; b < 4; b++)
+			{
+				if (thread_id == t)
+				{	
+					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
+				}
+				shuffle[0].x = __shfl(shuffle[0].x, start_lane + t);
+
+				mix = fnv4(mix, g_dag[shuffle[0].x].uint4s[thread_id]);
+			}
+		}
+
+		uint32_t thread_mix = fnv_reduce(mix);
+
+		// update mix accross threads
+		shuffle[0].x = __shfl(thread_mix, start_lane + 0);
+		shuffle[0].y = __shfl(thread_mix, start_lane + 1);
+		shuffle[1].x = __shfl(thread_mix, start_lane + 2);
+		shuffle[1].y = __shfl(thread_mix, start_lane + 3);
+		shuffle[2].x = __shfl(thread_mix, start_lane + 4);
+		shuffle[2].y = __shfl(thread_mix, start_lane + 5);
+		shuffle[3].x = __shfl(thread_mix, start_lane + 6);
+		shuffle[3].y = __shfl(thread_mix, start_lane + 7);
+
+		if (i == thread_id) {
+			//move mix into state:
+			state[8] = shuffle[0];
+			state[9] = shuffle[1];
+			state[10] = shuffle[2];
+			state[11] = shuffle[3];
+		}
+	}
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	state[12] = vectorize(0x0000000000000001ULL);
+	for (uint32_t i = 13; i < 25; i++)
+	{
+		state[i] = vectorize(0ULL);
+	}
+	state[16] = vectorize(0x8000000000000000);
+	keccak_f1600_block(state, 1);
+
+	return devectorize(state[0]);
+}
\ No newline at end of file
diff --git a/libethash-cu/ethash_cu_miner.cpp b/libethash-cu/ethash_cu_miner.cpp
new file mode 100644
index 000000000..4d2f3d2be
--- /dev/null
+++ b/libethash-cu/ethash_cu_miner.cpp
@@ -0,0 +1,281 @@
+/*
+  This file is part of c-ethash.
+
+  c-ethash is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  c-ethash is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/** @file ethash_cu_miner.cpp
+* @author Tim Hughes <tim@twistedfury.com>
+* @date 2015
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <assert.h>
+#include <queue>
+#include <random>
+#include <vector>
+#include <chrono>
+#include <thread>
+#include <libethash/util.h>
+#include <libethash/ethash.h>
+#include "ethash_cu_miner.h"
+#include "ethash_cu_miner_kernel_globals.h"
+
+
+#define ETHASH_BYTES 32
+
+// workaround lame platforms
+#if !CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
+#define CL_MEM_HOST_READ_ONLY 0
+#endif
+
+#undef min
+#undef max
+
+using namespace std;
+
+unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128;
+unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
+
+ethash_cu_miner::search_hook::~search_hook() {}
+
+ethash_cu_miner::ethash_cu_miner()
+{
+}
+
+std::string ethash_cu_miner::platform_info(unsigned _deviceId)
+{
+	int runtime_version;
+	int device_count;
+	
+	device_count = get_num_devices();
+
+	if (device_count == 0)
+		return std::string();
+
+	if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidValue) << endl;
+		return std::string();
+	}
+
+	// use selected default device
+	int device_num = std::min<int>((int)_deviceId, device_count - 1);
+
+	cudaDeviceProp device_props;
+	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
+		return std::string();
+	}
+
+	char platform[5];
+	int version_major = runtime_version / 1000;
+	int version_minor = (runtime_version - (version_major * 1000)) / 10;
+	sprintf(platform, "%d.%d", version_major, version_minor);
+
+
+	char compute[5];
+	sprintf(compute, "%d.%d", device_props.major, device_props.minor);
+
+	return "{ \"platform\": \"CUDA " + std::string(platform) + "\", \"device\": \"" + device_props.name + "\", \"version\": \"Compute " + std::string(compute) + "\" }";
+
+}
+
+int ethash_cu_miner::get_num_devices()
+{
+	int device_count;
+
+	if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorNoDevice) << endl;
+		return 0;
+	}
+	return device_count;
+}
+
+void ethash_cu_miner::finish()
+{
+	for (unsigned i = 0; i != m_num_buffers; i++) {
+		cudaStreamDestroy(m_streams[i]);
+		m_streams[i] = 0;
+	}
+	cudaDeviceReset();
+}
+
+bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu)
+{
+	
+	int device_count = get_num_devices();
+
+	if (device_count == 0)
+		return false;
+
+	// use selected device
+	int device_num = std::min<int>((int)_deviceId, device_count - 1);
+	
+	cudaDeviceProp device_props;
+	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
+	{
+		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
+		return false;
+	}
+
+	cout << "Using device: " << device_props.name << "(" << device_props.major << "." << device_props.minor << ")" << endl;
+
+	cudaError_t r = cudaSetDevice(device_num);
+	if (r != cudaSuccess)
+	{
+		cout << cudaGetErrorString(r) << endl;
+		return false;
+	}
+	cudaDeviceReset();
+	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
+
+	m_num_buffers = num_buffers;
+	m_search_batch_size = search_batch_size;
+
+	m_hash_buf	 = new void *[m_num_buffers];
+	m_search_buf = new uint32_t *[m_num_buffers];
+	m_streams    = new cudaStream_t[m_num_buffers];
+
+	// use requested workgroup size, but we require multiple of 8
+	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+
+	m_highcpu = highcpu;
+
+	// patch source code
+	cudaError result;
+
+	uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
+	unsigned max_outputs = c_max_search_results;
+
+	result = set_constants(&dagSize128, &max_outputs);
+
+	// create buffer for dag
+	result = cudaMalloc(&m_dag_ptr, _dagSize);
+
+	// create buffer for header256
+	result = cudaMalloc(&m_header, 32);
+
+	// copy dag to CPU.
+    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
+	
+	// create mining buffers
+	for (unsigned i = 0; i != m_num_buffers; ++i)
+	{		
+		result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size);
+		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
+		result = cudaStreamCreate(&m_streams[i]);
+	}
+	if (result != cudaSuccess)
+	{
+		cout << cudaGetErrorString(result) << endl;
+		return false;
+	}
+	return true;
+}
+
+/**
+ * Prevent High CPU usage while waiting for an async task
+ */
+static unsigned waitStream(cudaStream_t stream)
+{
+	unsigned wait_ms = 0;
+	while (cudaStreamQuery(stream) == cudaErrorNotReady) {
+		this_thread::sleep_for(chrono::milliseconds(10));
+		wait_ms += 10;
+	}
+	return wait_ms;
+}
+
+void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
+{
+	struct pending_batch
+	{
+		uint64_t start_nonce;
+		unsigned buf;
+	};
+	std::queue<pending_batch> pending;
+
+	static uint32_t const c_zero = 0;
+
+	// update header constant buffer
+	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
+	for (unsigned i = 0; i != m_num_buffers; ++i)
+	{
+		cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice);
+	}
+	cudaError err = cudaGetLastError();
+	if (cudaSuccess != err)
+	{
+		throw std::runtime_error(cudaGetErrorString(err));
+	}
+
+	unsigned buf = 0;
+	std::random_device engine;
+	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
+	for (;; start_nonce += m_search_batch_size)
+	{
+		run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);	
+		
+		pending.push({ start_nonce, buf });
+		buf = (buf + 1) % m_num_buffers;
+
+		// read results
+		if (pending.size() == m_num_buffers)
+		{
+			pending_batch const& batch = pending.front();
+
+			uint32_t results[1 + c_max_search_results];
+
+			if (!m_highcpu)
+				waitStream(m_streams[buf]); // 28ms
+			cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]);
+
+			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
+			uint64_t nonces[c_max_search_results];
+			for (unsigned i = 0; i != num_found; ++i)
+			{
+				nonces[i] = batch.start_nonce + results[i + 1];
+				//cout << results[i + 1] << ", ";
+			}
+			//if (num_found > 0)
+			//	cout << endl;
+			
+			bool exit = num_found && hook.found(nonces, num_found);
+			exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
+			if (exit)
+				break;
+
+			start_nonce += m_search_batch_size;
+			// reset search buffer if we're still going
+			if (num_found)
+				cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
+
+			cudaError err = cudaGetLastError();
+			if (cudaSuccess != err)
+			{
+				throw std::runtime_error(cudaGetErrorString(err));
+			}
+			pending.pop();
+		}
+	}	
+}
+
diff --git a/libethash-cu/ethash_cu_miner.h b/libethash-cu/ethash_cu_miner.h
new file mode 100644
index 000000000..54e535850
--- /dev/null
+++ b/libethash-cu/ethash_cu_miner.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <time.h>
+#include <functional>
+#include <libethash/ethash.h>
+#include "ethash_cu_miner_kernel.h"
+
+class ethash_cu_miner
+{
+public:
+	struct search_hook
+	{
+		virtual ~search_hook(); // always a virtual destructor for a class with virtuals.
+
+		// reports progress, return true to abort
+		virtual bool found(uint64_t const* nonces, uint32_t count) = 0;
+		virtual bool searched(uint64_t start_nonce, uint32_t count) = 0;
+	};
+
+public:
+	ethash_cu_miner();
+
+	bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false);
+	static std::string platform_info(unsigned _deviceId = 0);
+	static int get_num_devices();
+
+
+	void finish();
+	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
+	void search(uint8_t const* header, uint64_t target, search_hook& hook);
+
+	/* -- default values -- */
+	/// Default value of the local work size. Also known as workgroup size.
+	static unsigned const c_defaultLocalWorkSize;
+	/// Default value of the global work size as a multiplier of the local work size
+	static unsigned const c_defaultGlobalWorkSizeMultiplier;
+
+private:
+	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };
+	
+	bool	 m_highcpu;
+	unsigned m_num_buffers;
+	unsigned m_search_batch_size;
+	unsigned m_workgroup_size;
+
+	hash128_t * m_dag_ptr;
+	hash32_t * m_header;
+
+	void ** m_hash_buf;
+	uint32_t ** m_search_buf;
+	cudaStream_t  * m_streams;
+
+	
+};
\ No newline at end of file
diff --git a/libethash-cu/ethash_cu_miner_kernel.cu b/libethash-cu/ethash_cu_miner_kernel.cu
new file mode 100644
index 000000000..dc1da4fb1
--- /dev/null
+++ b/libethash-cu/ethash_cu_miner_kernel.cu
@@ -0,0 +1,72 @@
+/*
+* Genoil's CUDA mining kernel for Ethereum
+* based on Tim Hughes' opencl kernel.
+* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
+*/
+
+#include "ethash_cu_miner_kernel.h"
+#include "ethash_cu_miner_kernel_globals.h"
+#include "cuda_helper.h"
+
+#define SHUFFLE_MIN_VER 350
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+#include "dagger_shuffled.cuh"
+#else
+#include "dagger_shared.cuh"
+#endif
+
+__global__ void 
+__launch_bounds__(128, 7)
+ethash_search(
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+	)
+{
+	
+	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
+	
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+	uint64_t hash = compute_hash_shuffle((uint2 *)g_header, g_dag, start_nonce + gid);
+	if (cuda_swab64(hash) < target)
+#else
+	hash32_t hash = compute_hash(g_header, g_dag, start_nonce + gid);	
+	if (cuda_swab64(hash.uint64s[0]) < target)
+#endif
+	{
+		atomicInc(g_output, d_max_outputs);
+		g_output[g_output[0]] = gid;
+	}
+	
+}
+
+void run_ethash_search(
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+)
+{
+#if __CUDA_ARCH__ >= SHUFFLE_MIN_VER
+	ethash_search <<<blocks, threads, 0, stream >>>(g_output, g_header, g_dag, start_nonce, target);
+#else
+	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream>>>(g_output, g_header, g_dag, start_nonce, target);
+#endif
+}
+
+cudaError set_constants(
+	uint32_t * dag_size,
+	uint32_t * max_outputs
+	)
+{
+	cudaError result;
+	result = cudaMemcpyToSymbol(d_dag_size, dag_size, sizeof(uint32_t));
+	result = cudaMemcpyToSymbol(d_max_outputs, max_outputs, sizeof(uint32_t));
+	return result;
+}
diff --git a/libethash-cu/ethash_cu_miner_kernel.h b/libethash-cu/ethash_cu_miner_kernel.h
new file mode 100644
index 000000000..6f22ed447
--- /dev/null
+++ b/libethash-cu/ethash_cu_miner_kernel.h
@@ -0,0 +1,60 @@
+#ifndef _ETHASH_CU_MINER_KERNEL_H_
+#define _ETHASH_CU_MINER_KERNEL_H_
+
+#include <stdint.h>
+
+
+typedef union
+{
+	uint64_t uint64s[16 / sizeof(uint64_t)];
+	uint32_t uint32s[16 / sizeof(uint32_t)];
+} hash16_t;
+
+typedef union
+{
+	uint32_t uint32s[32 / sizeof(uint32_t)];
+	uint64_t uint64s[32 / sizeof(uint64_t)];
+	uint2 uint2s[32 / sizeof(uint2)];
+} hash32_t;
+
+
+typedef union
+{
+	uint32_t uint32s[64 / sizeof(uint32_t)];
+	uint64_t uint64s[64 / sizeof(uint64_t)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+
+typedef union
+{
+	uint32_t uint32s[128 / sizeof(uint32_t)];
+	uint4	 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+//typedef uint32_t hash128_t;
+
+cudaError set_constants(
+	uint32_t * dag_size,
+	uint32_t * max_outputs
+);
+
+void run_ethash_hash(
+	hash32_t* g_hashes,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce
+);
+
+void run_ethash_search(
+	uint32_t search_batch_size,
+	uint32_t workgroup_size,
+	cudaStream_t stream,
+	uint32_t* g_output,
+	hash32_t const* g_header,
+	hash128_t const* g_dag,
+	uint64_t start_nonce,
+	uint64_t target
+);
+
+#endif
diff --git a/libethash-cu/ethash_cu_miner_kernel_globals.h b/libethash-cu/ethash_cu_miner_kernel_globals.h
new file mode 100644
index 000000000..20754d8a9
--- /dev/null
+++ b/libethash-cu/ethash_cu_miner_kernel_globals.h
@@ -0,0 +1,9 @@
+#ifndef _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
+#define _ETHASH_CU_MINER_KERNEL_GLOBALS_H_
+
+//#include "cuda_helper.h"
+
+__constant__ uint32_t d_dag_size;
+__constant__ uint32_t d_max_outputs;
+
+#endif
\ No newline at end of file
diff --git a/libethash-cu/keccak.cuh b/libethash-cu/keccak.cuh
new file mode 100644
index 000000000..99ff45222
--- /dev/null
+++ b/libethash-cu/keccak.cuh
@@ -0,0 +1,89 @@
+#include "cuda_helper.h"
+
+__device__ __constant__ uint64_t const keccak_round_constants[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
+	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
+	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
+
+__device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size)
+{
+	uint2 t[5], u, v;
+
+#pragma unroll 3
+	for (int i = 0; i < 24; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		u = t[4] ^ ROL2(t[1], 1);
+		s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u;
+		u = t[0] ^ ROL2(t[2], 1);
+		s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u;
+		u = t[1] ^ ROL2(t[3], 1);
+		s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u;
+		u = t[2] ^ ROL2(t[4], 1);
+		s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u;
+		u = t[3] ^ ROL2(t[0], 1);
+		s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u;
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(u, 1);
+
+		// squeeze this in here
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		u = s[0]; v = s[1]; s[0] ^= (~v) & s[2];
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+		if (i == 23 && out_size == 1) return;
+
+		// continue chi
+		s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v;
+		u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9];
+
+		if (i == 23) return;
+		s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v;
+		u = s[10]; v = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ u, s[13], s[14]); s[14] = bitselect(s[14] ^ v, s[14], u);
+		u = s[15]; v = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ u, s[18], s[19]); s[19] = bitselect(s[19] ^ v, s[19], u);
+		u = s[20]; v = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ u, s[23], s[24]); s[24] = bitselect(s[24] ^ v, s[24], u);
+	}
+}
\ No newline at end of file
diff --git a/libethcore/CMakeLists.txt b/libethcore/CMakeLists.txt
index a5be81a26..3bd626449 100644
--- a/libethcore/CMakeLists.txt
+++ b/libethcore/CMakeLists.txt
@@ -21,6 +21,9 @@ target_link_libraries(${EXECUTABLE} evmcore)
 if (ETHASHCL)
 	target_link_libraries(${EXECUTABLE} ethash-cl)
 endif ()
+if (ETHASHCU)
+	target_link_libraries(${EXECUTABLE} ethash-cu)
+endif ()
 target_link_libraries(${EXECUTABLE} devcrypto)
 if (CPUID_FOUND)
 	target_link_libraries(${EXECUTABLE} ${CPUID_LIBRARIES})