upstream merge

also likely fixes a bug in device selection many cmd line params changed
10 years ago · 56bb6941ac
7 changed files with 258 additions and 117 deletions
--- a/ethminer/CMakeLists.txt
+++ b/ethminer/CMakeLists.txt
@ -5,11 +5,16 @@ aux_source_directory(. SRC_LIST)

 include_directories(BEFORE ..)
 include_directories(${Boost_INCLUDE_DIRS})
+
 if (JSONRPC)
-include_directories(BEFORE ${JSONCPP_INCLUDE_DIRS})
-include_directories(${JSON_RPC_CPP_INCLUDE_DIRS})
+	include_directories(BEFORE ${JSONCPP_INCLUDE_DIRS})
+	include_directories(${JSON_RPC_CPP_INCLUDE_DIRS})
 endif()

+if (ETHASHCU)
+	include_directories(${CUDA_INCLUDE_DIRS})
+endif ()
+
 set(EXECUTABLE ethminer)

 file(GLOB HEADERS "*.h")
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -145,7 +145,7 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 #if ETH_ETHASHCL || ETH_ETHASHCU || !ETH_TRUE
-		else if (arg == "--gpu-global-work" && i + 1 < argc)
+		else if ((arg == "--cl-global-work" || arg == "--cuda-grid-size")  && i + 1 < argc)
 			try {
 				m_globalWorkSizeMultiplier = stol(argv[++i]);
 			}
@ -154,7 +154,7 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-		else if (arg == "--gpu-local-work" && i + 1 < argc)
+		else if ((arg == "--cl-local-work" || arg == "--cuda-block-size") && i + 1 < argc)
 			try {
 				m_localWorkSize = stol(argv[++i]);
 			}
@ -163,6 +163,12 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
+		else if (arg == "--list-devices")
+			m_shouldListDevices = true;
+		else if ((arg == "--cl-extragpu-mem" || arg == "--cuda-extragpu-mem") && i + 1 < argc)
+			m_extraGPUMemory = 1000000 * stol(argv[++i]);
+#endif
+#if ETH_ETHASHCL || !ETH_TRUE
 		else if (arg == "--cl-ms-per-batch" && i + 1 < argc)
 			try {
 				m_msPerBatch = stol(argv[++i]);
@ -172,13 +178,29 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-#endif
-		else if (arg == "--list-devices")
-			m_shouldListDevices = true;
 		else if (arg == "--allow-opencl-cpu")
 			m_clAllowCPU = true;
-		else if (arg == "--cl-extragpu-mem" && i + 1 < argc)
-			m_extraGPUMemory = 1000000 * stol(argv[++i]);
+#endif
+#if ETH_ETHASHCU || !ETH_TRUE
+		else if (arg == "--cuda-devices") {
+			while (m_cudaDeviceCount < 16 && i + 1 < argc)
+			{
+				try {
+					m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
+				}
+				catch (...)
+				{
+					break;
+				}
+			}
+		}
+		else if (arg == "--cuda-turbo") {
+			m_cudaHighCPULoad = true;
+		}
+		else if (arg == "--cuda-streams" && i + 1 < argc) {
+			m_numStreams = stol(argv[++i]);
+		}
+#endif
 		else if (arg == "--phone-home" && i + 1 < argc)
 		{
 			string m = argv[++i];
@ -295,23 +317,6 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
-#if ETH_ETHASHCU || !ETH_TRUE
-		else if (arg == "--cuda-devices") {
-			while (m_cudaDeviceCount < 16 && i + 1 < argc)
-			{
-				try {
-					m_cudaDevices[m_cudaDeviceCount++] = stol(argv[++i]);
-				}
-				catch (...)
-				{
-					break;
-				}
-			}
-		}
-		else if (arg == "--cuda-high-cpu") {
-			m_cudaHighCPULoad = true;
-		}
-#endif
 		else
 			return false;
 		return true;
@ -322,7 +327,12 @@ public:
 		if (m_shouldListDevices)
 		{
 #if ETH_ETHASHCL || !ETH_TRUE
-			EthashGPUMiner::listDevices();
+			if (m_minerType == MinerType::GPU)
+				EthashGPUMiner::listDevices();
+#endif
+#if ETH_ETHASHCU || !ETH_TRUE
+			if (m_minerType == MinerType::CUDA)
+				EthashCUDAMiner::listDevices();
 #endif
 			exit(0);
 		}
@ -347,6 +357,31 @@ public:
 #else
 			cerr << "Selected GPU mining without having compiled with -DETHASHCL=1" << endl;
 			exit(1);
+#endif
+		}
+		else if (m_minerType == MinerType::CUDA)
+		{
+#if ETH_ETHASHCU || !ETH_TRUE
+			if (!EthashCUDAMiner::configureGPU(
+				m_localWorkSize,
+				m_globalWorkSizeMultiplier,
+				m_numStreams,
+				m_openclDevice,
+				m_extraGPUMemory,
+				m_cudaHighCPULoad,
+				m_currentBlock
+				))
+				exit(1);
+
+			if (m_cudaDeviceCount != 0) {
+				EthashCUDAMiner::setDevices(m_cudaDevices, m_cudaDeviceCount);
+				m_miningThreads = m_cudaDeviceCount;
+			}
+			EthashCUDAMiner::setNumInstances(m_miningThreads);
+			
+#else
+			cerr << "Selected CUDA mining without having compiled with -DETHASHCU=1 or -DBUNDLE=cudaminer" << endl;
+			exit(1);
 #endif
 		}
 		if (mode == OperationMode::DAGInit)
@ -382,17 +417,26 @@ public:
 			<< "Mining configuration:" << endl
 			<< "    -C,--cpu  When mining, use the CPU." << endl
 			<< "    -G,--opencl  When mining use the GPU via OpenCL." << endl
+			<< "    -U,--cuda  When mining use the GPU via CUDA." << endl
 			<< "    --opencl-platform <n>  When mining using -G/--opencl use OpenCL platform n (default: 0)." << endl
 			<< "    --opencl-device <n>  When mining using -G/--opencl use OpenCL device n (default: 0)." << endl
 			<< "    -t, --mining-threads <n> Limit number of CPU/GPU miners to n (default: use everything available on selected platform)" << endl
 			<< "    --allow-opencl-cpu Allows CPU to be considered as an OpenCL device if the OpenCL platform supports it." << endl
-			<< "    --list-devices List the detected OpenCL devices and exit." << endl
+			<< "    --list-devices List the detected OpenCL/CUDA devices and exit." << endl
 			<< "    --current-block Let the miner know the current block number at configuration time. Will help determine DAG size and required GPU memory." << endl
 #if ETH_ETHASHCL || !ETH_TRUE
 			<< "    --cl-extragpu-mem Set the memory (in MB) you believe your GPU requires for stuff other than mining. Windows rendering e.t.c.." << endl
 			<< "    --cl-local-work Set the OpenCL local work size. Default is " << toString(ethash_cl_miner::c_defaultLocalWorkSize) << endl
 			<< "    --cl-global-work Set the OpenCL global work size as a multiple of the local work size. Default is " << toString(ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier) << " * " << toString(ethash_cl_miner::c_defaultLocalWorkSize) << endl
 			<< "    --cl-ms-per-batch Set the OpenCL target milliseconds per batch (global workgroup size). Default is " << toString(ethash_cl_miner::c_defaultMSPerBatch) << ". If 0 is given then no autoadjustment of global work size will happen" << endl
+#endif
+#if ETH_ETHASHCU || !ETH_TRUE
+			<< "    --cuda-extragpu-mem Set the memory (in MB) you believe your GPU requires for stuff other than mining. Windows rendering e.t.c.." << endl
+			<< "    --cuda-block-size Set the CUDA block work size. Default is " << toString(ethash_cu_miner::c_defaultBlockSize) << endl
+			<< "    --cuda-grid-size Set the CUDA grid size. Default is " << toString(ethash_cu_miner::c_defaultGridSize) << endl
+			<< "    --cuda-streams Set the number of CUDA streams. Default is " << toString(ethash_cu_miner::c_defaultNumStreams) << endl
+			<< "    --cuda-turbo Get a bit of extra hashrate at the cost of high CPU load... Default is false" << endl
+			<< "    --cuda-devices <0 1 ..n> Select which GPU's to mine on. Default is to use all" << endl
 #endif
 			;
 	}
@ -427,11 +471,14 @@ private:
 		sealers["cpu"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashCPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCPUMiner(ci); }};
 #if ETH_ETHASHCL
 		sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
+#endif
+#if ETH_ETHASHCU
+		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
 #endif
 		f.setSealers(sealers);
 		f.onSolutionFound([&](EthashProofOfWork::Solution) { return false; });

-		string platformInfo = _m == MinerType::CPU ? "CPU" : "GPU";//EthashProofOfWork::CPUMiner::platformInfo() : _m == MinerType::GPU ? EthashProofOfWork::GPUMiner::platformInfo() : "";
+		string platformInfo = _m == MinerType::CPU ? "CPU" : (_m == MinerType::GPU ? "GPU" : "CUDA");
 		cout << "Benchmarking on platform: " << platformInfo << endl;

 		cout << "Preparing DAG..." << endl;
@ -443,6 +490,8 @@ private:
 			f.start("cpu");
 		else if (_m == MinerType::GPU)
 			f.start("opencl");
+		else if (_m == MinerType::CUDA)
+			f.start("cuda");

 		map<uint64_t, WorkingProgress> results;
 		uint64_t mean = 0;
@ -614,15 +663,18 @@ private:
 	bool m_shouldListDevices = false;
 	bool m_clAllowCPU = false;
 #if ETH_ETHASHCL || !ETH_TRUE
+#if !ETH_ETHASHCU || !ETH_TRUE
 	unsigned m_globalWorkSizeMultiplier = ethash_cl_miner::c_defaultGlobalWorkSizeMultiplier;
 	unsigned m_localWorkSize = ethash_cl_miner::c_defaultLocalWorkSize;
+#endif
 	unsigned m_msPerBatch = ethash_cl_miner::c_defaultMSPerBatch;
 #endif
 #if ETH_ETHASHCU || !ETH_TRUE
-	unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier;
-	unsigned m_localWorkSize = ethash_cu_miner::c_defaultLocalWorkSize;
+	unsigned m_globalWorkSizeMultiplier = ethash_cu_miner::c_defaultGridSize;
+	unsigned m_localWorkSize = ethash_cu_miner::c_defaultBlockSize;
 	unsigned m_cudaDeviceCount = 0;
 	unsigned m_cudaDevices[16];
+	unsigned m_numStreams = ethash_cu_miner::c_defaultNumStreams;
 	bool	 m_cudaHighCPULoad = false;
 #endif
 	uint64_t m_currentBlock = 0;
--- a/libethash-cu/ethash_cu_miner.cpp
+++ b/libethash-cu/ethash_cu_miner.cpp
@ -28,11 +28,14 @@
 #include <assert.h>
 #include <queue>
 #include <random>
+#include <atomic>
+#include <sstream>
 #include <vector>
 #include <chrono>
 #include <thread>
 #include <libethash/util.h>
 #include <libethash/ethash.h>
+#include <libethash/internal.h>
 #include <cuda_runtime.h>
 #include "ethash_cu_miner.h"
 #include "ethash_cu_miner_kernel_globals.h"
@ -51,8 +54,36 @@

 using namespace std;

-unsigned const ethash_cu_miner::c_defaultLocalWorkSize = 128;
-unsigned const ethash_cu_miner::c_defaultGlobalWorkSizeMultiplier = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
+unsigned const ethash_cu_miner::c_defaultBlockSize = 128;
+unsigned const ethash_cu_miner::c_defaultGridSize = 2048; // * CL_DEFAULT_LOCAL_WORK_SIZE
+unsigned const ethash_cu_miner::c_defaultNumStreams = 2;
+
+#if defined(_WIN32)
+extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA(const char* lpOutputString);
+static std::atomic_flag s_logSpin = ATOMIC_FLAG_INIT;
+#define ETHCU_LOG(_contents) \
+	do \
+			{ \
+		std::stringstream ss; \
+		ss << _contents; \
+						while (s_logSpin.test_and_set(std::memory_order_acquire)) {} \
+		OutputDebugStringA(ss.str().c_str()); \
+		cerr << ss.str() << endl << flush; \
+		s_logSpin.clear(std::memory_order_release); \
+			} while (false)
+#else
+#define ETHCL_LOG(_contents) cout << "[OPENCL]:" << _contents << endl
+#endif
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+		}                                                                 \
+} while (0)

 ethash_cu_miner::search_hook::~search_hook() {}

@ -70,28 +101,19 @@ std::string ethash_cu_miner::platform_info(unsigned _deviceId)
 	if (device_count == 0)
 		return std::string();

-	if (cudaRuntimeGetVersion(&runtime_version) == cudaErrorInvalidValue)
-	{
-		cout << cudaGetErrorString(cudaErrorInvalidValue) << endl;
-		return std::string();
-	}
+	CUDA_SAFE_CALL(cudaRuntimeGetVersion(&runtime_version));

 	// use selected default device
 	int device_num = std::min<int>((int)_deviceId, device_count - 1);
-
 	cudaDeviceProp device_props;
-	if (cudaGetDeviceProperties(&device_props, device_num) == cudaErrorInvalidDevice)
-	{
-		cout << cudaGetErrorString(cudaErrorInvalidDevice) << endl;
-		return std::string();
-	}
+
+	CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));

 	char platform[5];
 	int version_major = runtime_version / 1000;
 	int version_minor = (runtime_version - (version_major * 1000)) / 10;
 	sprintf(platform, "%d.%d", version_major, version_minor);
 	
-
 	char compute[5];
 	sprintf(compute, "%d.%d", device_props.major, device_props.minor);

@ -99,28 +121,85 @@ std::string ethash_cu_miner::platform_info(unsigned _deviceId)

 }

-int ethash_cu_miner::getNumDevices()
+unsigned ethash_cu_miner::getNumDevices()
 {
 	int device_count;
+	CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count));
+	return device_count;
+}

-	if (cudaGetDeviceCount(&device_count) == cudaErrorNoDevice)
+bool ethash_cu_miner::configureGPU(
+	unsigned _blockSize,
+	unsigned _gridSize,
+	unsigned _numStreams,
+	unsigned _extraGPUMemory,
+	bool	 _highcpu,
+	uint64_t _currentBlock
+	)
+{
+	s_blockSize = _blockSize;
+	s_gridSize = _gridSize;
+	s_extraRequiredGPUMem = _extraGPUMemory;
+	s_numStreams = _numStreams;
+	s_highCPU = _highcpu;
+
+	// by default let's only consider the DAG of the first epoch
+	uint64_t dagSize = ethash_get_datasize(_currentBlock);
+	uint64_t requiredSize = dagSize + _extraGPUMemory;
+	for (unsigned int i = 0; i < getNumDevices(); i++)
 	{
-		cout << cudaGetErrorString(cudaErrorNoDevice) << endl;
-		return 0;
+		cudaDeviceProp props;
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
+		if (props.totalGlobalMem >= requiredSize)
+		{
+			ETHCU_LOG(
+				"Found suitable CUDA device [" << props.name
+				<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
+				);
+			return true;
+		}
+
+		ETHCU_LOG(
+			"CUDA device " << props.name
+			<< " has insufficient GPU memory." << props.totalGlobalMem <<
+			" bytes of memory found < " << requiredSize << " bytes of memory required"
+			);
 	}
-	return device_count;
+	return false;
+}
+
+unsigned ethash_cu_miner::s_extraRequiredGPUMem;
+unsigned ethash_cu_miner::s_blockSize = ethash_cu_miner::c_defaultBlockSize;
+unsigned ethash_cu_miner::s_gridSize = ethash_cu_miner::c_defaultGridSize;
+unsigned ethash_cu_miner::s_numStreams = ethash_cu_miner::c_defaultNumStreams;
+bool ethash_cu_miner::s_highCPU = false;
+
+void ethash_cu_miner::listDevices()
+{
+	string outString = "\nListing CUDA devices.\nFORMAT: [deviceID] deviceName\n";
+	unsigned int i = 0;
+	for (unsigned int i = 0; i < getNumDevices(); i++)
+	{
+		cudaDeviceProp props;
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
+
+		outString += "[" + to_string(i) + "] " + props.name + "\n";
+		outString += "\tCompute version: " + to_string(props.major) + "." + to_string(props.minor) + "\n";
+		outString += "\tcudaDeviceProp::totalGlobalMem: " + to_string(props.totalGlobalMem) + "\n";
+	}
+	ETHCU_LOG(outString);
 }

 void ethash_cu_miner::finish()
 {
-	for (unsigned i = 0; i != m_num_buffers; i++) {
+	for (unsigned i = 0; i != s_numStreams; i++) {
 		cudaStreamDestroy(m_streams[i]);
 		m_streams[i] = 0;
 	}
 	cudaDeviceReset();
 }

-bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers, unsigned search_batch_size, unsigned workgroup_size, unsigned _deviceId, bool highcpu)
+bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
 {
 	
 	int device_count = getNumDevices();
@ -147,19 +226,12 @@ bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_
 		return false;
 	}
 	cudaDeviceReset();
-	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
-
-	m_num_buffers = num_buffers;
-	m_search_batch_size = search_batch_size;
-
-	m_hash_buf	 = new void *[m_num_buffers];
-	m_search_buf = new uint32_t *[m_num_buffers];
-	m_streams    = new cudaStream_t[m_num_buffers];
-
-	// use requested workgroup size, but we require multiple of 8
-	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

-	m_highcpu = highcpu;
+	m_hash_buf	 = new void *[s_numStreams];
+	m_search_buf = new uint32_t *[s_numStreams];
+	m_streams = new cudaStream_t[s_numStreams];

 	// patch source code
 	cudaError result;
@ -179,7 +251,7 @@ bool ethash_cu_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_
    result = cudaMemcpy(m_dag_ptr, _dag, _dagSize, cudaMemcpyHostToDevice);
 	
 	// create mining buffers
-	for (unsigned i = 0; i != m_num_buffers; ++i)
+	for (unsigned i = 0; i != s_numStreams; ++i)
 	{		
 		result = cudaMallocHost(&m_hash_buf[i], 32 * c_hash_batch_size);
 		result = cudaMallocHost(&m_search_buf[i], (c_max_search_results + 1) * sizeof(uint32_t));
@ -219,7 +291,7 @@ void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook

 	// update header constant buffer
 	cudaMemcpy(m_header, header, 32, cudaMemcpyHostToDevice);
-	for (unsigned i = 0; i != m_num_buffers; ++i)
+	for (unsigned i = 0; i != s_numStreams; ++i)
 	{
 		cudaMemcpy(m_search_buf[i], &c_zero, 4, cudaMemcpyHostToDevice);
 	}
@ -232,21 +304,21 @@ void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook
 	unsigned buf = 0;
 	std::random_device engine;
 	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
-	for (;; start_nonce += m_search_batch_size)
+	for (;; start_nonce += s_gridSize)
 	{
-		run_ethash_search(m_search_batch_size / m_workgroup_size, m_workgroup_size, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);	
+		run_ethash_search(s_gridSize, s_blockSize, m_streams[buf], m_search_buf[buf], m_header, m_dag_ptr, start_nonce, target);
 		
 		pending.push({ start_nonce, buf });
-		buf = (buf + 1) % m_num_buffers;
+		buf = (buf + 1) % s_numStreams;

 		// read results
-		if (pending.size() == m_num_buffers)
+		if (pending.size() == s_numStreams)
 		{
 			pending_batch const& batch = pending.front();

 			uint32_t results[1 + c_max_search_results];

-			if (!m_highcpu)
+			if (!s_highCPU)
 				waitStream(m_streams[buf]); // 28ms
 			cudaMemcpyAsync(results, m_search_buf[batch.buf], (1 + c_max_search_results) * sizeof(uint32_t), cudaMemcpyHostToHost, m_streams[batch.buf]);

@ -261,11 +333,11 @@ void ethash_cu_miner::search(uint8_t const* header, uint64_t target, search_hook
 			//	cout << endl;
 			
 			bool exit = num_found && hook.found(nonces, num_found);
-			exit |= hook.searched(batch.start_nonce, m_search_batch_size); // always report searched before exit
+			exit |= hook.searched(batch.start_nonce, s_gridSize * s_blockSize); // always report searched before exit
 			if (exit)
 				break;

-			start_nonce += m_search_batch_size;
+			start_nonce += s_gridSize * s_blockSize;
 			// reset search buffer if we're still going
 			if (num_found)
 				cudaMemcpyAsync(m_search_buf[batch.buf], &c_zero, 4, cudaMemcpyHostToDevice, m_streams[batch.buf]);
--- a/libethash-cu/ethash_cu_miner.h
+++ b/libethash-cu/ethash_cu_miner.h
@ -22,38 +22,36 @@ public:
 public:
 	ethash_cu_miner();

-	bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned num_buffers = 2, unsigned search_batch_size = 262144, unsigned workgroup_size = 64, unsigned _deviceId = 0, bool highcpu = false);
 	static std::string platform_info(unsigned _deviceId = 0);
-	static int getNumDevices();
+	static unsigned getNumDevices();
 	static void listDevices();
 	static bool configureGPU(
-		unsigned _platformId,
-		unsigned _localWorkSize,
-		unsigned _globalWorkSize,
-		unsigned _msPerBatch,
-		bool _allowCPU,
+		unsigned _blockSize,
+		unsigned _gridSize,
+		unsigned _numStreams,
 		unsigned _extraGPUMemory,
+		bool	 _highcpu,
 		uint64_t _currentBlock
 		);
-
+	bool init(
+		uint8_t const* _dag,
+		uint64_t _dagSize,
+		unsigned _deviceId = 0
+		);
 	void finish();
-	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
 	void search(uint8_t const* header, uint64_t target, search_hook& hook);

 	/* -- default values -- */
-	/// Default value of the local work size. Also known as workgroup size.
-	static unsigned const c_defaultLocalWorkSize;
-	/// Default value of the global work size as a multiplier of the local work size
-	static unsigned const c_defaultGlobalWorkSizeMultiplier;
+	/// Default value of the block size. Also known as workgroup size.
+	static unsigned const c_defaultBlockSize;
+	/// Default value of the grid size
+	static unsigned const c_defaultGridSize;
+	// default number of CUDA streams
+	static unsigned const c_defaultNumStreams;

 private:
 	enum { c_max_search_results = 63, c_hash_batch_size = 1024 };

-	bool	 m_highcpu;
-	unsigned m_num_buffers;
-	unsigned m_search_batch_size;
-	unsigned m_workgroup_size;
-
 	hash128_t * m_dag_ptr;
 	hash32_t * m_header;

@ -61,5 +59,16 @@ private:
 	uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

+	/// The local work size for the search
+	static unsigned s_blockSize;
+	/// The initial global work size for the searches
+	static unsigned s_gridSize;
+	/// The number of CUDA streams
+	static unsigned s_numStreams;
+	/// Whether or not to let the CPU wait
+	static bool s_highCPU;

+	/// GPU memory required for other things, like window rendering e.t.c.
+	/// User can set it via the --cl-extragpu-mem argument.
+	static unsigned s_extraRequiredGPUMem;
 };
--- a/libethcore/CMakeLists.txt
+++ b/libethcore/CMakeLists.txt
@ -21,7 +21,7 @@ file(GLOB HEADERS "*.h")
 add_library(${EXECUTABLE} ${SRC_LIST} ${HEADERS})

 target_link_libraries(${EXECUTABLE} ethash)
-target_link_libraries(${EXECUTABLE} evmcore)
+
 if (ETHASHCL)
 	target_link_libraries(${EXECUTABLE} ethash-cl)
 endif ()
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@ -103,6 +103,8 @@ namespace dev
 unsigned EthashCUDAMiner::s_platformId = 0;
 unsigned EthashCUDAMiner::s_deviceId = 0;
 unsigned EthashCUDAMiner::s_numInstances = 0;
+int EthashCUDAMiner::s_devices[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+

 EthashCUDAMiner::EthashCUDAMiner(ConstructionInfo const& _ci) :
 GenericMiner<EthashProofOfWork>(_ci),
@ -147,7 +149,7 @@ void EthashCUDAMiner::workLoop()
 			delete m_miner;
 			m_miner = new ethash_cu_miner;

-			unsigned device = instances() > 1 ? index() : s_deviceId;
+			unsigned device = instances() > 1 ? (s_devices[index()] > -1 ? s_devices[index()] : index()) : s_deviceId;

 			EthashAux::FullType dag;
 			while (true)
@ -164,7 +166,7 @@ void EthashCUDAMiner::workLoop()
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
 			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
+			m_miner->init(dagData.data(), dagData.size(), device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
@ -200,36 +202,33 @@ void EthashCUDAMiner::listDevices()
 }

 bool EthashCUDAMiner::configureGPU(
-	unsigned _localWorkSize,
-	unsigned _globalWorkSizeMultiplier,
-	unsigned _msPerBatch,
-	unsigned _platformId,
+	unsigned _blockSize,
+	unsigned _gridSize,
+	unsigned _numStreams,
 	unsigned _deviceId,
-	bool _allowCPU,
 	unsigned _extraGPUMemory,
+	bool	 _highcpu,
 	uint64_t _currentBlock
 	)
 {
-	s_platformId = _platformId;
 	s_deviceId = _deviceId;

-	if (_localWorkSize != 32 && _localWorkSize != 64 && _localWorkSize != 128 && _localWorkSize != 256)
+	if (_blockSize != 32 && _blockSize != 64 && _blockSize != 128)
 	{
-		cout << "Given localWorkSize of " << toString(_localWorkSize) << "is invalid. Must be either 32,64,128 or 256" << endl;
+		cout << "Given localWorkSize of " << toString(_blockSize) << "is invalid. Must be either 32,64 or 128" << endl;
 		return false;
 	}

 	if (!ethash_cu_miner::configureGPU(
-		_platformId,
-		_localWorkSize,
-		_globalWorkSizeMultiplier * _localWorkSize,
-		_msPerBatch,
-		_allowCPU,
+		_blockSize,
+		_gridSize,
+		_numStreams,
 		_extraGPUMemory,
+		_highcpu,
 		_currentBlock)
 		)
 	{
-		cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
+		cout << "No CUDA device with sufficient memory was found. Can't CUDA mine. Remove the -U argument" << endl;
 		return false;
 	}
 	return true;
--- a/libethcore/EthashCUDAMiner.h
+++ b/libethcore/EthashCUDAMiner.h
@ -46,17 +46,20 @@ namespace dev
 			static unsigned getNumDevices();
 			static void listDevices();
 			static bool configureGPU(
-				unsigned _localWorkSize,
-				unsigned _globalWorkSizeMultiplier,
-				unsigned _msPerBatch,
-				unsigned _platformId,
+				unsigned _blockSize,
+				unsigned _gridSize,
+				unsigned _numStreams,
 				unsigned _deviceId,
-				bool _allowCPU,
 				unsigned _extraGPUMemory,
+				bool	 _highcpu,
 				uint64_t _currentBlock
 				);
 			static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, getNumDevices()); }
-
+			static void setDevices(unsigned * _devices, unsigned _selectedDeviceCount) {
+				for (unsigned i = 0; i < _selectedDeviceCount; i++) {
+					s_devices[i] = _devices[i];
+				}
+			}
 		protected:
 			void kickOff() override;
 			void pause() override;
@ -74,6 +77,7 @@ namespace dev
 			static unsigned s_platformId;
 			static unsigned s_deviceId;
 			static unsigned s_numInstances;
+			static int		s_devices[16];
 		};

 	}