CL Argument for local work size

- Now the user can also set the local work size (workgroup size) - In addition the global work size is specified in the command line only as a multiplier of the local work size.
10 years ago · db54ff3b4a
5 changed files with 62 additions and 27 deletions
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -129,9 +129,18 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
-		else if (arg == "--cl-global-work-size" && i + 1 < argc)
+		else if (arg == "--cl-global-work" && i + 1 < argc)
 			try {
-				m_globalWorkSize = stol(argv[++i]);
+				m_globalWorkSizeMultiplier = stol(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		else if (arg == "--cl-local-work" && i + 1 < argc)
+			try {
+				m_localWorkSize = stol(argv[++i]);
 			}
 			catch (...)
 			{
@ -285,7 +294,8 @@ public:
 		else if (m_minerType == MinerType::GPU)
 		{
 			if (!ProofOfWork::GPUMiner::configureGPU(
-					m_globalWorkSize,
+					m_localWorkSize,
+					m_globalWorkSizeMultiplier,
 					m_msPerBatch,
 					m_openclPlatform,
 					m_openclDevice,
@ -293,10 +303,7 @@ public:
 					m_extraGPUMemory,
 					m_currentBlock
 				))
-			{
-				cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
 				exit(1);
-			}
 			ProofOfWork::GPUMiner::setNumInstances(m_miningThreads);
 		}
 		if (mode == OperationMode::DAGInit)
@ -339,7 +346,8 @@ public:
 			<< "    --list-devices List the detected OpenCL devices and exit." << endl
 			<< "    --current-block Let the miner know the current block number at configuration time. Will help determine DAG size and required GPU memory." << endl
 			<< "    --cl-extragpu-mem Set the memory (in MB) you believe your GPU requires for stuff other than mining. Windows rendering e.t.c.." << endl
-			<< "    --cl-global-work Set the OpenCL global work size. Default is " << toString(CL_DEFAULT_GLOBAL_WORK_SIZE) << endl
+			<< "    --cl-local-work Set the OpenCL local work size. Default is " << toString(CL_DEFAULT_LOCAL_WORK_SIZE) << endl
+			<< "    --cl-global-work Set the OpenCL global work size as a multiple of the local work size. Default is " << toString(CL_DEFAULT_GLOBAL_WORK_SIZE_MULTIPLIER) << " * " << toString(CL_DEFAULT_LOCAL_WORK_SIZE) << endl
 			<< "    --cl-ms-per-batch Set the OpenCL target milliseconds per batch (global workgroup size). Default is " << toString(CL_DEFAULT_MS_PER_BATCH) << ". If 0 is given then no autoadjustment of global work size will happen" << endl
 			;
 	}
@ -529,7 +537,8 @@ private:
 	unsigned m_miningThreads = UINT_MAX;
 	bool m_shouldListDevices = false;
 	bool m_clAllowCPU = false;
-	unsigned m_globalWorkSize = CL_DEFAULT_GLOBAL_WORK_SIZE;
+	unsigned m_globalWorkSizeMultiplier = CL_DEFAULT_GLOBAL_WORK_SIZE_MULTIPLIER;
+	unsigned m_localWorkSize = CL_DEFAULT_LOCAL_WORK_SIZE;
 	unsigned m_msPerBatch = CL_DEFAULT_MS_PER_BATCH;
 	boost::optional<uint64_t> m_currentBlock;
 	// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -140,6 +140,7 @@ unsigned ethash_cl_miner::getNumDevices(unsigned _platformId)

 bool ethash_cl_miner::configureGPU(
 	unsigned _platformId,
+	unsigned _localWorkSize,
 	unsigned _globalWorkSize,
 	unsigned _msPerBatch,
 	bool _allowCPU,
@ -147,6 +148,7 @@ bool ethash_cl_miner::configureGPU(
 	boost::optional<uint64_t> _currentBlock
 )
 {
+	s_workgroupSize = _localWorkSize;
 	s_initialGlobalWorkSize = _globalWorkSize;
 	s_msPerBatch = _msPerBatch;
 	s_allowCPU = _allowCPU;
@ -180,7 +182,8 @@ bool ethash_cl_miner::configureGPU(
 bool ethash_cl_miner::s_allowCPU = false;
 unsigned ethash_cl_miner::s_extraRequiredGPUMem;
 unsigned ethash_cl_miner::s_msPerBatch = CL_DEFAULT_MS_PER_BATCH;
-unsigned ethash_cl_miner::s_initialGlobalWorkSize = CL_DEFAULT_GLOBAL_WORK_SIZE;
+unsigned ethash_cl_miner::s_workgroupSize = CL_DEFAULT_LOCAL_WORK_SIZE;
+unsigned ethash_cl_miner::s_initialGlobalWorkSize = CL_DEFAULT_GLOBAL_WORK_SIZE_MULTIPLIER * CL_DEFAULT_LOCAL_WORK_SIZE;

 bool ethash_cl_miner::searchForAllDevices(function<bool(cl::Device const&)> _callback)
 {
@ -260,7 +263,6 @@ void ethash_cl_miner::finish()
 bool ethash_cl_miner::init(
 	uint8_t const* _dag,
 	uint64_t _dagSize,
-	unsigned _workgroupSize,
 	unsigned _platformId,
 	unsigned _deviceId
 )
@ -305,12 +307,10 @@ bool ethash_cl_miner::init(
 		m_context = cl::Context(vector<cl::Device>(&device, &device + 1));
 		m_queue = cl::CommandQueue(m_context, device);

-		// use requested workgroup size, but we require multiple of 8
-		m_workgroupSize = ((_workgroupSize + 7) / 8) * 8;
 		// make sure that global work size is evenly divisible by the local workgroup size
 		m_globalWorkSize = s_initialGlobalWorkSize;
-		if (m_globalWorkSize % m_workgroupSize != 0)
-			m_globalWorkSize = ((m_globalWorkSize / m_workgroupSize) + 1) * m_workgroupSize;
+		if (m_globalWorkSize % s_workgroupSize != 0)
+			m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize;
 		// remember the device's address bits
 		m_deviceBits = device.getInfo<CL_DEVICE_ADDRESS_BITS>();

@ -318,7 +318,7 @@ bool ethash_cl_miner::init(
 		// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
 		// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
 		string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
-		addDefinition(code, "GROUP_SIZE", m_workgroupSize);
+		addDefinition(code, "GROUP_SIZE", s_workgroupSize);
 		addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
 		addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
 		addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
@ -476,7 +476,7 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 				m_searchKernel.setArg(6, start_nonce);

 			// execute it!
-			m_queue.enqueueNDRangeKernel(m_searchKernel, cl::NullRange, m_globalWorkSize, m_workgroupSize);
+			m_queue.enqueueNDRangeKernel(m_searchKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize);

 			pending.push({ start_nonce, buf });
 			buf = (buf + 1) % c_bufferCount;
@ -519,15 +519,15 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 					if (d > chrono::milliseconds(s_msPerBatch * 10 / 9))
 					{
 						// cerr << "Batch of " << m_globalWorkSize << " took " << chrono::duration_cast<chrono::milliseconds>(d).count() << " ms, >> " << _msPerBatch << " ms." << endl;
-						m_globalWorkSize = max<unsigned>(128, m_globalWorkSize + m_workgroupSize);
+						m_globalWorkSize = max<unsigned>(128, m_globalWorkSize + s_workgroupSize);
 						// cerr << "New global work size" << m_globalWorkSize << endl;
 					}
 					else if (d < chrono::milliseconds(s_msPerBatch * 9 / 10))
 					{
 						// cerr << "Batch of " << m_globalWorkSize << " took " << chrono::duration_cast<chrono::milliseconds>(d).count() << " ms, << " << _msPerBatch << " ms." << endl;
-						m_globalWorkSize = min<unsigned>(pow(2, m_deviceBits) - 1, m_globalWorkSize - m_workgroupSize);
+						m_globalWorkSize = min<unsigned>(pow(2, m_deviceBits) - 1, m_globalWorkSize - s_workgroupSize);
 						// Global work size should never be less than the workgroup size
-						m_globalWorkSize = max<unsigned>(m_workgroupSize,  m_globalWorkSize);
+						m_globalWorkSize = max<unsigned>(s_workgroupSize,  m_globalWorkSize);
 						// cerr << "New global work size" << m_globalWorkSize << endl;
 					}
 				}
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -17,7 +17,11 @@
 #include <functional>
 #include <libethash/ethash.h>

-#define CL_DEFAULT_GLOBAL_WORK_SIZE 1024 * 16
+/// Default value of the local work size. Also known as workgroup size.
+#define CL_DEFAULT_LOCAL_WORK_SIZE 64
+/// Default value of the global work size as a multiplier of the local work size
+#define CL_DEFAULT_GLOBAL_WORK_SIZE_MULTIPLIER 512 // * CL_DEFAULT_LOCAL_WORK_SIZE
+/// Default value of the milliseconds per global work size (per batch)
 #define CL_DEFAULT_MS_PER_BATCH 100

 class ethash_cl_miner
@ -48,6 +52,7 @@ public:
 	static void listDevices();
 	static bool configureGPU(
 		unsigned _platformId,
+		unsigned _localWorkSize,
 		unsigned _globalWorkSize,
 		unsigned _msPerBatch,
 		bool _allowCPU,
@ -58,7 +63,6 @@ public:
 	bool init(
 		uint8_t const* _dag,
 		uint64_t _dagSize,
-		unsigned _workgroupSize = 64,
 		unsigned _platformId = 0,
 		unsigned _deviceId = 0
 	);
@ -81,11 +85,12 @@ private:
 	cl::Buffer m_header;
 	cl::Buffer m_hashBuffer[c_bufferCount];
 	cl::Buffer m_searchBuffer[c_bufferCount];
-	unsigned m_workgroupSize;
 	unsigned m_globalWorkSize;
 	bool m_openclOnePointOne;
 	unsigned m_deviceBits;

+	/// The local work size for the search
+	static unsigned s_workgroupSize;
 	/// The initial global work size for the searches
 	static unsigned s_initialGlobalWorkSize;
 	/// The target milliseconds per batch for the search. If 0, then no adjustment will happen
--- a/libethcore/Ethash.cpp
+++ b/libethcore/Ethash.cpp
@ -373,7 +373,7 @@ void Ethash::GPUMiner::workLoop()
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
 			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), 32, s_platformId, device);
+			m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
@ -409,7 +409,8 @@ void Ethash::GPUMiner::listDevices()
 }

 bool Ethash::GPUMiner::configureGPU(
-	unsigned _globalWorkSize,
+	unsigned _localWorkSize,
+	unsigned _globalWorkSizeMultiplier,
 	unsigned _msPerBatch,
 	unsigned _platformId,
 	unsigned _deviceId,
@ -420,7 +421,26 @@ bool Ethash::GPUMiner::configureGPU(
 {
 	s_platformId = _platformId;
 	s_deviceId = _deviceId;
-	return ethash_cl_miner::configureGPU(_globalWorkSize, _msPerBatch, _allowCPU, _extraGPUMemory, _currentBlock);
+
+	if (_localWorkSize != 32 && _localWorkSize != 64 && _localWorkSize != 128)
+	{
+		cout << "Given localWorkSize of " << toString(_localWorkSize) << "is invalid. Must be either 32,64, or 128" << endl;
+		return false;
+	}
+	
+	if (!ethash_cl_miner::configureGPU(
+			_localWorkSize,
+			_globalWorkSizeMultiplier * _localWorkSize,
+			_msPerBatch,
+			_allowCPU,
+			_extraGPUMemory,
+			_currentBlock)
+	)
+	{
+		cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
+		return false;
+	}
+	return true;
 }

 #endif
--- a/libethcore/Ethash.h
+++ b/libethcore/Ethash.h
@ -88,7 +88,7 @@ public:
 		static unsigned instances() { return s_numInstances > 0 ? s_numInstances : std::thread::hardware_concurrency(); }
 		static std::string platformInfo();
 		static void listDevices() {}
-		static bool configureGPU(unsigned, unsigned, unsigned, unsigned, bool, unsigned,  boost::optional<uint64_t>) { return false; }
+		static bool configureGPU(unsigned, unsigned, unsigned, unsigned, unsigned, bool, unsigned,  boost::optional<uint64_t>) { return false; }
 		static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, std::thread::hardware_concurrency()); }
 	protected:
 		void kickOff() override
@ -118,7 +118,8 @@ public:
 		static unsigned getNumDevices();
 		static void listDevices();
 		static bool configureGPU(
-			unsigned _globalWorkSize,
+			unsigned _localWorkSize,
+			unsigned _globalWorkSizeMultiplier,
 			unsigned _msPerBatch,
 			unsigned _platformId,
 			unsigned _deviceId,