Merge pull request #2298 from LefterisJP/cl_batch_size_adjust

Adjust CL miner work batch size properly
10 years ago · 80249970c5
5 changed files with 132 additions and 32 deletions
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -128,6 +128,33 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
+		else if (arg == "--cl-global-work" && i + 1 < argc)
+			try {
+				m_globalWorkSizeMultiplier = stol(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		else if (arg == "--cl-local-work" && i + 1 < argc)
+			try {
+				m_localWorkSize = stol(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		else if (arg == "--cl-ms-per-batch" && i + 1 < argc)
+			try {
+				m_msPerBatch = stol(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
 		else if (arg == "--list-devices")
 			m_shouldListDevices = true;
 		else if (arg == "--allow-opencl-cpu")
@ -266,16 +293,16 @@ public:
 		else if (m_minerType == MinerType::GPU)
 		{
 			if (!ProofOfWork::GPUMiner::configureGPU(
+					m_localWorkSize,
+					m_globalWorkSizeMultiplier,
+					m_msPerBatch,
 					m_openclPlatform,
 					m_openclDevice,
 					m_clAllowCPU,
 					m_extraGPUMemory,
 					m_currentBlock
 				))
-			{
-				cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
 				exit(1);
-			}
 			ProofOfWork::GPUMiner::setNumInstances(m_miningThreads);
 		}
 		if (mode == OperationMode::DAGInit)
@ -318,6 +345,9 @@ public:
 			<< "    --list-devices List the detected OpenCL devices and exit." << endl
 			<< "    --current-block Let the miner know the current block number at configuration time. Will help determine DAG size and required GPU memory." << endl
 			<< "    --cl-extragpu-mem Set the memory (in MB) you believe your GPU requires for stuff other than mining. Windows rendering e.t.c.." << endl
+			<< "    --cl-local-work Set the OpenCL local work size. Default is " << toString(dev::eth::Ethash::defaultLocalWorkSize) << endl
+			<< "    --cl-global-work Set the OpenCL global work size as a multiple of the local work size. Default is " << toString(dev::eth::Ethash::defaultGlobalWorkSizeMultiplier) << " * " << toString(dev::eth::Ethash::defaultLocalWorkSize) << endl
+			<< "    --cl-ms-per-batch Set the OpenCL target milliseconds per batch (global workgroup size). Default is " << toString(dev::eth::Ethash::defaultMSPerBatch) << ". If 0 is given then no autoadjustment of global work size will happen" << endl
 			;
 	}

@ -506,6 +536,9 @@ private:
 	unsigned m_miningThreads = UINT_MAX;
 	bool m_shouldListDevices = false;
 	bool m_clAllowCPU = false;
+	unsigned m_globalWorkSizeMultiplier = dev::eth::Ethash::defaultGlobalWorkSizeMultiplier;
+	unsigned m_localWorkSize = dev::eth::Ethash::defaultLocalWorkSize;
+	unsigned m_msPerBatch = dev::eth::Ethash::defaultMSPerBatch;
 	boost::optional<uint64_t> m_currentBlock;
 	// default value is 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
 	unsigned m_extraGPUMemory = 350000000;
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -33,6 +33,7 @@
 #include <vector>
 #include <libethash/util.h>
 #include <libethash/ethash.h>
+#include <libethcore/Ethash.h>
 #include <libethash/internal.h>
 #include "ethash_cl_miner.h"
 #include "ethash_cl_miner_kernel.h"
@ -49,6 +50,7 @@
 #undef max

 using namespace std;
+using namespace dev::eth;

 // TODO: If at any point we can use libdevcore in here then we should switch to using a LogChannel
 #define ETHCL_LOG(_contents) cout << "[OPENCL]:" << _contents << endl
@ -140,11 +142,17 @@ unsigned ethash_cl_miner::getNumDevices(unsigned _platformId)

 bool ethash_cl_miner::configureGPU(
 	unsigned _platformId,
+	unsigned _localWorkSize,
+	unsigned _globalWorkSize,
+	unsigned _msPerBatch,
 	bool _allowCPU,
 	unsigned _extraGPUMemory,
 	boost::optional<uint64_t> _currentBlock
 )
 {
+	s_workgroupSize = _localWorkSize;
+	s_initialGlobalWorkSize = _globalWorkSize;
+	s_msPerBatch = _msPerBatch;
 	s_allowCPU = _allowCPU;
 	s_extraRequiredGPUMem = _extraGPUMemory;
 	// by default let's only consider the DAG of the first epoch
@ -175,6 +183,9 @@ bool ethash_cl_miner::configureGPU(

 bool ethash_cl_miner::s_allowCPU = false;
 unsigned ethash_cl_miner::s_extraRequiredGPUMem;
+unsigned ethash_cl_miner::s_msPerBatch = Ethash::defaultMSPerBatch;
+unsigned ethash_cl_miner::s_workgroupSize = Ethash::defaultLocalWorkSize;
+unsigned ethash_cl_miner::s_initialGlobalWorkSize = Ethash::defaultGlobalWorkSizeMultiplier * Ethash::defaultLocalWorkSize;

 bool ethash_cl_miner::searchForAllDevices(function<bool(cl::Device const&)> _callback)
 {
@ -254,7 +265,6 @@ void ethash_cl_miner::finish()
 bool ethash_cl_miner::init(
 	uint8_t const* _dag,
 	uint64_t _dagSize,
-	unsigned _workgroupSize,
 	unsigned _platformId,
 	unsigned _deviceId
 )
@ -299,14 +309,18 @@ bool ethash_cl_miner::init(
 		m_context = cl::Context(vector<cl::Device>(&device, &device + 1));
 		m_queue = cl::CommandQueue(m_context, device);

-		// use requested workgroup size, but we require multiple of 8
-		m_workgroupSize = ((_workgroupSize + 7) / 8) * 8;
+		// make sure that global work size is evenly divisible by the local workgroup size
+		m_globalWorkSize = s_initialGlobalWorkSize;
+		if (m_globalWorkSize % s_workgroupSize != 0)
+			m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize;
+		// remember the device's address bits
+		m_deviceBits = device.getInfo<CL_DEVICE_ADDRESS_BITS>();

 		// patch source code
 		// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
 		// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
 		string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
-		addDefinition(code, "GROUP_SIZE", m_workgroupSize);
+		addDefinition(code, "GROUP_SIZE", s_workgroupSize);
 		addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
 		addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
 		addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
@ -415,9 +429,8 @@ bool ethash_cl_miner::init(
 	return true;
 }

-void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook, unsigned _msPerBatch)
+void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
-	(void)_msPerBatch;
 	try
 	{
 		struct pending_batch
@ -454,10 +467,9 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 		unsigned buf = 0;
 		random_device engine;
 		uint64_t start_nonce = uniform_int_distribution<uint64_t>()(engine);
-		for (;; start_nonce += m_batchSize)
+		for (;; start_nonce += m_globalWorkSize)
 		{
-//			chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
-
+			auto t = chrono::high_resolution_clock::now();
 			// supply output buffer to kernel
 			m_searchKernel.setArg(0, m_searchBuffer[buf]);
 			if (m_dagChunksCount == 1)
@ -466,7 +478,7 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 				m_searchKernel.setArg(6, start_nonce);

 			// execute it!
-			m_queue.enqueueNDRangeKernel(m_searchKernel, cl::NullRange, m_batchSize, m_workgroupSize);
+			m_queue.enqueueNDRangeKernel(m_searchKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize);

 			pending.push({ start_nonce, buf });
 			buf = (buf + 1) % c_bufferCount;
@ -486,7 +498,7 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook

 				m_queue.enqueueUnmapMemObject(m_searchBuffer[batch.buf], results);
 				bool exit = num_found && hook.found(nonces, num_found);
-				exit |= hook.searched(batch.start_nonce, m_batchSize); // always report searched before exit
+				exit |= hook.searched(batch.start_nonce, m_globalWorkSize); // always report searched before exit
 				if (exit)
 					break;

@ -497,19 +509,31 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 				pending.pop();
 			}

-/*			chrono::high_resolution_clock::duration d = chrono::high_resolution_clock::now() - t;
-			if (d > chrono::milliseconds(_msPerBatch * 10 / 9))
+			// adjust global work size depending on last search time
+			if (s_msPerBatch)
+			{
+				// Global work size must be:
+				//  - less than or equal to 2 ^ DEVICE_BITS - 1
+				//  - divisible by lobal work size (workgroup size)
+				auto d = chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t);
+				if (d != chrono::milliseconds(0)) // if duration is zero, we did not get in the actual searh/or search not finished
+				{
+					if (d > chrono::milliseconds(s_msPerBatch * 10 / 9))
 					{
-				cerr << "Batch of" << m_batchSize << "took" << chrono::duration_cast<chrono::milliseconds>(d).count() << "ms, >>" << _msPerBatch << "ms.";
-				m_batchSize = max<unsigned>(128, m_batchSize * 9 / 10);
-				cerr << "New batch size" << m_batchSize;
+						// cerr << "Batch of " << m_globalWorkSize << " took " << chrono::duration_cast<chrono::milliseconds>(d).count() << " ms, >> " << _msPerBatch << " ms." << endl;
+						m_globalWorkSize = max<unsigned>(128, m_globalWorkSize + s_workgroupSize);
+						// cerr << "New global work size" << m_globalWorkSize << endl;
 					}
-			else if (d < chrono::milliseconds(_msPerBatch * 9 / 10))
+					else if (d < chrono::milliseconds(s_msPerBatch * 9 / 10))
 					{
-				cerr << "Batch of" << m_batchSize << "took" << chrono::duration_cast<chrono::milliseconds>(d).count() << "ms, <<" << _msPerBatch << "ms.";
-				m_batchSize = m_batchSize * 10 / 9;
-				cerr << "New batch size" << m_batchSize;
-			}*/
+						// cerr << "Batch of " << m_globalWorkSize << " took " << chrono::duration_cast<chrono::milliseconds>(d).count() << " ms, << " << _msPerBatch << " ms." << endl;
+						m_globalWorkSize = min<unsigned>(pow(2, m_deviceBits) - 1, m_globalWorkSize - s_workgroupSize);
+						// Global work size should never be less than the workgroup size
+						m_globalWorkSize = max<unsigned>(s_workgroupSize,  m_globalWorkSize);
+						// cerr << "New global work size" << m_globalWorkSize << endl;
+					}
+				}
+			}
 		}

 		// not safe to return until this is ready
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -45,6 +45,9 @@ public:
 	static void listDevices();
 	static bool configureGPU(
 		unsigned _platformId,
+		unsigned _localWorkSize,
+		unsigned _globalWorkSize,
+		unsigned _msPerBatch,
 		bool _allowCPU,
 		unsigned _extraGPUMemory,
 		boost::optional<uint64_t> _currentBlock
@ -53,12 +56,11 @@ public:
 	bool init(
 		uint8_t const* _dag,
 		uint64_t _dagSize,
-		unsigned _workgroupSize = 64,
 		unsigned _platformId = 0,
 		unsigned _deviceId = 0
 	);
 	void finish();
-	void search(uint8_t const* _header, uint64_t _target, search_hook& _hook, unsigned _msPerBatch = 100);
+	void search(uint8_t const* _header, uint64_t _target, search_hook& _hook);

 	void hash_chunk(uint8_t* _ret, uint8_t const* _header, uint64_t _nonce, unsigned _count);
 	void search_chunk(uint8_t const*_header, uint64_t _target, search_hook& _hook);
@ -76,10 +78,16 @@ private:
 	cl::Buffer m_header;
 	cl::Buffer m_hashBuffer[c_bufferCount];
 	cl::Buffer m_searchBuffer[c_bufferCount];
-	unsigned m_workgroupSize;
-	unsigned m_batchSize = c_searchBatchSize;
+	unsigned m_globalWorkSize;
 	bool m_openclOnePointOne;
+	unsigned m_deviceBits;

+	/// The local work size for the search
+	static unsigned s_workgroupSize;
+	/// The initial global work size for the searches
+	static unsigned s_initialGlobalWorkSize;
+	/// The target milliseconds per batch for the search. If 0, then no adjustment will happen
+	static unsigned s_msPerBatch;
 	/// Allow CPU to appear as an OpenCL device or not. Default is false
 	static bool s_allowCPU;
 	/// GPU memory required for other things, like window rendering e.t.c.
--- a/libethcore/Ethash.cpp
+++ b/libethcore/Ethash.cpp
@ -54,6 +54,9 @@ namespace dev
 namespace eth
 {

+const unsigned Ethash::defaultLocalWorkSize = 64;
+const unsigned Ethash::defaultGlobalWorkSizeMultiplier = 512; // * CL_DEFAULT_LOCAL_WORK_SIZE
+const unsigned Ethash::defaultMSPerBatch = 100;
 const Ethash::WorkPackage Ethash::NullWorkPackage = Ethash::WorkPackage();

 std::string Ethash::name()
@ -373,7 +376,7 @@ void Ethash::GPUMiner::workLoop()
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
 			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), 32, s_platformId, device);
+			m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
@ -409,6 +412,9 @@ void Ethash::GPUMiner::listDevices()
 }

 bool Ethash::GPUMiner::configureGPU(
+	unsigned _localWorkSize,
+	unsigned _globalWorkSizeMultiplier,
+	unsigned _msPerBatch,
 	unsigned _platformId,
 	unsigned _deviceId,
 	bool _allowCPU,
@ -418,7 +424,27 @@ bool Ethash::GPUMiner::configureGPU(
 {
 	s_platformId = _platformId;
 	s_deviceId = _deviceId;
-	return ethash_cl_miner::configureGPU(_platformId, _allowCPU, _extraGPUMemory, _currentBlock);
+
+	if (_localWorkSize != 32 && _localWorkSize != 64 && _localWorkSize != 128)
+	{
+		cout << "Given localWorkSize of " << toString(_localWorkSize) << "is invalid. Must be either 32,64, or 128" << endl;
+		return false;
+	}
+	
+	if (!ethash_cl_miner::configureGPU(
+			_platformId,
+			_localWorkSize,
+			_globalWorkSizeMultiplier * _localWorkSize,
+			_msPerBatch,
+			_allowCPU,
+			_extraGPUMemory,
+			_currentBlock)
+	)
+	{
+		cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
+		return false;
+	}
+	return true;
 }

 #endif
--- a/libethcore/Ethash.h
+++ b/libethcore/Ethash.h
@ -88,7 +88,7 @@ public:
 		static unsigned instances() { return s_numInstances > 0 ? s_numInstances : std::thread::hardware_concurrency(); }
 		static std::string platformInfo();
 		static void listDevices() {}
-		static bool configureGPU(unsigned, unsigned, bool, unsigned,  boost::optional<uint64_t>) { return false; }
+		static bool configureGPU(unsigned, unsigned, unsigned, unsigned, unsigned, bool, unsigned,  boost::optional<uint64_t>) { return false; }
 		static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, std::thread::hardware_concurrency()); }
 	protected:
 		void kickOff() override
@ -118,6 +118,9 @@ public:
 		static unsigned getNumDevices();
 		static void listDevices();
 		static bool configureGPU(
+			unsigned _localWorkSize,
+			unsigned _globalWorkSizeMultiplier,
+			unsigned _msPerBatch,
 			unsigned _platformId,
 			unsigned _deviceId,
 			bool _allowCPU,
@ -147,6 +150,12 @@ public:
 #else
 	using GPUMiner = CPUMiner;
 #endif
+	/// Default value of the local work size. Also known as workgroup size.
+	static const unsigned defaultLocalWorkSize;
+	/// Default value of the global work size as a multiplier of the local work size
+	static const unsigned defaultGlobalWorkSizeMultiplier;
+	/// Default value of the milliseconds per global work size (per batch)
+	static const unsigned defaultMSPerBatch;
 };

 }