Merge pull request #2050 from LefterisJP/amd_chunk_dag_upload

OpenCL Amd chunk dag upload
10 years ago · e0e9f929e0
6 changed files with 368 additions and 163 deletions
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -127,6 +127,10 @@ public:
 				cerr << "Bad " << arg << " option: " << argv[i] << endl;
 				throw BadArgument();
 			}
+		else if (arg == "--use-chunks")
+		{
+			dagChunks = 4;
+		}
 		else if (arg == "--phone-home" && i + 1 < argc)
 		{
 			string m = argv[++i];
@ -264,6 +268,7 @@ public:
 			ProofOfWork::GPUMiner::setDefaultPlatform(openclPlatform);
 			ProofOfWork::GPUMiner::setDefaultDevice(openclDevice);
 			ProofOfWork::GPUMiner::setNumInstances(miningThreads);
+			ProofOfWork::GPUMiner::setDagChunks(dagChunks);
 		}
 		if (mode == OperationMode::DAGInit)
 			doInitDAG(initDAG);
@ -301,6 +306,7 @@ public:
 			<< "    --opencl-platform <n>  When mining using -G/--opencl use OpenCL platform n (default: 0)." << endl
 			<< "    --opencl-device <n>  When mining using -G/--opencl use OpenCL device n (default: 0)." << endl
 			<< "    -t, --mining-threads <n> Limit number of CPU/GPU miners to n (default: use everything available on selected platform)" << endl
+			<< "    --use-chunks When using GPU mining upload the DAG to the GPU in 4 chunks. " << endl
 			;
 	}

@ -488,6 +494,7 @@ private:
 	unsigned openclPlatform = 0;
 	unsigned openclDevice = 0;
 	unsigned miningThreads = UINT_MAX;
+	unsigned dagChunks = 1;

 	/// DAG initialisation param.
 	unsigned initDAG = 0;
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -24,6 +24,7 @@

 #include <cstdio>
 #include <cstdlib>
+#include <fstream>
 #include <iostream>
 #include <assert.h>
 #include <queue>
@ -173,195 +174,249 @@ void ethash_cl_miner::finish()
 		m_queue.finish();
 }

-bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned workgroup_size, unsigned _platformId, unsigned _deviceId)
+bool ethash_cl_miner::init(
+	uint8_t const* _dag,
+	uint64_t _dagSize,
+	unsigned workgroup_size,
+	unsigned _platformId,
+	unsigned _deviceId,
+	unsigned _dagChunksNum
+)
 {
+	// for now due to the .cl kernels we can only have either 1 big chunk or 4 chunks
+	assert(_dagChunksNum == 1 || _dagChunksNum == 4);
+	// now create the number of chunk buffers
+	m_dagChunksNum = _dagChunksNum;
+
 	// get all platforms
-	std::vector<cl::Platform> platforms;
-	cl::Platform::get(&platforms);
-	if (platforms.empty())
+	try
 	{
-		ETHCL_LOG("No OpenCL platforms found.");
-		return false;
-	}
-
-	// use selected platform
-
-	_platformId = std::min<unsigned>(_platformId, platforms.size() - 1);
-
-	ETHCL_LOG("Using platform: " << platforms[_platformId].getInfo<CL_PLATFORM_NAME>().c_str());
+		std::vector<cl::Platform> platforms;
+		cl::Platform::get(&platforms);
+		if (platforms.empty())
+		{
+			ETHCL_LOG("No OpenCL platforms found.");
+			return false;
+		}

-	// get GPU device of the default platform
-	std::vector<cl::Device> devices;
-	platforms[_platformId].getDevices(CL_DEVICE_TYPE_ALL, &devices);
-	if (devices.empty())
-	{
-		ETHCL_LOG("No OpenCL devices found.");
-		return false;
-	}
+		// use selected platform
+		_platformId = std::min<unsigned>(_platformId, platforms.size() - 1);
+		ETHCL_LOG("Using platform: " << platforms[_platformId].getInfo<CL_PLATFORM_NAME>().c_str());

-	// use selected device
-	cl::Device& device = devices[std::min<unsigned>(_deviceId, devices.size() - 1)];
-	std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
-	ETHCL_LOG("Using device: " << device.getInfo<CL_DEVICE_NAME>().c_str() << "(" << device_version.c_str() << ")");
+		// get GPU device of the default platform
+		std::vector<cl::Device> devices;
+		platforms[_platformId].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+		if (devices.empty())
+		{
+			ETHCL_LOG("No OpenCL devices found.");
+			return false;
+		}

-	if (strncmp("OpenCL 1.0", device_version.c_str(), 10) == 0)
-	{
-		ETHCL_LOG("OpenCL 1.0 is not supported.");
-		return false;
-	}
-	if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0)
-		m_opencl_1_1 = true;
+		// use selected device
+		cl::Device& device = devices[std::min<unsigned>(_deviceId, devices.size() - 1)];
+		std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
+		ETHCL_LOG("Using device: " << device.getInfo<CL_DEVICE_NAME>().c_str() << "(" << device_version.c_str() << ")");

-	// create context
-	m_context = cl::Context(std::vector<cl::Device>(&device, &device + 1));
-	m_queue = cl::CommandQueue(m_context, device);
+		if (strncmp("OpenCL 1.0", device_version.c_str(), 10) == 0)
+		{
+			ETHCL_LOG("OpenCL 1.0 is not supported.");
+			return false;
+		}
+		if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0)
+			m_opencl_1_1 = true;
+
+		// create context
+		m_context = cl::Context(std::vector<cl::Device>(&device, &device + 1));
+		m_queue = cl::CommandQueue(m_context, device);
+
+		// use requested workgroup size, but we require multiple of 8
+		m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+
+		// patch source code
+		// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
+		// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
+		std::string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
+		add_definition(code, "GROUP_SIZE", m_workgroup_size);
+		add_definition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
+		add_definition(code, "ACCESSES", ETHASH_ACCESSES);
+		add_definition(code, "MAX_OUTPUTS", c_max_search_results);
+		//debugf("%s", code.c_str());
+
+		// create miner OpenCL program
+		cl::Program::Sources sources;
+		sources.push_back({ code.c_str(), code.size() });
+
+		cl::Program program(m_context, sources);
+		try
+		{
+			program.build({ device });
+			ETHCL_LOG("Printing program log");
+			ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
+		}
+		catch (cl::Error err)
+		{
+			ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
+			return false;
+		}
+		if (_dagChunksNum == 1)
+		{
+			ETHCL_LOG("Loading single big chunk kernels");
+			m_hash_kernel = cl::Kernel(program, "ethash_hash");
+			m_search_kernel = cl::Kernel(program, "ethash_search");
+		}
+		else
+		{
+			ETHCL_LOG("Loading chunk kernels");
+			m_hash_kernel = cl::Kernel(program, "ethash_hash_chunks");
+			m_search_kernel = cl::Kernel(program, "ethash_search_chunks");
+		}

-	// use requested workgroup size, but we require multiple of 8
-	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+		// create buffer for dag
+		if (_dagChunksNum == 1)
+		{
+			ETHCL_LOG("Creating one big buffer");
+			m_dagChunks.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize));
+		}
+		else
+			for (unsigned i = 0; i < _dagChunksNum; i++)
+			{
+				// TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation
+				ETHCL_LOG("Creating buffer for chunk " << i);
+				m_dagChunks.push_back(cl::Buffer(
+					m_context,
+					CL_MEM_READ_ONLY,
+					(i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7
+				));
+			}

-	// patch source code
-	std::string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
-	add_definition(code, "GROUP_SIZE", m_workgroup_size);
-	add_definition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
-	add_definition(code, "ACCESSES", ETHASH_ACCESSES);
-	add_definition(code, "MAX_OUTPUTS", c_max_search_results);
-	//debugf("%s", code.c_str());
+		// create buffer for header
+		ETHCL_LOG("Creating buffer for header.");
+		m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);

-	// create miner OpenCL program
-	cl::Program::Sources sources;
-	sources.push_back({code.c_str(), code.size()});
+		if (_dagChunksNum == 1)
+		{
+			ETHCL_LOG("Mapping one big chunk.");
+			m_queue.enqueueWriteBuffer(m_dagChunks[0], CL_TRUE, 0, _dagSize, _dag);
+		}
+		else
+		{
+			// TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation
+			void* dag_ptr[4];
+			for (unsigned i = 0; i < _dagChunksNum; i++)
+			{
+				ETHCL_LOG("Mapping chunk " << i);
+				dag_ptr[i] = m_queue.enqueueMapBuffer(m_dagChunks[i], true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
+			}
+			for (unsigned i = 0; i < _dagChunksNum; i++)
+			{
+				memcpy(dag_ptr[i], (char *)_dag + i*((_dagSize >> 9) << 7), (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
+				m_queue.enqueueUnmapMemObject(m_dagChunks[i], dag_ptr[i]);
+			}
+		}

-	cl::Program program(m_context, sources);
-	try
-	{
-		program.build({device});
+		// create mining buffers
+		for (unsigned i = 0; i != c_num_buffers; ++i)
+		{
+			ETHCL_LOG("Creating mining buffer " << i);
+			m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32 * c_hash_batch_size);
+			m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t));
+		}
 	}
 	catch (cl::Error err)
 	{
-		ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
+		ETHCL_LOG(err.what() << "(" << err.err() << ")");
 		return false;
 	}
-	m_hash_kernel = cl::Kernel(program, "ethash_hash");
-	m_search_kernel = cl::Kernel(program, "ethash_search");
-
-	// create buffer for dag
-	m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
-	
-	// create buffer for header
-	m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);
-
-	// compute dag on CPU
-	try {
-		m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
-	}
-	catch (...)
-	{
-		// didn't work. shitty driver. try allocating in CPU RAM and manually memcpying it.
-		void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, _dagSize);
-		memcpy(dag_ptr, _dag, _dagSize);
-		m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
-	}
-
-	// create mining buffers
-	for (unsigned i = 0; i != c_num_buffers; ++i)
-	{
-		m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32*c_hash_batch_size);
-		m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t));
-	}
 	return true;
 }

 void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
-	struct pending_batch
+	try
 	{
-		uint64_t start_nonce;
-		unsigned buf;
-	};
-	std::queue<pending_batch> pending;
+		struct pending_batch
+		{
+			uint64_t start_nonce;
+			unsigned buf;
+		};
+		std::queue<pending_batch> pending;

-	uint32_t const c_zero = 0;
+		static uint32_t const c_zero = 0;

-	// update header constant buffer
-	m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header);
-	for (unsigned i = 0; i != c_num_buffers; ++i)
-		m_queue.enqueueWriteBuffer(m_search_buf[i], false, 0, 4, &c_zero);
+		// update header constant buffer
+		m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header);
+		for (unsigned i = 0; i != c_num_buffers; ++i)
+			m_queue.enqueueWriteBuffer(m_search_buf[i], false, 0, 4, &c_zero);

 #if CL_VERSION_1_2 && 0
-	cl::Event pre_return_event;
-	if (!m_opencl_1_1)
-		m_queue.enqueueBarrierWithWaitList(NULL, &pre_return_event);
-	else
+		cl::Event pre_return_event;
+		if (!m_opencl_1_1)
+			m_queue.enqueueBarrierWithWaitList(NULL, &pre_return_event);
+		else
 #endif
-		m_queue.finish();
-
-	/*
-	__kernel void ethash_combined_search(
-		__global hash32_t* g_hashes,			// 0
-		__constant hash32_t const* g_header,	// 1
-		__global hash128_t const* g_dag,		// 2
-		ulong start_nonce,						// 3
-		ulong target,							// 4
-		uint isolate							// 5
-	)
-	*/
-	m_search_kernel.setArg(1, m_header);
-	m_search_kernel.setArg(2, m_dag);
-
-	// pass these to stop the compiler unrolling the loops
-	m_search_kernel.setArg(4, target);
-	m_search_kernel.setArg(5, ~0u);
-
-
-	unsigned buf = 0;
-	std::random_device engine;
-	uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
-	for (; ; start_nonce += c_search_batch_size)
-	{
-		// supply output buffer to kernel
-		m_search_kernel.setArg(0, m_search_buf[buf]);
-		m_search_kernel.setArg(3, start_nonce);
-
-		// execute it!
-		m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size);
-		
-		pending.push({start_nonce, buf});
-		buf = (buf + 1) % c_num_buffers;
-
-		// read results
-		if (pending.size() == c_num_buffers)
+			m_queue.finish();
+
+		unsigned argPos = 2;
+		m_search_kernel.setArg(1, m_header);
+		for (unsigned i = 0; i < m_dagChunksNum; ++i, ++argPos)
+			m_search_kernel.setArg(argPos, m_dagChunks[i]);
+		// pass these to stop the compiler unrolling the loops
+		m_search_kernel.setArg(argPos + 1, target);
+		m_search_kernel.setArg(argPos + 2, ~0u);
+
+		unsigned buf = 0;
+		std::random_device engine;
+		uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
+		for (;; start_nonce += c_search_batch_size)
 		{
-			pending_batch const& batch = pending.front();
+			// supply output buffer to kernel
+			m_search_kernel.setArg(0, m_search_buf[buf]);
+			if (m_dagChunksNum == 1)
+				m_search_kernel.setArg(3, start_nonce);
+			else
+				m_search_kernel.setArg(6, start_nonce);

-			// could use pinned host pointer instead
-			uint32_t* results = (uint32_t*)m_queue.enqueueMapBuffer(m_search_buf[batch.buf], true, CL_MAP_READ, 0, (1+c_max_search_results) * sizeof(uint32_t));
-			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
+			// execute it!
+			m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size);

-			uint64_t nonces[c_max_search_results];
-			for (unsigned i = 0; i != num_found; ++i)
+			pending.push({ start_nonce, buf });
+			buf = (buf + 1) % c_num_buffers;
+
+			// read results
+			if (pending.size() == c_num_buffers)
 			{
-				nonces[i] = batch.start_nonce + results[i+1];
-			}
+				pending_batch const& batch = pending.front();

-			m_queue.enqueueUnmapMemObject(m_search_buf[batch.buf], results);
-			
-			bool exit = num_found && hook.found(nonces, num_found);
-			exit |= hook.searched(batch.start_nonce, c_search_batch_size); // always report searched before exit
-			if (exit)
-				break;
+				// could use pinned host pointer instead
+				uint32_t* results = (uint32_t*)m_queue.enqueueMapBuffer(m_search_buf[batch.buf], true, CL_MAP_READ, 0, (1 + c_max_search_results) * sizeof(uint32_t));
+				unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);

-			// reset search buffer if we're still going
-			if (num_found)
-				m_queue.enqueueWriteBuffer(m_search_buf[batch.buf], true, 0, 4, &c_zero);
+				uint64_t nonces[c_max_search_results];
+				for (unsigned i = 0; i != num_found; ++i)
+					nonces[i] = batch.start_nonce + results[i + 1];

-			pending.pop();
+				m_queue.enqueueUnmapMemObject(m_search_buf[batch.buf], results);
+				bool exit = num_found && hook.found(nonces, num_found);
+				exit |= hook.searched(batch.start_nonce, c_search_batch_size); // always report searched before exit
+				if (exit)
+					break;
+
+				// reset search buffer if we're still going
+				if (num_found)
+					m_queue.enqueueWriteBuffer(m_search_buf[batch.buf], true, 0, 4, &c_zero);
+
+				pending.pop();
+			}
 		}
-	}

-	// not safe to return until this is ready
+		// not safe to return until this is ready
 #if CL_VERSION_1_2 && 0
-	if (!m_opencl_1_1)
-		pre_return_event.wait();
+		if (!m_opencl_1_1)
+			pre_return_event.wait();
 #endif
+	}
+	catch (cl::Error err)
+	{
+		ETHCL_LOG(err.what() << "(" << err.err() << ")");
+	}
 }
-
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -37,10 +37,20 @@ public:
 	static std::string platform_info(unsigned _platformId = 0, unsigned _deviceId = 0);
 	static bool haveSufficientGPUMemory(unsigned _platformId = 0);

-	bool init(uint8_t const* _dag, uint64_t _dagSize, unsigned workgroup_size = 64, unsigned _platformId = 0, unsigned _deviceId = 0);
+	bool init(
+		uint8_t const* _dag,
+		uint64_t _dagSize,
+		unsigned workgroup_size = 64,
+		unsigned _platformId = 0,
+		unsigned _deviceId = 0,
+		unsigned _dagChunksNum = 1
+	);
 	void finish();
 	void search(uint8_t const* header, uint64_t target, search_hook& hook);

+	void hash_chunk(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
+	void search_chunk(uint8_t const* header, uint64_t target, search_hook& hook);
+
 private:
 	enum { c_max_search_results = 63, c_num_buffers = 2, c_hash_batch_size = 1024, c_search_batch_size = 1024*256 };

@ -48,7 +58,8 @@ private:
 	cl::CommandQueue m_queue;
 	cl::Kernel m_hash_kernel;
 	cl::Kernel m_search_kernel;
-	cl::Buffer m_dag;
+	unsigned m_dagChunksNum;
+	std::vector<cl::Buffer> m_dagChunks;
 	cl::Buffer m_header;
 	cl::Buffer m_hash_buf[c_num_buffers];
 	cl::Buffer m_search_buf[c_num_buffers];
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@ -179,13 +179,13 @@ void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
 		// much we try and help the compiler save VGPRs because it seems to throw
 		// that information away, hence the implementation of keccak here
 		// doesn't bother.
-		if (isolate) 
+		if (isolate)
 		{
 			keccak_f1600_round((uint2*)a, r++, 25);
 		}
 	}
 	while (r < 23);
-	
+
 	// final round optimised for digest size
 	keccak_f1600_round((uint2*)a, r++, out_size);
 }
@ -232,7 +232,7 @@ hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
 	hash64_t init;
 	uint const init_size = countof(init.ulongs);
 	uint const hash_size = countof(header->ulongs);
-	
+
 	// sha3_512(header .. nonce)
 	ulong state[25];
 	copy(state, header->ulongs, hash_size);
@ -243,6 +243,40 @@ hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
 	return init;
 }

+uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
+		}
+	} while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+
 uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
 {
 	uint4 mix = init;
@ -276,6 +310,7 @@ uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash12
 	return fnv_reduce(mix);
 }

+
 hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
 {
 	ulong state[25];
@ -309,7 +344,7 @@ hash32_t compute_hash_simple(
 	{
 		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
 	}
-	
+
 	uint mix_val = mix.uints[0];
 	uint init0 = mix.uints[0];
 	uint a = 0;
@ -333,7 +368,7 @@ hash32_t compute_hash_simple(
 	{
 		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
 	}
-	
+
 	return final_hash(&init, &fnv_mix, isolate);
 }

@ -347,6 +382,7 @@ typedef union
 	hash32_t mix;
 } compute_hash_share;

+
 hash32_t compute_hash(
 	__local compute_hash_share* share,
 	__constant hash32_t const* g_header,
@ -390,6 +426,53 @@ hash32_t compute_hash(
 	return final_hash(&init, &mix, isolate);
 }

+
+hash32_t compute_hash_chunks(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 __kernel void ethash_hash_simple(
 	__global hash32_t* g_hashes,
@ -415,13 +498,15 @@ __kernel void ethash_search_simple(
 {
 	uint const gid = get_global_id(0);
 	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
-	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+
+	if (hash.ulongs[countof(hash.ulongs)-1] < target)
 	{
-		uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
 		g_output[slot] = gid;
 	}
 }

+
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 __kernel void ethash_hash(
 	__global hash32_t* g_hashes,
@ -458,3 +543,46 @@ __kernel void ethash_search(
 		g_output[slot] = gid;
 	}
 }
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_chunks(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_chunks(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
--- a/libethcore/Ethash.cpp
+++ b/libethcore/Ethash.cpp
@ -285,6 +285,7 @@ private:
 unsigned Ethash::GPUMiner::s_platformId = 0;
 unsigned Ethash::GPUMiner::s_deviceId = 0;
 unsigned Ethash::GPUMiner::s_numInstances = 0;
+unsigned Ethash::GPUMiner::s_dagChunks = 1;

 Ethash::GPUMiner::GPUMiner(ConstructionInfo const& _ci):
 	Miner(_ci),
@ -345,7 +346,7 @@ void Ethash::GPUMiner::workLoop()
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
 			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), 32, s_platformId, device);
+			m_miner->init(dagData.data(), dagData.size(), 32, s_platformId, device, s_dagChunks);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
--- a/libethcore/Ethash.h
+++ b/libethcore/Ethash.h
@ -89,6 +89,7 @@ public:
 		static std::string platformInfo();
 		static bool haveSufficientGPUMemory() { return false; }
 		static void setDefaultPlatform(unsigned) {}
+		static void setDagChunks(unsigned) {}
 		static void setDefaultDevice(unsigned) {}
 		static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, std::thread::hardware_concurrency()); }
 	protected:
@ -121,6 +122,7 @@ public:
 		static void setDefaultPlatform(unsigned _id) { s_platformId = _id; }
 		static void setDefaultDevice(unsigned _id) { s_deviceId = _id; }
 		static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, getNumDevices()); }
+		static void setDagChunks(unsigned _dagChunks) { s_dagChunks = _dagChunks; }

 	protected:
 		void kickOff() override;
@ -139,6 +141,7 @@ public:
 		static unsigned s_platformId;
 		static unsigned s_deviceId;
 		static unsigned s_numInstances;
+		static unsigned s_dagChunks;
 	};
 #else
 	using GPUMiner = CPUMiner;