Merge branch 'develop' of github.com:kejace/cpp-ethereum into amd_chunk_dag_upload

10 years ago · 8ca0277e4e
3 changed files with 378 additions and 156 deletions
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -24,6 +24,7 @@

 #include <cstdio>
 #include <cstdlib>
+#include <fstream>
 #include <iostream>
 #include <assert.h>
 #include <queue>
@ -42,9 +43,13 @@
 #define CL_MEM_HOST_READ_ONLY 0
 #endif

+//#define CHUNKS
+
 #undef min
 #undef max

+//#define CHUNKS
+
 using namespace std;

 static void add_definition(std::string& source, char const* id, unsigned value)
@ -131,6 +136,7 @@ void ethash_cl_miner::finish()
 bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned workgroup_size, unsigned _platformId, unsigned _deviceId)
 {
 	// get all platforms
+	try {
 		std::vector<cl::Platform> platforms;
 		cl::Platform::get(&platforms);
 		if (platforms.empty())
@ -175,7 +181,9 @@ bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned work
 		m_workgroup_size = ((workgroup_size + 7) / 8) * 8;

 		// patch source code
-	std::string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
+		std::ifstream t("ethash_cl_miner_kernel.cl");
+		std::string code((std::istreambuf_iterator<char>(t)),
+			std::istreambuf_iterator<char>());
 		add_definition(code, "GROUP_SIZE", m_workgroup_size);
 		add_definition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
 		add_definition(code, "ACCESSES", ETHASH_ACCESSES);
@ -190,39 +198,84 @@ bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned work
 		try
 		{
 			program.build({ device });
+			cout << "Printing program log" << endl;
+			cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str();
 		}
 		catch (cl::Error err)
 		{
 			cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str();
 			return false;
 		}
+		#ifdef CHUNKS
+		cout << "loading ethash_hash_chunks" << endl;
+		m_hash_kernel = cl::Kernel(program, "ethash_hash_chunks");
+		cout << "loading ethash_search_chunks" << endl;
+		m_search_kernel = cl::Kernel(program, "ethash_search_chunks");
+
+		#else
+		cout << "loading ethash_hash" << endl;
 		m_hash_kernel = cl::Kernel(program, "ethash_hash");
+		cout << "loading ethash_search" << endl;
 		m_search_kernel = cl::Kernel(program, "ethash_search");
+		#endif

 		// create buffer for dag
+		#ifdef CHUNKS
+		for (unsigned i = 0; i < 4; i++){
+            
+            cout << "Creating chunky buffer: " << i << endl;
+			m_dags[i] = cl::Buffer(m_context, CL_MEM_READ_ONLY, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
+		}
+		#else
+		cout << "Creating one big buffer." << endl;
 		m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
+		#endif

 		// create buffer for header
+		cout << "Creating buffer for header." << endl;
 		m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);

-	// compute dag on CPU
-	try {
-		m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
+		#ifdef CHUNKS
+		void* dag_ptr[4];
+		for (unsigned i = 0; i < 4; i++)
+		{
+			cout << "Mapping chunk " << i << endl;
+			dag_ptr[i] = m_queue.enqueueMapBuffer(m_dags[i], true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
 		}
-	catch (...)
+		for (unsigned i = 0; i < 4; i++)
 		{
-		// didn't work. shitty driver. try allocating in CPU RAM and manually memcpying it.
-		void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, _dagSize);
-		memcpy(dag_ptr, _dag, _dagSize);
-		m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
+			memcpy(dag_ptr[i], (char *)_dag + i*((_dagSize >> 9) << 7), (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
+			m_queue.enqueueUnmapMemObject(m_dags[i], dag_ptr[i]);
 		}
+        #else
+        cout << "Mapping chunk." << endl;
+        m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
+        #endif
+		// compute dag on CPU
+		/*{
+			m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
+
+			// if this throws then it's because we probably need to subdivide the dag uploads for compatibility
+			//		void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, _dagSize);
+			// memcpying 1GB: horrible... really. horrible. but necessary since we can't mmap *and* gpumap.
+			//		_fillDAG(dag_ptr);
+			//		m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
+			}*/

 		// create mining buffers
 		for (unsigned i = 0; i != c_num_buffers; ++i)
 		{
+			cout << "Creating minig buffer " << i <<endl;
 			m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32 * c_hash_batch_size);
 			m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t));
 		}
+		
+	}
+	catch (cl::Error err)
+	{
+		std::cout << err.what() << "(" << err.err() << ")" << std::endl;
+	}
+
 	return true;	
 }

@ -248,10 +301,22 @@ void ethash_cl_miner::hash(uint8_t* ret, uint8_t const* header, uint64_t nonce,
 		uint isolate
 		)
 	*/
+    #ifdef CHUNKS
+    cout << "Setting chunk hash arguments." << endl;
+	m_hash_kernel.setArg(1, m_header);
+	m_hash_kernel.setArg(2, m_dags[0]);
+	m_hash_kernel.setArg(3, m_dags[1]);
+	m_hash_kernel.setArg(4, m_dags[2]);
+	m_hash_kernel.setArg(5, m_dags[3]);
+	m_hash_kernel.setArg(6, nonce);
+	m_hash_kernel.setArg(7, ~0u); // have to pass this to stop the compile unrolling the loop
+	#else
+	cout << "Setting hash arguments." << endl;
 	m_hash_kernel.setArg(1, m_header);
 	m_hash_kernel.setArg(2, m_dag);
 	m_hash_kernel.setArg(3, nonce);
 	m_hash_kernel.setArg(4, ~0u); // have to pass this to stop the compile unrolling the loop
+    #endif

 	unsigned buf = 0;
 	for (unsigned i = 0; i < count || !pending.empty(); )
@ -297,6 +362,7 @@ void ethash_cl_miner::hash(uint8_t* ret, uint8_t const* header, uint64_t nonce,

 void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
 {
+	try {
 		struct pending_batch
 		{
 			uint64_t start_nonce;
@ -304,7 +370,7 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 		};
 		std::queue<pending_batch> pending;

-	uint32_t const c_zero = 0;
+		static uint32_t const c_zero = 0;

 		// update header constant buffer
 		m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header);
@ -329,12 +395,28 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 		uint isolate							// 5
 		)
 		*/
+		#ifdef CHUNKS
+		cout << "Setting chunk search arguments." << endl;
+		m_search_kernel.setArg(1, m_header);
+		m_search_kernel.setArg(2, m_dags[0]);
+		m_search_kernel.setArg(3, m_dags[1]);
+		m_search_kernel.setArg(4, m_dags[2]);
+		m_search_kernel.setArg(5, m_dags[3]);
+
+		// pass these to stop the compiler unrolling the loops
+		m_search_kernel.setArg(7, target);
+		m_search_kernel.setArg(8, ~0u);
+
+        #else
+        cout << "Setting search arguments." << endl;
 		m_search_kernel.setArg(1, m_header);
 		m_search_kernel.setArg(2, m_dag);

 		// pass these to stop the compiler unrolling the loops
 		m_search_kernel.setArg(4, target);
 		m_search_kernel.setArg(5, ~0u);
+		#endif
+


 		unsigned buf = 0;
@ -344,7 +426,11 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 		{
 			// supply output buffer to kernel
 			m_search_kernel.setArg(0, m_search_buf[buf]);
+			#ifdef CHUNKS
+			m_search_kernel.setArg(6, start_nonce);
+			#else
 			m_search_kernel.setArg(3, start_nonce);
+			#endif

 			// execute it!
 			m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size);
@ -388,4 +474,8 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 			pre_return_event.wait();
 #endif
 	}
-
+	catch (cl::Error err)
+	{
+		std::cout << err.what() << "(" << err.err() << ")" << std::endl;
+	}
+}
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -41,6 +41,9 @@ public:
 	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
 	void search(uint8_t const* header, uint64_t target, search_hook& hook);

+	void hash_chunk(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
+	void search_chunk(uint8_t const* header, uint64_t target, search_hook& hook);
+
 private:
 	enum { c_max_search_results = 63, c_num_buffers = 2, c_hash_batch_size = 1024, c_search_batch_size = 1024*256 };

@ -49,6 +52,7 @@ private:
 	cl::Kernel m_hash_kernel;
 	cl::Kernel m_search_kernel;
 	cl::Buffer m_dag;
+	cl::Buffer m_dags[4];
 	cl::Buffer m_header;
 	cl::Buffer m_hash_buf[c_num_buffers];
 	cl::Buffer m_search_buf[c_num_buffers];
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@ -243,6 +243,40 @@ hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
 	return init;
 }

+uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
+		}
+	} while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+
 uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
 {
 	uint4 mix = init;
@ -276,6 +310,7 @@ uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash12
 	return fnv_reduce(mix);
 }

+
 hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
 {
 	ulong state[25];
@ -347,6 +382,7 @@ typedef union
 	hash32_t mix;
 } compute_hash_share;

+
 hash32_t compute_hash(
 	__local compute_hash_share* share,
 	__constant hash32_t const* g_header,
@ -390,6 +426,53 @@ hash32_t compute_hash(
 	return final_hash(&init, &mix, isolate);
 }

+
+hash32_t compute_hash_chunks(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 __kernel void ethash_hash_simple(
 	__global hash32_t* g_hashes,
@ -415,13 +498,15 @@ __kernel void ethash_search_simple(
 {
 	uint const gid = get_global_id(0);
 	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
-	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+
+	if (hash.ulongs[countof(hash.ulongs)-1] < target)
 	{
-		uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
 		g_output[slot] = gid;
 	}
 }

+
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 __kernel void ethash_hash(
 	__global hash32_t* g_hashes,
@ -458,3 +543,46 @@ __kernel void ethash_search(
 		g_output[slot] = gid;
 	}
 }
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_chunks(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_chunks(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}