Browse Source

Merge branch 'develop' of github.com:kejace/cpp-ethereum into amd_chunk_dag_upload

cl-refactor
Lefteris Karapetsas 10 years ago
parent
commit
8ca0277e4e
  1. 126
      libethash-cl/ethash_cl_miner.cpp
  2. 4
      libethash-cl/ethash_cl_miner.h
  3. 132
      libethash-cl/ethash_cl_miner_kernel.cl

126
libethash-cl/ethash_cl_miner.cpp

@ -24,6 +24,7 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <fstream>
#include <iostream> #include <iostream>
#include <assert.h> #include <assert.h>
#include <queue> #include <queue>
@ -42,9 +43,13 @@
#define CL_MEM_HOST_READ_ONLY 0 #define CL_MEM_HOST_READ_ONLY 0
#endif #endif
//#define CHUNKS
#undef min #undef min
#undef max #undef max
//#define CHUNKS
using namespace std; using namespace std;
static void add_definition(std::string& source, char const* id, unsigned value) static void add_definition(std::string& source, char const* id, unsigned value)
@ -131,6 +136,7 @@ void ethash_cl_miner::finish()
bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned workgroup_size, unsigned _platformId, unsigned _deviceId) bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned workgroup_size, unsigned _platformId, unsigned _deviceId)
{ {
// get all platforms // get all platforms
try {
std::vector<cl::Platform> platforms; std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms); cl::Platform::get(&platforms);
if (platforms.empty()) if (platforms.empty())
@ -175,7 +181,9 @@ bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned work
m_workgroup_size = ((workgroup_size + 7) / 8) * 8; m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
// patch source code // patch source code
std::string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE); std::ifstream t("ethash_cl_miner_kernel.cl");
std::string code((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
add_definition(code, "GROUP_SIZE", m_workgroup_size); add_definition(code, "GROUP_SIZE", m_workgroup_size);
add_definition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES)); add_definition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
add_definition(code, "ACCESSES", ETHASH_ACCESSES); add_definition(code, "ACCESSES", ETHASH_ACCESSES);
@ -184,45 +192,90 @@ bool ethash_cl_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned work
// create miner OpenCL program // create miner OpenCL program
cl::Program::Sources sources; cl::Program::Sources sources;
sources.push_back({code.c_str(), code.size()}); sources.push_back({ code.c_str(), code.size() });
cl::Program program(m_context, sources); cl::Program program(m_context, sources);
try try
{ {
program.build({device}); program.build({ device });
cout << "Printing program log" << endl;
cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str();
} }
catch (cl::Error err) catch (cl::Error err)
{ {
cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str(); cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str();
return false; return false;
} }
#ifdef CHUNKS
cout << "loading ethash_hash_chunks" << endl;
m_hash_kernel = cl::Kernel(program, "ethash_hash_chunks");
cout << "loading ethash_search_chunks" << endl;
m_search_kernel = cl::Kernel(program, "ethash_search_chunks");
#else
cout << "loading ethash_hash" << endl;
m_hash_kernel = cl::Kernel(program, "ethash_hash"); m_hash_kernel = cl::Kernel(program, "ethash_hash");
cout << "loading ethash_search" << endl;
m_search_kernel = cl::Kernel(program, "ethash_search"); m_search_kernel = cl::Kernel(program, "ethash_search");
#endif
// create buffer for dag // create buffer for dag
#ifdef CHUNKS
for (unsigned i = 0; i < 4; i++){
cout << "Creating chunky buffer: " << i << endl;
m_dags[i] = cl::Buffer(m_context, CL_MEM_READ_ONLY, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
}
#else
cout << "Creating one big buffer." << endl;
m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize); m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
#endif
// create buffer for header // create buffer for header
cout << "Creating buffer for header." << endl;
m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32); m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);
// compute dag on CPU #ifdef CHUNKS
try { void* dag_ptr[4];
m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag); for (unsigned i = 0; i < 4; i++)
{
cout << "Mapping chunk " << i << endl;
dag_ptr[i] = m_queue.enqueueMapBuffer(m_dags[i], true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
} }
catch (...) for (unsigned i = 0; i < 4; i++)
{ {
// didn't work. shitty driver. try allocating in CPU RAM and manually memcpying it. memcpy(dag_ptr[i], (char *)_dag + i*((_dagSize >> 9) << 7), (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, _dagSize); m_queue.enqueueUnmapMemObject(m_dags[i], dag_ptr[i]);
memcpy(dag_ptr, _dag, _dagSize);
m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
} }
#else
cout << "Mapping chunk." << endl;
m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
#endif
// compute dag on CPU
/*{
m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
// if this throws then it's because we probably need to subdivide the dag uploads for compatibility
// void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, _dagSize);
// memcpying 1GB: horrible... really. horrible. but necessary since we can't mmap *and* gpumap.
// _fillDAG(dag_ptr);
// m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
}*/
// create mining buffers // create mining buffers
for (unsigned i = 0; i != c_num_buffers; ++i) for (unsigned i = 0; i != c_num_buffers; ++i)
{ {
m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32*c_hash_batch_size); cout << "Creating minig buffer " << i <<endl;
m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32 * c_hash_batch_size);
m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t)); m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t));
} }
}
catch (cl::Error err)
{
std::cout << err.what() << "(" << err.err() << ")" << std::endl;
}
return true; return true;
} }
@ -248,10 +301,22 @@ void ethash_cl_miner::hash(uint8_t* ret, uint8_t const* header, uint64_t nonce,
uint isolate uint isolate
) )
*/ */
#ifdef CHUNKS
cout << "Setting chunk hash arguments." << endl;
m_hash_kernel.setArg(1, m_header);
m_hash_kernel.setArg(2, m_dags[0]);
m_hash_kernel.setArg(3, m_dags[1]);
m_hash_kernel.setArg(4, m_dags[2]);
m_hash_kernel.setArg(5, m_dags[3]);
m_hash_kernel.setArg(6, nonce);
m_hash_kernel.setArg(7, ~0u); // have to pass this to stop the compile unrolling the loop
#else
cout << "Setting hash arguments." << endl;
m_hash_kernel.setArg(1, m_header); m_hash_kernel.setArg(1, m_header);
m_hash_kernel.setArg(2, m_dag); m_hash_kernel.setArg(2, m_dag);
m_hash_kernel.setArg(3, nonce); m_hash_kernel.setArg(3, nonce);
m_hash_kernel.setArg(4, ~0u); // have to pass this to stop the compile unrolling the loop m_hash_kernel.setArg(4, ~0u); // have to pass this to stop the compile unrolling the loop
#endif
unsigned buf = 0; unsigned buf = 0;
for (unsigned i = 0; i < count || !pending.empty(); ) for (unsigned i = 0; i < count || !pending.empty(); )
@ -297,6 +362,7 @@ void ethash_cl_miner::hash(uint8_t* ret, uint8_t const* header, uint64_t nonce,
void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook) void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
{ {
try {
struct pending_batch struct pending_batch
{ {
uint64_t start_nonce; uint64_t start_nonce;
@ -304,7 +370,7 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
}; };
std::queue<pending_batch> pending; std::queue<pending_batch> pending;
uint32_t const c_zero = 0; static uint32_t const c_zero = 0;
// update header constant buffer // update header constant buffer
m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header); m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header);
@ -329,27 +395,47 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
uint isolate // 5 uint isolate // 5
) )
*/ */
#ifdef CHUNKS
cout << "Setting chunk search arguments." << endl;
m_search_kernel.setArg(1, m_header);
m_search_kernel.setArg(2, m_dags[0]);
m_search_kernel.setArg(3, m_dags[1]);
m_search_kernel.setArg(4, m_dags[2]);
m_search_kernel.setArg(5, m_dags[3]);
// pass these to stop the compiler unrolling the loops
m_search_kernel.setArg(7, target);
m_search_kernel.setArg(8, ~0u);
#else
cout << "Setting search arguments." << endl;
m_search_kernel.setArg(1, m_header); m_search_kernel.setArg(1, m_header);
m_search_kernel.setArg(2, m_dag); m_search_kernel.setArg(2, m_dag);
// pass these to stop the compiler unrolling the loops // pass these to stop the compiler unrolling the loops
m_search_kernel.setArg(4, target); m_search_kernel.setArg(4, target);
m_search_kernel.setArg(5, ~0u); m_search_kernel.setArg(5, ~0u);
#endif
unsigned buf = 0; unsigned buf = 0;
std::random_device engine; std::random_device engine;
uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine); uint64_t start_nonce = std::uniform_int_distribution<uint64_t>()(engine);
for (; ; start_nonce += c_search_batch_size) for (;; start_nonce += c_search_batch_size)
{ {
// supply output buffer to kernel // supply output buffer to kernel
m_search_kernel.setArg(0, m_search_buf[buf]); m_search_kernel.setArg(0, m_search_buf[buf]);
#ifdef CHUNKS
m_search_kernel.setArg(6, start_nonce);
#else
m_search_kernel.setArg(3, start_nonce); m_search_kernel.setArg(3, start_nonce);
#endif
// execute it! // execute it!
m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size); m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size);
pending.push({start_nonce, buf}); pending.push({ start_nonce, buf });
buf = (buf + 1) % c_num_buffers; buf = (buf + 1) % c_num_buffers;
// read results // read results
@ -358,13 +444,13 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
pending_batch const& batch = pending.front(); pending_batch const& batch = pending.front();
// could use pinned host pointer instead // could use pinned host pointer instead
uint32_t* results = (uint32_t*)m_queue.enqueueMapBuffer(m_search_buf[batch.buf], true, CL_MAP_READ, 0, (1+c_max_search_results) * sizeof(uint32_t)); uint32_t* results = (uint32_t*)m_queue.enqueueMapBuffer(m_search_buf[batch.buf], true, CL_MAP_READ, 0, (1 + c_max_search_results) * sizeof(uint32_t));
unsigned num_found = std::min<unsigned>(results[0], c_max_search_results); unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
uint64_t nonces[c_max_search_results]; uint64_t nonces[c_max_search_results];
for (unsigned i = 0; i != num_found; ++i) for (unsigned i = 0; i != num_found; ++i)
{ {
nonces[i] = batch.start_nonce + results[i+1]; nonces[i] = batch.start_nonce + results[i + 1];
} }
m_queue.enqueueUnmapMemObject(m_search_buf[batch.buf], results); m_queue.enqueueUnmapMemObject(m_search_buf[batch.buf], results);
@ -387,5 +473,9 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
if (!m_opencl_1_1) if (!m_opencl_1_1)
pre_return_event.wait(); pre_return_event.wait();
#endif #endif
}
catch (cl::Error err)
{
std::cout << err.what() << "(" << err.err() << ")" << std::endl;
}
} }

4
libethash-cl/ethash_cl_miner.h

@ -41,6 +41,9 @@ public:
void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count); void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
void search(uint8_t const* header, uint64_t target, search_hook& hook); void search(uint8_t const* header, uint64_t target, search_hook& hook);
void hash_chunk(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
void search_chunk(uint8_t const* header, uint64_t target, search_hook& hook);
private: private:
enum { c_max_search_results = 63, c_num_buffers = 2, c_hash_batch_size = 1024, c_search_batch_size = 1024*256 }; enum { c_max_search_results = 63, c_num_buffers = 2, c_hash_batch_size = 1024, c_search_batch_size = 1024*256 };
@ -49,6 +52,7 @@ private:
cl::Kernel m_hash_kernel; cl::Kernel m_hash_kernel;
cl::Kernel m_search_kernel; cl::Kernel m_search_kernel;
cl::Buffer m_dag; cl::Buffer m_dag;
cl::Buffer m_dags[4];
cl::Buffer m_header; cl::Buffer m_header;
cl::Buffer m_hash_buf[c_num_buffers]; cl::Buffer m_hash_buf[c_num_buffers];
cl::Buffer m_search_buf[c_num_buffers]; cl::Buffer m_search_buf[c_num_buffers];

132
libethash-cl/ethash_cl_miner_kernel.cl

@ -243,6 +243,40 @@ hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
return init; return init;
} }
uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
{
uint4 mix = init;
// share init0
if (thread_id == 0)
*share = mix.x;
barrier(CLK_LOCAL_MEM_FENCE);
uint init0 = *share;
uint a = 0;
do
{
bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
#pragma unroll
for (uint i = 0; i != 4; ++i)
{
if (update_share)
{
uint m[4] = { mix.x, mix.y, mix.z, mix.w };
*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
}
barrier(CLK_LOCAL_MEM_FENCE);
mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
}
} while ((a += 4) != (ACCESSES & isolate));
return fnv_reduce(mix);
}
uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate) uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
{ {
uint4 mix = init; uint4 mix = init;
@ -276,6 +310,7 @@ uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash12
return fnv_reduce(mix); return fnv_reduce(mix);
} }
hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate) hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
{ {
ulong state[25]; ulong state[25];
@ -347,6 +382,7 @@ typedef union
hash32_t mix; hash32_t mix;
} compute_hash_share; } compute_hash_share;
hash32_t compute_hash( hash32_t compute_hash(
__local compute_hash_share* share, __local compute_hash_share* share,
__constant hash32_t const* g_header, __constant hash32_t const* g_header,
@ -390,6 +426,53 @@ hash32_t compute_hash(
return final_hash(&init, &mix, isolate); return final_hash(&init, &mix, isolate);
} }
hash32_t compute_hash_chunks(
__local compute_hash_share* share,
__constant hash32_t const* g_header,
__global hash128_t const* g_dag,
__global hash128_t const* g_dag1,
__global hash128_t const* g_dag2,
__global hash128_t const* g_dag3,
ulong nonce,
uint isolate
)
{
uint const gid = get_global_id(0);
// Compute one init hash per work item.
hash64_t init = init_hash(g_header, nonce, isolate);
// Threads work together in this phase in groups of 8.
uint const thread_id = gid % THREADS_PER_HASH;
uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
hash32_t mix;
uint i = 0;
do
{
// share init with other threads
if (i == thread_id)
share[hash_id].init = init;
barrier(CLK_LOCAL_MEM_FENCE);
uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
barrier(CLK_LOCAL_MEM_FENCE);
uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
share[hash_id].mix.uints[thread_id] = thread_mix;
barrier(CLK_LOCAL_MEM_FENCE);
if (i == thread_id)
mix = share[hash_id].mix;
barrier(CLK_LOCAL_MEM_FENCE);
}
while (++i != (THREADS_PER_HASH & isolate));
return final_hash(&init, &mix, isolate);
}
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__kernel void ethash_hash_simple( __kernel void ethash_hash_simple(
__global hash32_t* g_hashes, __global hash32_t* g_hashes,
@ -415,13 +498,15 @@ __kernel void ethash_search_simple(
{ {
uint const gid = get_global_id(0); uint const gid = get_global_id(0);
hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate); hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
if (hash.ulongs[countof(hash.ulongs)-1] < target)
{ {
uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1); uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
g_output[slot] = gid; g_output[slot] = gid;
} }
} }
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__kernel void ethash_hash( __kernel void ethash_hash(
__global hash32_t* g_hashes, __global hash32_t* g_hashes,
@ -458,3 +543,46 @@ __kernel void ethash_search(
g_output[slot] = gid; g_output[slot] = gid;
} }
} }
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__kernel void ethash_hash_chunks(
__global hash32_t* g_hashes,
__constant hash32_t const* g_header,
__global hash128_t const* g_dag,
__global hash128_t const* g_dag1,
__global hash128_t const* g_dag2,
__global hash128_t const* g_dag3,
ulong start_nonce,
uint isolate
)
{
__local compute_hash_share share[HASHES_PER_LOOP];
uint const gid = get_global_id(0);
g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
}
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__kernel void ethash_search_chunks(
__global volatile uint* restrict g_output,
__constant hash32_t const* g_header,
__global hash128_t const* g_dag,
__global hash128_t const* g_dag1,
__global hash128_t const* g_dag2,
__global hash128_t const* g_dag3,
ulong start_nonce,
ulong target,
uint isolate
)
{
__local compute_hash_share share[HASHES_PER_LOOP];
uint const gid = get_global_id(0);
hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
{
uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
g_output[slot] = gid;
}
}
Loading…
Cancel
Save