diff --git a/libethash-cl/ethash_cl_miner.cpp b/libethash-cl/ethash_cl_miner.cpp index 28afc6e58..ec45be1a8 100644 --- a/libethash-cl/ethash_cl_miner.cpp +++ b/libethash-cl/ethash_cl_miner.cpp @@ -401,7 +401,6 @@ bool ethash_cl_miner::init( ETHCL_LOG("Creating one big buffer for the DAG"); m_dagChunks.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize)); ETHCL_LOG("Loading single big chunk kernels"); - m_hashKernel = cl::Kernel(program, "ethash_hash"); m_searchKernel = cl::Kernel(program, "ethash_search"); ETHCL_LOG("Mapping one big chunk."); m_queue.enqueueWriteBuffer(m_dagChunks[0], CL_TRUE, 0, _dagSize, _dag); @@ -410,46 +409,6 @@ bool ethash_cl_miner::init( { ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing."); return false; -#if 0 // Disabling chunking for release since it seems not to work. Never manages to mine a block. TODO: Fix when time is found. - int errCode = err.err(); - if (errCode != CL_INVALID_BUFFER_SIZE || errCode != CL_MEM_OBJECT_ALLOCATION_FAILURE) - ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << errCode << ")"); - cl_ulong result; - // if we fail midway on the try above make sure we start clean - m_dagChunks.clear(); - device.getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &result); - ETHCL_LOG( - "Failed to allocate 1 big chunk. Max allocateable memory is " - << result << ". Trying to allocate 4 chunks." - ); - // The OpenCL kernel has a hard coded number of 4 chunks at the moment - m_dagChunksCount = 4; - for (unsigned i = 0; i < m_dagChunksCount; i++) - { - // TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation - ETHCL_LOG("Creating buffer for chunk " << i); - m_dagChunks.push_back(cl::Buffer( - m_context, - CL_MEM_READ_ONLY, - (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7 - )); - } - ETHCL_LOG("Loading chunk kernels"); - m_hashKernel = cl::Kernel(program, "ethash_hash_chunks"); - m_searchKernel = cl::Kernel(program, "ethash_search_chunks"); - // TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation - void* dag_ptr[4]; - for (unsigned i = 0; i < m_dagChunksCount; i++) - { - ETHCL_LOG("Mapping chunk " << i); - dag_ptr[i] = m_queue.enqueueMapBuffer(m_dagChunks[i], true, m_openclOnePointOne ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7); - } - for (unsigned i = 0; i < m_dagChunksCount; i++) - { - memcpy(dag_ptr[i], (char *)_dag + i*((_dagSize >> 9) << 7), (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7); - m_queue.enqueueUnmapMemObject(m_dagChunks[i], dag_ptr[i]); - } -#endif } // create buffer for header ETHCL_LOG("Creating buffer for header."); @@ -459,7 +418,6 @@ bool ethash_cl_miner::init( for (unsigned i = 0; i != c_bufferCount; ++i) { ETHCL_LOG("Creating mining buffer " << i); - m_hashBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_openclOnePointOne ? CL_MEM_HOST_READ_ONLY : 0), 32 * c_hashBatchSize); m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t)); } } diff --git a/libethash-cl/ethash_cl_miner.h b/libethash-cl/ethash_cl_miner.h index c51c38fb6..4f10ef754 100644 --- a/libethash-cl/ethash_cl_miner.h +++ b/libethash-cl/ethash_cl_miner.h @@ -79,12 +79,10 @@ private: cl::Context m_context; cl::CommandQueue m_queue; - cl::Kernel m_hashKernel; cl::Kernel m_searchKernel; unsigned int m_dagChunksCount; std::vector m_dagChunks; cl::Buffer m_header; - cl::Buffer m_hashBuffer[c_bufferCount]; cl::Buffer m_searchBuffer[c_bufferCount]; unsigned m_globalWorkSize; bool m_openclOnePointOne; diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl index 8ea6df12d..191021ffd 100644 --- a/libethash-cl/ethash_cl_miner_kernel.cl +++ b/libethash-cl/ethash_cl_miner_kernel.cl @@ -112,17 +112,18 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size) // Chi a[0] = bitselect(b[0] ^ b[2], b[0], b[1]); - a[1] = bitselect(b[1] ^ b[3], b[1], b[2]); - a[2] = bitselect(b[2] ^ b[4], b[2], b[3]); - a[3] = bitselect(b[3] ^ b[0], b[3], b[4]); - if (out_size >= 4) + + if (out_size > 4) { + a[1] = bitselect(b[1] ^ b[3], b[1], b[2]); + a[2] = bitselect(b[2] ^ b[4], b[2], b[3]); + a[3] = bitselect(b[3] ^ b[0], b[3], b[4]); a[4] = bitselect(b[4] ^ b[1], b[4], b[0]); a[5] = bitselect(b[5] ^ b[7], b[5], b[6]); a[6] = bitselect(b[6] ^ b[8], b[6], b[7]); a[7] = bitselect(b[7] ^ b[9], b[7], b[8]); a[8] = bitselect(b[8] ^ b[5], b[8], b[9]); - if (out_size >= 8) + if (out_size > 8) { a[9] = bitselect(b[9] ^ b[6], b[9], b[5]); a[10] = bitselect(b[10] ^ b[12], b[10], b[11]); @@ -227,163 +228,13 @@ typedef union uint4 uint4s[128 / sizeof(uint4)]; } hash128_t; -static hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate) -{ - hash64_t init; - uint const init_size = countof(init.ulongs); - uint const hash_size = countof(header->ulongs); - - // sha3_512(header .. nonce) - ulong state[25]; - copy(state, header->ulongs, hash_size); - state[hash_size] = nonce; - keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate); - - copy(init.ulongs, state, init_size); - return init; -} - -static uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate) -{ - uint4 mix = init; - - // share init0 - if (thread_id == 0) - *share = mix.x; - barrier(CLK_LOCAL_MEM_FENCE); - uint init0 = *share; - - uint a = 0; - do - { - bool update_share = thread_id == (a/4) % THREADS_PER_HASH; - - #pragma unroll - for (uint i = 0; i != 4; ++i) - { - if (update_share) - { - uint m[4] = { mix.x, mix.y, mix.z, mix.w }; - *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE; - } - barrier(CLK_LOCAL_MEM_FENCE); - - mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]); - } - } while ((a += 4) != (ACCESSES & isolate)); - - return fnv_reduce(mix); -} - - - -static uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate) -{ - uint4 mix = init; - - // share init0 - if (thread_id == 0) - *share = mix.x; - barrier(CLK_LOCAL_MEM_FENCE); - uint init0 = *share; - - uint a = 0; - do - { - bool update_share = thread_id == (a/4) % THREADS_PER_HASH; - - #pragma unroll - for (uint i = 0; i != 4; ++i) - { - if (update_share) - { - uint m[4] = { mix.x, mix.y, mix.z, mix.w }; - *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE; - } - barrier(CLK_LOCAL_MEM_FENCE); - - mix = fnv4(mix, g_dag[*share].uint4s[thread_id]); - } - } - while ((a += 4) != (ACCESSES & isolate)); - - return fnv_reduce(mix); -} - - -static hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate) -{ - ulong state[25]; - - hash32_t hash; - uint const hash_size = countof(hash.ulongs); - uint const init_size = countof(init->ulongs); - uint const mix_size = countof(mix->ulongs); - - // keccak_256(keccak_512(header..nonce) .. mix); - copy(state, init->ulongs, init_size); - copy(state + init_size, mix->ulongs, mix_size); - keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate); - - // copy out - copy(hash.ulongs, state, hash_size); - return hash; -} - -static hash32_t compute_hash_simple( - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - ulong nonce, - uint isolate - ) -{ - hash64_t init = init_hash(g_header, nonce, isolate); - - hash128_t mix; - for (uint i = 0; i != countof(mix.uint4s); ++i) - { - mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)]; - } - - uint mix_val = mix.uints[0]; - uint init0 = mix.uints[0]; - uint a = 0; - do - { - uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE; - uint n = (a+1) % countof(mix.uints); - - #pragma unroll - for (uint i = 0; i != countof(mix.uints); ++i) - { - mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]); - mix_val = i == n ? mix.uints[i] : mix_val; - } - } - while (++a != (ACCESSES & isolate)); - - // reduce to output - hash32_t fnv_mix; - for (uint i = 0; i != countof(fnv_mix.uints); ++i) - { - fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]); - } - - return final_hash(&init, &fnv_mix, isolate); -} - -typedef union -{ - struct - { - hash64_t init; - uint pad; // avoid lds bank conflicts - }; - hash32_t mix; +typedef union { + uint4 uint4s[4]; + ulong ulongs[8]; + uint uints[16]; } compute_hash_share; - -static hash32_t compute_hash( +static ulong compute_hash( __local compute_hash_share* share, __constant hash32_t const* g_header, __global hash128_t const* g_dag, @@ -394,132 +245,70 @@ static hash32_t compute_hash( uint const gid = get_global_id(0); // Compute one init hash per work item. - hash64_t init = init_hash(g_header, nonce, isolate); + + // sha3_512(header .. nonce) + ulong state[25]; + copy(state, g_header->ulongs, 4); + state[4] = nonce; + keccak_f1600_no_absorb(state, 5, 8, isolate); + // Threads work together in this phase in groups of 8. - uint const thread_id = gid % THREADS_PER_HASH; - uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH; + uint const thread_id = gid & 7; + uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3; - hash32_t mix; uint i = 0; do { // share init with other threads if (i == thread_id) - share[hash_id].init = init; - barrier(CLK_LOCAL_MEM_FENCE); - - uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))]; - barrier(CLK_LOCAL_MEM_FENCE); - - uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate); + copy(share[hash_id].uint4s, state, 4); - share[hash_id].mix.uints[thread_id] = thread_mix; barrier(CLK_LOCAL_MEM_FENCE); - if (i == thread_id) - mix = share[hash_id].mix; + uint4 mix = share[hash_id].uint4s[thread_id & 3]; barrier(CLK_LOCAL_MEM_FENCE); - } - while (++i != (THREADS_PER_HASH & isolate)); - - return final_hash(&init, &mix, isolate); -} - - -static hash32_t compute_hash_chunks( - __local compute_hash_share* share, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - __global hash128_t const* g_dag1, - __global hash128_t const* g_dag2, - __global hash128_t const* g_dag3, - ulong nonce, - uint isolate - ) -{ - uint const gid = get_global_id(0); - // Compute one init hash per work item. - hash64_t init = init_hash(g_header, nonce, isolate); - - // Threads work together in this phase in groups of 8. - uint const thread_id = gid % THREADS_PER_HASH; - uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH; + __local uint *share0 = share[hash_id].uints; - hash32_t mix; - uint i = 0; - do - { - // share init with other threads - if (i == thread_id) - share[hash_id].init = init; + // share init0 + if (thread_id == 0) + *share0 = mix.x; barrier(CLK_LOCAL_MEM_FENCE); + uint init0 = *share0; - uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))]; - barrier(CLK_LOCAL_MEM_FENCE); + uint a = 0; + do + { + bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1)); - uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate); + #pragma unroll + for (uint i = 0; i != 4; ++i) + { + if (update_share) + { + *share0 = fnv(init0 ^ (a + i), ((uint *)&mix)[i]) % DAG_SIZE; + } + barrier(CLK_LOCAL_MEM_FENCE); - share[hash_id].mix.uints[thread_id] = thread_mix; + mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]); + } + } while ((a += 4) != (ACCESSES & isolate)); + + share[hash_id].uints[thread_id] = fnv_reduce(mix); barrier(CLK_LOCAL_MEM_FENCE); if (i == thread_id) - mix = share[hash_id].mix; + copy(state + 8, share[hash_id].ulongs, 4); + barrier(CLK_LOCAL_MEM_FENCE); } while (++i != (THREADS_PER_HASH & isolate)); - return final_hash(&init, &mix, isolate); -} - -__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -__kernel void ethash_hash_simple( - __global hash32_t* g_hashes, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - ulong start_nonce, - uint isolate - ) -{ - uint const gid = get_global_id(0); - g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate); -} - -__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -__kernel void ethash_search_simple( - __global volatile uint* restrict g_output, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - ulong start_nonce, - ulong target, - uint isolate - ) -{ - uint const gid = get_global_id(0); - hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate); - - if (hash.ulongs[countof(hash.ulongs)-1] < target) - { - uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1)); - g_output[slot] = gid; - } -} - - -__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -__kernel void ethash_hash( - __global hash32_t* g_hashes, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - ulong start_nonce, - uint isolate - ) -{ - __local compute_hash_share share[HASHES_PER_LOOP]; + // keccak_256(keccak_512(header..nonce) .. mix); + keccak_f1600_no_absorb(state, 12, 4, isolate); - uint const gid = get_global_id(0); - g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); + return state[0]; } __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) @@ -535,54 +324,11 @@ __kernel void ethash_search( __local compute_hash_share share[HASHES_PER_LOOP]; uint const gid = get_global_id(0); - hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); + ulong hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); - if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target) + if (as_ulong(as_uchar8(hash).s76543210) < target) { uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1); g_output[slot] = gid; } } - -__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -__kernel void ethash_hash_chunks( - __global hash32_t* g_hashes, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - __global hash128_t const* g_dag1, - __global hash128_t const* g_dag2, - __global hash128_t const* g_dag3, - ulong start_nonce, - uint isolate - ) -{ - __local compute_hash_share share[HASHES_PER_LOOP]; - - uint const gid = get_global_id(0); - g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate); -} - -__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -__kernel void ethash_search_chunks( - __global volatile uint* restrict g_output, - __constant hash32_t const* g_header, - __global hash128_t const* g_dag, - __global hash128_t const* g_dag1, - __global hash128_t const* g_dag2, - __global hash128_t const* g_dag3, - ulong start_nonce, - ulong target, - uint isolate - ) -{ - __local compute_hash_share share[HASHES_PER_LOOP]; - - uint const gid = get_global_id(0); - hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate); - - if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target) - { - uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1)); - g_output[slot] = gid; - } -}