Browse Source

saved 5 opencl regs (14 to go..), fixed compute 3.0

cl-refactor
Genoil 9 years ago
parent
commit
22e3b7f6fc
  1. 2
      CMakeLists.txt
  2. 114
      libethash-cl/ethash_cl_miner_kernel.cl
  3. 2
      libethash-cuda/CMakeLists.txt
  4. BIN
      releases/ethminer-0.9.41-genoil-1.0.5.zip

2
CMakeLists.txt

@ -2,7 +2,7 @@
cmake_minimum_required(VERSION 2.8.12) cmake_minimum_required(VERSION 2.8.12)
set(PROJECT_VERSION "0.9.41") set(PROJECT_VERSION "0.9.41")
set(GENOIL_VERSION "1.0.4") set(GENOIL_VERSION "1.0.5")
if (${CMAKE_VERSION} VERSION_GREATER 3.0) if (${CMAKE_VERSION} VERSION_GREATER 3.0)
cmake_policy(SET CMP0042 OLD) # fix MACOSX_RPATH cmake_policy(SET CMP0042 OLD) # fix MACOSX_RPATH
cmake_policy(SET CMP0048 NEW) # allow VERSION argument in project() cmake_policy(SET CMP0048 NEW) # allow VERSION argument in project()

114
libethash-cl/ethash_cl_miner_kernel.cl

@ -54,11 +54,6 @@ static uint2 ROL2(const uint2 v, const int n)
static void keccak_f1600_round(uint2* a, uint r, uint out_size) static void keccak_f1600_round(uint2* a, uint r, uint out_size)
{ {
#if !__ENDIAN_LITTLE__
for (uint i = 0; i != 25; ++i)
a[i] = a[i].yx;
#endif
uint2 t[5]; uint2 t[5];
uint2 u, v; uint2 u, v;
@ -127,11 +122,14 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size)
a[10] = ROL2(u, 1); a[10] = ROL2(u, 1);
// Chi // Chi
u = a[0]; v = a[1]; u = a[0];
a[0] = bitselect(a[0] ^ a[2], a[0], a[1]); a[0] = bitselect(a[0] ^ a[2], a[0], a[1]);
// Iota
a[0] ^= Keccak_f1600_RC[r];
if (out_size > 4) if (out_size == 1) return;
{ // Continue Chi
v = a[1];
a[1] = bitselect(a[1] ^ a[3], a[1], a[2]); a[1] = bitselect(a[1] ^ a[3], a[1], a[2]);
a[2] = bitselect(a[2] ^ a[4], a[2], a[3]); a[2] = bitselect(a[2] ^ a[4], a[2], a[3]);
a[3] = bitselect(a[3] ^ u, a[3], a[4]); a[3] = bitselect(a[3] ^ u, a[3], a[4]);
@ -142,8 +140,9 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size)
a[6] = bitselect(a[6] ^ a[8], a[6], a[7]); a[6] = bitselect(a[6] ^ a[8], a[6], a[7]);
a[7] = bitselect(a[7] ^ a[9], a[7], a[8]); a[7] = bitselect(a[7] ^ a[9], a[7], a[8]);
a[8] = bitselect(a[8] ^ u, a[8], a[9]); a[8] = bitselect(a[8] ^ u, a[8], a[9]);
if (out_size > 8)
{ if (out_size == 8) return;
a[9] = bitselect(a[9] ^ v, a[9], u); a[9] = bitselect(a[9] ^ v, a[9], u);
u = a[10]; v = a[11]; u = a[10]; v = a[11];
@ -167,35 +166,17 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size)
a[23] = bitselect(a[23] ^ u, a[23], a[24]); a[23] = bitselect(a[23] ^ u, a[23], a[24]);
a[24] = bitselect(a[24] ^ v, a[24], u); a[24] = bitselect(a[24] ^ v, a[24], u);
} }
}
// Iota
a[0] ^= Keccak_f1600_RC[r];
#if !__ENDIAN_LITTLE__
for (uint i = 0; i != 25; ++i)
a[i] = a[i].yx;
#endif
}
static void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate) static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
{
for (uint i = in_size; i != 25; ++i)
{ {
a[i] = 0;
}
#if __ENDIAN_LITTLE__
a[in_size] ^= 0x0000000000000001;
a[24-out_size*2] ^= 0x8000000000000000;
#else
a[in_size] ^= 0x0100000000000000;
a[24-out_size*2] ^= 0x0000000000000080;
#endif
// Originally I unrolled the first and last rounds to interface // Originally I unrolled the first and last rounds to interface
// better with surrounding code, however I haven't done this // better with surrounding code, however I haven't done this
// without causing the AMD compiler to blow up the VGPR usage. // without causing the AMD compiler to blow up the VGPR usage.
uint r = 0; uint r = 0;
uint o = 25;
do do
{ {
// This dynamic branch stops the AMD compiler unrolling the loop // This dynamic branch stops the AMD compiler unrolling the loop
@ -207,19 +188,18 @@ static void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint i
// doesn't bother. // doesn't bother.
if (isolate) if (isolate)
{ {
keccak_f1600_round((uint2*)a, r++, 25); keccak_f1600_round(a, r++, o);
if (r == 23) o = out_size;
} }
} }
while (r < 23); while (r < 24);
// final round optimised for digest size // final round optimised for digest size
keccak_f1600_round((uint2*)a, r++, out_size); //keccak_f1600_round(a, 23, out_size);
} }
#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } #define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
#define countof(x) (sizeof(x) / sizeof(x[0]))
static uint fnv(uint x, uint y) static uint fnv(uint x, uint y)
{ {
return x * FNV_PRIME ^ y; return x * FNV_PRIME ^ y;
@ -235,21 +215,13 @@ static uint fnv_reduce(uint4 v)
return fnv(fnv(fnv(v.x, v.y), v.z), v.w); return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
} }
typedef union typedef struct
{ {
ulong ulongs[32 / sizeof(ulong)]; ulong ulongs[32 / sizeof(ulong)];
uint uints[32 / sizeof(uint)];
} hash32_t; } hash32_t;
typedef union typedef struct
{ {
ulong ulongs[64 / sizeof(ulong)];
uint4 uint4s[64 / sizeof(uint4)];
} hash64_t;
typedef union
{
uint uints[128 / sizeof(uint)];
uint4 uint4s[128 / sizeof(uint4)]; uint4 uint4s[128 / sizeof(uint4)];
} hash128_t; } hash128_t;
@ -259,14 +231,18 @@ typedef union {
uint uints[16]; uint uints[16];
} compute_hash_share; } compute_hash_share;
static ulong compute_hash( __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__local compute_hash_share* share, __kernel void ethash_search(
__global volatile uint* restrict g_output,
__constant hash32_t const* g_header, __constant hash32_t const* g_header,
__global hash128_t const* g_dag, __global hash128_t const* g_dag,
ulong nonce, ulong start_nonce,
ulong target,
uint isolate uint isolate
) )
{ {
__local compute_hash_share share[HASHES_PER_LOOP];
uint const gid = get_global_id(0); uint const gid = get_global_id(0);
// Compute one init hash per work item. // Compute one init hash per work item.
@ -274,9 +250,16 @@ static ulong compute_hash(
// sha3_512(header .. nonce) // sha3_512(header .. nonce)
ulong state[25]; ulong state[25];
copy(state, g_header->ulongs, 4); copy(state, g_header->ulongs, 4);
state[4] = nonce; state[4] = start_nonce + gid;
keccak_f1600_no_absorb(state, 5, 8, isolate);
for (uint i = 6; i != 25; ++i)
{
state[i] = 0;
}
state[5] = 0x0000000000000001;
state[8] = 0x8000000000000000;
keccak_f1600_no_absorb((uint2*)state, 8, isolate);
// Threads work together in this phase in groups of 8. // Threads work together in this phase in groups of 8.
uint const thread_id = gid & 7; uint const thread_id = gid & 7;
@ -326,28 +309,17 @@ static ulong compute_hash(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
// keccak_256(keccak_512(header..nonce) .. mix); for (uint i = 13; i != 25; ++i)
keccak_f1600_no_absorb(state, 12, 4, isolate);
return state[0];
}
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
__kernel void ethash_search(
__global volatile uint* restrict g_output,
__constant hash32_t const* g_header,
__global hash128_t const* g_dag,
ulong start_nonce,
ulong target,
uint isolate
)
{ {
__local compute_hash_share share[HASHES_PER_LOOP]; state[i] = 0;
}
state[12] = 0x0000000000000001;
state[16] = 0x8000000000000000;
uint const gid = get_global_id(0); // keccak_256(keccak_512(header..nonce) .. mix);
ulong hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate); keccak_f1600_no_absorb((uint2*)state, 1, isolate);
if (as_ulong(as_uchar8(hash).s76543210) < target) if (as_ulong(as_uchar8(state[0]).s76543210) < target)
{ {
uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1); uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
g_output[slot] = gid; g_output[slot] = gid;

2
libethash-cuda/CMakeLists.txt

@ -13,7 +13,7 @@ LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
if(COMPUTE AND (COMPUTE GREATER 0)) if(COMPUTE AND (COMPUTE GREATER 0))
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
else(COMPUTE AND (COMPUTE GREATER 0)) else(COMPUTE AND (COMPUTE GREATER 0))
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
endif(COMPUTE AND (COMPUTE GREATER 0)) endif(COMPUTE AND (COMPUTE GREATER 0))

BIN
releases/ethminer-0.9.41-genoil-1.0.5.zip

Binary file not shown.
Loading…
Cancel
Save