|
|
@ -12,7 +12,6 @@ |
|
|
|
|
|
|
|
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; } |
|
|
|
|
|
|
|
#define SHUFFLE_MIN_VER 300 |
|
|
|
|
|
|
|
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER |
|
|
|
#include "keccak_u64.cuh" |
|
|
@ -38,18 +37,18 @@ ethash_search( |
|
|
|
if (cuda_swab64(hash) > d_target) return; |
|
|
|
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1; |
|
|
|
g_output[index] = gid; |
|
|
|
__threadfence_system(); |
|
|
|
} |
|
|
|
|
|
|
|
void run_ethash_search( |
|
|
|
uint32_t blocks, |
|
|
|
uint32_t threads, |
|
|
|
uint32_t sharedbytes, |
|
|
|
cudaStream_t stream, |
|
|
|
volatile uint32_t* g_output, |
|
|
|
uint64_t start_nonce |
|
|
|
) |
|
|
|
{ |
|
|
|
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce); |
|
|
|
ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce); |
|
|
|
CUDA_SAFE_CALL(cudaGetLastError()); |
|
|
|
} |
|
|
|
|
|
|
@ -60,9 +59,9 @@ __global__ void |
|
|
|
__launch_bounds__(128, 7) |
|
|
|
ethash_calculate_dag_item(uint32_t start) |
|
|
|
{ |
|
|
|
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
if (node_index > d_dag_size * 2) return; |
|
|
|
|
|
|
|
|
|
|
|
hash200_t dag_node; |
|
|
|
copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4); |
|
|
|
dag_node.words[0] ^= node_index; |
|
|
@ -72,27 +71,43 @@ ethash_calculate_dag_item(uint32_t start) |
|
|
|
|
|
|
|
for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) { |
|
|
|
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size; |
|
|
|
|
|
|
|
/* fix this some time. or not. |
|
|
|
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER |
|
|
|
for (unsigned w = 0; w != 4; ++w) { |
|
|
|
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]); |
|
|
|
} |
|
|
|
#else |
|
|
|
for (uint32_t t = 0; t < 4; t++) { |
|
|
|
uint32_t shuffle_index = __shfl(parent_index, t, 4); |
|
|
|
uint4 p4 = d_light[shuffle_index].uint4s[thread_id]; |
|
|
|
if (t == thread_id) { |
|
|
|
for (uint32_t w = 0; w < 4; w++) { |
|
|
|
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4)); |
|
|
|
|
|
|
|
for (int w = 0; w < 4; w++) { |
|
|
|
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4)); |
|
|
|
if (t == thread_id) { |
|
|
|
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
*/ |
|
|
|
|
|
|
|
for (unsigned w = 0; w != 4; ++w) { |
|
|
|
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]); |
|
|
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
SHA3_512(dag_node.uint2s); |
|
|
|
hash64_t * dag_nodes = (hash64_t *)d_dag; |
|
|
|
copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4); |
|
|
|
|
|
|
|
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER |
|
|
|
for (uint32_t i = 0; i < 4; i++) { |
|
|
|
dag_nodes[node_index].uint4s[i] = dag_node.uint4s[i]; |
|
|
|
} |
|
|
|
#else |
|
|
|
for (uint32_t t = 0; t < 4; t++) { |
|
|
|
|
|
|
|
uint32_t shuffle_index = __shfl(node_index, t, 4); |
|
|
|
uint4 s[4]; |
|
|
|
for (uint32_t w = 0; w < 4; w++) { |
|
|
|
s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4)); |
|
|
|
} |
|
|
|
dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id]; |
|
|
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
|
|
|
|
void ethash_generate_dag( |
|
|
@ -103,7 +118,6 @@ void ethash_generate_dag( |
|
|
|
) |
|
|
|
{ |
|
|
|
uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t)); |
|
|
|
//while (work < blocks * threads) blocks /= 2; |
|
|
|
|
|
|
|
uint32_t fullRuns = work / (blocks * threads); |
|
|
|
uint32_t const restWork = work % (blocks * threads); |
|
|
|