Browse Source

some fixes, cuda 8 rc build

cl-refactor
Genoil 9 years ago
parent
commit
ed2e888bed
  1. 4
      CMakeLists.txt
  2. 12906
      libethash-cl/CL/cl.hpp
  3. 2
      libethash-cuda/CMakeLists.txt
  4. 1
      libethash-cuda/cuda_helper.h
  5. 9
      libethash-cuda/dagger_shared.cuh
  6. 13
      libethash-cuda/dagger_shuffled.cuh
  7. 5
      libethash-cuda/ethash_cuda_miner.cpp
  8. 2
      libethash-cuda/ethash_cuda_miner.h
  9. 46
      libethash-cuda/ethash_cuda_miner_kernel.cu
  10. 1
      libethash-cuda/ethash_cuda_miner_kernel.h
  11. 2
      libethash-cuda/ethash_cuda_miner_kernel_globals.h
  12. BIN
      releases/ethminer-0.9.41-genoil-1.1.zip

4
CMakeLists.txt

@ -266,11 +266,13 @@ message("-- MINER Build miner ${MINER}")
message("-- GUI Build GUI components ${GUI}")
message("-- ETHASHCL Build OpenCL components ${ETHASHCL}")
message("-- ETHASHCUDA Build CUDA components ${ETHASHCUDA}")
message("-- ETHSTRATUM Build Stratum components (experimental) ${ETHSTRATUM}")
message("-- ETHSTRATUM Build Stratum components ${ETHSTRATUM}")
message("------------------------------------------------------------------------")
message("")
if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
set(CMAKE_THREAD_LIBS_INIT pthread)
endif()
include(EthCompilerSettings)
message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")

12906
libethash-cl/CL/cl.hpp

File diff suppressed because it is too large

2
libethash-cuda/CMakeLists.txt

@ -13,7 +13,7 @@ LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
if(COMPUTE AND (COMPUTE GREATER 0))
LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
else(COMPUTE AND (COMPUTE GREATER 0))
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52;-gencode arch=compute_61,code=sm_61)
endif(COMPUTE AND (COMPUTE GREATER 0))

1
libethash-cuda/cuda_helper.h

@ -10,6 +10,7 @@
#include <device_launch_parameters.h>
#define __launch_bounds__(max_tpb, min_blocks)
#define asm("a" : "=l"(result) : "l"(a))
#define __CUDA_ARCH__ 520 // highlight shuffle code by default.
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);

9
libethash-cuda/dagger_shared.cuh

@ -44,13 +44,12 @@ __device__ uint64_t compute_hash(
for (uint32_t a = 0; a < ACCESSES; a += 4)
{
bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
int t = bfe(a, 2u, 3u);
for (uint32_t i = 0; i != 4; ++i)
for (uint32_t b = 0; b < 4; b++)
{
if (update_share)
{
*share0 = fnv(init0 ^ (a + i), ((uint32_t *)&mix)[i]) % d_dag_size;
if (thread_id == t) {
*share0 = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
}
__syncthreads();

13
libethash-cuda/dagger_shuffled.cuh

@ -1,9 +1,6 @@
#include "ethash_cuda_miner_kernel_globals.h"
#include "ethash_cuda_miner_kernel.h"
#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
typedef bool compute_hash_share;
#include "cuda_helper.h"
__device__ uint64_t compute_hash(
uint64_t nonce
@ -18,7 +15,6 @@ __device__ uint64_t compute_hash(
// Threads work together in this phase in groups of 8.
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
const int mix_idx = thread_id & 3;
uint4 mix;
@ -46,11 +42,11 @@ __device__ uint64_t compute_hash(
mix = vectorize2(shuffle[6], shuffle[7]);
}
uint32_t init0 = __shfl(shuffle[0].x, start_lane);
uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
for (uint32_t a = 0; a < ACCESSES; a += 4)
{
int t = ((a >> 2) & (THREADS_PER_HASH - 1));
int t = bfe(a, 2u, 3u);
for (uint32_t b = 0; b < 4; b++)
{
@ -59,8 +55,7 @@ __device__ uint64_t compute_hash(
shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
}
shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]);
}
}

5
libethash-cuda/ethash_cuda_miner.cpp

@ -253,6 +253,9 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
m_current_nonce = 0;
m_current_index = 0;
m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
cout << "Generating DAG..." << endl;
ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0]);
@ -307,7 +310,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
for (unsigned int j = 0; j < found_count; j++)
nonces[j] = nonce_base + buffer[j + 1];
}
run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce);
if (m_current_index >= s_numStreams)
{
exit = found_count && hook.found(nonces, found_count);

2
libethash-cuda/ethash_cuda_miner.h

@ -58,6 +58,8 @@ private:
uint64_t m_current_nonce;
uint64_t m_current_index;
uint32_t m_sharedBytes;
volatile uint32_t ** m_search_buf;
cudaStream_t * m_streams;

46
libethash-cuda/ethash_cuda_miner_kernel.cu

@ -12,7 +12,6 @@
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
#define SHUFFLE_MIN_VER 300
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
#include "keccak_u64.cuh"
@ -38,18 +37,18 @@ ethash_search(
if (cuda_swab64(hash) > d_target) return;
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
g_output[index] = gid;
__threadfence_system();
}
void run_ethash_search(
uint32_t blocks,
uint32_t threads,
uint32_t sharedbytes,
cudaStream_t stream,
volatile uint32_t* g_output,
uint64_t start_nonce
)
{
ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
CUDA_SAFE_CALL(cudaGetLastError());
}
@ -60,9 +59,9 @@ __global__ void
__launch_bounds__(128, 7)
ethash_calculate_dag_item(uint32_t start)
{
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
if (node_index > d_dag_size * 2) return;
hash200_t dag_node;
copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
dag_node.words[0] ^= node_index;
@ -72,27 +71,43 @@ ethash_calculate_dag_item(uint32_t start)
for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
/* fix this some time. or not.
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
for (unsigned w = 0; w != 4; ++w) {
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
}
#else
for (uint32_t t = 0; t < 4; t++) {
uint32_t shuffle_index = __shfl(parent_index, t, 4);
uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
if (t == thread_id) {
for (uint32_t w = 0; w < 4; w++) {
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
for (int w = 0; w < 4; w++) {
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
if (t == thread_id) {
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
}
}
}
*/
for (unsigned w = 0; w != 4; ++w) {
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
}
#endif
}
SHA3_512(dag_node.uint2s);
hash64_t * dag_nodes = (hash64_t *)d_dag;
copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4);
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
for (uint32_t i = 0; i < 4; i++) {
dag_nodes[node_index].uint4s[i] = dag_node.uint4s[i];
}
#else
for (uint32_t t = 0; t < 4; t++) {
uint32_t shuffle_index = __shfl(node_index, t, 4);
uint4 s[4];
for (uint32_t w = 0; w < 4; w++) {
s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
}
dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
}
#endif
}
void ethash_generate_dag(
@ -103,7 +118,6 @@ void ethash_generate_dag(
)
{
uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));
//while (work < blocks * threads) blocks /= 2;
uint32_t fullRuns = work / (blocks * threads);
uint32_t const restWork = work % (blocks * threads);

1
libethash-cuda/ethash_cuda_miner_kernel.h

@ -49,6 +49,7 @@ void set_target(
void run_ethash_search(
uint32_t search_batch_size,
uint32_t workgroup_size,
uint32_t sharedbytes,
cudaStream_t stream,
volatile uint32_t* g_output,
uint64_t start_nonce

2
libethash-cuda/ethash_cuda_miner_kernel_globals.h

@ -1,6 +1,8 @@
#ifndef _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
#define _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
#define SHUFFLE_MIN_VER 300
//#include "cuda_helper.h"
__constant__ uint32_t d_dag_size;

BIN
releases/ethminer-0.9.41-genoil-1.1.zip

Binary file not shown.
Loading…
Cancel
Save