some fixes, cuda 8 rc build

9 years ago · ed2e888bed
12 changed files with 12958 additions and 33 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -266,11 +266,13 @@ message("-- MINER            Build miner                              ${MINER}")
 message("-- GUI              Build GUI components                     ${GUI}")
 message("-- ETHASHCL         Build OpenCL components                  ${ETHASHCL}")
 message("-- ETHASHCUDA       Build CUDA components                    ${ETHASHCUDA}")
-message("-- ETHSTRATUM       Build Stratum components (experimental)  ${ETHSTRATUM}")
+message("-- ETHSTRATUM       Build Stratum components                 ${ETHSTRATUM}")
 message("------------------------------------------------------------------------")
 message("")

+if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
 set(CMAKE_THREAD_LIBS_INIT pthread)
+endif()

 include(EthCompilerSettings)
 message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
--- a/libethash-cl/CL/cl.hpp
+++ b/libethash-cl/CL/cl.hpp
--- a/libethash-cuda/CMakeLists.txt
+++ b/libethash-cuda/CMakeLists.txt
@ -13,7 +13,7 @@ LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
 if(COMPUTE AND (COMPUTE GREATER 0))
 	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
 else(COMPUTE AND (COMPUTE GREATER 0))
-	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
+	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52;-gencode arch=compute_61,code=sm_61)
 endif(COMPUTE AND (COMPUTE GREATER 0))


--- a/libethash-cuda/cuda_helper.h
+++ b/libethash-cuda/cuda_helper.h
@ -10,6 +10,7 @@
 #include <device_launch_parameters.h>
 #define __launch_bounds__(max_tpb, min_blocks)
 #define asm("a" : "=l"(result) : "l"(a))
+#define __CUDA_ARCH__ 520 // highlight shuffle code by default.

 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
 uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -44,13 +44,12 @@ __device__ uint64_t compute_hash(

 		for (uint32_t a = 0; a < ACCESSES; a += 4)
 		{
-			bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
+			int t = bfe(a, 2u, 3u);

-			for (uint32_t i = 0; i != 4; ++i)
+			for (uint32_t b = 0; b < 4; b++)
 			{
-				if (update_share)
-				{
-					*share0 = fnv(init0 ^ (a + i), ((uint32_t *)&mix)[i]) % d_dag_size;
+				if (thread_id == t) {
+					*share0 = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
 				}
 				__syncthreads();

--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -1,9 +1,6 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "ethash_cuda_miner_kernel.h"
-
-#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
-
-typedef bool compute_hash_share;
+#include "cuda_helper.h"

 __device__ uint64_t compute_hash(
 	uint64_t nonce
@ -18,7 +15,6 @@ __device__ uint64_t compute_hash(
 	
 	// Threads work together in this phase in groups of 8.
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
-	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;

 	uint4 mix;
@ -46,11 +42,11 @@ __device__ uint64_t compute_hash(
 				mix = vectorize2(shuffle[6], shuffle[7]);
 		}
 		
-		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
+		uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);

 		for (uint32_t a = 0; a < ACCESSES; a += 4)
 		{
-			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
+			int t = bfe(a, 2u, 3u);

 			for (uint32_t b = 0; b < 4; b++)
 			{
@ -59,8 +55,7 @@ __device__ uint64_t compute_hash(
 					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
 				}
 				shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
-
-				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
+				mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]);
 			}
 		}

--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -253,6 +253,9 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		m_current_nonce = 0;
 		m_current_index = 0;

+		m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
+
+
 		cout << "Generating DAG..." << endl;
 		ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0]);

@ -307,7 +310,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
 			for (unsigned int j = 0; j < found_count; j++)
 				nonces[j] = nonce_base + buffer[j + 1];
 		}
-		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
+		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce);
 		if (m_current_index >= s_numStreams)
 		{
 			exit = found_count && hook.found(nonces, found_count);
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -58,6 +58,8 @@ private:
 	uint64_t m_current_nonce;
 	uint64_t m_current_index;

+	uint32_t m_sharedBytes;
+
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -12,7 +12,6 @@

 #define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }

-#define SHUFFLE_MIN_VER 300

 #if __CUDA_ARCH__ < SHUFFLE_MIN_VER
 #include "keccak_u64.cuh"
@ -38,18 +37,18 @@ ethash_search(
 	if (cuda_swab64(hash) > d_target) return;
 	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
-	__threadfence_system();
 }

 void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
+	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
 )
 {
-	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
+	ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
 	CUDA_SAFE_CALL(cudaGetLastError());
 }

@ -60,9 +59,9 @@ __global__ void
 __launch_bounds__(128, 7)
 ethash_calculate_dag_item(uint32_t start)
 {
- 	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
+	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
 	if (node_index > d_dag_size * 2) return;
-	
+
 	hash200_t dag_node;
 	copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
 	dag_node.words[0] ^= node_index;
@ -72,27 +71,43 @@ ethash_calculate_dag_item(uint32_t start)

 	for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
 		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
-		
-		/* fix this some time. or not.
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+		for (unsigned w = 0; w != 4; ++w) {
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
+		}
+#else
 		for (uint32_t t = 0; t < 4; t++) {
 			uint32_t shuffle_index = __shfl(parent_index, t, 4);
 			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
-			if (t == thread_id) {
-				for (uint32_t w = 0; w < 4; w++) {
-					uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
+
+			for (int w = 0; w < 4; w++) {
+				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
+				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}
-		}
-		*/

-		for (unsigned w = 0; w != 4; ++w) {
-			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
 		}
+#endif		
 	}
 	SHA3_512(dag_node.uint2s);
 	hash64_t * dag_nodes = (hash64_t *)d_dag;
-	copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4);
+
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+	for (uint32_t i = 0; i < 4; i++) {
+		dag_nodes[node_index].uint4s[i] =  dag_node.uint4s[i];
+	}
+#else
+	for (uint32_t t = 0; t < 4; t++) {
+
+		uint32_t shuffle_index = __shfl(node_index, t, 4);
+		uint4 s[4];
+		for (uint32_t w = 0; w < 4; w++) {
+			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
+		}
+		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
+	}
+#endif		 
 }

 void ethash_generate_dag(
@ -103,7 +118,6 @@ void ethash_generate_dag(
 	)
 {
 	uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));
-	//while (work < blocks * threads) blocks /= 2;

 	uint32_t fullRuns = work / (blocks * threads);
 	uint32_t const restWork = work % (blocks * threads);
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -49,6 +49,7 @@ void set_target(
 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
+	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@ -1,6 +1,8 @@
 #ifndef _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
 #define _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_

+#define SHUFFLE_MIN_VER 300
+
 //#include "cuda_helper.h"

 __constant__ uint32_t d_dag_size;
--- a/releases/ethminer-0.9.41-genoil-1.1.zip
+++ b/releases/ethminer-0.9.41-genoil-1.1.zip