From dcb0cc96ab8cb92b20b2840abf9ae977e1cdf54a Mon Sep 17 00:00:00 2001
From: Jan Willem Penterman <janwillem.penterman@ddbgroup.nl>
Date: Wed, 2 Sep 2015 11:01:17 +0200
Subject: [PATCH 1/3] fixed multiple GPU support

---
 ethminer/MinerAux.h                  | 19 ++++++++-------
 libethash-cuda/ethash_cuda_miner.cpp | 36 ++++++++++++++++------------
 libethash-cuda/ethash_cuda_miner.h   |  1 +
 libethcore/EthashCUDAMiner.cpp       |  5 ++--
 libethcore/EthashCUDAMiner.h         |  1 -
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 1a2182e25..29a69b3ba 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -375,23 +375,24 @@ public:
 		else if (m_minerType == MinerType::CUDA)
 		{
 #if ETH_ETHASHCUDA || !ETH_TRUE
+			if (m_cudaDeviceCount == 0)
+			{
+				m_cudaDevices[0]  = 0;
+				m_cudaDeviceCount = 1;
+			}
+			EthashCUDAMiner::setDevices(m_cudaDevices, m_cudaDeviceCount);
+			m_miningThreads = m_cudaDeviceCount;
+	
+			EthashCUDAMiner::setNumInstances(m_miningThreads);
 			if (!EthashCUDAMiner::configureGPU(
 				m_localWorkSize,
 				m_globalWorkSizeMultiplier,
 				m_numStreams,
-				m_openclDevice,
 				m_extraGPUMemory,
 				m_cudaHighCPULoad,
 				m_currentBlock
 				))
 				exit(1);
-			if (m_cudaDeviceCount != 0) 
-			{
-				EthashCUDAMiner::setDevices(m_cudaDevices, m_cudaDeviceCount);
-				m_miningThreads = m_cudaDeviceCount;
-			}
-			EthashCUDAMiner::setNumInstances(m_miningThreads);
-			
 #else
 			cerr << "Selected CUDA mining without having compiled with -DETHASHCUDA=1 or -DBUNDLE=cudaminer" << endl;
 			exit(1);
@@ -486,7 +487,7 @@ private:
 		sealers["opencl"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{&EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashGPUMiner(ci); }};
 #endif
 #if ETH_ETHASHCUDA
-		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashGPUMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
+		sealers["cuda"] = GenericFarm<EthashProofOfWork>::SealerDescriptor{ &EthashCUDAMiner::instances, [](GenericMiner<EthashProofOfWork>::ConstructionInfo ci){ return new EthashCUDAMiner(ci); } };
 #endif
 		f.setSealers(sealers);
 		f.onSolutionFound([&](EthashProofOfWork::Solution) { return false; });
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 7308334ac..965cd77fc 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -129,6 +129,7 @@ unsigned ethash_cuda_miner::getNumDevices()
 }
 
 bool ethash_cuda_miner::configureGPU(
+	int *	 _devices,
 	unsigned _blockSize,
 	unsigned _gridSize,
 	unsigned _numStreams,
@@ -148,24 +149,29 @@ bool ethash_cuda_miner::configureGPU(
 	uint64_t requiredSize = dagSize + _extraGPUMemory;
 	for (unsigned int i = 0; i < getNumDevices(); i++)
 	{
-		cudaDeviceProp props;
-		CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
-		if (props.totalGlobalMem >= requiredSize)
+		if (_devices[i] != -1) 
 		{
-			ETHCUDA_LOG(
-				"Found suitable CUDA device [" << string(props.name)
-				<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
-				);
-			return true;
+			cudaDeviceProp props;
+			CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, _devices[i]));
+			if (props.totalGlobalMem >= requiredSize)
+			{
+				ETHCUDA_LOG(
+					"Found suitable CUDA device [" << string(props.name)
+					<< "] with " << props.totalGlobalMem << " bytes of GPU memory"
+					);
+			}
+			else
+			{
+				ETHCUDA_LOG(
+					"CUDA device " << string(props.name)
+					<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
+					" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
+					);
+				return false;
+			}
 		}
-
-		ETHCUDA_LOG(
-			"CUDA device " << string(props.name)
-			<< " has insufficient GPU memory." << to_string(props.totalGlobalMem) <<
-			" bytes of memory found < " << to_string(requiredSize) << " bytes of memory required"
-			);
 	}
-	return false;
+	return true;
 }
 
 unsigned ethash_cuda_miner::s_extraRequiredGPUMem;
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 2b97ab294..d89096cfb 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -26,6 +26,7 @@ public:
 	static unsigned getNumDevices();
 	static void listDevices();
 	static bool configureGPU(
+		int *	 _devices,
 		unsigned _blockSize,
 		unsigned _gridSize,
 		unsigned _numStreams,
diff --git a/libethcore/EthashCUDAMiner.cpp b/libethcore/EthashCUDAMiner.cpp
index 69b2338b8..a5d82c430 100644
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@@ -146,7 +146,7 @@ void EthashCUDAMiner::workLoop()
 			delete m_miner;
 			m_miner = new ethash_cuda_miner;
 
-			unsigned device = instances() > 1 ? (s_devices[index()] > -1 ? s_devices[index()] : index()) : s_deviceId;
+			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();
 
 			EthashAux::FullType dag;
 			while (true)
@@ -202,19 +202,18 @@ bool EthashCUDAMiner::configureGPU(
 	unsigned _blockSize,
 	unsigned _gridSize,
 	unsigned _numStreams,
-	unsigned _deviceId,
 	unsigned _extraGPUMemory,
 	bool _highcpu,
 	uint64_t _currentBlock
 	)
 {
-	s_deviceId = _deviceId;
 	if (_blockSize != 32 && _blockSize != 64 && _blockSize != 128)
 	{
 		cout << "Given localWorkSize of " << toString(_blockSize) << "is invalid. Must be either 32,64 or 128" << endl;
 		return false;
 	}
 	if (!ethash_cuda_miner::configureGPU(
+		s_devices,
 		_blockSize,
 		_gridSize,
 		_numStreams,
diff --git a/libethcore/EthashCUDAMiner.h b/libethcore/EthashCUDAMiner.h
index b587aad86..565ea5be2 100644
--- a/libethcore/EthashCUDAMiner.h
+++ b/libethcore/EthashCUDAMiner.h
@@ -51,7 +51,6 @@ namespace eth
 			unsigned _blockSize,
 			unsigned _gridSize,
 			unsigned _numStreams,
-			unsigned _deviceId,
 			unsigned _extraGPUMemory,
 			bool	 _highcpu,
 			uint64_t _currentBlock

From 3fbe74bcc308cd2116ceffde7d7362627e292392 Mon Sep 17 00:00:00 2001
From: Jan Willem Penterman <janwillem.penterman@ddbgroup.nl>
Date: Wed, 2 Sep 2015 11:43:24 +0200
Subject: [PATCH 2/3] missing makelist

---
 libethash-cuda/CMakeLists.txt | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 libethash-cuda/CMakeLists.txt

diff --git a/libethash-cuda/CMakeLists.txt b/libethash-cuda/CMakeLists.txt
new file mode 100644
index 000000000..7e11d97fe
--- /dev/null
+++ b/libethash-cuda/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(EXECUTABLE ethash-cuda)
+
+FIND_PACKAGE(CUDA REQUIRED)
+
+file(GLOB SRC_LIST "*.cpp" "*.cu")
+file(GLOB HEADERS "*.h" "*.cuh")
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
+
+if(COMPUTE)
+	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE)
+	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
+endif(COMPUTE)
+
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} ethash)
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
+

From 59dc0628d652f06a5ae335b09b9a1c8ea12a9587 Mon Sep 17 00:00:00 2001
From: Jan Willem Penterman <janwillem.penterman@ddbgroup.nl>
Date: Wed, 2 Sep 2015 13:11:06 +0200
Subject: [PATCH 3/3] reverted possible speedup.

---
 libethash-cuda/dagger_shuffled.cuh | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index 0918ff4f4..db4edb045 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -76,7 +76,7 @@ __device__ uint64_t compute_hash_shuffle(
 		uint32_t thread_mix = fnv_reduce(mix);
 
 		// update mix accross threads
-		/*
+		
 		shuffle[0].x = __shfl(thread_mix, start_lane + 0);
 		shuffle[0].y = __shfl(thread_mix, start_lane + 1);
 		shuffle[1].x = __shfl(thread_mix, start_lane + 2);
@@ -85,30 +85,15 @@ __device__ uint64_t compute_hash_shuffle(
 		shuffle[2].y = __shfl(thread_mix, start_lane + 5);
 		shuffle[3].x = __shfl(thread_mix, start_lane + 6);
 		shuffle[3].y = __shfl(thread_mix, start_lane + 7);
-		*/
+		
 		if (i == thread_id) {
 			//move mix into state:
-			state[8].x = __shfl(thread_mix, start_lane + 0);
-			state[8].y = __shfl(thread_mix, start_lane + 1);
-			state[9].x = __shfl(thread_mix, start_lane + 2);
-			state[9].y = __shfl(thread_mix, start_lane + 3);
-			state[10].x = __shfl(thread_mix, start_lane + 4);
-			state[10].y = __shfl(thread_mix, start_lane + 5);
-			state[11].x = __shfl(thread_mix, start_lane + 6);
-			state[11].y = __shfl(thread_mix, start_lane + 7);
+			state[8] = shuffle[0];
+			state[9] = shuffle[1];
+			state[10] = shuffle[2];
+			state[11] = shuffle[3];
 		}
-		/*
-		else {
-			shuffle[0].x = __shfl(thread_mix, start_lane + 0);
-			shuffle[0].y = __shfl(thread_mix, start_lane + 1);
-			shuffle[1].x = __shfl(thread_mix, start_lane + 2);
-			shuffle[1].y = __shfl(thread_mix, start_lane + 3);
-			shuffle[2].x = __shfl(thread_mix, start_lane + 4);
-			shuffle[2].y = __shfl(thread_mix, start_lane + 5);
-			shuffle[3].x = __shfl(thread_mix, start_lane + 6);
-			shuffle[3].y = __shfl(thread_mix, start_lane + 7);
-		}
-		*/
+
 	}
 	
 	// keccak_256(keccak_512(header..nonce) .. mix);