on-GPU DAG generation

9 years ago · 19459aa2b2
18 changed files with 639 additions and 66 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 2.8.12)

 set(PROJECT_VERSION "0.9.41")
-set(GENOIL_VERSION "1.0.8")
+set(GENOIL_VERSION "1.1")
 if (${CMAKE_VERSION} VERSION_GREATER 3.0)
 	cmake_policy(SET CMP0042 OLD) 	# fix MACOSX_RPATH
 	cmake_policy(SET CMP0048 NEW) 	# allow VERSION argument in project()
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -362,6 +362,7 @@ public:
 		}
 		else if (arg == "--current-block" && i + 1 < argc)
 			m_currentBlock = stol(argv[++i]);
+		/*
 		else if ((arg == "-R" || arg == "--dag-dir") && i + 1 < argc)
 		{
 			strcpy(s_dagDir, argv[++i]);
@ -395,6 +396,7 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
+		*/
 		else if ((arg == "-w" || arg == "--check-pow") && i + 4 < argc)
 		{
 			string m;
@ -494,6 +496,7 @@ public:

 	void execute()
 	{
+		/*
 		EthashAux::setDAGDirName(s_dagDir);
 		EthashAux::setDAGEraseMode(m_eraseMode);
 		EthashAux::eraseDAGs();
@ -501,6 +504,7 @@ public:
 		{
 			m_eraseMode = DAGEraseMode::None;
 		}
+		*/

 		if (m_shouldListDevices)
 		{
@ -596,7 +600,7 @@ public:
 			<< "	-FS, --failover-stratum <host:port>  Failover stratum server at host:port" << endl
 			<< "    -O, --userpass <username.workername:password> Stratum login credentials" << endl
 			<< "    -FO, --failover-userpass <username.workername:password> Failover stratum login credentials (optional, will use normal credentials when omitted)" << endl
-			<< "    --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 60. Don't set lower than max. avg. block time" << endl
+			<< "    --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 180. Don't set lower than max. avg. block time" << endl
 #endif
 #if ETH_JSONRPC || ETH_STRATUM || !ETH_TRUE
 			<< "    --farm-recheck <n>  Leave n ms between checks for changed work (default: 500). When using stratum, use a high value (i.e. 2000) to get more stable hashrate output" << endl
@ -615,14 +619,14 @@ public:
 #if ETH_JSONRPC || !ETH_TRUE
 			<< "    --phone-home <on/off>  When benchmarking, publish results (default: off)" << endl
 #endif
-			<< "DAG file management:" << endl
-			<< "    -D,--create-dag <number>  Create the DAG in preparation for mining on given block and exit." << endl
-			<< "    -R <s>, --dag-dir <s> Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl
-			<< "    -E <mode>, --erase-dags <mode> Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl
-			<< "        none  - don't erase DAG files (default)" << endl
-			<< "        old   - erase all DAG files older than current epoch" << endl
-			<< "		bench - like old, but keep epoch 0 for benchmarking" << endl
-			<< "        all   - erase all DAG files. After deleting all files, setting changes to none." << endl
+//			<< "DAG file management:" << endl
+//			<< "    -D,--create-dag <number>  Create the DAG in preparation for mining on given block and exit." << endl
+//			<< "    -R <s>, --dag-dir <s> Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl
+//			<< "    -E <mode>, --erase-dags <mode> Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl
+//			<< "        none  - don't erase DAG files (default)" << endl
+//			<< "        old   - erase all DAG files older than current epoch" << endl
+//			<< "		bench - like old, but keep epoch 0 for benchmarking" << endl
+//			<< "        all   - erase all DAG files. After deleting all files, setting changes to none." << endl
 			<< "Mining configuration:" << endl
 			<< "    -C,--cpu  When mining, use the CPU." << endl
 			<< "    -G,--opencl  When mining use the GPU via OpenCL." << endl
@ -691,7 +695,7 @@ private:
 		cout << "Benchmarking on platform: " << platformInfo << endl;

 		cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
-		genesis.prep();
+		//genesis.prep();

 		genesis.setDifficulty(u256(1) << 63);
 		f.setWork(genesis);
@ -774,7 +778,7 @@ private:
 		cout << "Running mining simulation on platform: " << platformInfo << endl;

 		cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
-		genesis.prep();
+		//genesis.prep();

 		genesis.setDifficulty(u256(1) << difficulty);
 		f.setWork(genesis);
@ -911,6 +915,7 @@ private:
 					Json::Value v = prpc->eth_getWork();
 					h256 hh(v[0].asString());
 					h256 newSeedHash(v[1].asString());
+					/*
 					if (current.seedHash != newSeedHash)
 					{
 						minelog << "Grabbing DAG for" << newSeedHash;
@ -923,6 +928,7 @@ private:
 					{
 						EthashAux::computeFull(sha3(newSeedHash), true);
 					}
+					*/
 					if (hh != current.headerHash)
 					{
 						x_current.lock();
@ -1104,7 +1110,7 @@ private:

 	/// Benchmarking params
 	bool m_phoneHome = false;
-	unsigned m_benchmarkWarmup = 3;
+	unsigned m_benchmarkWarmup = 15;
 	unsigned m_benchmarkTrial = 3;
 	unsigned m_benchmarkTrials = 5;
 	unsigned m_benchmarkBlock = 0;
@ -1119,7 +1125,7 @@ private:
 	unsigned m_farmRecheckPeriod = 500;
 	unsigned m_defaultStratumFarmRecheckPeriod = 2000;
 	bool m_farmRecheckSet = false;
-	int m_worktimeout = 90;
+	int m_worktimeout = 180;
 	bool m_precompute = true;

 #if ETH_STRATUM || !ETH_TRUE
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -324,9 +324,11 @@ void ethash_cl_miner::finish()
 		m_queue.finish();
 }

+
 bool ethash_cl_miner::init(
-	uint8_t const* _dag,
-	uint64_t _dagSize,
+	ethash_light_t _light, 
+	uint8_t const* _lightData, 
+	uint64_t _lightSize,
 	unsigned _platformId,
 	unsigned _deviceId
 )
@ -399,12 +401,17 @@ bool ethash_cl_miner::init(
 		if (m_globalWorkSize % s_workgroupSize != 0)
 			m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize;

+		uint64_t dagSize = ethash_get_datasize(_light->block_number);
+		uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES);
+		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
+
 		// patch source code
 		// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
 		// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
 		string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
 		addDefinition(code, "GROUP_SIZE", s_workgroupSize);
-		addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
+		addDefinition(code, "DAG_SIZE", dagSize128);
+		addDefinition(code, "LIGHT_SIZE", lightSize64);
 		addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
 		addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
 		addDefinition(code, "PLATFORM", platformId);
@ -430,16 +437,19 @@ bool ethash_cl_miner::init(
 		// create buffer for dag
 		try
 		{
-			ETHCL_LOG("Creating one big buffer for the DAG");
-			m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
-			ETHCL_LOG("Loading single big chunk kernels");
+			ETHCL_LOG("Creating cache buffer");
+			m_light = cl::Buffer(m_context, CL_MEM_READ_ONLY, _lightSize);
+			ETHCL_LOG("Creating DAG buffer");
+			m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, dagSize);
+			ETHCL_LOG("Loading kernels");
 			m_searchKernel = cl::Kernel(program, "ethash_search");
-			ETHCL_LOG("Mapping one big chunk.");
-			m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
+			m_dagKernel = cl::Kernel(program, "ethash_calculate_dag_item");
+			ETHCL_LOG("Writing cache buffer");
+			m_queue.enqueueWriteBuffer(m_light, CL_TRUE, 0, _lightSize, _lightData);
 		}
 		catch (cl::Error const& err)
 		{
-			ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
+			ETHCL_LOG("Allocating/mapping DAG buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
 			return false;
 		}
 		// create buffer for header
@ -456,6 +466,28 @@ bool ethash_cl_miner::init(
 			ETHCL_LOG("Creating mining buffer " << i);
 			m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t));
 		}
+
+		ETHCL_LOG("Generating DAG data");
+
+		uint32_t const work = (uint32_t)(dagSize / sizeof(node));
+		//while (work < blocks * threads) blocks /= 2;
+
+		uint32_t fullRuns = work / m_globalWorkSize;
+		uint32_t const restWork = work % m_globalWorkSize;
+		if (restWork > 0) fullRuns++;
+
+		m_dagKernel.setArg(1, m_light);
+		m_dagKernel.setArg(2, m_dag);
+		m_dagKernel.setArg(3, ~0u);
+
+		for (uint32_t i = 0; i < fullRuns; i++)
+		{
+			m_dagKernel.setArg(0, i * m_globalWorkSize);
+			m_queue.enqueueNDRangeKernel(m_dagKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize);
+			m_queue.finish();
+			printf("%.0f%%\n", 100.0f * (float)i / (float)fullRuns);
+		}
+
 	}
 	catch (cl::Error const& err)
 	{
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -51,12 +51,13 @@ public:
 		uint64_t _currentBlock
 	);

-	bool init(
-		uint8_t const* _dag,
-		uint64_t _dagSize,
-		unsigned _platformId = 0,
-		unsigned _deviceId = 0
-	);
+	bool ethash_cl_miner::init(
+		ethash_light_t _light,
+		uint8_t const* _lightData,
+		uint64_t _lightSize,
+		unsigned _platformId,
+		unsigned _deviceId
+		);
 	void finish();
 	void search(uint8_t const* _header, uint64_t _target, search_hook& _hook);

@ -74,7 +75,9 @@ private:
 	cl::Context m_context;
 	cl::CommandQueue m_queue;
 	cl::Kernel m_searchKernel;
+	cl::Kernel m_dagKernel;
 	cl::Buffer m_dag;
+	cl::Buffer m_light;
 	cl::Buffer m_header;
 	cl::Buffer m_searchBuffer[c_bufferCount];
 	unsigned m_globalWorkSize;
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@ -1,11 +1,12 @@
 #define OPENCL_PLATFORM_UNKNOWN 0
 #define OPENCL_PLATFORM_NVIDIA  1
-#define OPENCL_PLATFORM_AMD			2
+#define OPENCL_PLATFORM_AMD		2

+#define ETHASH_DATASET_PARENTS 256
+#define NODE_WORDS (64/4)

 #define THREADS_PER_HASH (128 / 16)
 #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
-
 #define FNV_PRIME	0x01000193

 __constant uint2 const Keccak_f1600_RC[24] = {
@ -176,8 +177,6 @@ static void keccak_f1600_round(uint2* a, uint r)

 static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
 {
-
-
 	// Originally I unrolled the first and last rounds to interface
 	// better with surrounding code, however I haven't done this
 	// without causing the AMD compiler to blow up the VGPR usage.
@ -227,6 +226,18 @@ typedef struct
 	ulong ulongs[32 / sizeof(ulong)];
 } hash32_t;

+typedef union {
+	uint	 words[64 / sizeof(uint)];
+	uint2	 uint2s[64 / sizeof(uint2)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union {
+	uint	 words[200 / sizeof(uint)];
+	uint2	 uint2s[200 / sizeof(uint2)];
+	uint4	 uint4s[200 / sizeof(uint4)];
+} hash200_t;
+
 typedef struct
 {
 	uint4 uint4s[128 / sizeof(uint4)];
@ -334,3 +345,35 @@ __kernel void ethash_search(
 		g_output[slot] = gid;
 	}
 }
+
+static void SHA3_512(uint2* s, uint isolate)
+{
+	for (uint i = 8; i != 25; ++i)
+	{
+		s[i] = (uint2){ 0, 0 };
+	}
+	s[8].x = 0x00000001;
+	s[8].y = 0x80000000;
+	keccak_f1600_no_absorb(s, 8, isolate);
+}
+
+__kernel void ethash_calculate_dag_item(uint start, __global hash64_t const* g_light, __global hash64_t * g_dag, uint isolate)
+{
+	uint const node_index = start + get_global_id(0);
+	if (node_index > DAG_SIZE * 2) return;
+
+	hash200_t dag_node;
+	copy(dag_node.uint4s, g_light[node_index % LIGHT_SIZE].uint4s, 4);
+	dag_node.words[0] ^= node_index;
+	SHA3_512(dag_node.uint2s, isolate);
+
+	for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
+		uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_SIZE;
+
+		for (uint w = 0; w != 4; ++w) {
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
+		}
+	}
+	SHA3_512(dag_node.uint2s, isolate);
+	copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
+}
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -1,9 +1,5 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "ethash_cuda_miner_kernel.h"
-#include "keccak_u64.cuh"
-#include "fnv.cuh"
-
-#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }

 typedef union {
 	uint4	 uint4s[4];
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -1,7 +1,5 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "ethash_cuda_miner_kernel.h"
-#include "keccak.cuh"
-#include "fnv.cuh"

 #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)

--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -199,7 +199,7 @@ void ethash_cuda_miner::finish()
 	CUDA_SAFE_CALL(cudaDeviceReset());
 }

-bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
+bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId)
 {
 	try
 	{
@ -224,25 +224,38 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
 		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];

-		uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
+		uint64_t dagSize = ethash_get_datasize(_light->block_number);
+		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
+		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
+
+		// create buffer for cache
+		hash64_t * light;
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
+		// copy dag to CPU.
+		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));

 		// create buffer for dag
 		hash128_t * dag;
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
 		// copy dag to CPU.
-		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
+		//CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));

+		
 		// create mining buffers
 		for (unsigned i = 0; i != s_numStreams; ++i)
 		{
 			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
 			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 		}
-		set_constants(dag, dagSize128);
+		set_constants(dag, dagSize128, light, lightSize64);
 		memset(&m_current_header, 0, sizeof(hash32_t));
 		m_current_target = 0;
 		m_current_nonce = 0;
 		m_current_index = 0;
+
+		cout << "Generating DAG..." << endl;
+		ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0]);
+
 		return true;
 	}
 	catch (runtime_error)
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -34,11 +34,13 @@ public:
 		unsigned _scheduleFlag,
 		uint64_t _currentBlock
 		);
-	bool init(
+	/*bool init(
 		uint8_t const* _dag,
 		uint64_t _dagSize,
 		unsigned _deviceId = 0
-		);
+		);*/
+	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId);
+
 	void finish();
 	void search(uint8_t const* header, uint64_t target, search_hook& hook);

--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -8,13 +8,19 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "cuda_helper.h"

+#include "fnv.cuh"
+
+#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
 #define SHUFFLE_MIN_VER 300

 #if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+#include "keccak_u64.cuh"
 #include "dagger_shared.cuh"
 #define TPB		128
 #define BPSM	4
 #else
+#include "keccak.cuh"
 #include "dagger_shuffled.cuh"
 #define TPB		896
 #define BPSM	1
@ -43,17 +49,86 @@ void run_ethash_search(
 	uint64_t start_nonce
 )
 {
-	ethash_search << <blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
+	ethash_search <<<blocks, threads, (sizeof(compute_hash_share) * threads) / THREADS_PER_HASH, stream >> >(g_output, start_nonce);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+#define ETHASH_DATASET_PARENTS 256
+#define NODE_WORDS (64/4)
+
+__global__ void
+__launch_bounds__(128, 7)
+ethash_calculate_dag_item(uint32_t start)
+{
+ 	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
+	if (node_index > d_dag_size * 2) return;
+	
+	hash200_t dag_node;
+	copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
+	dag_node.words[0] ^= node_index;
+	SHA3_512(dag_node.uint2s);
+
+	const int thread_id = threadIdx.x & 3;
+
+	for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
+		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
+		
+		/* fix this some time. or not.
+		for (uint32_t t = 0; t < 4; t++) {
+			uint32_t shuffle_index = __shfl(parent_index, t, 4);
+			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
+			if (t == thread_id) {
+				for (uint32_t w = 0; w < 4; w++) {
+					uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
+					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
+				}
+			}
+		}
+		*/
+
+		for (unsigned w = 0; w != 4; ++w) {
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
+		}
+	}
+	SHA3_512(dag_node.uint2s);
+	hash64_t * dag_nodes = (hash64_t *)d_dag;
+	copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4);
+}
+
+void ethash_generate_dag(
+	uint64_t dag_size,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream
+	)
+{
+	uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));
+	//while (work < blocks * threads) blocks /= 2;
+
+	uint32_t fullRuns = work / (blocks * threads);
+	uint32_t const restWork = work % (blocks * threads);
+	if (restWork > 0) fullRuns++;
+	for (uint32_t i = 0; i < fullRuns; i++)
+	{
+		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+		printf("%.0f%%\n",100.0f * (float)i / (float)fullRuns);
+	}
+	
 	CUDA_SAFE_CALL(cudaGetLastError());
 }

 void set_constants(
 	hash128_t* _dag,
-	uint32_t _dag_size
+	uint32_t _dag_size,
+	hash64_t * _light,
+	uint32_t _light_size
 	)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
 }

 void set_header(
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -19,10 +19,23 @@ typedef struct
 	uint4	 uint4s[128 / sizeof(uint4)];
 } hash128_t;

+typedef union {
+	uint32_t words[64 / sizeof(uint32_t)];
+	uint2	 uint2s[64 / sizeof(uint2)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union {
+	uint32_t words[200 / sizeof(uint32_t)];
+	uint2	 uint2s[200 / sizeof(uint2)];
+	uint4	 uint4s[200 / sizeof(uint4)];
+} hash200_t;

 void set_constants(
 	hash128_t* _dag,
-	uint32_t _dag_size
+	uint32_t _dag_size,
+	hash64_t * _light,
+	uint32_t _light_size
 	);

 void set_header(
@ -41,6 +54,13 @@ void run_ethash_search(
 	uint64_t start_nonce
 	);

+void ethash_generate_dag(
+	uint64_t dag_size,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream
+	);
+

 #define CUDA_SAFE_CALL(call)								\
 do {														\
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@ -5,6 +5,8 @@

 __constant__ uint32_t d_dag_size;
 __constant__ hash128_t* d_dag;
+__constant__ uint32_t d_light_size;
+__constant__ hash64_t* d_light;
 __constant__ hash32_t d_header;
 __constant__ uint64_t d_target;

--- a/libethash-cuda/fnv.cuh
+++ b/libethash-cuda/fnv.cuh
@ -1,3 +1,4 @@
+
 #define FNV_PRIME	0x01000193

 #define fnv(x,y) ((x) * FNV_PRIME ^(y))
@ -15,4 +16,5 @@ __device__ uint4 fnv4(uint4 a, uint4 b)
 __device__ uint32_t fnv_reduce(uint4 v)
 {
 	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
-}
+}
+
--- a/libethash-cuda/keccak.cuh
+++ b/libethash-cuda/keccak.cuh
@ -588,4 +588,190 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s)
 	/* iota: a[0,0] ^= round constant */
 	//s[0] ^= vectorize(keccak_round_constants[23]);
 	return devectorize(s[0]) ^ keccak_round_constants[23];
+}
+
+__device__ __forceinline__ void SHA3_512(uint2* s) {
+	
+	uint2 t[5], u, v;
+
+	for (uint32_t i = 8; i < 25; i++)
+	{
+		s[i] = make_uint2(0, 0);
+	}
+	s[8].x = 1;
+	s[8].y = 0x80000000;
+
+	for (int i = 0; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+		t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+		t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+		t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+		t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROL2(t[1], 1);
+		s[0] = xor3(s[0], t[4], u);
+		s[5] = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROL2(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROL2(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROL2(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROL2(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6];
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11];
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+	}
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROL2(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+
+	u = ROL2(t[2], 1);
+	s[6] = xor3(s[6], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+
+	u = ROL2(t[3], 1);
+	s[12] = xor3(s[12], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROL2(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+
+	u = ROL2(t[0], 1);
+	s[9] = xor3(s[9], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROL2(s[6], 44);
+	s[6] = ROL2(s[9], 20);
+	s[9] = ROL2(s[22], 61);
+	s[2] = ROL2(s[12], 43);
+	s[4] = ROL2(s[24], 14);
+	s[8] = ROL2(s[16], 45);
+	s[5] = ROL2(s[3], 28);
+	s[3] = ROL2(s[18], 21);
+	s[7] = ROL2(s[10], 3);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= vectorize(keccak_round_constants[23]);
 }
--- a/libethash-cuda/keccak_u64.cuh
+++ b/libethash-cuda/keccak_u64.cuh
@ -586,4 +586,192 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint64_t* s)
 	/* iota: a[0,0] ^= round constant */
 	//s[0] ^= vectorize(keccak_round_constants[23]);
 	return s[0] ^ keccak_round_constants[23];
+}
+
+__device__ __forceinline__ void SHA3_512(uint2* s2) {
+
+	uint64_t * s = (uint64_t*)s2; //dirty
+
+	uint64_t t[5], u, v;
+
+	for (uint32_t i = 9; i < 25; i++)
+	{
+		s[i] = 0;
+	}
+	s[8] = 0x8000000000000001;
+
+	for (int i = 0; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+		t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+		t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+		t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+		t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROTL64(t[1], 1);
+		s[0] = xor3(s[0], t[4], u);
+		s[5] = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROTL64(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROTL64(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROTL64(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROTL64(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROTL64(s[6], 44);
+		s[6] = ROTL64(s[9], 20);
+		s[9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[2], 62);
+		s[2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19], 8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[4], 27);
+		s[4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21], 2);
+		s[21] = ROTL64(s[8], 55);
+		s[8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[5], 36);
+		s[5] = ROTL64(s[3], 28);
+		s[3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[7], 6);
+		s[7] = ROTL64(s[10], 3);
+		s[10] = ROTL64(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6];
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11];
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROTL64(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+
+	u = ROTL64(t[2], 1);
+	s[6] = xor3(s[6], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+
+	u = ROTL64(t[3], 1);
+	s[12] = xor3(s[12], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROTL64(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+
+	u = ROTL64(t[0], 1);
+	s[9] = xor3(s[9], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROTL64(s[6], 44);
+	s[6] = ROTL64(s[9], 20);
+	s[9] = ROTL64(s[22], 61);
+	s[2] = ROTL64(s[12], 43);
+	s[4] = ROTL64(s[24], 14);
+	s[8] = ROTL64(s[16], 45);
+	s[5] = ROTL64(s[3], 28);
+	s[3] = ROTL64(s[18], 21);
+	s[7] = ROTL64(s[10], 3);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= keccak_round_constants[23];
 }
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@ -112,16 +112,8 @@ EthashCUDAMiner::EthashCUDAMiner(ConstructionInfo const& _ci) :
 	Worker("cudaminer" + toString(index())),
 m_hook( new EthashCUDAHook(this))
 {
-/*
-#if defined(WIN32)
-	SYSTEM_INFO sysinfo;
-	GetSystemInfo(&sysinfo);
-	int num_cpus = sysinfo.dwNumberOfProcessors;
-	SetThreadAffinityMask(GetCurrentThread(), 1 << (index() % num_cpus));
-	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
-#endif
-*/
 }
+
 EthashCUDAMiner::~EthashCUDAMiner()
 {
 	pause();
@ -149,8 +141,7 @@ void EthashCUDAMiner::workLoop()
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
 		WorkPackage w = work();
-		//cnote << "seedhash" << "#" + m_minerSeed.hex().substr(0, 16);
-		cnote << "set work to" << "#" + w.headerHash.hex().substr(0, 8) + ", target " << "#" + w.boundary.hex().substr(0, 16);
+		cnote << "set work; seed: " << "#" + w.seedHash.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 16);
 		if (!m_miner || m_minerSeed != w.seedHash)
 		{
 			cnote << "Initialising miner...";
@ -161,7 +152,9 @@ void EthashCUDAMiner::workLoop()

 			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();

+			/*
 			EthashAux::FullType dag;
+
 			while (true)
 			{
 				if ((dag = EthashAux::full(w.seedHash, true)))
@ -175,8 +168,14 @@ void EthashCUDAMiner::workLoop()
 				cnote << "Awaiting DAG";
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
-			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), device);
+			*/
+
+			EthashAux::LightType light;
+			light = EthashAux::light(w.seedHash);
+			//bytesConstRef dagData = dag->data();
+			bytesConstRef lightData = light->data();
+			
+			m_miner->init(light->light, lightData.data(), lightData.size(), device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
--- a/libethcore/EthashGPUMiner.cpp
+++ b/libethcore/EthashGPUMiner.cpp
@ -150,6 +150,7 @@ void EthashGPUMiner::workLoop()

 			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();

+			/*
 			EthashAux::FullType dag;
 			while (true)
 			{
@ -164,8 +165,13 @@ void EthashGPUMiner::workLoop()
 				cnote << "Awaiting DAG";
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
-			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
+			*/
+
+			EthashAux::LightType light;
+			light = EthashAux::light(w.seedHash);
+			bytesConstRef lightData = light->data();
+
+			m_miner->init(light->light, lightData.data(), lightData.size(), s_platformId,  device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
--- a/libstratum/EthStratumClient.cpp
+++ b/libstratum/EthStratumClient.cpp
@ -305,7 +305,7 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
 					h256 headerHash = h256(sHeaderHash);
 					EthashAux::FullType dag;

-
+					/*
 					if (seedHash != m_current.seedHash)
 					{
 						cnote << "Grabbing DAG for" << seedHash;
@ -318,6 +318,8 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
 					{
 						EthashAux::computeFull(sha3(seedHash), true);
 					}
+					*/
+
 					if (headerHash != m_current.headerHash)
 					{
 						x_current.lock();