Merge branch '117'

8 years ago · 690a0f75b8
50 changed files with 2577 additions and 791 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 2.8.12)

 set(PROJECT_VERSION "0.9.41")
-set(GENOIL_VERSION "1.0.8")
+set(GENOIL_VERSION "1.1.7")
 if (${CMAKE_VERSION} VERSION_GREATER 3.0)
 	cmake_policy(SET CMP0042 OLD) 	# fix MACOSX_RPATH
 	cmake_policy(SET CMP0048 NEW) 	# allow VERSION argument in project()
@ -266,11 +266,13 @@ message("-- MINER            Build miner                              ${MINER}")
 message("-- GUI              Build GUI components                     ${GUI}")
 message("-- ETHASHCL         Build OpenCL components                  ${ETHASHCL}")
 message("-- ETHASHCUDA       Build CUDA components                    ${ETHASHCUDA}")
-message("-- ETHSTRATUM       Build Stratum components (experimental)  ${ETHSTRATUM}")
+message("-- ETHSTRATUM       Build Stratum components                 ${ETHSTRATUM}")
 message("------------------------------------------------------------------------")
 message("")

+if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
 set(CMAKE_THREAD_LIBS_INIT pthread)
+endif()

 include(EthCompilerSettings)
 message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
--- a/CodingStandards.txt
+++ b/CodingStandards.txt
@ -1,240 +0,0 @@
-0. Formatting
-
-GOLDEN RULE: Never *ever* use spaces for formatting.
-
-a. Use tabs for indentation!
- tab stops are every 4 characters.
- One indentation level -> exactly one byte (i.e. a tab character) in the source file.
- Never use spaces to line up sequential lines: If you have run-on lines, indent as you would for a block.
-b. Line widths:
- Don't worry about having lines of code > 80-char wide.
- Lines of comments should be formatted according to ease of viewing, but simplicity is to be prefered over beauty.
-c. Don't use braces for condition-body one-liners.
-d. Never place condition bodies on same line as condition.
-e. Space between first paren and keyword, but *not* following first paren or preceeding final paren.
-f. No spaces when fewer than intra-expression three parens together; when three or more, space according to clarity.
-g. No spaces for subscripting or unary operators.
-h. No space before ':' but one after it, except in the ternary operator: one on both sides.
-i. Space all other operators.
-j. Braces, when used, always have their own lines and are at same indentation level as "parent" scope.
-
-(WRONG)
-if( a==b[ i ] ) { printf ("Hello\n"); }
-foo->bar(someLongVariableName,
-         anotherLongVariableName,
-         anotherLongVariableName,
-         anotherLongVariableName,
-         anotherLongVariableName);
-
-(RIGHT)
-if (a == b[i])
-    printf("Hello\n");	// NOTE spaces used instead of tab here for clarity - first byte should be '\t'.
-foo->bar(
-    someLongVariableName,
-    anotherLongVariableName,
-    anotherLongVariableName,
-    anotherLongVariableName,
-    anotherLongVariableName
-);
-
-
-
-1. Namespaces;
-
-a. No "using namespace" declarations in header files.
-b. All symbols should be declared in a namespace except for final applications.
-c. Preprocessor symbols should be prefixed with the namespace in all-caps and an underscore.
-
-(WRONG)
-#include <cassert>
-using namespace std;
-tuple<float, float> meanAndSigma(vector<float> const& _v);
-
-(CORRECT)
-#include <cassert>
-std::tuple<float, float> meanAndSigma(std::vector<float> const& _v);
-
-
-
-2. Preprocessor;
-
-a. File comment is always at top, and includes:
- Original author, date.
- Later maintainers (not contributors - they can be seen through VCS log).
- Copyright.
- License (e.g. see COPYING).
-b. Never use #ifdef/#define/#endif file guards. Prefer #pragma once as first line below file comment.
-c. Prefer static const variable to value macros.
-d. Prefer inline constexpr functions to function macros.
-e. Split complex macro on multiple lines with '\'.
-
-
-
-3. Capitalization;
-
-GOLDEN RULE: Preprocessor: ALL_CAPS; C++: camelCase.
-
-a. Use camelCase for splitting words in names, except where obviously extending STL/boost functionality in which case follow those naming conventions.
-b. The following entities' first alpha is upper case:
- Type names.
- Template parameters.
- Enum members.
- static const variables that form an external API.
-c. All preprocessor symbols (macros, macro argments) in full uppercase with underscore word separation.
-
-
-All other entities' first alpha is lower case.
-
-
-
-4. Variable prefixes:
-
-a. Leading underscore "_" to parameter names (both normal and template).
- Exception: "o_parameterName" when it is used exclusively for output. See 6(f).
- Exception: "io_parameterName" when it is used for both input and output. See 6(f).
-b. Leading "c_" to const variables (unless part of an external API).
-c. Leading "g_" to global (non-const) variables.
-d. Leading "s_" to static (non-const, non-global) variables.
-
-
-
-5. Error reporting:
-
- Prefer exception to bool/int return type.
-
-
-
-6. Declarations:
-
-a. {Typename} + {qualifiers} + {name}.
-b. Only one per line.
-c. Associate */& with type, not variable (at ends with parser, but more readable, and safe if in conjunction with (b)).
-d. Favour declarations close to use; don't habitually declare at top of scope ala C.
-e. Always pass non-trivial parameters with a const& suffix.
-f. If a function returns multiple values, use std::tuple (std::pair acceptable). Prefer not using */& arguments, except where efficiency requires.
-g. Never use a macro where adequate non-preprocessor C++ can be written.
-h. Make use of auto whenever type is clear or unimportant:
- Always avoid doubly-stating the type.
- Use to avoid vast and unimportant type declarations.
- However, avoid using auto where type is not immediately obvious from the context, and especially not for arithmetic expressions.
-i. Don't pass bools: prefer enumerations instead.
-j. Prefer enum class to straight enum.
-
-
-(WRONG)
-const double d = 0;
-int i, j;
-char *s;
-float meanAndSigma(std::vector<float> _v, float* _sigma, bool _approximate);
-Derived* x(dynamic_cast<Derived*>(base));
-for (map<ComplexTypeOne, ComplexTypeTwo>::iterator i = l.begin(); i != l.end(); ++l) {}
-
-
-(CORRECT)
-enum class Accuracy
-{
-	Approximate,
-	Exact
-};
-double const d = 0;
-int i;
-int j;
-char* s;
-std::tuple<float, float> meanAndSigma(std::vector<float> const& _v, Accuracy _a);
-auto x = dynamic_cast<Derived*>(base);
-for (auto i = x.begin(); i != x.end(); ++i) {}
-
-
-7. Structs & classes
-
-a. Structs to be used when all members public and no virtual functions.
- In this case, members should be named naturally and not prefixed with 'm_'
-b. Classes to be used in all other circumstances.
-
-
-
-8. Members:
-
-a. One member per line only.
-b. Private, non-static, non-const fields prefixed with m_.
-c. Avoid public fields, except in structs.
-d. Use override, final and const as much as possible.
-e. No implementations with the class declaration, except:
- template or force-inline method (though prefer implementation at bottom of header file).
- one-line implementation (in which case include it in same line as declaration).
-f. For a property 'foo'
- Member: m_foo;
- Getter: foo() [ also: for booleans, isFoo() ];
- Setter: setFoo();
-
-
-
-9. Naming
-
-a. Collection conventions:
- -s means std::vector e.g. using MyTypes = std::vector<MyType>
- -Set means std::set e.g. using MyTypeSet = std::set<MyType>
- -Hash means std::unordered_set e.g. using MyTypeHash = std::unordered_set<MyType>
-b. Class conventions:
- -Face means the interface of some shared concept. (e.g. FooFace might be a pure virtual class.)
-c. Avoid unpronouncable names;
- If you need to shorten a name favour a pronouncable slice of the original to a scatterred set of consonants.
- e.g. Manager shortens to Man rather than Mgr.
-d. Avoid prefixes of initials (e.g. DON'T use IMyInterface, CMyImplementation)
-e. Find short, memorable & (at least semi-) descriptive names for commonly used classes or name-fragments.
- A dictionary and thesaurus are your friends.
- Spell correctly.
- Think carefully about the class's purpose.
- Imagine it as an isolated component to try to decontextualise it when considering its name.
- Don't be trapped into naming it (purely) in terms of its implementation.
-
-
-
-10. Type-definitions
-
-a. Prefer 'using' to 'typedef'. e.g. using ints = std::vector<int>; rather than typedef std::vector<int> ints;
-b. Generally avoid shortening a standard form that already includes all important information:
- e.g. stick to shared_ptr<X> rather than shortening to ptr<X>.
-c. Where there are exceptions to this (due to excessive use and clear meaning), note the change prominently and use it consistently.
- e.g. using Guard = std::lock_guard<std::mutex>; ///< Guard is used throughout the codebase since it's clear in meaning and used commonly. 
-d. In general expressions should be roughly as important/semantically meaningful as the space they occupy.
-
-
-11. Commenting
-
-a. Comments should be doxygen-compilable, using @notation rather than \notation.
-b. Document the interface, not the implementation.
- Documentation should be able to remain completely unchanged, even if the method is reimplemented.
- Comment in terms of the method properties and intended alteration to class state (or what aspects of the state it reports).
- Be careful to scrutinise documentation that extends only to intended purpose and usage.
- Reject documentation that is simply an English transaction of the implementation.
-
-
-
-12. Include Headers
-
-a. Includes should go in order of lower level (STL -> boost -> libdevcore -> libdevcrypto -> libethcore -> libethereum) to higher level. Lower levels are basically dependencies to the higher levels. For example:
-
-#include <string>
-#include <boost/filesystem.hpp>
-
-#include <libdevcore/Common.h>
-#include <libdevcore/CommonData.h>
-#include <libdevcore/Exceptions.h>
-#include <libdevcore/Log.h>
-#include <libdevcrypto/SHA3.h>
-#include <libethereum/Defaults.h>
-
-b. The only exception to the above rule is the top of a .cpp file where its corresponding header should be located.
-
-13. Logging
-
-Logging should be performed at appropriate verbosities depending on the logging message.
-The more likely a message is to repeat (and thus cuase noise) the higher in verbosity it should be.
-Some rules to keep in mind:
-
- - Verbosity == 0 -> Reserved for important stuff that users must see and can understand.
- - Verbosity == 1 -> Reserved for stuff that users don't need to see but can understand.
- - Verbosity >= 2 -> Anything that is or might be displayed more than once every minute
- - Verbosity >= 3 -> Anything that only a developer would understand
- - Verbosity >= 4 -> Anything that is low-level (e.g. peer disconnects, timers being cancelled)
--- a/EthereumMacOSXBundleInfo.plist.in
+++ b/EthereumMacOSXBundleInfo.plist.in
@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>English</string>
-	<key>CFBundleExecutable</key>
-	<string>${MACOSX_BUNDLE_EXECUTABLE_NAME}</string>
-	<key>CFBundleGetInfoString</key>
-	<string>${MACOSX_BUNDLE_INFO_STRING}</string>
-	<key>CFBundleIconFile</key>
-	<string>${MACOSX_BUNDLE_ICON_FILE}</string>
-	<key>CFBundleIdentifier</key>
-	<string>${MACOSX_BUNDLE_GUI_IDENTIFIER}</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleLongVersionString</key>
-	<string>${MACOSX_BUNDLE_LONG_VERSION_STRING}</string>
-	<key>CFBundleName</key>
-	<string>${MACOSX_BUNDLE_BUNDLE_NAME}</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>${MACOSX_BUNDLE_SHORT_VERSION_STRING}</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleVersion</key>
-	<string>${MACOSX_BUNDLE_BUNDLE_VERSION}</string>
-	<key>CSResourcesFileMapped</key>
-	<true/>
-	<key>LSRequiresCarbon</key>
-	<true/>
-	<key>NSHumanReadableCopyright</key>
-	<string>${MACOSX_BUNDLE_COPYRIGHT}</string>
-	<key>NSHighResolutionCapable</key>
-	<true/>
-</dict>
-</plist>
--- a/appdmg.json.in
+++ b/appdmg.json.in
@ -1,13 +0,0 @@
-{
-    "title": "Ethereum",
-    "icon": "appdmg_icon.icns",
-    "background": "appdmg_background.png",
-    "icon-size": 55,
-    "contents": [
-        { "x": 242, "y": 240, "type": "link", "path": "/Applications" },
-        { "x": 145, "y": 125,  "type": "file", "path": "${ETH_ALETHZERO_APP}" },
-        { "x": 339, "y": 125, "type": "file", "path": "${ETH_MIX_APP}" }
-    ]
-}
-
-
--- a/11
+++ b/11
@ -1,11 +0,0 @@
-style=allman
-indent=force-tab=4
-convert-tabs
-indent-preprocessor
-min-conditional-indent=1
-pad-oper
-pad-header
-unpad-paren
-align-pointer=type
-keep-one-line-blocks
-close-templates
--- a/cmake/scripts/macdeployfix.sh
+++ b/cmake/scripts/macdeployfix.sh
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@ -60,6 +60,7 @@
 #endif
 #if ETH_STRATUM || !ETH_TRUE
 #include <libstratum/EthStratumClient.h>
+#include <libstratum/EthStratumClientV2.h>
 #endif
 using namespace std;
 using namespace dev;
@ -187,6 +188,41 @@ public:
 			if (p + 1 <= userpass.length())
 				m_pass = userpass.substr(p+1);
 		}
+		else if ((arg == "-SC" || arg == "--stratum-client") && i + 1 < argc)
+		{
+			try {
+				m_stratumClientVersion = atoi(argv[++i]);
+				if (m_stratumClientVersion > 2) m_stratumClientVersion = 2;
+				else if (m_stratumClientVersion < 1) m_stratumClientVersion = 1;
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		}
+		else if ((arg == "-SP" || arg == "--stratum-protocol") && i + 1 < argc)
+		{
+			try {
+				m_stratumProtocol = atoi(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		}
+		else if ((arg == "-SE" || arg == "--stratum-email") && i + 1 < argc)
+		{
+			try {
+				m_email = string(argv[++i]);
+			}
+			catch (...)
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		}
 		else if ((arg == "-FO" || arg == "--failover-userpass") && i + 1 < argc)
 		{
 			string userpass = string(argv[++i]);
@ -311,6 +347,23 @@ public:
 		else if (arg == "--cuda-streams" && i + 1 < argc)
 			m_numStreams = stol(argv[++i]);
 #endif
+		else if ((arg == "-L" || arg == "--dag-load-mode") && i + 1 < argc)
+		{
+			string mode = argv[++i];
+			if (mode == "parallel") m_dagLoadMode = DAG_LOAD_MODE_PARALLEL;
+			else if (mode == "sequential") m_dagLoadMode = DAG_LOAD_MODE_SEQUENTIAL;
+			else if (mode == "single")
+			{
+				m_dagLoadMode = DAG_LOAD_MODE_SINGLE;
+				m_dagCreateDevice = stol(argv[++i]);
+			}
+			else
+			{
+				cerr << "Bad " << arg << " option: " << argv[i] << endl;
+				BOOST_THROW_EXCEPTION(BadArgument());
+			}
+		}
+		/*
 		else if (arg == "--phone-home" && i + 1 < argc)
 		{
 			string m = argv[++i];
@ -324,6 +377,7 @@ public:
 				BOOST_THROW_EXCEPTION(BadArgument());
 			}
 		}
+		*/
 		else if (arg == "--benchmark-warmup" && i + 1 < argc)
 			try {
 				m_benchmarkWarmup = stol(argv[++i]);
@ -360,77 +414,9 @@ public:
 		{
 			m_minerType = MinerType::CUDA;
 		}
-		else if (arg == "--current-block" && i + 1 < argc)
-			m_currentBlock = stol(argv[++i]);
-		else if ((arg == "-R" || arg == "--dag-dir") && i + 1 < argc)
-		{
-			strcpy(s_dagDir, argv[++i]);
-		}
-		else if ((arg == "-E" || arg == "--erase-dags") && i + 1 < argc)
+		else if (arg == "-X" || arg == "--cuda-opencl")
 		{
-			string m = string(argv[++i]);
-			if (m == "none") m_eraseMode = DAGEraseMode::None;
-			else if (m == "old") m_eraseMode = DAGEraseMode::Old;
-			else if (m == "bench") m_eraseMode = DAGEraseMode::Bench;
-			else if (m == "all") m_eraseMode = DAGEraseMode::All;
-			else
-			{
-				cerr << "Bad " << arg << " option: " << argv[i] << endl;
-				BOOST_THROW_EXCEPTION(BadArgument());
-			}
-		}
-		else if (arg == "--no-precompute")
-			m_precompute = false;
-		else if ((arg == "-D" || arg == "--create-dag") && i + 1 < argc)
-		{
-			string m = boost::to_lower_copy(string(argv[++i]));
-			mode = OperationMode::DAGInit;
-			try
-			{
-				m_initDAG = stol(m);
-			}
-			catch (...)
-			{
-				cerr << "Bad " << arg << " option: " << m << endl;
-				BOOST_THROW_EXCEPTION(BadArgument());
-			}
-		}
-		else if ((arg == "-w" || arg == "--check-pow") && i + 4 < argc)
-		{
-			string m;
-			try
-			{
-				Ethash::BlockHeader bi;
-				m = boost::to_lower_copy(string(argv[++i]));
-				h256 powHash(m);
-				m = boost::to_lower_copy(string(argv[++i]));
-				h256 seedHash;
-				if (m.size() == 64 || m.size() == 66)
-					seedHash = h256(m);
-				else
-					seedHash = EthashAux::seedHash(stol(m));
-				m = boost::to_lower_copy(string(argv[++i]));
-				bi.setDifficulty(u256(m));
-				auto boundary = bi.boundary();
-				m = boost::to_lower_copy(string(argv[++i]));
-				bi.setNonce(h64(m));
-				auto r = EthashAux::eval(seedHash, powHash, bi.nonce());
-				bool valid = r.value < boundary;
-				cout << (valid ? "VALID :-)" : "INVALID :-(") << endl;
-				cout << r.value << (valid ? " < " : " >= ") << boundary << endl;
-				cout << "  where " << boundary << " = 2^256 / " << bi.difficulty() << endl;
-				cout << "  and " << r.value << " = ethash(" << powHash << ", " << bi.nonce() << ")" << endl;
-				cout << "  with seed as " << seedHash << endl;
-				if (valid)
-					cout << "(mixHash = " << r.mixHash << ")" << endl;
-				cout << "SHA3( light(seed) ) = " << sha3(EthashAux::light(bi.seedHash())->data()) << endl;
-				exit(0);
-			}
-			catch (...)
-			{
-				cerr << "Bad " << arg << " option: " << m << endl;
-				BOOST_THROW_EXCEPTION(BadArgument());
-			}
+			m_minerType = MinerType::Mixed;
 		}
 		else if (arg == "-M" || arg == "--benchmark")
 		{
@ -494,22 +480,14 @@ public:

 	void execute()
 	{
-		EthashAux::setDAGDirName(s_dagDir);
-		EthashAux::setDAGEraseMode(m_eraseMode);
-		EthashAux::eraseDAGs();
-		if (m_eraseMode == DAGEraseMode::All)
-		{
-			m_eraseMode = DAGEraseMode::None;
-		}
-
 		if (m_shouldListDevices)
 		{
 #if ETH_ETHASHCL || !ETH_TRUE
-			if (m_minerType == MinerType::CL)
+			if (m_minerType == MinerType::CL || m_minerType == MinerType::Mixed)
 				EthashGPUMiner::listDevices();
 #endif
 #if ETH_ETHASHCUDA || !ETH_TRUE
-			if (m_minerType == MinerType::CUDA)
+			if (m_minerType == MinerType::CUDA || m_minerType == MinerType::Mixed)
 				EthashCUDAMiner::listDevices();
 #endif
 			if (m_minerType == MinerType::CPU)
@ -518,8 +496,11 @@ public:
 		}

 		if (m_minerType == MinerType::CPU)
-			EthashCPUMiner::setNumInstances(m_miningThreads);
-		else if (m_minerType == MinerType::CL)
+		{
+			cout << "CPU mining is no longer supported in this miner. Use -G (opencl) or -U (cuda) flag to select GPU platform." << endl;
+			exit(0);
+		}
+		else if (m_minerType == MinerType::CL || m_minerType == MinerType::Mixed)
 		{
 #if ETH_ETHASHCL || !ETH_TRUE
 			if (m_openclDeviceCount > 0)
@ -535,7 +516,9 @@ public:
 					m_openclDevice,
 					m_clAllowCPU,
 					m_extraGPUMemory,
-					m_currentBlock
+					0,
+					m_dagLoadMode,
+					m_dagCreateDevice
 				))
 				exit(1);
 			EthashGPUMiner::setNumInstances(m_miningThreads);
@ -544,7 +527,7 @@ public:
 			exit(1);
 #endif
 		}
-		else if (m_minerType == MinerType::CUDA)
+		else if (m_minerType == MinerType::CUDA || m_minerType == MinerType::Mixed)
 		{
 #if ETH_ETHASHCUDA || !ETH_TRUE
 			if (m_cudaDeviceCount > 0)
@ -560,7 +543,9 @@ public:
 				m_numStreams,
 				m_extraGPUMemory,
 				m_cudaSchedule,
-				m_currentBlock
+				0,
+				m_dagLoadMode,
+				m_dagCreateDevice
 				))
 				exit(1);
 #else
@ -568,9 +553,7 @@ public:
 			exit(1);
 #endif
 		}
-		if (mode == OperationMode::DAGInit)
-			doInitDAG(m_initDAG);
-		else if (mode == OperationMode::Benchmark)
+		if (mode == OperationMode::Benchmark)
 			doBenchmark(m_minerType, m_phoneHome, m_benchmarkWarmup, m_benchmarkTrial, m_benchmarkTrials);
 		else if (mode == OperationMode::Farm)
 			doFarm(m_minerType, m_activeFarmURL, m_farmRecheckPeriod);
@ -596,14 +579,17 @@ public:
 			<< "	-FS, --failover-stratum <host:port>  Failover stratum server at host:port" << endl
 			<< "    -O, --userpass <username.workername:password> Stratum login credentials" << endl
 			<< "    -FO, --failover-userpass <username.workername:password> Failover stratum login credentials (optional, will use normal credentials when omitted)" << endl
-			<< "    --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 60. Don't set lower than max. avg. block time" << endl
+			<< "    --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 180. Don't set lower than max. avg. block time" << endl
+			<< "    -SC, --stratum-client <n>  Stratum client version. Defaults to 1 (async client). Use 2 to use the new synchronous client." << endl
+			<< "    -SP, --stratum-protocol <n> Choose which stratum protocol to use:" << endl
+			<< "        0: official stratum spec: ethpool, ethermine, coinotron, mph, nanopool (default)" << endl
+			<< "        1: eth-proxy compatible: dwarfpool, f2pool, nanopool" << endl
+			<< "        2: EthereumStratum/1.0.0: nicehash" << endl
+			<< "    -SE, --stratum-email <s> Email address used in eth-proxy (optional)" << endl
 #endif
 #if ETH_JSONRPC || ETH_STRATUM || !ETH_TRUE
 			<< "    --farm-recheck <n>  Leave n ms between checks for changed work (default: 500). When using stratum, use a high value (i.e. 2000) to get more stable hashrate output" << endl
-			<< "    --no-precompute  Don't precompute the next epoch's DAG." << endl
 #endif
-			<< "Ethash verify mode:" << endl
-			<< "    -w,--check-pow <headerHash> <seedHash> <difficulty> <nonce>  Check PoW credentials for validity." << endl
 			<< endl
 			<< "Benchmarking mode:" << endl
 			<< "    -M [<n>],--benchmark [<n>] Benchmark for mining and exit; Optionally specify block number to benchmark against specific DAG." << endl
@ -612,28 +598,20 @@ public:
 			<< "    --benchmark-trials <n>  Set the duration of warmup for the benchmark tests (default: 5)." << endl
 			<< "Simulation mode:" << endl
 			<< "    -Z [<n>],--simulation [<n>] Mining test mode. Used to validate kernel optimizations. Optionally specify block number." << endl
-#if ETH_JSONRPC || !ETH_TRUE
-			<< "    --phone-home <on/off>  When benchmarking, publish results (default: off)" << endl
-#endif
-			<< "DAG file management:" << endl
-			<< "    -D,--create-dag <number>  Create the DAG in preparation for mining on given block and exit." << endl
-			<< "    -R <s>, --dag-dir <s> Store/Load DAG files in/from the specified directory. Useful for running multiple instances with different configurations." << endl
-			<< "    -E <mode>, --erase-dags <mode> Erase unneeded DAG files. Default is 'none'. Possible values are:" << endl
-			<< "        none  - don't erase DAG files (default)" << endl
-			<< "        old   - erase all DAG files older than current epoch" << endl
-			<< "		bench - like old, but keep epoch 0 for benchmarking" << endl
-			<< "        all   - erase all DAG files. After deleting all files, setting changes to none." << endl
 			<< "Mining configuration:" << endl
-			<< "    -C,--cpu  When mining, use the CPU." << endl
 			<< "    -G,--opencl  When mining use the GPU via OpenCL." << endl
 			<< "    -U,--cuda  When mining use the GPU via CUDA." << endl
+			<< "    -X,--cuda-opencl Use OpenCL + CUDA in a system with mixed AMD/Nvidia cards. May require setting --opencl-platform 1" << endl
 			<< "    --opencl-platform <n>  When mining using -G/--opencl use OpenCL platform n (default: 0)." << endl
 			<< "    --opencl-device <n>  When mining using -G/--opencl use OpenCL device n (default: 0)." << endl
 			<< "    --opencl-devices <0 1 ..n> Select which OpenCL devices to mine on. Default is to use all" << endl
 			<< "    -t, --mining-threads <n> Limit number of CPU/GPU miners to n (default: use everything available on selected platform)" << endl
 			<< "    --allow-opencl-cpu Allows CPU to be considered as an OpenCL device if the OpenCL platform supports it." << endl
 			<< "    --list-devices List the detected OpenCL/CUDA devices and exit. Should be combined with -G or -U flag" << endl
-			<< "    --current-block Let the miner know the current block number at configuration time. Will help determine DAG size and required GPU memory." << endl
+			<< "    -L, --dag-load-mode <mode> DAG generation mode." << endl
+			<< "        parallel    - load DAG on all GPUs at the same time (default)" << endl
+			<< "        sequential  - load DAG on GPUs one after another. Use this when the miner crashes during DAG generation" << endl
+			<< "        single <n>  - generate DAG on device n, then copy to other devices" << endl
 #if ETH_ETHASHCL || !ETH_TRUE
 			<< "    --cl-extragpu-mem Set the memory (in MB) you believe your GPU requires for stuff other than mining. default: 0" << endl
 			<< "    --cl-local-work Set the OpenCL local work size. Default is " << toString(ethash_cl_miner::c_defaultLocalWorkSize) << endl
@ -655,7 +633,6 @@ public:
 	}

 	MinerType minerType() const { return m_minerType; }
-	bool shouldPrecompute() const { return m_precompute; }

 private:
 	void doInitDAG(unsigned _n)
@ -691,16 +668,16 @@ private:
 		cout << "Benchmarking on platform: " << platformInfo << endl;

 		cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
-		genesis.prep();
+		//genesis.prep();

 		genesis.setDifficulty(u256(1) << 63);
 		f.setWork(genesis);
 		if (_m == MinerType::CPU)
-			f.start("cpu");
+			f.start("cpu", false);
 		else if (_m == MinerType::CL)
-			f.start("opencl");
+			f.start("opencl", false);
 		else if (_m == MinerType::CUDA)
-			f.start("cuda");
+			f.start("cuda", false);

 		map<uint64_t, WorkingProgress> results;
 		uint64_t mean = 0;
@ -774,17 +751,17 @@ private:
 		cout << "Running mining simulation on platform: " << platformInfo << endl;

 		cout << "Preparing DAG for block #" << m_benchmarkBlock << endl;
-		genesis.prep();
+		//genesis.prep();

 		genesis.setDifficulty(u256(1) << difficulty);
 		f.setWork(genesis);

 		if (_m == MinerType::CPU)
-			f.start("cpu");
+			f.start("cpu", false);
 		else if (_m == MinerType::CL)
-			f.start("opencl");
+			f.start("opencl", false);
 		else if (_m == MinerType::CUDA)
-			f.start("cuda");
+			f.start("cuda", false);

 		int time = 0;

@ -806,13 +783,7 @@ private:
 				this_thread::sleep_for(chrono::milliseconds(1000));
 				time++;
 			}
-			//cnote << "Solution found";
 			cnote << "Difficulty:" << difficulty << "  Nonce:" << solution.nonce.hex();
-			//cnote << "  Mixhash:" << solution.mixHash.hex();
-			//cnote << "  Header-hash:" << current.headerHash.hex();
-			//cnote << "  Seedhash:" << current.seedHash.hex();
-			//cnote << "  Target: " << h256(current.boundary).hex();
-			//cnote << "  Ethash: " << h256(EthashAux::eval(current.seedHash, current.headerHash, solution.nonce).value).hex();
 			if (EthashAux::eval(current.seedHash, current.headerHash, solution.nonce).value < current.boundary)
 			{
 				cnote << "SUCCESS: GPU gave correct result!";
@ -869,11 +840,11 @@ private:
 		GenericFarm<EthashProofOfWork> f;
 		f.setSealers(sealers);
 		if (_m == MinerType::CPU)
-			f.start("cpu");
+			f.start("cpu", false);
 		else if (_m == MinerType::CL)
-			f.start("opencl");
+			f.start("opencl", false);
 		else if (_m == MinerType::CUDA)
-			f.start("cuda");
+			f.start("cuda", false);
 		EthashProofOfWork::WorkPackage current, previous;
 		boost::mutex x_current;
 		EthashAux::FullType dag;
@ -911,18 +882,7 @@ private:
 					Json::Value v = prpc->eth_getWork();
 					h256 hh(v[0].asString());
 					h256 newSeedHash(v[1].asString());
-					if (current.seedHash != newSeedHash)
-					{
-						minelog << "Grabbing DAG for" << newSeedHash;
-					}
-					if (!(dag = EthashAux::full(newSeedHash, true, [&](unsigned _pc){ cout << "\rCreating DAG. " << _pc << "% done..." << flush; return 0; })))
-					{
-						BOOST_THROW_EXCEPTION(DAGCreationFailure());
-					}
-					if (m_precompute)
-					{
-						EthashAux::computeFull(sha3(newSeedHash), true);
-					}
+
 					if (hh != current.headerHash)
 					{
 						x_current.lock();
@ -933,8 +893,6 @@ private:
 						current.seedHash = newSeedHash;
 						current.boundary = h256(fromHex(v[2].asString()), h256::AlignRight);
 						minelog << "Got work package: #" + current.headerHash.hex().substr(0,8);
-						//minelog << "  Seedhash:" << current.seedHash.hex();
-						//minelog << "  Target: " << h256(current.boundary).hex();
 						f.setWork(current);
 						x_current.unlock();
 					}
@ -942,11 +900,6 @@ private:
 				}
 				cnote << "Solution found; Submitting to" << _remote << "...";
 				cnote << "  Nonce:" << solution.nonce.hex();
-				//cnote << "  Mixhash:" << solution.mixHash.hex();
-				//cnote << "  Header-hash:" << current.headerHash.hex();
-				//cnote << "  Seedhash:" << solved.seedHash.hex();
-				//cnote << "  Target: " << h256(solved.boundary).hex();
-				//cnote << "  Ethash: " << h256(EthashAux::eval(solved.seedHash, solved.headerHash, solution.nonce).value).hex();
 				if (EthashAux::eval(current.seedHash, current.headerHash, solution.nonce).value < current.boundary)
 				{
 					bool ok = prpc->eth_submitWork("0x" + toString(solution.nonce), "0x" + toString(current.headerHash), "0x" + toString(solution.mixHash));
@ -958,6 +911,7 @@ private:
 						cwarn << ":-( Not accepted.";
 						f.rejectedSolution(false);
 					}
+					//exit(0);
 				}
 				else if (EthashAux::eval(previous.seedHash, previous.headerHash, solution.nonce).value < previous.boundary)
 				{
@ -970,6 +924,7 @@ private:
 						cwarn << ":-( Not accepted.";
 						f.rejectedSolution(true);
 					}
+					//exit(0);
 				}
 				else {
 					f.failedSolution();
@ -1030,45 +985,89 @@ private:
 			m_farmRecheckPeriod = m_defaultStratumFarmRecheckPeriod;
 		
 		GenericFarm<EthashProofOfWork> f;
-		EthStratumClient client(&f, m_minerType, m_farmURL, m_port, m_user, m_pass, m_maxFarmRetries, m_worktimeout, m_precompute);
-		if (m_farmFailOverURL != "")
-		{
-			if (m_fuser != "")
+
+		// this is very ugly, but if Stratum Client V2 tunrs out to be a success, V1 will be completely removed anyway
+		if (m_stratumClientVersion == 1) {
+			EthStratumClient client(&f, m_minerType, m_farmURL, m_port, m_user, m_pass, m_maxFarmRetries, m_worktimeout, m_stratumProtocol, m_email);
+			if (m_farmFailOverURL != "")
 			{
-				client.setFailover(m_farmFailOverURL, m_fport, m_fuser, m_fpass);
+				if (m_fuser != "")
+				{
+					client.setFailover(m_farmFailOverURL, m_fport, m_fuser, m_fpass);
+				}
+				else
+				{
+					client.setFailover(m_farmFailOverURL, m_fport);
+				}
 			}
-			else
+			f.setSealers(sealers);
+
+			f.onSolutionFound([&](EthashProofOfWork::Solution sol)
+			{
+				if (client.isConnected()) {
+					client.submit(sol);
+				}
+				else {
+					cwarn << "Can't submit solution: Not connected";
+				}
+				return false;
+			});
+
+			while (client.isRunning())
 			{
-				client.setFailover(m_farmFailOverURL, m_fport);
+				auto mp = f.miningProgress();
+				f.resetMiningProgress();
+				if (client.isConnected())
+				{
+					if (client.current())
+						minelog << "Mining on PoWhash" << "#" + (client.currentHeaderHash().hex().substr(0, 8)) << ": " << mp << f.getSolutionStats();
+					else if (client.waitState() == MINER_WAIT_STATE_WORK)
+						minelog << "Waiting for work package...";
+				}
+				this_thread::sleep_for(chrono::milliseconds(m_farmRecheckPeriod));
 			}
 		}
-		f.setSealers(sealers);
+		else if (m_stratumClientVersion == 2) {
+			EthStratumClientV2 client(&f, m_minerType, m_farmURL, m_port, m_user, m_pass, m_maxFarmRetries, m_worktimeout, m_stratumProtocol, m_email);
+			if (m_farmFailOverURL != "")
+			{
+				if (m_fuser != "")
+				{
+					client.setFailover(m_farmFailOverURL, m_fport, m_fuser, m_fpass);
+				}
+				else
+				{
+					client.setFailover(m_farmFailOverURL, m_fport);
+				}
+			}
+			f.setSealers(sealers);

-		f.onSolutionFound([&](EthashProofOfWork::Solution sol)
-		{
-			client.submit(sol);
-			return false;
-		});
-		 
-		while (client.isRunning())
-		{
-			auto mp = f.miningProgress();
-			f.resetMiningProgress();
-			if (client.isConnected())
+			f.onSolutionFound([&](EthashProofOfWork::Solution sol)
 			{
-				if (client.current())
-					minelog << "Mining on PoWhash" << "#"+(client.currentHeaderHash().hex().substr(0,8)) << ": " << mp << f.getSolutionStats();
-				else if (client.waitState() == MINER_WAIT_STATE_WORK)
-					minelog << "Waiting for work package...";
+				client.submit(sol);
+				return false;
+			});
+
+			while (client.isRunning())
+			{
+				auto mp = f.miningProgress();
+				f.resetMiningProgress();
+				if (client.isConnected())
+				{
+					if (client.current())
+						minelog << "Mining on PoWhash" << "#" + (client.currentHeaderHash().hex().substr(0, 8)) << ": " << mp << f.getSolutionStats();
+					else if (client.waitState() == MINER_WAIT_STATE_WORK)
+						minelog << "Waiting for work package...";
+				}
+				this_thread::sleep_for(chrono::milliseconds(m_farmRecheckPeriod));
 			}
-			this_thread::sleep_for(chrono::milliseconds(m_farmRecheckPeriod));
 		}
+
 	}
 #endif

 	/// Operating mode.
 	OperationMode mode;
-	DAGEraseMode m_eraseMode = DAGEraseMode::None;

 	/// Mining options
 	bool m_running = true;
@ -1094,17 +1093,13 @@ private:
 	unsigned m_numStreams = ethash_cuda_miner::c_defaultNumStreams;
 	unsigned m_cudaSchedule = 4; // sync
 #endif
-	uint64_t m_currentBlock = 0;
-	static char s_dagDir[256];
 	// default value was 350MB of GPU memory for other stuff (windows system rendering, e.t.c.)
 	unsigned m_extraGPUMemory = 0;// 350000000; don't assume miners run desktops...
-
-	/// DAG initialisation param.
-	unsigned m_initDAG = 0;
-
+	unsigned m_dagLoadMode = 0; // parallel
+	unsigned m_dagCreateDevice = 0;
 	/// Benchmarking params
 	bool m_phoneHome = false;
-	unsigned m_benchmarkWarmup = 3;
+	unsigned m_benchmarkWarmup = 15;
 	unsigned m_benchmarkTrial = 3;
 	unsigned m_benchmarkTrials = 5;
 	unsigned m_benchmarkBlock = 0;
@ -1119,17 +1114,17 @@ private:
 	unsigned m_farmRecheckPeriod = 500;
 	unsigned m_defaultStratumFarmRecheckPeriod = 2000;
 	bool m_farmRecheckSet = false;
-	int m_worktimeout = 90;
-	bool m_precompute = true;
+	int m_worktimeout = 180;

 #if ETH_STRATUM || !ETH_TRUE
+	int m_stratumClientVersion = 1;
+	int m_stratumProtocol = STRATUM_PROTOCOL_STRATUM;
 	string m_user;
 	string m_pass;
 	string m_port;
 	string m_fuser = "";
 	string m_fpass = "";
+	string m_email = "";
 #endif
 	string m_fport = "";
 };
-
-char MinerCLI::s_dagDir[256] = ""; 
--- a/ethminer/main.cpp
+++ b/ethminer/main.cpp
@ -72,7 +72,7 @@ int main(int argc, char** argv)
 	cout << "=====================================================================" << endl;
 	cout << "Forked from github.com/ethereum/cpp-ethereum" << endl;
 	cout << "CUDA kernel ported from Tim Hughes' OpenCL kernel" << endl;
-	cout << "With contributions from nerdralph, RoBiK, tpruvot and sp_ " << endl << endl;
+	cout << "With contributions from nicehash, nerdralph, RoBiK and sp_ " << endl << endl;
 	cout << "Please consider a donation to:" << endl;
 	cout << "ETH: 0xeb9310b185455f863f526dab3d245809f6854b4d" << endl << endl;

--- a/extdep/getstuff.bat
+++ b/extdep/getstuff.bat
@ -36,3 +36,4 @@ cd ..

 goto :EOF

+
--- a/install-folder-bg.png
+++ b/install-folder-bg.png
--- a/install-folder-bg@2x.png
+++ b/install-folder-bg@2x.png
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -205,6 +205,7 @@ bool ethash_cl_miner::configureGPU(
 	s_initialGlobalWorkSize = _globalWorkSize;
 	s_allowCPU = _allowCPU;
 	s_extraRequiredGPUMem = _extraGPUMemory;
+
 	// by default let's only consider the DAG of the first epoch
 	uint64_t dagSize = ethash_get_datasize(_currentBlock);
 	uint64_t requiredSize =  dagSize + _extraGPUMemory;
@ -324,9 +325,11 @@ void ethash_cl_miner::finish()
 		m_queue.finish();
 }

+
 bool ethash_cl_miner::init(
-	uint8_t const* _dag,
-	uint64_t _dagSize,
+	ethash_light_t _light, 
+	uint8_t const* _lightData, 
+	uint64_t _lightSize,
 	unsigned _platformId,
 	unsigned _deviceId
 )
@ -399,12 +402,17 @@ bool ethash_cl_miner::init(
 		if (m_globalWorkSize % s_workgroupSize != 0)
 			m_globalWorkSize = ((m_globalWorkSize / s_workgroupSize) + 1) * s_workgroupSize;

+		uint64_t dagSize = ethash_get_datasize(_light->block_number);
+		uint32_t dagSize128 = (unsigned)(dagSize / ETHASH_MIX_BYTES);
+		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
+
 		// patch source code
 		// note: ETHASH_CL_MINER_KERNEL is simply ethash_cl_miner_kernel.cl compiled
 		// into a byte array by bin2h.cmake. There is no need to load the file by hand in runtime
 		string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
 		addDefinition(code, "GROUP_SIZE", s_workgroupSize);
-		addDefinition(code, "DAG_SIZE", (unsigned)(_dagSize / ETHASH_MIX_BYTES));
+		addDefinition(code, "DAG_SIZE", dagSize128);
+		addDefinition(code, "LIGHT_SIZE", lightSize64);
 		addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
 		addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
 		addDefinition(code, "PLATFORM", platformId);
@ -430,16 +438,19 @@ bool ethash_cl_miner::init(
 		// create buffer for dag
 		try
 		{
-			ETHCL_LOG("Creating one big buffer for the DAG");
-			m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize);
-			ETHCL_LOG("Loading single big chunk kernels");
+			ETHCL_LOG("Creating cache buffer");
+			m_light = cl::Buffer(m_context, CL_MEM_READ_ONLY, _lightSize);
+			ETHCL_LOG("Creating DAG buffer");
+			m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, dagSize);
+			ETHCL_LOG("Loading kernels");
 			m_searchKernel = cl::Kernel(program, "ethash_search");
-			ETHCL_LOG("Mapping one big chunk.");
-			m_queue.enqueueWriteBuffer(m_dag, CL_TRUE, 0, _dagSize, _dag);
+			m_dagKernel = cl::Kernel(program, "ethash_calculate_dag_item");
+			ETHCL_LOG("Writing cache buffer");
+			m_queue.enqueueWriteBuffer(m_light, CL_TRUE, 0, _lightSize, _lightData);
 		}
 		catch (cl::Error const& err)
 		{
-			ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
+			ETHCL_LOG("Allocating/mapping DAG buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
 			return false;
 		}
 		// create buffer for header
@ -456,6 +467,28 @@ bool ethash_cl_miner::init(
 			ETHCL_LOG("Creating mining buffer " << i);
 			m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t));
 		}
+
+		ETHCL_LOG("Generating DAG data");
+
+		uint32_t const work = (uint32_t)(dagSize / sizeof(node));
+		//while (work < blocks * threads) blocks /= 2;
+
+		uint32_t fullRuns = work / m_globalWorkSize;
+		uint32_t const restWork = work % m_globalWorkSize;
+		if (restWork > 0) fullRuns++;
+
+		m_dagKernel.setArg(1, m_light);
+		m_dagKernel.setArg(2, m_dag);
+		m_dagKernel.setArg(3, ~0u);
+
+		for (uint32_t i = 0; i < fullRuns; i++)
+		{
+			m_dagKernel.setArg(0, i * m_globalWorkSize);
+			m_queue.enqueueNDRangeKernel(m_dagKernel, cl::NullRange, m_globalWorkSize, s_workgroupSize);
+			m_queue.finish();
+			printf("OPENCL#%d: %.0f%%\n", _deviceId, 100.0f * (float)i / (float)fullRuns);
+		}
+
 	}
 	catch (cl::Error const& err)
 	{
@ -471,7 +504,7 @@ typedef struct
 	unsigned buf;
 } pending_batch;

-void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
+void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN)
 {
 	try
 	{
@ -498,7 +531,9 @@ void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook
 		
 		unsigned buf = 0;
 		random_device engine;
-		uint64_t start_nonce = uniform_int_distribution<uint64_t>()(engine);
+		uint64_t start_nonce;
+		if (_ethStratum) start_nonce = _startN;
+		else start_nonce = uniform_int_distribution<uint64_t>()(engine);
 		for (;; start_nonce += m_globalWorkSize)
 		{
 			// supply output buffer to kernel
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -52,13 +52,14 @@ public:
 	);

 	bool init(
-		uint8_t const* _dag,
-		uint64_t _dagSize,
-		unsigned _platformId = 0,
-		unsigned _deviceId = 0
-	);
+		ethash_light_t _light,
+		uint8_t const* _lightData,
+		uint64_t _lightSize,
+		unsigned _platformId,
+		unsigned _deviceId
+		);
 	void finish();
-	void search(uint8_t const* _header, uint64_t _target, search_hook& _hook);
+	void search(uint8_t const* _header, uint64_t _target, search_hook& _hook, bool _ethStratum, uint64_t _startN);

 	/* -- default values -- */
 	/// Default value of the local work size. Also known as workgroup size.
@ -74,7 +75,9 @@ private:
 	cl::Context m_context;
 	cl::CommandQueue m_queue;
 	cl::Kernel m_searchKernel;
+	cl::Kernel m_dagKernel;
 	cl::Buffer m_dag;
+	cl::Buffer m_light;
 	cl::Buffer m_header;
 	cl::Buffer m_searchBuffer[c_bufferCount];
 	unsigned m_globalWorkSize;
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@ -1,11 +1,36 @@
 #define OPENCL_PLATFORM_UNKNOWN 0
 #define OPENCL_PLATFORM_NVIDIA  1
-#define OPENCL_PLATFORM_AMD			2
+#define OPENCL_PLATFORM_AMD		2

+#ifndef ACCESSES
+#define ACCESSES 64
+#endif
+
+#ifndef GROUP_SIZE
+#define GROUP_SIZE 128
+#endif
+
+#ifndef MAX_OUTPUTS
+#define MAX_OUTPUTS 63U
+#endif
+
+#ifndef PLATFORM
+#define PLATFORM 2
+#endif
+
+#ifndef DAG_SIZE
+#define DAG_SIZE 8388593
+#endif
+
+#ifndef LIGHT_SIZE
+#define LIGHT_SIZE 262139
+#endif
+
+#define ETHASH_DATASET_PARENTS 256
+#define NODE_WORDS (64/4)

 #define THREADS_PER_HASH (128 / 16)
 #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
-
 #define FNV_PRIME	0x01000193

 __constant uint2 const Keccak_f1600_RC[24] = {
@ -176,8 +201,6 @@ static void keccak_f1600_round(uint2* a, uint r)

 static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
 {
-
-
 	// Originally I unrolled the first and last rounds to interface
 	// better with surrounding code, however I haven't done this
 	// without causing the AMD compiler to blow up the VGPR usage.
@ -227,6 +250,18 @@ typedef struct
 	ulong ulongs[32 / sizeof(ulong)];
 } hash32_t;

+typedef union {
+	uint	 words[64 / sizeof(uint)];
+	uint2	 uint2s[64 / sizeof(uint2)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union {
+	uint	 words[200 / sizeof(uint)];
+	uint2	 uint2s[200 / sizeof(uint2)];
+	uint4	 uint4s[200 / sizeof(uint4)];
+} hash200_t;
+
 typedef struct
 {
 	uint4 uint4s[128 / sizeof(uint4)];
@ -334,3 +369,35 @@ __kernel void ethash_search(
 		g_output[slot] = gid;
 	}
 }
+
+static void SHA3_512(uint2* s, uint isolate)
+{
+	for (uint i = 8; i != 25; ++i)
+	{
+		s[i] = (uint2){ 0, 0 };
+	}
+	s[8].x = 0x00000001;
+	s[8].y = 0x80000000;
+	keccak_f1600_no_absorb(s, 8, isolate);
+}
+
+__kernel void ethash_calculate_dag_item(uint start, __global hash64_t const* g_light, __global hash64_t * g_dag, uint isolate)
+{
+	uint const node_index = start + get_global_id(0);
+	if (node_index > DAG_SIZE * 2) return;
+
+	hash200_t dag_node;
+	copy(dag_node.uint4s, g_light[node_index % LIGHT_SIZE].uint4s, 4);
+	dag_node.words[0] ^= node_index;
+	SHA3_512(dag_node.uint2s, isolate);
+
+	for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
+		uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_SIZE;
+
+		for (uint w = 0; w != 4; ++w) {
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
+		}
+	}
+	SHA3_512(dag_node.uint2s, isolate);
+	copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
+}
--- a/libethash-cuda/CMakeLists.txt
+++ b/libethash-cuda/CMakeLists.txt
@ -13,7 +13,7 @@ LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
 if(COMPUTE AND (COMPUTE GREATER 0))
 	LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
 else(COMPUTE AND (COMPUTE GREATER 0))
-	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52)
+	set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52;-gencode arch=compute_61,code=sm_61)
 endif(COMPUTE AND (COMPUTE GREATER 0))


--- a/libethash-cuda/cuda_helper.h
+++ b/libethash-cuda/cuda_helper.h
@ -10,6 +10,7 @@
 #include <device_launch_parameters.h>
 #define __launch_bounds__(max_tpb, min_blocks)
 #define asm("a" : "=l"(result) : "l"(a))
+#define __CUDA_ARCH__ 520 // highlight shuffle code by default.

 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
 uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
@ -337,7 +338,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 }

 #ifndef USE_ROT_ASM_OPT
-#define USE_ROT_ASM_OPT 1
+#define USE_ROT_ASM_OPT 2
 #endif

 // 64-bit ROTATE RIGHT
--- a/libethash-cuda/dagger_shared.cuh
+++ b/libethash-cuda/dagger_shared.cuh
@ -0,0 +1,71 @@
+#include "ethash_cuda_miner_kernel_globals.h"
+#include "ethash_cuda_miner_kernel.h"
+
+typedef union {
+	uint4	 uint4s[4];
+	uint64_t ulongs[8];
+	uint32_t uints[16];
+} compute_hash_share;
+
+
+__device__ uint64_t compute_hash(
+	uint64_t nonce
+	)
+{
+	// sha3_512(header .. nonce)
+	uint64_t state[25];
+	state[4] = nonce;
+	keccak_f1600_init(state);
+	
+	// Threads work together in this phase in groups of 8.
+	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
+	const int hash_id = threadIdx.x  >> 3;
+
+	extern __shared__  compute_hash_share share[];
+	
+	for (int i = 0; i < THREADS_PER_HASH; i++)
+	{
+		// share init with other threads
+		if (i == thread_id)
+			copy(share[hash_id].ulongs, state, 8);
+
+		__syncthreads();
+
+		uint4 mix = share[hash_id].uint4s[thread_id & 3];
+		__syncthreads();
+
+		uint32_t *share0 = share[hash_id].uints;
+
+		// share init0
+		if (thread_id == 0)
+			*share0 = mix.x;
+		__syncthreads();
+		uint32_t init0 = *share0;
+
+		for (uint32_t a = 0; a < ACCESSES; a += 4)
+		{
+			int t = bfe(a, 2u, 3u);
+
+			for (uint32_t b = 0; b < 4; b++)
+			{
+				if (thread_id == t) {
+					*share0 = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
+				}
+				__syncthreads();
+
+				mix = fnv4(mix, d_dag[*share0].uint4s[thread_id]);
+			}
+		}
+
+		share[hash_id].uints[thread_id] = fnv_reduce(mix);
+		__syncthreads();
+
+		if (i == thread_id)
+			copy(state + 8, share[hash_id].ulongs, 4);
+
+		__syncthreads();
+	}
+	
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	return keccak_f1600_final(state);
+}
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@ -1,12 +1,8 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "ethash_cuda_miner_kernel.h"
-#include "keccak.cuh"
-#include "fnv.cuh"
+#include "cuda_helper.h"

-#define ACCESSES 64
-#define THREADS_PER_HASH (128 / 16)
-
-__device__ uint64_t compute_hash_shuffle(
+__device__ uint64_t compute_hash(
 	uint64_t nonce
 	)
 {
@ -19,7 +15,6 @@ __device__ uint64_t compute_hash_shuffle(
 	
 	// Threads work together in this phase in groups of 8.
 	const int thread_id  = threadIdx.x &  (THREADS_PER_HASH - 1);
-	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
 	const int mix_idx    = thread_id & 3;

 	uint4 mix;
@ -47,11 +42,11 @@ __device__ uint64_t compute_hash_shuffle(
 				mix = vectorize2(shuffle[6], shuffle[7]);
 		}
 		
-		uint32_t init0 = __shfl(shuffle[0].x, start_lane);
+		uint32_t init0 = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);

 		for (uint32_t a = 0; a < ACCESSES; a += 4)
 		{
-			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
+			int t = bfe(a, 2u, 3u);

 			for (uint32_t b = 0; b < 4; b++)
 			{
@ -60,8 +55,7 @@ __device__ uint64_t compute_hash_shuffle(
 					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
 				}
 				shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
-
-				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
+				mix = fnv4(mix, d_dag[shuffle[0].x].uint4s[thread_id]);
 			}
 		}

--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@ -199,7 +199,7 @@ void ethash_cuda_miner::finish()
 	CUDA_SAFE_CALL(cudaDeviceReset());
 }

-bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _deviceId)
+bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG)
 {
 	try
 	{
@ -224,25 +224,61 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
 		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];

-		uint32_t dagSize128 = (unsigned)(_dagSize / ETHASH_MIX_BYTES);
+		uint64_t dagSize = ethash_get_datasize(_light->block_number);
+		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
+		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
+
+		// create buffer for cache
+		hash64_t * light = NULL;
+
+		if (!*hostDAG)
+		{
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
+			// copy dag cache to CPU.
+			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
+		}

 		// create buffer for dag
 		hash128_t * dag;
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), _dagSize));
-		// copy dag to CPU.
-		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), _dag, _dagSize, cudaMemcpyHostToDevice));
-
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+		
 		// create mining buffers
 		for (unsigned i = 0; i != s_numStreams; ++i)
 		{
 			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
 			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 		}
-		set_constants(dag, dagSize128);
+		set_constants(dag, dagSize128, light, lightSize64);
 		memset(&m_current_header, 0, sizeof(hash32_t));
 		m_current_target = 0;
 		m_current_nonce = 0;
 		m_current_index = 0;
+
+		m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
+
+		if (!*hostDAG)
+		{
+			cout << "Generating DAG for GPU #" << device_num << endl;
+			ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+
+			if (_cpyToHost)
+			{
+				uint8_t* memoryDAG = new uint8_t[dagSize];
+				if (!memoryDAG) throw std::runtime_error("Failed to init host memory for DAG, not enough memory?");
+
+				cout << "Copying DAG from GPU #" << device_num << " to host" << endl;
+				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
+
+				*hostDAG = (void*)memoryDAG;
+			}
+		}
+		else
+		{
+			cout << "Copying DAG from host to GPU #" << device_num << endl;
+			const void* hdag = (const void*)(*hostDAG);
+			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
+		}
+
 		return true;
 	}
 	catch (runtime_error)
@ -251,7 +287,7 @@ bool ethash_cuda_miner::init(uint8_t const* _dag, uint64_t _dagSize, unsigned _d
 	}
 }

-void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
+void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN)
 {
 	bool initialize = false;
 	bool exit = false;
@ -267,14 +303,34 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
 		set_target(m_current_target);
 		initialize = true;
 	}
-	if (initialize)
+	if (_ethStratum)
 	{
-		random_device engine;
-		m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
-		m_current_index = 0;
-		CUDA_SAFE_CALL(cudaDeviceSynchronize());
-		for (unsigned int i = 0; i < s_numStreams; i++)
-			m_search_buf[i][0] = 0;
+		if (initialize)
+		{
+			m_starting_nonce = 0;
+			m_current_index = 0;
+			CUDA_SAFE_CALL(cudaDeviceSynchronize());
+			for (unsigned int i = 0; i < s_numStreams; i++)
+				m_search_buf[i][0] = 0;
+		}
+		if (m_starting_nonce != _startN)
+		{
+			// reset nonce counter
+			m_starting_nonce = _startN;
+			m_current_nonce = m_starting_nonce;
+		}
+	}
+	else
+	{
+		if (initialize)
+		{
+			random_device engine;
+			m_current_nonce = uniform_int_distribution<uint64_t>()(engine);
+			m_current_index = 0;
+			CUDA_SAFE_CALL(cudaDeviceSynchronize());
+			for (unsigned int i = 0; i < s_numStreams; i++)
+				m_search_buf[i][0] = 0;
+		}
 	}
 	uint64_t batch_size = s_gridSize * s_blockSize;
 	for (; !exit; m_current_index++, m_current_nonce += batch_size)
@ -294,7 +350,7 @@ void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_ho
 			for (unsigned int j = 0; j < found_count; j++)
 				nonces[j] = nonce_base + buffer[j + 1];
 		}
-		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce);
+		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce);
 		if (m_current_index >= s_numStreams)
 		{
 			exit = found_count && hook.found(nonces, found_count);
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@ -34,13 +34,11 @@ public:
 		unsigned _scheduleFlag,
 		uint64_t _currentBlock
 		);
-	bool init(
-		uint8_t const* _dag,
-		uint64_t _dagSize,
-		unsigned _deviceId = 0
-		);
+
+	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);
+
 	void finish();
-	void search(uint8_t const* header, uint64_t target, search_hook& hook);
+	void search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN);

 	/* -- default values -- */
 	/// Default value of the block size. Also known as workgroup size.
@ -54,8 +52,11 @@ private:
 	hash32_t m_current_header;
 	uint64_t m_current_target;
 	uint64_t m_current_nonce;
+	uint64_t m_starting_nonce;
 	uint64_t m_current_index;

+	uint32_t m_sharedBytes;
+
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@ -8,131 +8,142 @@
 #include "ethash_cuda_miner_kernel_globals.h"
 #include "cuda_helper.h"

-#include "keccak.cuh"
 #include "fnv.cuh"

-#define ACCESSES 64
-#define THREADS_PER_HASH (128 / 16)
-
-__device__ uint64_t compute_hash_shuffle(
-	uint64_t nonce
-	)
-{
-	// sha3_512(header .. nonce)
-	uint2 state[25];
-
-	state[4] = vectorize(nonce);
-
-	keccak_f1600_init(state);
-
-	// Threads work together in this phase in groups of 8.
-	const int thread_id = threadIdx.x &  (THREADS_PER_HASH - 1);
-	const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
-	const int mix_idx = thread_id & 3;
-
-	uint4 mix;
-	uint2 shuffle[8];
-
-	for (int i = 0; i < THREADS_PER_HASH; i++)
-	{
-		// share init among threads
-		for (int j = 0; j < 8; j++) {
-			shuffle[j].x = __shfl(state[j].x, i, THREADS_PER_HASH);
-			shuffle[j].y = __shfl(state[j].y, i, THREADS_PER_HASH);
-		}
-
-		// ugly but avoids local reads/writes
-		if (mix_idx < 2) {
-			if (mix_idx == 0)
-				mix = vectorize2(shuffle[0], shuffle[1]);
-			else
-				mix = vectorize2(shuffle[2], shuffle[3]);
-		}
-		else  {
-			if (mix_idx == 2)
-				mix = vectorize2(shuffle[4], shuffle[5]);
-			else
-				mix = vectorize2(shuffle[6], shuffle[7]);
-		}
+#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }

-		uint32_t init0 = __shfl(shuffle[0].x, start_lane);

-		for (uint32_t a = 0; a < ACCESSES; a += 4)
-		{
-			int t = ((a >> 2) & (THREADS_PER_HASH - 1));
-
-			for (uint32_t b = 0; b < 4; b++)
-			{
-				if (thread_id == t)
-				{
-					shuffle[0].x = fnv(init0 ^ (a + b), ((uint32_t *)&mix)[b]) % d_dag_size;
-				}
-				shuffle[0].x = __shfl(shuffle[0].x, t, THREADS_PER_HASH);
-
-				mix = fnv4(mix, (&d_dag[shuffle[0].x])->uint4s[thread_id]);
-			}
-		}
-
-		uint32_t thread_mix = fnv_reduce(mix);
-
-		// update mix accross threads
-
-		shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
-		shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
-		shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
-		shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
-		shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
-		shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
-		shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
-		shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
-
-		if (i == thread_id) {
-			//move mix into state:
-			state[8] = shuffle[0];
-			state[9] = shuffle[1];
-			state[10] = shuffle[2];
-			state[11] = shuffle[3];
-		}
-	}
-
-	// keccak_256(keccak_512(header..nonce) .. mix);
-	return keccak_f1600_final(state);
-}
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+#include "keccak_u64.cuh"
+#include "dagger_shared.cuh"
+#define TPB		128
+#define BPSM	4
+#else
+#include "keccak.cuh"
+#include "dagger_shuffled.cuh"
+#define TPB		896
+#define BPSM	1
+#endif

 __global__ void 
-__launch_bounds__(896, 1)
+__launch_bounds__(TPB, BPSM)
 ethash_search(
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;	
-	uint64_t hash = compute_hash_shuffle(start_nonce + gid);
+	uint64_t hash = compute_hash(start_nonce + gid);
 	if (cuda_swab64(hash) > d_target) return;
 	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), SEARCH_RESULT_BUFFER_SIZE - 1) + 1;
 	g_output[index] = gid;
-	__threadfence_system();
 }

 void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
+	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
 )
 {
-	ethash_search <<<blocks, threads, 0, stream >>>(g_output, start_nonce);
+	ethash_search << <blocks, threads, sharedbytes, stream >> >(g_output, start_nonce);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+#define ETHASH_DATASET_PARENTS 256
+#define NODE_WORDS (64/4)
+
+__global__ void
+__launch_bounds__(128, 7)
+ethash_calculate_dag_item(uint32_t start)
+{
+	uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
+	if (node_index > d_dag_size * 2) return;
+
+	hash200_t dag_node;
+	copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
+	dag_node.words[0] ^= node_index;
+	SHA3_512(dag_node.uint2s);
+
+	const int thread_id = threadIdx.x & 3;
+
+	for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
+		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+		for (unsigned w = 0; w != 4; ++w) {
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
+		}
+#else
+		for (uint32_t t = 0; t < 4; t++) {
+			uint32_t shuffle_index = __shfl(parent_index, t, 4);
+			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
+
+			for (int w = 0; w < 4; w++) {
+				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
+				if (t == thread_id) {
+					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
+				}
+			}
+
+		}
+#endif		
+	}
+	SHA3_512(dag_node.uint2s);
+	hash64_t * dag_nodes = (hash64_t *)d_dag;
+
+#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
+	for (uint32_t i = 0; i < 4; i++) {
+		dag_nodes[node_index].uint4s[i] =  dag_node.uint4s[i];
+	}
+#else
+	for (uint32_t t = 0; t < 4; t++) {
+
+		uint32_t shuffle_index = __shfl(node_index, t, 4);
+		uint4 s[4];
+		for (uint32_t w = 0; w < 4; w++) {
+			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
+		}
+		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
+	}
+#endif		 
+}
+
+void ethash_generate_dag(
+	uint64_t dag_size,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	int device
+	)
+{
+	uint32_t const work = (uint32_t)(dag_size / sizeof(hash64_t));
+
+	uint32_t fullRuns = work / (blocks * threads);
+	uint32_t const restWork = work % (blocks * threads);
+	if (restWork > 0) fullRuns++;
+	for (uint32_t i = 0; i < fullRuns; i++)
+	{
+		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);
+	}
+	//printf("GPU#%d 100%%\n");
 	CUDA_SAFE_CALL(cudaGetLastError());
 }

 void set_constants(
 	hash128_t* _dag,
-	uint32_t _dag_size
+	uint32_t _dag_size,
+	hash64_t * _light,
+	uint32_t _light_size
 	)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t *)));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t *)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
 }

 void set_header(
--- a/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel.h
@ -6,6 +6,8 @@
 #include <cuda_runtime.h>

 #define SEARCH_RESULT_BUFFER_SIZE 64
+#define ACCESSES 64
+#define THREADS_PER_HASH (128 / 16)

 typedef struct
 {
@ -17,10 +19,23 @@ typedef struct
 	uint4	 uint4s[128 / sizeof(uint4)];
 } hash128_t;

+typedef union {
+	uint32_t words[64 / sizeof(uint32_t)];
+	uint2	 uint2s[64 / sizeof(uint2)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union {
+	uint32_t words[200 / sizeof(uint32_t)];
+	uint2	 uint2s[200 / sizeof(uint2)];
+	uint4	 uint4s[200 / sizeof(uint4)];
+} hash200_t;

 void set_constants(
 	hash128_t* _dag,
-	uint32_t _dag_size
+	uint32_t _dag_size,
+	hash64_t * _light,
+	uint32_t _light_size
 	);

 void set_header(
@ -34,11 +49,20 @@ void set_target(
 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
+	uint32_t sharedbytes,
 	cudaStream_t stream,
 	volatile uint32_t* g_output,
 	uint64_t start_nonce
 	);

+void ethash_generate_dag(
+	uint64_t dag_size,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	int device
+	);
+

 #define CUDA_SAFE_CALL(call)								\
 do {														\
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@ -1,10 +1,14 @@
 #ifndef _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
 #define _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_

+#define SHUFFLE_MIN_VER 300
+
 //#include "cuda_helper.h"

 __constant__ uint32_t d_dag_size;
 __constant__ hash128_t* d_dag;
+__constant__ uint32_t d_light_size;
+__constant__ hash64_t* d_light;
 __constant__ hash32_t d_header;
 __constant__ uint64_t d_target;

--- a/libethash-cuda/fnv.cuh
+++ b/libethash-cuda/fnv.cuh
@ -1,3 +1,4 @@
+
 #define FNV_PRIME	0x01000193

 #define fnv(x,y) ((x) * FNV_PRIME ^(y))
@ -15,4 +16,5 @@ __device__ uint4 fnv4(uint4 a, uint4 b)
 __device__ uint32_t fnv_reduce(uint4 v)
 {
 	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
-}
+}
+
--- a/libethash-cuda/keccak.cuh
+++ b/libethash-cuda/keccak.cuh
@ -588,4 +588,190 @@ __device__ __forceinline__ uint64_t keccak_f1600_final(uint2* s)
 	/* iota: a[0,0] ^= round constant */
 	//s[0] ^= vectorize(keccak_round_constants[23]);
 	return devectorize(s[0]) ^ keccak_round_constants[23];
+}
+
+__device__ __forceinline__ void SHA3_512(uint2* s) {
+	
+	uint2 t[5], u, v;
+
+	for (uint32_t i = 8; i < 25; i++)
+	{
+		s[i] = make_uint2(0, 0);
+	}
+	s[8].x = 1;
+	s[8].y = 0x80000000;
+
+	for (int i = 0; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+		t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+		t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+		t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+		t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROL2(t[1], 1);
+		s[0] = xor3(s[0], t[4], u);
+		s[5] = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROL2(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROL2(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROL2(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROL2(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6];
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11];
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+	}
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROL2(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+
+	u = ROL2(t[2], 1);
+	s[6] = xor3(s[6], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+
+	u = ROL2(t[3], 1);
+	s[12] = xor3(s[12], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROL2(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+
+	u = ROL2(t[0], 1);
+	s[9] = xor3(s[9], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROL2(s[6], 44);
+	s[6] = ROL2(s[9], 20);
+	s[9] = ROL2(s[22], 61);
+	s[2] = ROL2(s[12], 43);
+	s[4] = ROL2(s[24], 14);
+	s[8] = ROL2(s[16], 45);
+	s[5] = ROL2(s[3], 28);
+	s[3] = ROL2(s[18], 21);
+	s[7] = ROL2(s[10], 3);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= vectorize(keccak_round_constants[23]);
 }
--- a/libethash-cuda/keccak_u64.cuh
+++ b/libethash-cuda/keccak_u64.cuh
@ -0,0 +1,777 @@
+#include "cuda_helper.h"
+
+__device__ __constant__ uint64_t const keccak_round_constants[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
+	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
+	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+__device__ __forceinline__
+uint64_t xor5(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e) {
+	return a ^ b ^ c ^ d ^ e;
+}
+__device__ __forceinline__
+uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) {
+	return a ^ b ^ c;
+}
+
+__device__ __forceinline__
+uint64_t chi(const uint64_t a, const uint64_t b, const uint64_t c) {
+	return a ^ (~b) & c;
+}
+
+__device__ __forceinline__ void keccak_f1600_init(uint64_t * s)
+{
+	uint64_t t[5], u, v;
+
+	devectorize4(d_header.uint4s[0], s[0], s[1]);
+	devectorize4(d_header.uint4s[1], s[2], s[3]);
+
+	for (uint32_t i = 5; i < 25; i++)
+	{
+		s[i] = 0;
+	}
+	s[5] = 0x0000000000000001;
+	s[8] = 0x8000000000000000;
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = s[0] ^ s[5];
+	t[1] = s[1];
+	t[2] = s[2];
+	t[3] = s[3] ^ s[8];
+	t[4] = s[4];
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROTL64(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[5] = xor3(s[5], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+	s[15] = xor3(s[15], t[4], u);
+	s[20] = xor3(s[20], t[4], u);
+
+	u = ROTL64(t[2], 1);
+	s[1] = xor3(s[1], t[0], u);
+	s[6] = xor3(s[6], t[0], u);
+	s[11] = xor3(s[11], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+	s[21] = xor3(s[21], t[0], u);
+
+	u = ROTL64(t[3], 1);
+	s[2] = xor3(s[2], t[1], u);
+	s[7] = xor3(s[7], t[1], u);
+	s[12] = xor3(s[12], t[1], u);
+	s[17] = xor3(s[17], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROTL64(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[8] = xor3(s[8], t[2], u);
+	s[13] = xor3(s[13], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+	s[23] = xor3(s[23], t[2], u);
+
+	u = ROTL64(t[0], 1);
+	s[4] = xor3(s[4], t[3], u);
+	s[9] = xor3(s[9], t[3], u);
+	s[14] = xor3(s[14], t[3], u);
+	s[19] = xor3(s[19], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROTL64(s[6], 44);
+	s[6] = ROTL64(s[9], 20);
+	s[9] = ROTL64(s[22], 61);
+	s[22] = ROTL64(s[14], 39);
+	s[14] = ROTL64(s[20], 18);
+	s[20] = ROTL64(s[2], 62);
+	s[2] = ROTL64(s[12], 43);
+	s[12] = ROTL64(s[13], 25);
+	s[13] = ROTL64(s[19], 8);
+	s[19] = ROTL64(s[23], 56);
+	s[23] = ROTL64(s[15], 41);
+	s[15] = ROTL64(s[4], 27);
+	s[4] = ROTL64(s[24], 14);
+	s[24] = ROTL64(s[21], 2);
+	s[21] = ROTL64(s[8], 55);
+	s[8] = ROTL64(s[16], 45);
+	s[16] = ROTL64(s[5], 36);
+	s[5] = ROTL64(s[3], 28);
+	s[3] = ROTL64(s[18], 21);
+	s[18] = ROTL64(s[17], 15);
+	s[17] = ROTL64(s[11], 10);
+	s[11] = ROTL64(s[7], 6);
+	s[7] = ROTL64(s[10], 3);
+	s[10] = ROTL64(u, 1);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+
+	u = s[5]; v = s[6];
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+	s[8] = chi(s[8], s[9], u);
+	s[9] = chi(s[9], u, v);
+
+	u = s[10]; v = s[11];
+	s[10] = chi(s[10], s[11], s[12]);
+	s[11] = chi(s[11], s[12], s[13]);
+	s[12] = chi(s[12], s[13], s[14]);
+	s[13] = chi(s[13], s[14], u);
+	s[14] = chi(s[14], u, v);
+
+	u = s[15]; v = s[16];
+	s[15] = chi(s[15], s[16], s[17]);
+	s[16] = chi(s[16], s[17], s[18]);
+	s[17] = chi(s[17], s[18], s[19]);
+	s[18] = chi(s[18], s[19], u);
+	s[19] = chi(s[19], u, v);
+
+	u = s[20]; v = s[21];
+	s[20] = chi(s[20], s[21], s[22]);
+	s[21] = chi(s[21], s[22], s[23]);
+	s[22] = chi(s[22], s[23], s[24]);
+	s[23] = chi(s[23], s[24], u);
+	s[24] = chi(s[24], u, v);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= keccak_round_constants[0];
+
+	for (int i = 1; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0] , s[5] , s[10] , s[15] , s[20]);
+		t[1] = xor5(s[1] , s[6] , s[11] , s[16] , s[21]);
+		t[2] = xor5(s[2] , s[7] , s[12] , s[17] , s[22]);
+		t[3] = xor5(s[3] , s[8] , s[13] , s[18] , s[23]);
+		t[4] = xor5(s[4] , s[9] , s[14] , s[19] , s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROTL64(t[1], 1);
+		s[0]  = xor3(s[0], t[4], u);
+		s[5]  = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROTL64(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROTL64(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROTL64(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROTL64(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROTL64(s[6], 44);
+		s[6] = ROTL64(s[9], 20);
+		s[9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[2], 62);
+		s[2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19], 8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[4], 27);
+		s[4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21], 2);
+		s[21] = ROTL64(s[8], 55);
+		s[8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[5], 36);
+		s[5] = ROTL64(s[3], 28);
+		s[3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[7], 6);
+		s[7] = ROTL64(s[10], 3);
+		s[10] = ROTL64(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+			
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6]; 
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11]; 
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROTL64(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+
+	u = ROTL64(t[2], 1);
+	s[6] = xor3(s[6], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+
+	u = ROTL64(t[3], 1);
+	s[12] = xor3(s[12], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROTL64(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+
+	u = ROTL64(t[0], 1);
+	s[9] = xor3(s[9], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROTL64(s[6], 44);
+	s[6] = ROTL64(s[9], 20);
+	s[9] = ROTL64(s[22], 61);
+	s[2] = ROTL64(s[12], 43);
+	s[4] = ROTL64(s[24], 14);
+	s[8] = ROTL64(s[16], 45);
+	s[5] = ROTL64(s[3], 28);
+	s[3] = ROTL64(s[18], 21);
+	s[7] = ROTL64(s[10], 3);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= keccak_round_constants[23];
+}
+
+__device__ __forceinline__ uint64_t keccak_f1600_final(uint64_t* s)
+{
+	uint64_t t[5], u, v;
+
+	for (uint32_t i = 12; i < 25; i++)
+	{
+		s[i] = 0;
+	}
+	s[12] = 0x0000000000000001;
+	s[16] = 0x8000000000000000;
+	
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor3(s[0], s[5], s[10]);
+	t[1] = xor3(s[1], s[6], s[11]) ^ s[16];
+	t[2] = xor3(s[2], s[7], s[12]);
+	t[3] = s[3] ^ s[8];
+	t[4] = s[4] ^ s[9];
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROTL64(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[5] = xor3(s[5], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+	s[15] = xor3(s[15], t[4], u);
+	s[20] = xor3(s[20], t[4], u);
+
+	u = ROTL64(t[2], 1);
+	s[1] = xor3(s[1], t[0], u);
+	s[6] = xor3(s[6], t[0], u);
+	s[11] = xor3(s[11], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+	s[21] = xor3(s[21], t[0], u);
+
+	u = ROTL64(t[3], 1);
+	s[2] = xor3(s[2], t[1], u);
+	s[7] = xor3(s[7], t[1], u);
+	s[12] = xor3(s[12], t[1], u);
+	s[17] = xor3(s[17], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROTL64(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[8] = xor3(s[8], t[2], u);
+	s[13] = xor3(s[13], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+	s[23] = xor3(s[23], t[2], u);
+
+
+	u = ROTL64(t[0], 1);
+	s[4] = xor3(s[4], t[3], u);
+	s[9] = xor3(s[9], t[3], u);
+	s[14] = xor3(s[14], t[3], u);
+	s[19] = xor3(s[19], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROTL64(s[6], 44);
+	s[6] = ROTL64(s[9], 20);
+	s[9] = ROTL64(s[22], 61);
+	s[22] = ROTL64(s[14], 39);
+	s[14] = ROTL64(s[20], 18);
+	s[20] = ROTL64(s[2], 62);
+	s[2] = ROTL64(s[12], 43);
+	s[12] = ROTL64(s[13], 25);
+	s[13] = ROTL64(s[19], 8);
+	s[19] = ROTL64(s[23], 56);
+	s[23] = ROTL64(s[15], 41);
+	s[15] = ROTL64(s[4], 27);
+	s[4] = ROTL64(s[24], 14);
+	s[24] = ROTL64(s[21], 2);
+	s[21] = ROTL64(s[8], 55);
+	s[8] = ROTL64(s[16], 45);
+	s[16] = ROTL64(s[5], 36);
+	s[5] = ROTL64(s[3], 28);
+	s[3] = ROTL64(s[18], 21);
+	s[18] = ROTL64(s[17], 15);
+	s[17] = ROTL64(s[11], 10);
+	s[11] = ROTL64(s[7], 6);
+	s[7] = ROTL64(s[10], 3);
+	s[10] = ROTL64(u, 1);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+
+	u = s[5]; v = s[6];
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+	s[8] = chi(s[8], s[9], u);
+	s[9] = chi(s[9], u, v);
+
+	u = s[10]; v = s[11];
+	s[10] = chi(s[10], s[11], s[12]);
+	s[11] = chi(s[11], s[12], s[13]);
+	s[12] = chi(s[12], s[13], s[14]);
+	s[13] = chi(s[13], s[14], u);
+	s[14] = chi(s[14], u, v);
+
+	u = s[15]; v = s[16];
+	s[15] = chi(s[15], s[16], s[17]);
+	s[16] = chi(s[16], s[17], s[18]);
+	s[17] = chi(s[17], s[18], s[19]);
+	s[18] = chi(s[18], s[19], u);
+	s[19] = chi(s[19], u, v);
+
+	u = s[20]; v = s[21];
+	s[20] = chi(s[20], s[21], s[22]);
+	s[21] = chi(s[21], s[22], s[23]);
+	s[22] = chi(s[22], s[23], s[24]);
+	s[23] = chi(s[23], s[24], u);
+	s[24] = chi(s[24], u, v);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= keccak_round_constants[0];
+
+	for (int i = 1; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+		t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+		t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+		t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+		t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROTL64(t[1], 1);
+		s[0] = xor3(s[0], t[4], u);
+		s[5] = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROTL64(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROTL64(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROTL64(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROTL64(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROTL64(s[6], 44);
+		s[6] = ROTL64(s[9], 20);
+		s[9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[2], 62);
+		s[2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19], 8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[4], 27);
+		s[4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21], 2);
+		s[21] = ROTL64(s[8], 55);
+		s[8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[5], 36);
+		s[5] = ROTL64(s[3], 28);
+		s[3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[7], 6);
+		s[7] = ROTL64(s[10], 3);
+		s[10] = ROTL64(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);	
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6];
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11];
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	s[0] = xor3(s[0], t[4], ROTL64(t[1], 1));
+	s[6] = xor3(s[6], t[0], ROTL64(t[2], 1));
+	s[12] = xor3(s[12], t[1], ROTL64(t[3], 1));
+
+	s[1] = ROTL64(s[6], 44);
+	s[2] = ROTL64(s[12], 43);
+
+	s[0] = chi(s[0], s[1], s[2]);
+
+	/* iota: a[0,0] ^= round constant */
+	//s[0] ^= vectorize(keccak_round_constants[23]);
+	return s[0] ^ keccak_round_constants[23];
+}
+
+__device__ __forceinline__ void SHA3_512(uint2* s2) {
+
+	uint64_t * s = (uint64_t*)s2; //dirty
+
+	uint64_t t[5], u, v;
+
+	for (uint32_t i = 9; i < 25; i++)
+	{
+		s[i] = 0;
+	}
+	s[8] = 0x8000000000000001;
+
+	for (int i = 0; i < 23; i++)
+	{
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+		t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+		t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+		t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+		t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+		u = ROTL64(t[1], 1);
+		s[0] = xor3(s[0], t[4], u);
+		s[5] = xor3(s[5], t[4], u);
+		s[10] = xor3(s[10], t[4], u);
+		s[15] = xor3(s[15], t[4], u);
+		s[20] = xor3(s[20], t[4], u);
+
+		u = ROTL64(t[2], 1);
+		s[1] = xor3(s[1], t[0], u);
+		s[6] = xor3(s[6], t[0], u);
+		s[11] = xor3(s[11], t[0], u);
+		s[16] = xor3(s[16], t[0], u);
+		s[21] = xor3(s[21], t[0], u);
+
+		u = ROTL64(t[3], 1);
+		s[2] = xor3(s[2], t[1], u);
+		s[7] = xor3(s[7], t[1], u);
+		s[12] = xor3(s[12], t[1], u);
+		s[17] = xor3(s[17], t[1], u);
+		s[22] = xor3(s[22], t[1], u);
+
+		u = ROTL64(t[4], 1);
+		s[3] = xor3(s[3], t[2], u);
+		s[8] = xor3(s[8], t[2], u);
+		s[13] = xor3(s[13], t[2], u);
+		s[18] = xor3(s[18], t[2], u);
+		s[23] = xor3(s[23], t[2], u);
+
+
+		u = ROTL64(t[0], 1);
+		s[4] = xor3(s[4], t[3], u);
+		s[9] = xor3(s[9], t[3], u);
+		s[14] = xor3(s[14], t[3], u);
+		s[19] = xor3(s[19], t[3], u);
+		s[24] = xor3(s[24], t[3], u);
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		u = s[1];
+
+		s[1] = ROTL64(s[6], 44);
+		s[6] = ROTL64(s[9], 20);
+		s[9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[2], 62);
+		s[2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19], 8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[4], 27);
+		s[4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21], 2);
+		s[21] = ROTL64(s[8], 55);
+		s[8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[5], 36);
+		s[5] = ROTL64(s[3], 28);
+		s[3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[7], 6);
+		s[7] = ROTL64(s[10], 3);
+		s[10] = ROTL64(u, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+		u = s[0]; v = s[1];
+		s[0] = chi(s[0], s[1], s[2]);
+		s[1] = chi(s[1], s[2], s[3]);
+		s[2] = chi(s[2], s[3], s[4]);
+		s[3] = chi(s[3], s[4], u);
+		s[4] = chi(s[4], u, v);
+
+		u = s[5]; v = s[6];
+		s[5] = chi(s[5], s[6], s[7]);
+		s[6] = chi(s[6], s[7], s[8]);
+		s[7] = chi(s[7], s[8], s[9]);
+		s[8] = chi(s[8], s[9], u);
+		s[9] = chi(s[9], u, v);
+
+		u = s[10]; v = s[11];
+		s[10] = chi(s[10], s[11], s[12]);
+		s[11] = chi(s[11], s[12], s[13]);
+		s[12] = chi(s[12], s[13], s[14]);
+		s[13] = chi(s[13], s[14], u);
+		s[14] = chi(s[14], u, v);
+
+		u = s[15]; v = s[16];
+		s[15] = chi(s[15], s[16], s[17]);
+		s[16] = chi(s[16], s[17], s[18]);
+		s[17] = chi(s[17], s[18], s[19]);
+		s[18] = chi(s[18], s[19], u);
+		s[19] = chi(s[19], u, v);
+
+		u = s[20]; v = s[21];
+		s[20] = chi(s[20], s[21], s[22]);
+		s[21] = chi(s[21], s[22], s[23]);
+		s[22] = chi(s[22], s[23], s[24]);
+		s[23] = chi(s[23], s[24], u);
+		s[24] = chi(s[24], u, v);
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+
+	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+	t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+	t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+	t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+	t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+	u = ROTL64(t[1], 1);
+	s[0] = xor3(s[0], t[4], u);
+	s[10] = xor3(s[10], t[4], u);
+
+	u = ROTL64(t[2], 1);
+	s[6] = xor3(s[6], t[0], u);
+	s[16] = xor3(s[16], t[0], u);
+
+	u = ROTL64(t[3], 1);
+	s[12] = xor3(s[12], t[1], u);
+	s[22] = xor3(s[22], t[1], u);
+
+	u = ROTL64(t[4], 1);
+	s[3] = xor3(s[3], t[2], u);
+	s[18] = xor3(s[18], t[2], u);
+
+	u = ROTL64(t[0], 1);
+	s[9] = xor3(s[9], t[3], u);
+	s[24] = xor3(s[24], t[3], u);
+
+	/* rho pi: b[..] = rotl(a[..], ..) */
+	u = s[1];
+
+	s[1] = ROTL64(s[6], 44);
+	s[6] = ROTL64(s[9], 20);
+	s[9] = ROTL64(s[22], 61);
+	s[2] = ROTL64(s[12], 43);
+	s[4] = ROTL64(s[24], 14);
+	s[8] = ROTL64(s[16], 45);
+	s[5] = ROTL64(s[3], 28);
+	s[3] = ROTL64(s[18], 21);
+	s[7] = ROTL64(s[10], 3);
+
+	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+	u = s[0]; v = s[1];
+	s[0] = chi(s[0], s[1], s[2]);
+	s[1] = chi(s[1], s[2], s[3]);
+	s[2] = chi(s[2], s[3], s[4]);
+	s[3] = chi(s[3], s[4], u);
+	s[4] = chi(s[4], u, v);
+	s[5] = chi(s[5], s[6], s[7]);
+	s[6] = chi(s[6], s[7], s[8]);
+	s[7] = chi(s[7], s[8], s[9]);
+
+	/* iota: a[0,0] ^= round constant */
+	s[0] ^= keccak_round_constants[23];
+}
--- a/libethcore/EthashAux.cpp
+++ b/libethcore/EthashAux.cpp
@ -52,6 +52,8 @@ const unsigned EthashProofOfWork::defaultGlobalWorkSizeMultiplier = 4096; // * C
 const unsigned EthashProofOfWork::defaultMSPerBatch = 0;
 const EthashProofOfWork::WorkPackage EthashProofOfWork::NullWorkPackage = EthashProofOfWork::WorkPackage();

+//unsigned EthashProofOfWork::s_dagLoadMode = 0;
+
 EthashAux::~EthashAux()
 {
 }
--- a/libethcore/EthashAux.h
+++ b/libethcore/EthashAux.h
@ -32,6 +32,7 @@ namespace dev
 namespace eth
 {

+
 struct DAGChannel: public LogChannel { static const char* name(); static const int verbosity = 1; };

 /// Proof of work definition for Ethash.
@ -52,17 +53,20 @@ struct EthashProofOfWork
 	struct WorkPackage
 	{
 		WorkPackage() = default;
-		WorkPackage(Ethash::BlockHeader const& _bh):
+		WorkPackage(Ethash::BlockHeader const& _bh) :
 			boundary(_bh.boundary()),
 			headerHash(_bh.hashWithout()),
 			seedHash(_bh.seedHash())
-		{}
+		{ }
 		void reset() { headerHash = h256(); }
 		operator bool() const { return headerHash != h256(); }

 		h256 boundary;
 		h256 headerHash;	///< When h256() means "pause until notified a new work package is available".
 		h256 seedHash;
+
+		uint64_t startNonce = 0;
+		int exSizeBits = -1;
 	};

 	static const WorkPackage NullWorkPackage;
@ -73,6 +77,7 @@ struct EthashProofOfWork
 	static const unsigned defaultGlobalWorkSizeMultiplier;
 	/// Default value of the milliseconds per global work size (per batch)
 	static const unsigned defaultMSPerBatch;
+
 };

 enum class DAGEraseMode
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@ -112,16 +112,8 @@ EthashCUDAMiner::EthashCUDAMiner(ConstructionInfo const& _ci) :
 	Worker("cudaminer" + toString(index())),
 m_hook( new EthashCUDAHook(this))
 {
-/*
-#if defined(WIN32)
-	SYSTEM_INFO sysinfo;
-	GetSystemInfo(&sysinfo);
-	int num_cpus = sysinfo.dwNumberOfProcessors;
-	SetThreadAffinityMask(GetCurrentThread(), 1 << (index() % num_cpus));
-	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
-#endif
-*/
 }
+
 EthashCUDAMiner::~EthashCUDAMiner()
 {
 	pause();
@ -149,38 +141,65 @@ void EthashCUDAMiner::workLoop()
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
 		WorkPackage w = work();
-		//cnote << "seedhash" << "#" + m_minerSeed.hex().substr(0, 16);
-		cnote << "set work to" << "#" + w.headerHash.hex().substr(0, 8) + ", target " << "#" + w.boundary.hex().substr(0, 16);
+		cnote << "set work; seed: " << "#" + w.seedHash.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 		if (!m_miner || m_minerSeed != w.seedHash)
 		{
+			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();
+
+			if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+			{
+				while (s_dagLoadIndex < index()) {
+					this_thread::sleep_for(chrono::seconds(1));
+				}
+			}
+			else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+			{
+				if (device != s_dagCreateDevice)
+				{
+					// wait until DAG is created on selected device
+					while (s_dagInHostMemory == NULL) {
+						this_thread::sleep_for(chrono::seconds(1));
+					}
+				}
+				else
+				{
+					// reset load index
+					s_dagLoadIndex = 0;
+				}
+			}
+
 			cnote << "Initialising miner...";
 			m_minerSeed = w.seedHash;

 			delete m_miner;
 			m_miner = new ethash_cuda_miner;

-			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();
+			EthashAux::LightType light;
+			light = EthashAux::light(w.seedHash);
+			//bytesConstRef dagData = dag->data();
+			bytesConstRef lightData = light->data();
+			
+			m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
+			s_dagLoadIndex++;

-			EthashAux::FullType dag;
-			while (true)
+			if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
 			{
-				if ((dag = EthashAux::full(w.seedHash, true)))
-					break;
-				if (shouldStop())
+				if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
 				{
-					delete m_miner;
-					m_miner = nullptr;
-					return;
+					// all devices have loaded DAG, we can free now
+					delete[] s_dagInHostMemory;
+					s_dagInHostMemory = NULL;
+
+					cout << "Freeing DAG from host" << endl;
 				}
-				cnote << "Awaiting DAG";
-				this_thread::sleep_for(chrono::milliseconds(500));
 			}
-			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), device);
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-		m_miner->search(w.headerHash.data(), upper64OfBoundary, *m_hook);
+		uint64_t startN;
+		if (w.exSizeBits >= 0)
+			startN = w.startNonce | ((uint64_t)index() << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
+		m_miner->search(w.headerHash.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
 	}
 	catch (std::runtime_error const& _e)
 	{
@ -217,17 +236,15 @@ bool EthashCUDAMiner::configureGPU(
 	unsigned _numStreams,
 	unsigned _extraGPUMemory,
 	unsigned _scheduleFlag,
-	uint64_t _currentBlock
+	uint64_t _currentBlock,
+	unsigned _dagLoadMode,
+	unsigned _dagCreateDevice
 	)
 {
+	s_dagLoadMode = _dagLoadMode;
+	s_dagCreateDevice = _dagCreateDevice;
 	_blockSize = ((_blockSize + 7) / 8) * 8;
-	/*
-	if (_blockSize != 32 && _blockSize != 64 && _blockSize != 128)
-	{
-		cout << "Given localWorkSize of " << toString(_blockSize) << "is invalid. Must be either 32,64 or 128" << endl;
-		return false;
-	}
-	*/
+
 	if (!ethash_cuda_miner::configureGPU(
 		s_devices,
 		_blockSize,
--- a/libethcore/EthashCUDAMiner.h
+++ b/libethcore/EthashCUDAMiner.h
@ -53,7 +53,9 @@ namespace eth
 			unsigned _numStreams,
 			unsigned _extraGPUMemory,
 			unsigned _scheduleFlag,
-			uint64_t _currentBlock
+			uint64_t _currentBlock,
+			unsigned _dagLoadMode,
+			unsigned _dagCreateDevice
 			);
 		static void setNumInstances(unsigned _instances) 
 		{ 
@ -84,6 +86,7 @@ namespace eth
 		static unsigned s_deviceId;
 		static unsigned s_numInstances;
 		static int s_devices[16];
+
 	};
 }
 }
--- a/libethcore/EthashGPUMiner.cpp
+++ b/libethcore/EthashGPUMiner.cpp
@ -107,7 +107,7 @@ int EthashGPUMiner::s_devices[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1

 EthashGPUMiner::EthashGPUMiner(ConstructionInfo const& _ci):
 	GenericMiner<EthashProofOfWork>(_ci),
-	Worker("gpuminer" + toString(index())),
+	Worker("openclminer" + toString(index())),
 	m_hook(new EthashCLHook(this))
 {
 }
@ -139,9 +139,16 @@ void EthashGPUMiner::workLoop()
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
 		WorkPackage w = work();
-		cnote << "set work to:" << "#" + w.headerHash.hex().substr(0, 8) + ", target " << "#" + w.boundary.hex().substr(0, 16);
+		cnote << "set work; seed: " << "#" + w.seedHash.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 		if (!m_miner || m_minerSeed != w.seedHash)
 		{
+			if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+			{
+				while (s_dagLoadIndex < index()) {
+					this_thread::sleep_for(chrono::seconds(1));
+				}
+			}
+
 			cnote << "Initialising miner...";
 			m_minerSeed = w.seedHash;

@ -150,6 +157,7 @@ void EthashGPUMiner::workLoop()

 			unsigned device = s_devices[index()] > -1 ? s_devices[index()] : index();

+			/*
 			EthashAux::FullType dag;
 			while (true)
 			{
@ -164,12 +172,21 @@ void EthashGPUMiner::workLoop()
 				cnote << "Awaiting DAG";
 				this_thread::sleep_for(chrono::milliseconds(500));
 			}
-			bytesConstRef dagData = dag->data();
-			m_miner->init(dagData.data(), dagData.size(), s_platformId, device);
+			*/
+
+			EthashAux::LightType light;
+			light = EthashAux::light(w.seedHash);
+			bytesConstRef lightData = light->data();
+
+			m_miner->init(light->light, lightData.data(), lightData.size(), s_platformId,  device);
+			s_dagLoadIndex++;
 		}

 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-		m_miner->search(w.headerHash.data(), upper64OfBoundary, *m_hook);
+		uint64_t startN;
+		if (w.exSizeBits >= 0)
+			startN = w.startNonce | ((uint64_t)index() << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
+		m_miner->search(w.headerHash.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
 	}
 	catch (cl::Error const& _e)
 	{
@ -207,27 +224,27 @@ bool EthashGPUMiner::configureGPU(
 	unsigned _deviceId,
 	bool _allowCPU,
 	unsigned _extraGPUMemory,
-	uint64_t _currentBlock
+	uint64_t _currentBlock,
+	unsigned _dagLoadMode,
+	unsigned _dagCreateDevice
 )
 {
+	s_dagLoadMode = _dagLoadMode;
+	s_dagCreateDevice = _dagCreateDevice;
+
 	s_platformId = _platformId;
 	s_deviceId = _deviceId;

 	_localWorkSize = ((_localWorkSize + 7) / 8) * 8;
-	/*
-	if (_localWorkSize != 32 && _localWorkSize != 64 && _localWorkSize != 128 && _localWorkSize != 256)
-	{
-		cout << "Given localWorkSize of " << toString(_localWorkSize) << "is invalid. Must be either 32,64,128 or 256" << endl;
-		return false;
-	}
-	*/ 
+
 	if (!ethash_cl_miner::configureGPU(
 			_platformId,
 			_localWorkSize,
 			_globalWorkSizeMultiplier * _localWorkSize,
 			_allowCPU,
 			_extraGPUMemory,
-			_currentBlock)
+			_currentBlock
+			)
 	)
 	{
 		cout << "No GPU device with sufficient memory was found. Can't GPU mine. Remove the -G argument" << endl;
--- a/libethcore/EthashGPUMiner.h
+++ b/libethcore/EthashGPUMiner.h
@ -52,7 +52,9 @@ public:
 		unsigned _deviceId,
 		bool _allowCPU,
 		unsigned _extraGPUMemory,
-		uint64_t _currentBlock
+		uint64_t _currentBlock,
+		unsigned _dagLoadMode,
+		unsigned _dagCreateDevice
 	);
 	static void setNumInstances(unsigned _instances) { s_numInstances = std::min<unsigned>(_instances, getNumDevices()); }
 	static void setDevices(unsigned * _devices, unsigned _selectedDeviceCount)
@ -81,6 +83,7 @@ private:
 	static unsigned s_deviceId;
 	static unsigned s_numInstances;
 	static int s_devices[16];
+
 };

 }
--- a/libethcore/EthashSealEngine.cpp
+++ b/libethcore/EthashSealEngine.cpp
@ -59,7 +59,7 @@ void EthashSealEngine::generateSeal(BlockInfo const& _bi)
 {
 	m_sealing = Ethash::BlockHeader(_bi);
 	m_farm.setWork(m_sealing);
-	m_farm.start(m_sealer);
+	m_farm.start(m_sealer, false);
 	m_farm.setWork(m_sealing);		// TODO: take out one before or one after...
 	bytes shouldPrecompute = option("precomputeDAG");
 	if (!shouldPrecompute.empty() && shouldPrecompute[0] == 1)
--- a/libethcore/Farm.h
+++ b/libethcore/Farm.h
@ -67,7 +67,7 @@ public:
 	void setWork(WorkPackage const& _wp)
 	{
 		WriteGuard l(x_minerWork);
-		if (_wp.headerHash == m_work.headerHash)
+		if (_wp.headerHash == m_work.headerHash && _wp.startNonce == m_work.startNonce)
 			return;
 		m_work = _wp;
 		for (auto const& m: m_miners)
@ -80,7 +80,7 @@ public:
 	/**
 	 * @brief Start a number of miners.
 	 */
-	bool start(std::string const& _sealer)
+	bool start(std::string const& _sealer, bool mixed)
 	{
 		WriteGuard l(x_minerWork);
 		if (!m_miners.empty() && m_lastSealer == _sealer)
@ -88,10 +88,23 @@ public:
 		if (!m_sealers.count(_sealer))
 			return false;

-		m_miners.clear();
+		if (!mixed)
+		{
+			m_miners.clear();
+		}
 		auto ins = m_sealers[_sealer].instances();
-		m_miners.reserve(ins);
-		for (unsigned i = 0; i < ins; ++i)
+		unsigned start = 0;
+		if (!mixed)
+		{
+			m_miners.reserve(ins);
+		}
+		else
+		{
+			start = m_miners.size();
+			ins += start;
+			m_miners.reserve(ins);
+		}
+		for (unsigned i = start; i < ins; ++i)
 		{
 			m_miners.push_back(std::shared_ptr<Miner>(m_sealers[_sealer].create(std::make_pair(this, i))));
 			m_miners.back()->setWork(m_work);
--- a/libethcore/Miner.cpp
+++ b/libethcore/Miner.cpp
@ -0,0 +1,19 @@
+#include "Miner.h"
+#include "EthashAux.h"
+
+using namespace dev;
+using namespace eth;
+
+template <>
+unsigned dev::eth::GenericMiner<dev::eth::EthashProofOfWork>::s_dagLoadMode = 0;
+
+template <>
+volatile unsigned dev::eth::GenericMiner<dev::eth::EthashProofOfWork>::s_dagLoadIndex = 0;
+
+template <>
+unsigned dev::eth::GenericMiner<dev::eth::EthashProofOfWork>::s_dagCreateDevice = 0;
+
+template <>
+volatile void* dev::eth::GenericMiner<dev::eth::EthashProofOfWork>::s_dagInHostMemory = NULL;
+
+
--- a/libethcore/Miner.h
+++ b/libethcore/Miner.h
@ -24,6 +24,7 @@
 #include <thread>
 #include <list>
 #include <atomic>
+#include <string>
 #include <boost/timer.hpp>
 #include <libdevcore/Common.h>
 #include <libdevcore/Log.h>
@ -34,6 +35,24 @@
 #define MINER_WAIT_STATE_WORK	 1
 #define MINER_WAIT_STATE_DAG	 2

+
+#define DAG_LOAD_MODE_PARALLEL	 0
+#define DAG_LOAD_MODE_SEQUENTIAL 1
+#define DAG_LOAD_MODE_SINGLE	 2
+
+#define STRATUM_PROTOCOL_STRATUM		 0
+#define STRATUM_PROTOCOL_ETHPROXY		 1
+#define STRATUM_PROTOCOL_ETHEREUMSTRATUM 2
+
+using namespace std;
+
+typedef struct {
+	string host;
+	string port;
+	string user;
+	string pass;
+} cred_t;
+
 namespace dev
 {

@ -44,7 +63,8 @@ enum class MinerType
 {
 	CPU,
 	CL,
-	CUDA
+	CUDA,
+	Mixed
 };

 struct MineInfo: public WorkingProgress {};
@ -91,6 +111,7 @@ inline std::ostream& operator<<(std::ostream& os, SolutionStats s)

 template <class PoW> class GenericMiner;

+
 /**
 * @brief Class for hosting one or more Miners.
 * @warning Must be implemented in a threadsafe manner since it will be called from multiple
@ -162,6 +183,7 @@ public:

 protected:

+
 	// REQUIRED TO BE REIMPLEMENTED BY A SUBCLASS:

 	/**
@ -199,6 +221,10 @@ protected:

 	void accumulateHashes(unsigned _n) { m_hashCount += _n; }

+	static unsigned s_dagLoadMode;
+	static volatile unsigned s_dagLoadIndex;
+	static unsigned s_dagCreateDevice;
+	static volatile void* s_dagInHostMemory;
 private:
 	FarmFace* m_farm = nullptr;
 	unsigned m_index;
@ -207,6 +233,9 @@ private:

 	WorkPackage m_work;
 	mutable Mutex x_work;
+
+	
+	bool m_dagLoaded = false;
 };

 }
--- a/libstratum/EthStratumClient.cpp
+++ b/libstratum/EthStratumClient.cpp
@ -1,10 +1,33 @@
-
+ 
 #include "EthStratumClient.h"
 #include <libdevcore/Log.h>
+#include <libethash/endian.h>
 using boost::asio::ip::tcp;


-EthStratumClient::EthStratumClient(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, bool const & precompute)
+static void diffToTarget(uint32_t *target, double diff)
+{
+	uint32_t target2[8];
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target2, 0xff, 32);
+	else {
+		memset(target2, 0, 32);
+		target2[k] = (uint32_t)m;
+		target2[k + 1] = (uint32_t)(m >> 32);
+	}
+
+	for (int i = 0; i < 32; i++)
+		((uint8_t*)target)[31 - i] = ((uint8_t*)target2)[i];
+}
+
+
+EthStratumClient::EthStratumClient(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, int const & protocol, string const & email)
 	: m_socket(m_io_service)
 {
 	m_minerType = m;
@ -17,11 +40,13 @@ EthStratumClient::EthStratumClient(GenericFarm<EthashProofOfWork> * f, MinerType

 	m_authorized = false;
 	m_connected = false;
-	m_precompute = precompute;
 	m_pending = 0;
 	m_maxRetries = retries;
 	m_worktimeout = worktimeout;

+	m_protocol = protocol;
+	m_email = email;
+
 	p_farm = f;
 	p_worktimer = nullptr;
 	connect();
@ -65,25 +90,16 @@ void EthStratumClient::connect()

 void EthStratumClient::reconnect()
 {
-	/*
-	if (p_farm->isMining())
-	{
-		cnote << "Stopping farm";
-		p_farm->stop();
-	}
-	*/
 	if (p_worktimer) {
 		p_worktimer->cancel();
 		p_worktimer = nullptr;
 	}

 	m_io_service.reset();
-	m_socket.close();
+	//m_socket.close(); // leads to crashes on Linux
 	m_authorized = false;
 	m_connected = false;
-	
-
-	
+		
 	if (!m_failover.host.empty())
 	{
 		m_retries++;
@ -153,15 +169,46 @@ void EthStratumClient::connect_handler(const boost::system::error_code& ec, tcp:
 		{
 			cnote << "Starting farm";
 			if (m_minerType == MinerType::CPU)
-				p_farm->start("cpu");
+				p_farm->start("cpu", false);
 			else if (m_minerType == MinerType::CL)
-				p_farm->start("opencl");
+				p_farm->start("opencl", false);
 			else if (m_minerType == MinerType::CUDA)
-				p_farm->start("cuda");
+				p_farm->start("cuda", false);
+			else if (m_minerType == MinerType::Mixed) {
+				p_farm->start("cuda", false);
+				p_farm->start("opencl", true);
+			}
 		}
 		std::ostream os(&m_requestBuffer);
-		os << "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}\n";

+		string user;
+		size_t p;
+
+		switch (m_protocol) {
+			case STRATUM_PROTOCOL_STRATUM:
+				os << "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}\n";
+				break;
+			case STRATUM_PROTOCOL_ETHPROXY:
+				p = p_active->user.find_first_of(".");
+				user = p_active->user.substr(0, p);
+				if (p + 1 <= p_active->user.length())
+					m_worker = p_active->user.substr(p + 1);
+				else
+					m_worker = "";
+
+				if (m_email.empty())
+				{
+					os << "{\"id\": 1, \"worker\":\"" << m_worker << "\", \"method\": \"eth_submitLogin\", \"params\": [\"" << user << "\"]}\n";
+				}
+				else
+				{
+					os << "{\"id\": 1, \"worker\":\"" << m_worker << "\", \"method\": \"eth_submitLogin\", \"params\": [\"" << user << "\", \"" << m_email << "\"]}\n";
+				}
+				break;
+			case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+				os << "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"ethminer/" << ETH_PROJECT_VERSION << "\",\"EthereumStratum/1.0.0\"]}\n";
+				break;
+		}
 		
 		async_write(m_socket, m_requestBuffer,
 			boost::bind(&EthStratumClient::handleResponse, this,
@ -213,7 +260,7 @@ void EthStratumClient::readResponse(const boost::system::error_code& ec, std::si
 		std::string response;
 		getline(is, response);

-		if (response.front() == '{' && response.back() == '}') 
+		if (!response.empty() && response.front() == '{' && response.back() == '}') 
 		{
 			Json::Value responseObject;
 			Json::Reader reader;
@ -222,12 +269,12 @@ void EthStratumClient::readResponse(const boost::system::error_code& ec, std::si
 				processReponse(responseObject);
 				m_response = response;
 			}
-			else
+			else 
 			{
 				cwarn << "Parse response failed: " << reader.getFormattedErrorMessages();
 			}
 		}
-		else
+		else if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
 		{
 			cwarn << "Discarding incomplete response";
 		}
@ -242,6 +289,16 @@ void EthStratumClient::readResponse(const boost::system::error_code& ec, std::si
 	}
 }

+void EthStratumClient::processExtranonce(std::string& enonce)
+{
+	m_extraNonceHexSize = enonce.length();
+
+	cnote << "Extranonce set to " << enonce;
+
+	for (int i = enonce.length(); i < 16; ++i) enonce += "0";
+	m_extraNonce = h64(enonce);
+}
+
 void EthStratumClient::processReponse(Json::Value& responseObject)
 {
 	Json::Value error = responseObject.get("error", new Json::Value);
@ -255,16 +312,37 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
 	int id = responseObject.get("id", Json::Value::null).asInt();
 	switch (id)
 	{
-	case 1:
-		cnote << "Subscribed to stratum server";
-
-		os << "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"" << p_active->user << "\",\"" << p_active->pass << "\"]}\n";
+		case 1:
+		if (m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		{
+			m_nextWorkDifficulty = 1;
+			params = responseObject.get("result", Json::Value::null);
+			if (params.isArray())
+			{
+				std::string enonce = params.get((Json::Value::ArrayIndex)1, "").asString();
+				processExtranonce(enonce);
+			}

+			os << "{\"id\": 2, \"method\": \"mining.extranonce.subscribe\", \"params\": []}\n";
+		}
+		if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
+		{
+			cnote << "Subscribed to stratum server";
+			os << "{\"id\": 3, \"method\": \"mining.authorize\", \"params\": [\"" << p_active->user << "\",\"" << p_active->pass << "\"]}\n";
+		}
+		else
+		{
+			m_authorized = true;
+			os << "{\"id\": 5, \"method\": \"eth_getWork\", \"params\": []}\n"; // not strictly required but it does speed up initialization
+		}
 		async_write(m_socket, m_requestBuffer,
 			boost::bind(&EthStratumClient::handleResponse, this,
 			boost::asio::placeholders::error));
 		break;
 	case 2:
+		// nothing to do...
+		break;
+	case 3:
 		m_authorized = responseObject.get("result", Json::Value::null).asBool();
 		if (!m_authorized)
 		{
@ -285,76 +363,121 @@ void EthStratumClient::processReponse(Json::Value& responseObject)
 		}
 		break;
 	default:
-		string method = responseObject.get("method", "").asString();
+		string method, workattr;
+		unsigned index;
+		if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
+		{
+			method = responseObject.get("method", "").asString();
+			workattr = "params";
+			index = 1;
+		}
+		else
+		{
+			method = "mining.notify";
+			workattr = "result";
+			index = 0;
+		}
+
 		if (method == "mining.notify")
 		{
-			params = responseObject.get("params", Json::Value::null);
+			params = responseObject.get(workattr.c_str(), Json::Value::null);
 			if (params.isArray())
 			{
 				string job = params.get((Json::Value::ArrayIndex)0, "").asString();
-				string sHeaderHash = params.get((Json::Value::ArrayIndex)1, "").asString();
-				string sSeedHash = params.get((Json::Value::ArrayIndex)2, "").asString();
-				string sShareTarget = params.get((Json::Value::ArrayIndex)3, "").asString();
-				//bool cleanJobs = params.get((Json::Value::ArrayIndex)4, "").asBool();
-				
-				// coinmine.pl fix
-				int l = sShareTarget.length();
-				if (l < 66)
-					sShareTarget = "0x" + string(66 - l, '0') + sShareTarget.substr(2);
-								
-
-				if (sHeaderHash != "" && sSeedHash != "" && sShareTarget != "")
-				{
-					cnote << "Received new job #" + job.substr(0,8);
-					//cnote << "Header hash: " + sHeaderHash;
-					//cnote << "Seed hash: " + sSeedHash;
-					//cnote << "Share target: " + sShareTarget;
-
-					h256 seedHash = h256(sSeedHash);
-					h256 headerHash = h256(sHeaderHash);
-					EthashAux::FullType dag;

+				if (m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+				{
+					string sSeedHash = params.get((Json::Value::ArrayIndex)1, "").asString();
+					string sHeaderHash = params.get((Json::Value::ArrayIndex)2, "").asString();

-					if (seedHash != m_current.seedHash)
-					{
-						cnote << "Grabbing DAG for" << seedHash;
-					}
-					if (!(dag = EthashAux::full(seedHash, true, [&](unsigned _pc){ m_waitState = _pc < 100 ? MINER_WAIT_STATE_DAG : MINER_WAIT_STATE_WORK;  cnote << "Creating DAG. " << _pc << "% done..."; return 0; })))
-					{
-						BOOST_THROW_EXCEPTION(DAGCreationFailure());
-					}
-					if (m_precompute)
+					if (sHeaderHash != "" && sSeedHash != "")
 					{
-						EthashAux::computeFull(sha3(seedHash), true);
-					}
-					if (headerHash != m_current.headerHash)
-					{
-						//x_current.lock();
-						if (p_worktimer)
-							p_worktimer->cancel();
+						cnote << "Received new job #" + job;
+
+						h256 seedHash = h256(sSeedHash);
+						h256 headerHash = h256(sHeaderHash);

 						m_previous.headerHash = m_current.headerHash;
 						m_previous.seedHash = m_current.seedHash;
 						m_previous.boundary = m_current.boundary;
+						m_previous.startNonce = m_current.startNonce;
+						m_previous.exSizeBits = m_previous.exSizeBits;
 						m_previousJob = m_job;

 						m_current.headerHash = h256(sHeaderHash);
 						m_current.seedHash = seedHash;
-						m_current.boundary = h256(sShareTarget);// , h256::AlignRight);
+						m_current.boundary = h256();
+						diffToTarget((uint32_t*)m_current.boundary.data(), m_nextWorkDifficulty);
+						m_current.startNonce = ethash_swap_u64(*((uint64_t*)m_extraNonce.data()));
+						m_current.exSizeBits = m_extraNonceHexSize * 4;
 						m_job = job;

 						p_farm->setWork(m_current);
-						//x_current.unlock();
-						p_worktimer = new boost::asio::deadline_timer(m_io_service, boost::posix_time::seconds(m_worktimeout));
-						p_worktimer->async_wait(boost::bind(&EthStratumClient::work_timeout_handler, this, boost::asio::placeholders::error));
+					}
+				}
+				else
+				{
+					string sHeaderHash = params.get((Json::Value::ArrayIndex)index++, "").asString();
+					string sSeedHash = params.get((Json::Value::ArrayIndex)index++, "").asString();
+					string sShareTarget = params.get((Json::Value::ArrayIndex)index++, "").asString();
+
+					// coinmine.pl fix
+					int l = sShareTarget.length();
+					if (l < 66)
+						sShareTarget = "0x" + string(66 - l, '0') + sShareTarget.substr(2);
+
+
+					if (sHeaderHash != "" && sSeedHash != "" && sShareTarget != "")
+					{
+						cnote << "Received new job #" + job.substr(0, 8);
+
+						h256 seedHash = h256(sSeedHash);
+						h256 headerHash = h256(sHeaderHash);

+						if (headerHash != m_current.headerHash)
+						{
+							//x_current.lock();
+							if (p_worktimer)
+								p_worktimer->cancel();
+
+							m_previous.headerHash = m_current.headerHash;
+							m_previous.seedHash = m_current.seedHash;
+							m_previous.boundary = m_current.boundary;
+							m_previousJob = m_job;
+
+							m_current.headerHash = h256(sHeaderHash);
+							m_current.seedHash = seedHash;
+							m_current.boundary = h256(sShareTarget);
+							m_job = job;
+
+							p_farm->setWork(m_current);
+							//x_current.unlock();
+							p_worktimer = new boost::asio::deadline_timer(m_io_service, boost::posix_time::seconds(m_worktimeout));
+							p_worktimer->async_wait(boost::bind(&EthStratumClient::work_timeout_handler, this, boost::asio::placeholders::error));
+
+						}
 					}
 				}
 			}
 		}
-		else if (method == "mining.set_difficulty")
+		else if (method == "mining.set_difficulty" && m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
 		{
-
+			params = responseObject.get("params", Json::Value::null);
+			if (params.isArray())
+			{
+				m_nextWorkDifficulty = params.get((Json::Value::ArrayIndex)0, 1).asDouble();
+				if (m_nextWorkDifficulty <= 0.0001) m_nextWorkDifficulty = 0.0001;
+				cnote << "Difficulty set to " << m_nextWorkDifficulty;
+			}
+		}
+		else if (method == "mining.set_extranonce" && m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		{
+			params = responseObject.get("params", Json::Value::null);
+			if (params.isArray())
+			{
+				std::string enonce = params.get((Json::Value::ArrayIndex)0, "").asString();
+				processExtranonce(enonce);
+			}
 		}
 		else if (method == "client.get_version")
 		{
@ -384,11 +507,30 @@ bool EthStratumClient::submit(EthashProofOfWork::Solution solution) {
 	x_current.unlock();

 	cnote << "Solution found; Submitting to" << p_active->host << "...";
-	cnote << "  Nonce:" << "0x" + solution.nonce.hex();
+
+	string minernonce;
+	if (m_protocol != STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		cnote << "  Nonce:" << "0x" + solution.nonce.hex();
+	else
+		minernonce = solution.nonce.hex().substr(m_extraNonceHexSize, 16 - m_extraNonceHexSize);
+

 	if (EthashAux::eval(tempWork.seedHash, tempWork.headerHash, solution.nonce).value < tempWork.boundary)
 	{
-		string json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+		string json;
+
+		switch (m_protocol) {
+			case STRATUM_PROTOCOL_STRATUM:
+				json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+				break;
+			case STRATUM_PROTOCOL_ETHPROXY:
+				json = "{\"id\": 4, \"worker\":\"" + m_worker + "\", \"method\": \"eth_submitWork\", \"params\": [\"0x" + solution.nonce.hex() + "\",\"0x" + tempWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+				break;
+			case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+				json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_job + "\",\"" + minernonce + "\"]}\n";
+				break;
+		}
+
 		std::ostream os(&m_requestBuffer);
 		os << json;
 		m_stale = false;
@ -399,7 +541,20 @@ bool EthStratumClient::submit(EthashProofOfWork::Solution solution) {
 	}
 	else if (EthashAux::eval(tempPreviousWork.seedHash, tempPreviousWork.headerHash, solution.nonce).value < tempPreviousWork.boundary)
 	{
-		string json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_previous_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempPreviousWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+		string json;
+
+		switch (m_protocol) {
+		case STRATUM_PROTOCOL_STRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_previous_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempPreviousWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHPROXY:
+			json = "{\"id\": 4, \"worker\":\"" + m_worker + "\", \"method\": \"eth_submitWork\", \"params\": [\"0x" + solution.nonce.hex() + "\",\"0x" + tempPreviousWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_previous_job + "\",\"" + minernonce + "\"]}\n";
+			break;
+		}
+
 		std::ostream os(&m_requestBuffer);
 		os << json;
 		m_stale = true;
--- a/libstratum/EthStratumClient.h
+++ b/libstratum/EthStratumClient.h
@ -17,24 +17,18 @@ using boost::asio::ip::tcp;
 using namespace dev;
 using namespace dev::eth;

-typedef struct {
-	string host;
-	string port;
-	string user;
-	string pass;
-} cred_t;

 class EthStratumClient
 {
 public:
-	EthStratumClient(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, bool const & precompute);
+	EthStratumClient(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, int const & protocol, string const & email);
 	~EthStratumClient();

 	void setFailover(string const & host, string const & port);
 	void setFailover(string const & host, string const & port, string const & user, string const & pass);

 	bool isRunning() { return m_running; }
-	bool isConnected() { return m_connected; }
+	bool isConnected() { return m_connected && m_authorized; }
 	h256 currentHeaderHash() { return m_current.headerHash; }
 	bool current() { return m_current; }
 	unsigned waitState() { return m_waitState; }
@ -59,9 +53,10 @@ private:
 	cred_t m_primary;
 	cred_t m_failover;

+	string m_worker; // eth-proxy only;
+
 	bool m_authorized;
 	bool m_connected;
-	bool m_precompute;
 	bool m_running = true;

 	int	m_retries = 0;
@ -93,4 +88,13 @@ private:

 	boost::asio::deadline_timer * p_worktimer;

+	int m_protocol;
+	string m_email;
+
+	double m_nextWorkDifficulty;
+
+	h64 m_extraNonce;
+	int m_extraNonceHexSize;
+
+	void processExtranonce(std::string& enonce);
 };
--- a/libstratum/EthStratumClientV2.cpp
+++ b/libstratum/EthStratumClientV2.cpp
@ -0,0 +1,515 @@
+
+#include "EthStratumClientV2.h"
+#include <libdevcore/Log.h>
+#include <libethash/endian.h>
+using boost::asio::ip::tcp;
+
+
+static void diffToTarget(uint32_t *target, double diff)
+{
+	uint32_t target2[8];
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target2, 0xff, 32);
+	else {
+		memset(target2, 0, 32);
+		target2[k] = (uint32_t)m;
+		target2[k + 1] = (uint32_t)(m >> 32);
+	}
+
+	for (int i = 0; i < 32; i++)
+		((uint8_t*)target)[31 - i] = ((uint8_t*)target2)[i];
+}
+
+
+EthStratumClientV2::EthStratumClientV2(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, int const & protocol, string const & email)
+	: Worker("stratum"), 
+	  m_socket(m_io_service)
+{
+	m_minerType = m;
+	m_primary.host = host;
+	m_primary.port = port;
+	m_primary.user = user;
+	m_primary.pass = pass;
+
+	p_active = &m_primary;
+
+	m_authorized = false;
+	m_connected = false;
+	m_maxRetries = retries;
+	m_worktimeout = worktimeout;
+
+	m_protocol = protocol;
+	m_email = email;
+
+	p_farm = f;
+	p_worktimer = nullptr;
+	startWorking();
+}
+
+EthStratumClientV2::~EthStratumClientV2()
+{
+
+}
+
+void EthStratumClientV2::setFailover(string const & host, string const & port)
+{
+	setFailover(host, port, p_active->user, p_active->pass);
+}
+
+void EthStratumClientV2::setFailover(string const & host, string const & port, string const & user, string const & pass)
+{
+	m_failover.host = host;
+	m_failover.port = port;
+	m_failover.user = user;
+	m_failover.pass = pass;
+}
+
+void EthStratumClientV2::workLoop() 
+{
+	while (m_running)
+	{
+		try {
+			if (!m_connected)
+			{
+				//m_io_service.run();
+				//boost::thread t(boost::bind(&boost::asio::io_service::run, &m_io_service));
+				connect();
+				
+			}
+			read_until(m_socket, m_responseBuffer, "\n");
+			std::istream is(&m_responseBuffer);
+			std::string response;
+			getline(is, response);
+
+			if (!response.empty() && response.front() == '{' && response.back() == '}')
+			{
+				Json::Value responseObject;
+				Json::Reader reader;
+				if (reader.parse(response.c_str(), responseObject))
+				{
+					processReponse(responseObject);
+					m_response = response;
+				}
+				else
+				{
+					cwarn << "Parse response failed: " << reader.getFormattedErrorMessages();
+				}
+			}
+			else if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
+			{
+				cwarn << "Discarding incomplete response";
+			}
+		}
+		catch (std::exception const& _e) {
+			cwarn << _e.what();
+			reconnect();
+		}
+	}
+}
+
+
+void EthStratumClientV2::connect()
+{
+	cnote << "Connecting to stratum server " << p_active->host + ":" + p_active->port;
+
+	tcp::resolver r(m_io_service);
+	tcp::resolver::query q(p_active->host, p_active->port);
+	tcp::resolver::iterator endpoint_iterator = r.resolve(q);
+	tcp::resolver::iterator end;
+
+	boost::system::error_code error = boost::asio::error::host_not_found;
+	while (error && endpoint_iterator != end)
+	{
+		m_socket.close();
+		m_socket.connect(*endpoint_iterator++, error);
+	}
+	if (error)
+	{
+		cerr << "Could not connect to stratum server " << p_active->host + ":" + p_active->port + ", " << error.message();
+		reconnect();
+	}
+	else
+	{
+		cnote << "Connected!";
+		m_connected = true;
+		if (!p_farm->isMining())
+		{
+			cnote << "Starting farm";
+			if (m_minerType == MinerType::CPU)
+				p_farm->start("cpu", false);
+			else if (m_minerType == MinerType::CL)
+				p_farm->start("opencl", false);
+			else if (m_minerType == MinerType::CUDA)
+				p_farm->start("cuda", false);
+			else if (m_minerType == MinerType::Mixed) {
+				p_farm->start("cuda", false);
+				p_farm->start("opencl", true);
+			}
+		}
+		std::ostream os(&m_requestBuffer);
+
+		string user;
+		size_t p;
+
+
+		switch (m_protocol) {
+			case STRATUM_PROTOCOL_STRATUM:
+				os << "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}\n";
+				break;
+			case STRATUM_PROTOCOL_ETHPROXY:
+				p = p_active->user.find_first_of(".");
+				user = p_active->user.substr(0, p);
+				if (p + 1 <= p_active->user.length())
+					m_worker = p_active->user.substr(p + 1);
+				else
+					m_worker = "";
+
+				if (m_email.empty())
+				{
+					os << "{\"id\": 1, \"worker\":\"" << m_worker << "\", \"method\": \"eth_submitLogin\", \"params\": [\"" << user << "\"]}\n";
+				}
+				else
+				{
+					os << "{\"id\": 1, \"worker\":\"" << m_worker << "\", \"method\": \"eth_submitLogin\", \"params\": [\"" << user << "\", \"" << m_email << "\"]}\n";
+				}
+				break;
+			case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+				os << "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"ethminer/" << ETH_PROJECT_VERSION << "\",\"EthereumStratum/1.0.0\"]}\n";
+				break;
+		}
+
+		write(m_socket, m_requestBuffer);
+	}
+}
+
+void EthStratumClientV2::reconnect()
+{
+	if (p_worktimer) {
+		p_worktimer->cancel();
+		p_worktimer = nullptr;
+	}
+
+	//m_io_service.reset();
+	//m_socket.close(); // leads to crashes on Linux
+	m_authorized = false;
+	m_connected = false;
+		
+	if (!m_failover.host.empty())
+	{
+		m_retries++;
+
+		if (m_retries > m_maxRetries)
+		{
+			if (m_failover.host == "exit") {
+				disconnect();
+				return;
+			}
+			else if (p_active == &m_primary)
+			{
+				p_active = &m_failover;
+			}
+			else {
+				p_active = &m_primary;
+			}
+			m_retries = 0;
+		}
+	}
+	
+	cnote << "Reconnecting in 3 seconds...";
+	boost::asio::deadline_timer     timer(m_io_service, boost::posix_time::seconds(3));
+	timer.wait();
+}
+
+void EthStratumClientV2::disconnect()
+{
+	cnote << "Disconnecting";
+	m_connected = false;
+	m_running = false;
+	if (p_farm->isMining())
+	{
+		cnote << "Stopping farm";
+		p_farm->stop();
+	}
+	m_socket.close();
+	//m_io_service.stop();
+}
+
+void EthStratumClientV2::processExtranonce(std::string& enonce)
+{
+	m_extraNonceHexSize = enonce.length();
+
+	cnote << "Extranonce set to " << enonce;
+
+	for (int i = enonce.length(); i < 16; ++i) enonce += "0";
+	m_extraNonce = h64(enonce);
+}
+
+void EthStratumClientV2::processReponse(Json::Value& responseObject)
+{
+	Json::Value error = responseObject.get("error", new Json::Value);
+	if (error.isArray())
+	{
+		string msg = error.get(1, "Unknown error").asString();
+		cnote << msg;
+	}
+	std::ostream os(&m_requestBuffer);
+	Json::Value params;
+	int id = responseObject.get("id", Json::Value::null).asInt();
+	switch (id)
+	{
+		case 1:
+
+		if (m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		{
+			m_nextWorkDifficulty = 1;
+			params = responseObject.get("result", Json::Value::null);
+			if (params.isArray())
+			{
+				std::string enonce = params.get((Json::Value::ArrayIndex)1, "").asString();
+				processExtranonce(enonce);
+			}
+
+			os << "{\"id\": 2, \"method\": \"mining.extranonce.subscribe\", \"params\": []}\n";
+		}
+		if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
+		{
+			cnote << "Subscribed to stratum server";
+			os << "{\"id\": 3, \"method\": \"mining.authorize\", \"params\": [\"" << p_active->user << "\",\"" << p_active->pass << "\"]}\n";
+			write(m_socket, m_requestBuffer);
+		}
+		else
+		{
+			m_authorized = true;
+			os << "{\"id\": 5, \"method\": \"eth_getWork\", \"params\": []}\n"; // not strictly required but it does speed up initialization
+			write(m_socket, m_requestBuffer);
+		}
+		break;
+	case 2:
+		// nothing to do...
+		break;
+	case 3:
+		m_authorized = responseObject.get("result", Json::Value::null).asBool();
+		if (!m_authorized)
+		{
+			cnote << "Worker not authorized:" << p_active->user;
+			disconnect();
+			return;
+		}
+		cnote << "Authorized worker " << p_active->user;
+		break;
+	case 4:
+		if (responseObject.get("result", false).asBool()) {
+			cnote << "B-) Submitted and accepted.";
+			p_farm->acceptedSolution(m_stale);
+		}
+		else {
+			cwarn << ":-( Not accepted.";
+			p_farm->rejectedSolution(m_stale);
+		}
+		break;
+	default:
+		string method, workattr;
+		unsigned index;
+		if (m_protocol != STRATUM_PROTOCOL_ETHPROXY)
+		{
+			method = responseObject.get("method", "").asString();
+			workattr = "params";
+			index = 1;
+		}
+		else
+		{
+			method = "mining.notify";
+			workattr = "result";
+			index = 0;
+		}
+
+		if (method == "mining.notify")
+		{
+			params = responseObject.get(workattr, Json::Value::null);
+			if (params.isArray())
+			{
+				string job = params.get((Json::Value::ArrayIndex)0, "").asString();
+
+				if (m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+				{
+					string job = params.get((Json::Value::ArrayIndex)0, "").asString();
+					string sSeedHash = params.get((Json::Value::ArrayIndex)1, "").asString();
+					string sHeaderHash = params.get((Json::Value::ArrayIndex)2, "").asString();
+
+					if (sHeaderHash != "" && sSeedHash != "")
+					{
+						cnote << "Received new job #" + job;
+
+						h256 seedHash = h256(sSeedHash);
+						h256 headerHash = h256(sHeaderHash);
+
+						m_previous.headerHash = m_current.headerHash;
+						m_previous.seedHash = m_current.seedHash;
+						m_previous.boundary = m_current.boundary;
+						m_previous.startNonce = m_current.startNonce;
+						m_previous.exSizeBits = m_previous.exSizeBits;
+						m_previousJob = m_job;
+
+						m_current.headerHash = h256(sHeaderHash);
+						m_current.seedHash = seedHash;
+						m_current.boundary = h256();
+						diffToTarget((uint32_t*)m_current.boundary.data(), m_nextWorkDifficulty);
+						m_current.startNonce = ethash_swap_u64(*((uint64_t*)m_extraNonce.data()));
+						m_current.exSizeBits = m_extraNonceHexSize * 4;
+						m_job = job;
+
+						p_farm->setWork(m_current);
+					}
+				}
+				else
+				{
+					string sHeaderHash = params.get((Json::Value::ArrayIndex)index++, "").asString();
+					string sSeedHash = params.get((Json::Value::ArrayIndex)index++, "").asString();
+					string sShareTarget = params.get((Json::Value::ArrayIndex)index++, "").asString();
+
+					// coinmine.pl fix
+					int l = sShareTarget.length();
+					if (l < 66)
+						sShareTarget = "0x" + string(66 - l, '0') + sShareTarget.substr(2);
+
+
+					if (sHeaderHash != "" && sSeedHash != "" && sShareTarget != "")
+					{
+						cnote << "Received new job #" + job.substr(0, 8);
+
+						h256 seedHash = h256(sSeedHash);
+						h256 headerHash = h256(sHeaderHash);
+
+						if (headerHash != m_current.headerHash)
+						{
+							//x_current.lock();
+							//if (p_worktimer)
+							//	p_worktimer->cancel();
+
+							m_previous.headerHash = m_current.headerHash;
+							m_previous.seedHash = m_current.seedHash;
+							m_previous.boundary = m_current.boundary;
+							m_previousJob = m_job;
+
+							m_current.headerHash = h256(sHeaderHash);
+							m_current.seedHash = seedHash;
+							m_current.boundary = h256(sShareTarget);
+							m_job = job;
+
+							p_farm->setWork(m_current);
+							//x_current.unlock();
+							//p_worktimer = new boost::asio::deadline_timer(m_io_service, boost::posix_time::seconds(m_worktimeout));
+							//p_worktimer->async_wait(boost::bind(&EthStratumClientV2::work_timeout_handler, this, boost::asio::placeholders::error));
+						}
+					}
+				}
+			}
+		}
+		else if (method == "mining.set_difficulty" && m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		{
+			params = responseObject.get("params", Json::Value::null);
+			if (params.isArray())
+			{
+				m_nextWorkDifficulty = params.get((Json::Value::ArrayIndex)0, 1).asDouble();
+				if (m_nextWorkDifficulty <= 0.0001) m_nextWorkDifficulty = 0.0001;
+				cnote << "Difficulty set to " << m_nextWorkDifficulty;
+			}
+		}
+		else if (method == "mining.set_extranonce" && m_protocol == STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		{
+			params = responseObject.get("params", Json::Value::null);
+			if (params.isArray())
+			{
+				std::string enonce = params.get((Json::Value::ArrayIndex)0, "").asString();
+				processExtranonce(enonce);
+			}
+		}
+		else if (method == "client.get_version")
+		{
+			os << "{\"error\": null, \"id\" : " << id << ", \"result\" : \"" << ETH_PROJECT_VERSION << "\"}\n";
+			write(m_socket, m_requestBuffer);
+		}
+		break;
+	}
+
+}
+
+void EthStratumClientV2::work_timeout_handler(const boost::system::error_code& ec) {
+	if (!ec) {
+		cnote << "No new work received in" << m_worktimeout << "seconds.";
+		reconnect();
+	}
+}
+
+bool EthStratumClientV2::submit(EthashProofOfWork::Solution solution) {
+	x_current.lock();
+	EthashProofOfWork::WorkPackage tempWork(m_current);
+	string temp_job = m_job;
+	EthashProofOfWork::WorkPackage tempPreviousWork(m_previous);
+	string temp_previous_job = m_previousJob;
+	x_current.unlock();
+
+	cnote << "Solution found; Submitting to" << p_active->host << "...";
+
+	string minernonce;
+	if (m_protocol != STRATUM_PROTOCOL_ETHEREUMSTRATUM)
+		cnote << "  Nonce:" << "0x" + solution.nonce.hex();
+	else
+		minernonce = solution.nonce.hex().substr(m_extraNonceHexSize, 16 - m_extraNonceHexSize);
+
+
+	if (EthashAux::eval(tempWork.seedHash, tempWork.headerHash, solution.nonce).value < tempWork.boundary)
+	{
+		string json;
+		switch (m_protocol) {
+		case STRATUM_PROTOCOL_STRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHPROXY:
+			json = "{\"id\": 4, \"worker\":\"" + m_worker + "\", \"method\": \"eth_submitWork\", \"params\": [\"0x" + solution.nonce.hex() + "\",\"0x" + tempWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_job + "\",\"" + minernonce + "\"]}\n";
+			break;
+		}
+		std::ostream os(&m_requestBuffer);
+		os << json;
+		m_stale = false;
+		write(m_socket, m_requestBuffer);
+		return true;
+	}
+	else if (EthashAux::eval(tempPreviousWork.seedHash, tempPreviousWork.headerHash, solution.nonce).value < tempPreviousWork.boundary)
+	{
+		string json;
+		switch (m_protocol) {
+		case STRATUM_PROTOCOL_STRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_previous_job + "\",\"0x" + solution.nonce.hex() + "\",\"0x" + tempPreviousWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHPROXY:
+			json = "{\"id\": 4, \"worker\":\"" + m_worker + "\", \"method\": \"eth_submitWork\", \"params\": [\"0x" + solution.nonce.hex() + "\",\"0x" + tempPreviousWork.headerHash.hex() + "\",\"0x" + solution.mixHash.hex() + "\"]}\n";
+			break;
+		case STRATUM_PROTOCOL_ETHEREUMSTRATUM:
+			json = "{\"id\": 4, \"method\": \"mining.submit\", \"params\": [\"" + p_active->user + "\",\"" + temp_previous_job + "\",\"" + minernonce + "\"]}\n";
+			break;
+		}		std::ostream os(&m_requestBuffer);
+		os << json;
+		m_stale = true;
+		cwarn << "Submitting stale solution.";
+		write(m_socket, m_requestBuffer);
+		return true;
+	}
+	else {
+		m_stale = false;
+		cwarn << "FAILURE: GPU gave incorrect result!";
+		p_farm->failedSolution();
+	}
+
+	return false;
+}
+
--- a/libstratum/EthStratumClientV2.h
+++ b/libstratum/EthStratumClientV2.h
@ -0,0 +1,95 @@
+#include <iostream>
+#include <boost/array.hpp>
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+#include <json/json.h>
+#include <libdevcore/Log.h>
+#include <libdevcore/FixedHash.h>
+#include <libdevcore/Worker.h>
+#include <libethcore/Farm.h>
+#include <libethcore/EthashAux.h>
+#include <libethcore/Miner.h>
+
+#include "BuildInfo.h"
+
+
+using namespace std;
+using namespace boost::asio;
+using boost::asio::ip::tcp;
+using namespace dev;
+using namespace dev::eth;
+
+class EthStratumClientV2 : public Worker
+{
+public:
+	EthStratumClientV2(GenericFarm<EthashProofOfWork> * f, MinerType m, string const & host, string const & port, string const & user, string const & pass, int const & retries, int const & worktimeout, int const & protocol, string const & email);
+	~EthStratumClientV2();
+
+	void setFailover(string const & host, string const & port);
+	void setFailover(string const & host, string const & port, string const & user, string const & pass);
+
+	bool isRunning() { return m_running; }
+	bool isConnected() { return m_connected && m_authorized; }
+	h256 currentHeaderHash() { return m_current.headerHash; }
+	bool current() { return m_current; }
+	unsigned waitState() { return m_waitState; }
+	bool submit(EthashProofOfWork::Solution solution);
+	void reconnect();
+private:
+	void workLoop() override;
+	void connect();
+	
+	void disconnect();
+	void work_timeout_handler(const boost::system::error_code& ec);
+
+	void processReponse(Json::Value& responseObject);
+	
+	MinerType m_minerType;
+
+	cred_t * p_active;
+	cred_t m_primary;
+	cred_t m_failover;
+
+	string m_worker; // eth-proxy only;
+
+	bool m_authorized;
+	bool m_connected;
+	bool m_running = true;
+
+	int	m_retries = 0;
+	int	m_maxRetries;
+	int m_worktimeout = 60;
+
+	int m_waitState = MINER_WAIT_STATE_WORK;
+
+	string m_response;
+
+	GenericFarm<EthashProofOfWork> * p_farm;
+	mutex x_current;
+	EthashProofOfWork::WorkPackage m_current;
+	EthashProofOfWork::WorkPackage m_previous;
+
+	bool m_stale = false;
+
+	string m_job;
+	string m_previousJob;
+	EthashAux::FullType m_dag;
+
+	boost::asio::io_service m_io_service;
+	tcp::socket m_socket;
+
+	boost::asio::streambuf m_requestBuffer;
+	boost::asio::streambuf m_responseBuffer;
+
+	boost::asio::deadline_timer * p_worktimer;
+
+	int m_protocol;
+	string m_email;
+
+	double m_nextWorkDifficulty;
+
+	h64 m_extraNonce;
+	int m_extraNonceHexSize;
+
+	void processExtranonce(std::string& enonce);
+};
--- a/package.sh
+++ b/package.sh
@ -1,19 +0,0 @@
-#!/bin/bash
-
-opwd="$PWD"
-br=$(git branch | grep '\*' | sed 's/^..//')
-
-n=cpp-ethereum-src-$(date "+%Y%m%d%H%M%S" --date="1970-01-01 $(git log -1 --date=short --pretty=format:%ct) sec GMT")-$(grep "Version = " libdevcore/Common.cpp | sed 's/^[^"]*"//' | sed 's/".*$//')-$(git rev-parse HEAD | cut -c1-6)
-
-cd /tmp
-git clone "$opwd" $n
-cd $n
-git checkout $br
-rm -f package.sh
-cd ..
-tar c $n | bzip2 -- > $opwd/../${n}.tar.bz2
-rm -rf $n
-cd $opwd
-
-echo "SHA1(${n}.tar.bz2) = $(shasum $opwd/../${n}.tar.bz2 | cut -d' '  -f 1)"
-
--- a/releases/ethminer-0.9.41-genoil-1.0.1.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.1.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.2.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.2.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.3.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.3.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.4b3.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.4b3.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.5.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.5.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.6.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.6.zip
--- a/releases/ethminer-0.9.41-genoil-1.0.7.zip
+++ b/releases/ethminer-0.9.41-genoil-1.0.7.zip
--- a/releases/ethminer-0.9.41-genoil-1.1.6.zip
+++ b/releases/ethminer-0.9.41-genoil-1.1.6.zip
--- a/releases/ethminer-0.9.41-genoil-1.1.7.zip
+++ b/releases/ethminer-0.9.41-genoil-1.1.7.zip