diff --git a/CMakeLists.txt b/CMakeLists.txt
index e87d6ac18..4dbca5af7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 2.8.12)
 
 set(PROJECT_VERSION "0.9.41")
-set(GENOIL_VERSION "1.0.6")
+set(GENOIL_VERSION "1.0.7")
 if (${CMAKE_VERSION} VERSION_GREATER 3.0)
 	cmake_policy(SET CMP0042 OLD) 	# fix MACOSX_RPATH
 	cmake_policy(SET CMP0048 NEW) 	# allow VERSION argument in project()
diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 58cca14e1..a5b97e78e 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -835,7 +835,7 @@ private:
 			f.start("cuda");
 		EthashProofOfWork::WorkPackage current;
 		EthashAux::FullType dag;
-		while (true)
+		while (m_running)
 			try
 			{
 				bool completed = false;
@@ -909,13 +909,20 @@ private:
 			}
 			catch (jsonrpc::JsonRpcException&)
 			{
-				for (auto i = 3; --i; this_thread::sleep_for(chrono::seconds(1)))
-					cerr << "JSON-RPC problem. Probably couldn't connect. Retrying in " << i << "... \r";
-				cerr << endl;
+				if (m_maxFarmRetries > 0)
+				{
+					for (auto i = 3; --i; this_thread::sleep_for(chrono::seconds(1)))
+						cerr << "JSON-RPC problem. Probably couldn't connect. Retrying in " << i << "... \r";
+					cerr << endl;
+				}
+				else
+				{
+					cerr << "JSON-RPC problem. Probably couldn't connect." << endl;
+				}
 				if (m_farmFailOverURL != "")
 				{
 					m_farmRetries++;
-					if (m_farmRetries == m_maxFarmRetries)
+					if (m_farmRetries >= m_maxFarmRetries)
 					{
 						if (_remote == m_farmURL) {
 							_remote = m_farmFailOverURL;
@@ -927,6 +934,10 @@ private:
 						}
 						m_farmRetries = 0;
 					}
+					if (_remote == "exit")
+					{
+						m_running = false;
+					}
 				}
 			}
 #endif
@@ -976,6 +987,7 @@ private:
 	DAGEraseMode m_eraseMode = DAGEraseMode::None;
 
 	/// Mining options
+	bool m_running = true;
 	MinerType m_minerType = MinerType::CPU;
 	unsigned m_openclPlatform = 0;
 	unsigned m_openclDevice = 0;
diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl
index 2bb9a1c59..29ae6a487 100644
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@@ -1,6 +1,6 @@
 #define OPENCL_PLATFORM_UNKNOWN 0
 #define OPENCL_PLATFORM_NVIDIA  1
-#define OPENCL_PLATFORM_AMD		2
+#define OPENCL_PLATFORM_AMD			2
 
 
 #define THREADS_PER_HASH (128 / 16)
@@ -79,10 +79,19 @@ static uint2 ROL2(const uint2 v, const int n)
 }
 #endif
 
+static void chi(uint2 * a, const uint n, const uint2 * t)
+{
+	a[n+0] = bitselect(t[n + 0] ^ t[n + 2], t[n + 0], t[n + 1]);
+	a[n+1] = bitselect(t[n + 1] ^ t[n + 3], t[n + 1], t[n + 2]);
+	a[n+2] = bitselect(t[n + 2] ^ t[n + 4], t[n + 2], t[n + 3]);
+	a[n+3] = bitselect(t[n + 3] ^ t[n + 0], t[n + 3], t[n + 4]);
+	a[n+4] = bitselect(t[n + 4] ^ t[n + 1], t[n + 4], t[n + 0]);
+}
+
 static void keccak_f1600_round(uint2* a, uint r)
 {
 	uint2 t[25];
-	uint2 u, v;
+	uint2 u;
 
 	// Theta
 	t[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
@@ -122,66 +131,47 @@ static void keccak_f1600_round(uint2* a, uint r)
 	a[24] ^= u;
 
 	// Rho Pi
-	u = a[1];
-	t[0] = a[0];
-	t[1] = ROL2(a[6], 44);
-	t[6] = ROL2(a[9], 20);
-	t[9] = ROL2(a[22], 61);
-	t[22] = ROL2(a[14], 39);
-	t[14] = ROL2(a[20], 18);
+
+	t[0]  = a[0];
+	t[10] = ROL2(a[1], 1);
 	t[20] = ROL2(a[2], 62);
-	t[2] = ROL2(a[12], 43);
-	t[12] = ROL2(a[13], 25);
-	t[13] = ROL2(a[19], 8);
-	t[19] = ROL2(a[23], 56);
-	t[23] = ROL2(a[15], 41);
+	t[5]  = ROL2(a[3], 28);
 	t[15] = ROL2(a[4], 27);
-	t[4] = ROL2(a[24], 14);
-	t[24] = ROL2(a[21], 2);
-	t[21] = ROL2(a[8], 55);
-	t[8] = ROL2(a[16], 45);
+	
 	t[16] = ROL2(a[5], 36);
-	t[5] = ROL2(a[3], 28);
-	t[3] = ROL2(a[18], 21);
-	t[18] = ROL2(a[17], 15);
+	t[1]  = ROL2(a[6], 44);
+  t[11] = ROL2(a[7], 6);
+	t[21] = ROL2(a[8], 55);
+	t[6]  = ROL2(a[9], 20);
+	
+	t[7]  = ROL2(a[10], 3);
 	t[17] = ROL2(a[11], 10);
-	t[11] = ROL2(a[7], 6);
-	t[7] = ROL2(a[10], 3);
-	t[10] = ROL2(u, 1);
+	t[2]  = ROL2(a[12], 43);
+	t[12] = ROL2(a[13], 25);
+	t[22] = ROL2(a[14], 39);
+	
+	t[23] = ROL2(a[15], 41);
+	t[8]  = ROL2(a[16], 45);
+	t[18] = ROL2(a[17], 15);
+	t[3]  = ROL2(a[18], 21);
+	t[13] = ROL2(a[19], 8);
+	
+	t[14] = ROL2(a[20], 18);
+	t[24] = ROL2(a[21], 2);
+	t[9]  = ROL2(a[22], 61);
+	t[19] = ROL2(a[23], 56);
+	t[4]  = ROL2(a[24], 14);
 
 	// Chi
-	a[0] = bitselect(t[0] ^ t[2], t[0], t[1]);
-	a[1] = bitselect(t[1] ^ t[3], t[1], t[2]);
-	a[2] = bitselect(t[2] ^ t[4], t[2], t[3]);
-	a[3] = bitselect(t[3] ^ t[0], t[3], t[4]);
-	a[4] = bitselect(t[4] ^ t[1], t[4], t[0]);
+	chi(a, 0, t);
 
 	// Iota
 	a[0] ^= Keccak_f1600_RC[r];
 
-	a[5] = bitselect(t[5] ^ t[7], t[5], t[6]);
-	a[6] = bitselect(t[6] ^ t[8], t[6], t[7]);
-	a[7] = bitselect(t[7] ^ t[9], t[7], t[8]);
-	a[8] = bitselect(t[8] ^ t[5], t[8], t[9]);
-	a[9] = bitselect(t[9] ^ t[6], t[9], t[5]);
-
-	a[10] = bitselect(t[10] ^ t[12], t[10], t[11]);
-	a[11] = bitselect(t[11] ^ t[13], t[11], t[12]);
-	a[12] = bitselect(t[12] ^ t[14], t[12], t[13]);
-	a[13] = bitselect(t[13] ^ t[10], t[13], t[14]);
-	a[14] = bitselect(t[14] ^ t[11], t[14], t[10]);
-
-	a[15] = bitselect(t[15] ^ t[17], t[15], t[16]);
-	a[16] = bitselect(t[16] ^ t[18], t[16], t[17]);
-	a[17] = bitselect(t[17] ^ t[19], t[17], t[18]);
-	a[18] = bitselect(t[18] ^ t[15], t[18], t[19]);
-	a[19] = bitselect(t[19] ^ t[16], t[19], t[15]);
-
-	a[20] = bitselect(t[20] ^ t[22], t[20], t[21]);
-	a[21] = bitselect(t[21] ^ t[23], t[21], t[22]);
-	a[22] = bitselect(t[22] ^ t[24], t[22], t[23]);
-	a[23] = bitselect(t[23] ^ t[20], t[23], t[24]);
-	a[24] = bitselect(t[24] ^ t[21], t[24], t[20]);
+	chi(a, 5, t);
+	chi(a, 10, t);
+	chi(a, 15, t);
+	chi(a, 20, t);
 }
 
 static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
@@ -192,9 +182,9 @@ static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
 	// better with surrounding code, however I haven't done this
 	// without causing the AMD compiler to blow up the VGPR usage.
 
-	uint r = 0;
-	uint o = 25;
-	do
+	
+	//uint o = 25;
+	for (uint r = 0; r < 24;)
 	{
 		// This dynamic branch stops the AMD compiler unrolling the loop
 		// and additionally saves about 33% of the VGPRs, enough to gain another
@@ -206,10 +196,10 @@ static void keccak_f1600_no_absorb(uint2* a, uint out_size, uint isolate)
 		if (isolate)
 		{
 			keccak_f1600_round(a, r++);
-			if (r == 23) o = out_size;
+			//if (r == 23) o = out_size;
 		}
 	} 
-	while (r < 24);
+	
 
 	// final round optimised for digest size
 	//keccak_f1600_round(a, 23, out_size);
diff --git a/libethcore/EthashCUDAMiner.cpp b/libethcore/EthashCUDAMiner.cpp
index 65777efca..3438d1d10 100644
--- a/libethcore/EthashCUDAMiner.cpp
+++ b/libethcore/EthashCUDAMiner.cpp
@@ -219,11 +219,13 @@ bool EthashCUDAMiner::configureGPU(
 	uint64_t _currentBlock
 	)
 {
+	
 	if (_blockSize != 32 && _blockSize != 64 && _blockSize != 128)
 	{
 		cout << "Given localWorkSize of " << toString(_blockSize) << "is invalid. Must be either 32,64 or 128" << endl;
 		return false;
 	}
+	
 	if (!ethash_cuda_miner::configureGPU(
 		s_devices,
 		_blockSize,