diff --git a/libethash-cl/ethash_cl_miner.cpp b/libethash-cl/ethash_cl_miner.cpp
index 28afc6e58..ec45be1a8 100644
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@@ -401,7 +401,6 @@ bool ethash_cl_miner::init(
 			ETHCL_LOG("Creating one big buffer for the DAG");
 			m_dagChunks.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, _dagSize));
 			ETHCL_LOG("Loading single big chunk kernels");
-			m_hashKernel = cl::Kernel(program, "ethash_hash");
 			m_searchKernel = cl::Kernel(program, "ethash_search");
 			ETHCL_LOG("Mapping one big chunk.");
 			m_queue.enqueueWriteBuffer(m_dagChunks[0], CL_TRUE, 0, _dagSize, _dag);
@@ -410,46 +409,6 @@ bool ethash_cl_miner::init(
 		{
 			ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << err.err() << "). GPU can't allocate the DAG in a single chunk. Bailing.");
 			return false;
-#if 0		// Disabling chunking for release since it seems not to work. Never manages to mine a block. TODO: Fix when time is found.
-			int errCode = err.err();
-			if (errCode != CL_INVALID_BUFFER_SIZE || errCode != CL_MEM_OBJECT_ALLOCATION_FAILURE)
-				ETHCL_LOG("Allocating/mapping single buffer failed with: " << err.what() << "(" << errCode << ")");
-			cl_ulong result;
-			// if we fail midway on the try above make sure we start clean
-			m_dagChunks.clear();
-			device.getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &result);
-			ETHCL_LOG(
-				"Failed to allocate 1 big chunk. Max allocateable memory is "
-				<< result << ". Trying to allocate 4 chunks."
-			);
-			// The OpenCL kernel has a hard coded number of 4 chunks at the moment
-			m_dagChunksCount = 4;
-			for (unsigned i = 0; i < m_dagChunksCount; i++)
-			{
-				// TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation
-				ETHCL_LOG("Creating buffer for chunk " << i);
-				m_dagChunks.push_back(cl::Buffer(
-					m_context,
-					CL_MEM_READ_ONLY,
-					(i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7
-				));
-			}
-			ETHCL_LOG("Loading chunk kernels");
-			m_hashKernel = cl::Kernel(program, "ethash_hash_chunks");
-			m_searchKernel = cl::Kernel(program, "ethash_search_chunks");
-			// TODO Note: If we ever change to _dagChunksNum other than 4, then the size would need recalculation
-			void* dag_ptr[4];
-			for (unsigned i = 0; i < m_dagChunksCount; i++)
-			{
-				ETHCL_LOG("Mapping chunk " << i);
-				dag_ptr[i] = m_queue.enqueueMapBuffer(m_dagChunks[i], true, m_openclOnePointOne ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
-			}
-			for (unsigned i = 0; i < m_dagChunksCount; i++)
-			{
-				memcpy(dag_ptr[i], (char *)_dag + i*((_dagSize >> 9) << 7), (i == 3) ? (_dagSize - 3 * ((_dagSize >> 9) << 7)) : (_dagSize >> 9) << 7);
-				m_queue.enqueueUnmapMemObject(m_dagChunks[i], dag_ptr[i]);
-			}
-#endif
 		}
 		// create buffer for header
 		ETHCL_LOG("Creating buffer for header.");
@@ -459,7 +418,6 @@ bool ethash_cl_miner::init(
 		for (unsigned i = 0; i != c_bufferCount; ++i)
 		{
 			ETHCL_LOG("Creating mining buffer " << i);
-			m_hashBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_openclOnePointOne ? CL_MEM_HOST_READ_ONLY : 0), 32 * c_hashBatchSize);
 			m_searchBuffer[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_maxSearchResults + 1) * sizeof(uint32_t));
 		}
 	}
diff --git a/libethash-cl/ethash_cl_miner.h b/libethash-cl/ethash_cl_miner.h
index c51c38fb6..4f10ef754 100644
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@@ -79,12 +79,10 @@ private:
 
 	cl::Context m_context;
 	cl::CommandQueue m_queue;
-	cl::Kernel m_hashKernel;
 	cl::Kernel m_searchKernel;
 	unsigned int m_dagChunksCount;
 	std::vector<cl::Buffer> m_dagChunks;
 	cl::Buffer m_header;
-	cl::Buffer m_hashBuffer[c_bufferCount];
 	cl::Buffer m_searchBuffer[c_bufferCount];
 	unsigned m_globalWorkSize;
 	bool m_openclOnePointOne;
diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl
index 8ea6df12d..191021ffd 100644
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@@ -112,17 +112,18 @@ static void keccak_f1600_round(uint2* a, uint r, uint out_size)
 
 	// Chi
 	a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
-	a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
-	a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
-	a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
-	if (out_size >= 4)
+	
+	if (out_size > 4)
 	{
+		a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
+		a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
+		a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
 		a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
 		a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
 		a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
 		a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
 		a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
-		if (out_size >= 8)
+		if (out_size > 8)
 		{
 			a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
 			a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
@@ -227,163 +228,13 @@ typedef union
 	uint4 uint4s[128 / sizeof(uint4)];
 } hash128_t;
 
-static hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
-{
-	hash64_t init;
-	uint const init_size = countof(init.ulongs);
-	uint const hash_size = countof(header->ulongs);
-
-	// sha3_512(header .. nonce)
-	ulong state[25];
-	copy(state, header->ulongs, hash_size);
-	state[hash_size] = nonce;
-	keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
-
-	copy(init.ulongs, state, init_size);
-	return init;
-}
-
-static uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
-{
-	uint4 mix = init;
-
-	// share init0
-	if (thread_id == 0)
-		*share = mix.x;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	uint init0 = *share;
-
-	uint a = 0;
-	do
-	{
-		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
-
-		#pragma unroll
-		for (uint i = 0; i != 4; ++i)
-		{
-			if (update_share)
-			{
-				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
-				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
-			}
-			barrier(CLK_LOCAL_MEM_FENCE);
-
-			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
-		}
-	} while ((a += 4) != (ACCESSES & isolate));
-
-	return fnv_reduce(mix);
-}
-
-
-
-static uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
-{
-	uint4 mix = init;
-
-	// share init0
-	if (thread_id == 0)
-		*share = mix.x;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	uint init0 = *share;
-
-	uint a = 0;
-	do
-	{
-		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
-
-		#pragma unroll
-		for (uint i = 0; i != 4; ++i)
-		{
-			if (update_share)
-			{
-				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
-				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
-			}
-			barrier(CLK_LOCAL_MEM_FENCE);
-
-			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
-		}
-	}
-	while ((a += 4) != (ACCESSES & isolate));
-
-	return fnv_reduce(mix);
-}
-
-
-static hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
-{
-	ulong state[25];
-
-	hash32_t hash;
-	uint const hash_size = countof(hash.ulongs);
-	uint const init_size = countof(init->ulongs);
-	uint const mix_size = countof(mix->ulongs);
-
-	// keccak_256(keccak_512(header..nonce) .. mix);
-	copy(state, init->ulongs, init_size);
-	copy(state + init_size, mix->ulongs, mix_size);
-	keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
-
-	// copy out
-	copy(hash.ulongs, state, hash_size);
-	return hash;
-}
-
-static hash32_t compute_hash_simple(
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	ulong nonce,
-	uint isolate
-	)
-{
-	hash64_t init = init_hash(g_header, nonce, isolate);
-
-	hash128_t mix;
-	for (uint i = 0; i != countof(mix.uint4s); ++i)
-	{
-		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
-	}
-
-	uint mix_val = mix.uints[0];
-	uint init0 = mix.uints[0];
-	uint a = 0;
-	do
-	{
-		uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
-		uint n = (a+1) % countof(mix.uints);
-
-		#pragma unroll
-		for (uint i = 0; i != countof(mix.uints); ++i)
-		{
-			mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
-			mix_val = i == n ? mix.uints[i] : mix_val;
-		}
-	}
-	while (++a != (ACCESSES & isolate));
-
-	// reduce to output
-	hash32_t fnv_mix;
-	for (uint i = 0; i != countof(fnv_mix.uints); ++i)
-	{
-		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
-	}
-
-	return final_hash(&init, &fnv_mix, isolate);
-}
-
-typedef union
-{
-	struct
-	{
-		hash64_t init;
-		uint pad; // avoid lds bank conflicts
-	};
-	hash32_t mix;
+typedef union {
+	uint4 uint4s[4];
+	ulong ulongs[8];
+	uint  uints[16];
 } compute_hash_share;
 
-
-static hash32_t compute_hash(
+static ulong compute_hash(
 	__local compute_hash_share* share,
 	__constant hash32_t const* g_header,
 	__global hash128_t const* g_dag,
@@ -394,132 +245,70 @@ static hash32_t compute_hash(
 	uint const gid = get_global_id(0);
 
 	// Compute one init hash per work item.
-	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// sha3_512(header .. nonce)
+	ulong state[25];
+	copy(state, g_header->ulongs, 4);
+	state[4] = nonce;
+	keccak_f1600_no_absorb(state, 5, 8, isolate);
+	 
 
 	// Threads work together in this phase in groups of 8.
-	uint const thread_id = gid % THREADS_PER_HASH;
-	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+	uint const thread_id = gid & 7;
+	uint const hash_id = (gid & (GROUP_SIZE-1)) >> 3;
 
-	hash32_t mix;
 	uint i = 0;
 	do
 	{
 		// share init with other threads
 		if (i == thread_id)
-			share[hash_id].init = init;
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
+			copy(share[hash_id].uint4s, state, 4);
 
-		share[hash_id].mix.uints[thread_id] = thread_mix;
 		barrier(CLK_LOCAL_MEM_FENCE);
 
-		if (i == thread_id)
-			mix = share[hash_id].mix;
+		uint4 mix = share[hash_id].uint4s[thread_id & 3];
 		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-	while (++i != (THREADS_PER_HASH & isolate));
-
-	return final_hash(&init, &mix, isolate);
-}
-
-
-static hash32_t compute_hash_chunks(
-	__local compute_hash_share* share,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	__global hash128_t const* g_dag1,
-	__global hash128_t const* g_dag2,
-	__global hash128_t const* g_dag3,
-	ulong nonce,
-	uint isolate
-	)
-{
-	uint const gid = get_global_id(0);
 
-	// Compute one init hash per work item.
-	hash64_t init = init_hash(g_header, nonce, isolate);
-
-	// Threads work together in this phase in groups of 8.
-	uint const thread_id = gid % THREADS_PER_HASH;
-	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+		__local uint *share0 = share[hash_id].uints;
 
-	hash32_t mix;
-	uint i = 0;
-	do
-	{
-		// share init with other threads
-		if (i == thread_id)
-			share[hash_id].init = init;
+		// share init0
+		if (thread_id == 0)
+			*share0 = mix.x;
 		barrier(CLK_LOCAL_MEM_FENCE);
+		uint init0 = *share0;
 
-		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
-		barrier(CLK_LOCAL_MEM_FENCE);
+		uint a = 0;
+		do
+		{
+			bool update_share = thread_id == ((a >> 2) & (THREADS_PER_HASH - 1));
 
-		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+			#pragma unroll
+			for (uint i = 0; i != 4; ++i)
+			{
+				if (update_share)
+				{
+					*share0 = fnv(init0 ^ (a + i), ((uint *)&mix)[i]) % DAG_SIZE;
+				}
+				barrier(CLK_LOCAL_MEM_FENCE);
 
-		share[hash_id].mix.uints[thread_id] = thread_mix;
+				mix = fnv4(mix, g_dag[*share0].uint4s[thread_id]);
+			}
+		} while ((a += 4) != (ACCESSES & isolate));
+
+		share[hash_id].uints[thread_id] = fnv_reduce(mix);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
 		if (i == thread_id)
-			mix = share[hash_id].mix;
+			copy(state + 8, share[hash_id].ulongs, 4);
+
 		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 	while (++i != (THREADS_PER_HASH & isolate));
 
-	return final_hash(&init, &mix, isolate);
-}
-
-__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
-__kernel void ethash_hash_simple(
-	__global hash32_t* g_hashes,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	ulong start_nonce,
-	uint isolate
-	)
-{
-	uint const gid = get_global_id(0);
-	g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
-}
-
-__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
-__kernel void ethash_search_simple(
-	__global volatile uint* restrict g_output,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	ulong start_nonce,
-	ulong target,
-	uint isolate
-	)
-{
-	uint const gid = get_global_id(0);
-	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
-
-	if (hash.ulongs[countof(hash.ulongs)-1] < target)
-	{
-		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
-		g_output[slot] = gid;
-	}
-}
-
-
-__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
-__kernel void ethash_hash(
-	__global hash32_t* g_hashes,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	ulong start_nonce,
-	uint isolate
-	)
-{
-	__local compute_hash_share share[HASHES_PER_LOOP];
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	keccak_f1600_no_absorb(state, 12, 4, isolate);
 
-	uint const gid = get_global_id(0);
-	g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+	return state[0];
 }
 
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
@@ -535,54 +324,11 @@ __kernel void ethash_search(
 	__local compute_hash_share share[HASHES_PER_LOOP];
 
 	uint const gid = get_global_id(0);
-	hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+	ulong hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
 
-	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	if (as_ulong(as_uchar8(hash).s76543210) < target)
 	{
 		uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
 		g_output[slot] = gid;
 	}
 }
-
-__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
-__kernel void ethash_hash_chunks(
-	__global hash32_t* g_hashes,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	__global hash128_t const* g_dag1,
-	__global hash128_t const* g_dag2,
-	__global hash128_t const* g_dag3,
-	ulong start_nonce,
-	uint isolate
-	)
-{
-	__local compute_hash_share share[HASHES_PER_LOOP];
-
-	uint const gid = get_global_id(0);
-	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
-}
-
-__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
-__kernel void ethash_search_chunks(
-	__global volatile uint* restrict g_output,
-	__constant hash32_t const* g_header,
-	__global hash128_t const* g_dag,
-	__global hash128_t const* g_dag1,
-	__global hash128_t const* g_dag2,
-	__global hash128_t const* g_dag3,
-	ulong start_nonce,
-	ulong target,
-	uint isolate
-	)
-{
-	__local compute_hash_share share[HASHES_PER_LOOP];
-
-	uint const gid = get_global_id(0);
-	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
-
-	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
-	{
-		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
-		g_output[slot] = gid;
-	}
-}