Add more files.

10 years ago · 0b665270b6
6 changed files with 4982 additions and 0 deletions
--- a/libethash-cl/CMakeLists.txt
+++ b/libethash-cl/CMakeLists.txt
@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 2.8)
+
+set(LIBRARY ethash-cl)
+#set(CMAKE_BUILD_TYPE Release)
+
+include(bin2h.cmake)
+bin2h(SOURCE_FILE ethash_cl_miner_kernel.cl
+      VARIABLE_NAME ethash_cl_miner_kernel
+      HEADER_FILE ${CMAKE_CURRENT_BINARY_DIR}/ethash_cl_miner_kernel.h)
+
+if (NOT MSVC)
+	# Initialize CXXFLAGS for c++11
+	set(CMAKE_CXX_FLAGS                "-Wall -std=c++11")
+	set(CMAKE_CXX_FLAGS_DEBUG          "-O0 -g")
+	set(CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG")
+	set(CMAKE_CXX_FLAGS_RELEASE        "-O4 -DNDEBUG")
+	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g")
+
+    # Compiler-specific C++11 activation.
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+        execute_process(
+            COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (NOT (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7))
+            message(FATAL_ERROR "${PROJECT_NAME} requires g++ 4.7 or greater.")
+        endif ()
+    elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+    else ()
+        message(FATAL_ERROR "Your C++ compiler does not support C++11.")
+    endif ()
+endif()
+
+set(OpenCL_FOUND TRUE)
+set(OpenCL_INCLUDE_DIRS /usr/include/CL)
+set(OpenCL_LIBRARIES -lOpenCL)
+
+if (NOT OpenCL_FOUND)
+	find_package(OpenCL)
+endif()
+
+if (OpenCL_FOUND)
+	set(CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unknown-pragmas -Wextra -Werror -pedantic -fPIC ${CMAKE_CXX_FLAGS}")
+	include_directories(${OpenCL_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(..)
+	add_library(${LIBRARY} ethash_cl_miner.cpp ethash_cl_miner.h cl.hpp)
+	TARGET_LINK_LIBRARIES(${LIBRARY} ${OpenCL_LIBRARIES} ethash)
+endif()
--- a/libethash-cl/bin2h.cmake
+++ b/libethash-cl/bin2h.cmake
@ -0,0 +1,86 @@
+# https://gist.github.com/sivachandran/3a0de157dccef822a230
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column position.
+# Parameters:
+#   VARIABLE    - The name of the CMake variable holding the string.
+#   AT_COLUMN   - The column position at which string will be wrapped.
+function(WRAP_STRING)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+    math(EXPR offset "0")
+
+    while(stringLength GREATER 0)
+
+        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+        else()
+            math(EXPR length "${stringLength}")
+        endif()
+
+        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+
+        math(EXPR stringLength "${stringLength} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+
+    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
+# will contain a byte array and integer variable holding the size of the array.
+# Parameters
+#   SOURCE_FILE     - The path of source file whose contents will be embedded in the header file.
+#   VARIABLE_NAME   - The name of the variable for the byte array. The string "_SIZE" will be append
+#                     to this name and will be used a variable name for size variable.
+#   HEADER_FILE     - The path of header file.
+#   APPEND          - If specified appends to the header file instead of overwriting it
+#   NULL_TERMINATE  - If specified a null byte(zero) will be append to the byte array. This will be
+#                     useful if the source file is a text file and we want to use the file contents
+#                     as string. But the size variable holds size of the byte array without this
+#                     null byte.
+# Usage:
+#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+    set(options APPEND NULL_TERMINATE)
+    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
+    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    # reads source file contents as hex string
+    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
+    string(LENGTH ${hexString} hexStringLength)
+
+    # appends null byte if asked
+    if(BIN2H_NULL_TERMINATE)
+        set(hexString "${hexString}00")
+    endif()
+
+    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
+    wrap_string(VARIABLE hexString AT_COLUMN 32)
+    math(EXPR arraySize "${hexStringLength} / 2")
+
+    # adds '0x' prefix and comma suffix before and after every byte respectively
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " arrayValues ${hexString})
+    # removes trailing comma
+    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
+
+    # converts the variable name into proper C identifier
+    IF (${CMAKE_VERSION} GREATER 2.8.10) # fix for legacy cmake
+        string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+    ENDIF()
+    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+
+    # declares byte array and the length variables
+    set(arrayDefinition "const unsigned char ${BIN2H_VARIABLE_NAME}[] = { ${arrayValues} };")
+    set(arraySizeDefinition "const size_t ${BIN2H_VARIABLE_NAME}_SIZE = ${arraySize};")
+
+    set(declarations "${arrayDefinition}\n\n${arraySizeDefinition}\n\n")
+    if(BIN2H_APPEND)
+        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+    else()
+        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+    endif()
+endfunction()
--- a/libethash-cl/cl.hpp
+++ b/libethash-cl/cl.hpp
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@ -0,0 +1,332 @@
+/*
+  This file is part of c-ethash.
+
+  c-ethash is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  c-ethash is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/** @file ethash_cl_miner.cpp
+* @author Tim Hughes <tim@twistedfury.com>
+* @date 2015
+*/
+
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstdio>
+#include <cstdlib>
+#include <assert.h>
+#include <queue>
+#include <vector>
+#include <libethash/util.h>
+#include <libethash/ethash.h>
+#include "ethash_cl_miner.h"
+#include "ethash_cl_miner_kernel.h"
+
+#define ETHASH_BYTES 32
+
+// workaround lame platforms
+#if !CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
+#define CL_MEM_HOST_READ_ONLY 0
+#endif
+
+#undef min
+#undef max
+
+static void add_definition(std::string& source, char const* id, unsigned value)
+{
+	char buf[256];
+	sprintf(buf, "#define %s %uu\n", id, value);
+	source.insert(source.begin(), buf, buf + strlen(buf));
+}
+
+ethash_cl_miner::ethash_cl_miner()
+:	m_opencl_1_1()
+{
+}
+
+void ethash_cl_miner::finish()
+{
+	if (m_queue())
+	{
+		m_queue.finish();
+	}
+}
+
+bool ethash_cl_miner::init(ethash_params const& params, std::function<void(void*)> _fillDAG, unsigned workgroup_size)
+{
+	// store params
+	m_params = params;
+
+	// get all platforms
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+	if (platforms.empty())
+	{
+		debugf("No OpenCL platforms found.\n");
+		return false;
+	}
+
+	// use default platform
+	fprintf(stderr, "Using platform: %s\n", platforms[0].getInfo<CL_PLATFORM_NAME>().c_str());
+
+    // get GPU device of the default platform
+    std::vector<cl::Device> devices;
+    platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+    if (devices.empty())
+	{
+		debugf("No OpenCL devices found.\n");
+		return false;
+	}
+
+	// use default device
+	unsigned device_num = 0;
+	cl::Device& device = devices[device_num];
+	std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
+	fprintf(stderr, "Using device: %s (%s)\n", device.getInfo<CL_DEVICE_NAME>().c_str(),device_version.c_str());
+
+	if (strncmp("OpenCL 1.0", device_version.c_str(), 10) == 0)
+	{
+		debugf("OpenCL 1.0 is not supported.\n");
+		return false;
+	}
+	if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0)
+	{
+		m_opencl_1_1 = true;
+	}
+
+	// create context
+	m_context = cl::Context(std::vector<cl::Device>(&device, &device + 1));
+	m_queue = cl::CommandQueue(m_context, device);
+
+	// use requested workgroup size, but we require multiple of 8
+	m_workgroup_size = ((workgroup_size + 7) / 8) * 8;
+
+	// patch source code
+	std::string code(ETHASH_CL_MINER_KERNEL, ETHASH_CL_MINER_KERNEL + ETHASH_CL_MINER_KERNEL_SIZE);
+	add_definition(code, "GROUP_SIZE", m_workgroup_size);
+	add_definition(code, "DAG_SIZE", (unsigned)(params.full_size / ETHASH_MIX_BYTES));
+	add_definition(code, "ACCESSES", ETHASH_ACCESSES);
+	add_definition(code, "MAX_OUTPUTS", c_max_search_results);
+	//debugf("%s", code.c_str());
+
+	// create miner OpenCL program
+	cl::Program::Sources sources;
+	sources.push_back({code.c_str(), code.size()});
+
+	cl::Program program(m_context, sources);
+	try
+	{
+		program.build({device});
+	}
+	catch (cl::Error err)
+	{
+		debugf("%s\n", program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
+		return false;
+	}
+	m_hash_kernel = cl::Kernel(program, "ethash_hash");
+	m_search_kernel = cl::Kernel(program, "ethash_search");
+
+	// create buffer for dag
+	m_dag = cl::Buffer(m_context, CL_MEM_READ_ONLY, params.full_size);
+	
+	// create buffer for header
+	m_header = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);
+
+	// compute dag on CPU
+	{
+		// if this throws then it's because we probably need to subdivide the dag uploads for compatibility
+		void* dag_ptr = m_queue.enqueueMapBuffer(m_dag, true, m_opencl_1_1 ? CL_MAP_WRITE : CL_MAP_WRITE_INVALIDATE_REGION, 0, params.full_size);
+		// memcpying 1GB: horrible... really. horrible. but necessary since we can't mmap *and* gpumap.
+		_fillDAG(dag_ptr);
+		m_queue.enqueueUnmapMemObject(m_dag, dag_ptr);
+	}
+
+	// create mining buffers
+	for (unsigned i = 0; i != c_num_buffers; ++i)
+	{
+		m_hash_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY | (!m_opencl_1_1 ? CL_MEM_HOST_READ_ONLY : 0), 32*c_hash_batch_size);
+		m_search_buf[i] = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, (c_max_search_results + 1) * sizeof(uint32_t));
+	}
+	return true;
+}
+
+void ethash_cl_miner::hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count)
+{
+	struct pending_batch
+	{
+		unsigned base;
+		unsigned count;
+		unsigned buf;
+	};
+	std::queue<pending_batch> pending;
+	
+	// update header constant buffer
+	m_queue.enqueueWriteBuffer(m_header, true, 0, 32, header);
+
+	/*
+	__kernel void ethash_combined_hash(
+		__global hash32_t* g_hashes,
+		__constant hash32_t const* g_header,
+		__global hash128_t const* g_dag,
+		ulong start_nonce,
+		uint isolate
+		)
+	*/
+	m_hash_kernel.setArg(1, m_header);
+	m_hash_kernel.setArg(2, m_dag);
+	m_hash_kernel.setArg(3, nonce);
+	m_hash_kernel.setArg(4, ~0u); // have to pass this to stop the compile unrolling the loop
+
+	unsigned buf = 0;
+	for (unsigned i = 0; i < count || !pending.empty(); )
+	{
+		// how many this batch
+		if (i < count)
+		{
+			unsigned const this_count = std::min<unsigned>(count - i, c_hash_batch_size);
+			unsigned const batch_count = std::max<unsigned>(this_count, m_workgroup_size);
+
+			// supply output hash buffer to kernel
+			m_hash_kernel.setArg(0, m_hash_buf[buf]);
+
+			// execute it!
+			m_queue.enqueueNDRangeKernel(
+				m_hash_kernel,
+				cl::NullRange,
+				cl::NDRange(batch_count),
+				cl::NDRange(m_workgroup_size)
+				);
+			m_queue.flush();
+		
+			pending.push({i, this_count, buf});
+			i += this_count;
+			buf = (buf + 1) % c_num_buffers;
+		}
+
+		// read results
+		if (i == count || pending.size() == c_num_buffers)
+		{
+			pending_batch const& batch = pending.front();
+
+			// could use pinned host pointer instead, but this path isn't that important.
+			uint8_t* hashes = (uint8_t*)m_queue.enqueueMapBuffer(m_hash_buf[batch.buf], true, CL_MAP_READ, 0, batch.count * ETHASH_BYTES);
+			memcpy(ret + batch.base*ETHASH_BYTES, hashes, batch.count*ETHASH_BYTES);
+			m_queue.enqueueUnmapMemObject(m_hash_buf[batch.buf], hashes);
+
+			pending.pop();
+		}
+	}
+}
+
+
+void ethash_cl_miner::search(uint8_t const* header, uint64_t target, search_hook& hook)
+{
+	struct pending_batch
+	{
+		uint64_t start_nonce;
+		unsigned buf;
+	};
+	std::queue<pending_batch> pending;
+
+	static uint32_t const c_zero = 0;
+
+	// update header constant buffer
+	m_queue.enqueueWriteBuffer(m_header, false, 0, 32, header);
+	for (unsigned i = 0; i != c_num_buffers; ++i)
+	{
+		m_queue.enqueueWriteBuffer(m_search_buf[i], false, 0, 4, &c_zero);
+	}
+
+#if CL_VERSION_1_2 && 0
+	cl::Event pre_return_event;
+	if (!m_opencl_1_1)
+	{
+		m_queue.enqueueBarrierWithWaitList(NULL, &pre_return_event);
+	}
+	else
+#endif
+	{
+		m_queue.finish();
+	}
+
+	/*
+	__kernel void ethash_combined_search(
+		__global hash32_t* g_hashes,			// 0
+		__constant hash32_t const* g_header,	// 1
+		__global hash128_t const* g_dag,		// 2
+		ulong start_nonce,						// 3
+		ulong target,							// 4
+		uint isolate							// 5
+	)
+	*/
+	m_search_kernel.setArg(1, m_header);
+	m_search_kernel.setArg(2, m_dag);
+
+	// pass these to stop the compiler unrolling the loops
+	m_search_kernel.setArg(4, target);
+	m_search_kernel.setArg(5, ~0u);
+
+
+	unsigned buf = 0;
+	for (uint64_t start_nonce = 0; ; start_nonce += c_search_batch_size)
+	{
+		// supply output buffer to kernel
+		m_search_kernel.setArg(0, m_search_buf[buf]);
+		m_search_kernel.setArg(3, start_nonce);
+
+		// execute it!
+		m_queue.enqueueNDRangeKernel(m_search_kernel, cl::NullRange, c_search_batch_size, m_workgroup_size);
+		
+		pending.push({start_nonce, buf});
+		buf = (buf + 1) % c_num_buffers;
+
+		// read results
+		if (pending.size() == c_num_buffers)
+		{
+			pending_batch const& batch = pending.front();
+
+			// could use pinned host pointer instead
+			uint32_t* results = (uint32_t*)m_queue.enqueueMapBuffer(m_search_buf[batch.buf], true, CL_MAP_READ, 0, (1+c_max_search_results) * sizeof(uint32_t));
+			unsigned num_found = std::min<unsigned>(results[0], c_max_search_results);
+
+			uint64_t nonces[c_max_search_results];
+			for (unsigned i = 0; i != num_found; ++i)
+			{
+				nonces[i] = batch.start_nonce + results[i+1];
+			}
+
+			m_queue.enqueueUnmapMemObject(m_search_buf[batch.buf], results);
+			
+			bool exit = num_found && hook.found(nonces, num_found);
+			exit |= hook.searched(batch.start_nonce, c_search_batch_size); // always report searched before exit
+			if (exit)
+				break;
+
+			// reset search buffer if we're still going
+			if (num_found)
+				m_queue.enqueueWriteBuffer(m_search_buf[batch.buf], true, 0, 4, &c_zero);
+
+			pending.pop();
+		}
+	}
+
+	// not safe to return until this is ready
+#if CL_VERSION_1_2 && 0
+	if (!m_opencl_1_1)
+	{
+		pre_return_event.wait();
+	}
+#endif
+}
+
--- a/libethash-cl/ethash_cl_miner.h
+++ b/libethash-cl/ethash_cl_miner.h
@ -0,0 +1,43 @@
+#pragma once
+
+#define __CL_ENABLE_EXCEPTIONS 
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#include "cl.hpp"
+#include <time.h>
+#include <functional>
+#include <libethash/ethash.h>
+
+class ethash_cl_miner
+{
+public:
+	struct search_hook
+	{
+		// reports progress, return true to abort
+		virtual bool found(uint64_t const* nonces, uint32_t count) = 0;
+		virtual bool searched(uint64_t start_nonce, uint32_t count) = 0;
+	};
+
+public:
+	ethash_cl_miner();
+
+	bool init(ethash_params const& params, std::function<void(void*)> _fillDAG, unsigned workgroup_size = 64);
+
+	void finish();
+	void hash(uint8_t* ret, uint8_t const* header, uint64_t nonce, unsigned count);
+	void search(uint8_t const* header, uint64_t target, search_hook& hook);
+
+private:
+	enum { c_max_search_results = 63, c_num_buffers = 2, c_hash_batch_size = 1024, c_search_batch_size = 1024*256 };
+
+	ethash_params m_params;
+	cl::Context m_context;
+	cl::CommandQueue m_queue;
+	cl::Kernel m_hash_kernel;
+	cl::Kernel m_search_kernel;
+	cl::Buffer m_dag;
+	cl::Buffer m_header;
+	cl::Buffer m_hash_buf[c_num_buffers];
+	cl::Buffer m_search_buf[c_num_buffers];
+	unsigned m_workgroup_size;
+	bool m_opencl_1_1;
+};
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@ -0,0 +1,460 @@
+// author Tim Hughes <tim@twistedfury.com>
+// Tested on Radeon HD 7850
+// Hashrate: 15940347 hashes/s
+// Bandwidth: 124533 MB/s
+// search kernel should fit in <= 84 VGPRS (3 wavefronts)
+
+#define THREADS_PER_HASH (128 / 16)
+#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
+
+#define FNV_PRIME	0x01000193
+
+__constant uint2 const Keccak_f1600_RC[24] = {
+	(uint2)(0x00000001, 0x00000000),
+	(uint2)(0x00008082, 0x00000000),
+	(uint2)(0x0000808a, 0x80000000),
+	(uint2)(0x80008000, 0x80000000),
+	(uint2)(0x0000808b, 0x00000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008009, 0x80000000),
+	(uint2)(0x0000008a, 0x00000000),
+	(uint2)(0x00000088, 0x00000000),
+	(uint2)(0x80008009, 0x00000000),
+	(uint2)(0x8000000a, 0x00000000),
+	(uint2)(0x8000808b, 0x00000000),
+	(uint2)(0x0000008b, 0x80000000),
+	(uint2)(0x00008089, 0x80000000),
+	(uint2)(0x00008003, 0x80000000),
+	(uint2)(0x00008002, 0x80000000),
+	(uint2)(0x00000080, 0x80000000),
+	(uint2)(0x0000800a, 0x00000000),
+	(uint2)(0x8000000a, 0x80000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008080, 0x80000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008008, 0x80000000),
+};
+
+void keccak_f1600_round(uint2* a, uint r, uint out_size)
+{
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+
+	uint2 b[25];
+	uint2 t;
+
+	// Theta
+	b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
+	b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
+	b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
+	b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
+	b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
+	t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31);
+	a[0] ^= t;
+	a[5] ^= t;
+	a[10] ^= t;
+	a[15] ^= t;
+	a[20] ^= t;
+	t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31);
+	a[1] ^= t;
+	a[6] ^= t;
+	a[11] ^= t;
+	a[16] ^= t;
+	a[21] ^= t;
+	t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31);
+	a[2] ^= t;
+	a[7] ^= t;
+	a[12] ^= t;
+	a[17] ^= t;
+	a[22] ^= t;
+	t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31);
+	a[3] ^= t;
+	a[8] ^= t;
+	a[13] ^= t;
+	a[18] ^= t;
+	a[23] ^= t;
+	t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31);
+	a[4] ^= t;
+	a[9] ^= t;
+	a[14] ^= t;
+	a[19] ^= t;
+	a[24] ^= t;
+
+	// Rho Pi
+	b[0] = a[0];
+	b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31);
+	b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29);
+	b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26);
+	b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22);
+	b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17);
+	b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11);
+	b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4);
+	b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28);
+	b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19);
+	b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9);
+	b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30);
+	b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18);
+	b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5);
+	b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23);
+	b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8);
+	b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24);
+	b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7);
+	b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21);
+	b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2);
+	b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14);
+	b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25);
+	b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3);
+	b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12);
+	b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20);
+
+	// Chi
+	a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
+	a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
+	a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
+	a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
+	if (out_size >= 4)
+	{
+		a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
+		a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
+		a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
+		a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
+		a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
+		if (out_size >= 8)
+		{
+			a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
+			a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
+			a[11] = bitselect(b[11] ^ b[13], b[11], b[12]);
+			a[12] = bitselect(b[12] ^ b[14], b[12], b[13]);
+			a[13] = bitselect(b[13] ^ b[10], b[13], b[14]);
+			a[14] = bitselect(b[14] ^ b[11], b[14], b[10]);
+			a[15] = bitselect(b[15] ^ b[17], b[15], b[16]);
+			a[16] = bitselect(b[16] ^ b[18], b[16], b[17]);
+			a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
+			a[18] = bitselect(b[18] ^ b[15], b[18], b[19]);
+			a[19] = bitselect(b[19] ^ b[16], b[19], b[15]);
+			a[20] = bitselect(b[20] ^ b[22], b[20], b[21]);
+			a[21] = bitselect(b[21] ^ b[23], b[21], b[22]);
+			a[22] = bitselect(b[22] ^ b[24], b[22], b[23]);
+			a[23] = bitselect(b[23] ^ b[20], b[23], b[24]);
+			a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
+		}
+	}
+
+	// Iota
+	a[0] ^= Keccak_f1600_RC[r];
+
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+}
+
+void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
+{
+	for (uint i = in_size; i != 25; ++i)
+	{
+		a[i] = 0;
+	}
+#if __ENDIAN_LITTLE__
+	a[in_size] ^= 0x0000000000000001;
+	a[24-out_size*2] ^= 0x8000000000000000;
+#else
+	a[in_size] ^= 0x0100000000000000;
+	a[24-out_size*2] ^= 0x0000000000000080;
+#endif
+
+	// Originally I unrolled the first and last rounds to interface
+	// better with surrounding code, however I haven't done this
+	// without causing the AMD compiler to blow up the VGPR usage.
+	uint r = 0;
+	do
+	{
+		// This dynamic branch stops the AMD compiler unrolling the loop
+		// and additionally saves about 33% of the VGPRs, enough to gain another
+		// wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
+		// massage out of the compiler. It doesn't really seem to matter how
+		// much we try and help the compiler save VGPRs because it seems to throw
+		// that information away, hence the implementation of keccak here
+		// doesn't bother.
+		if (isolate) 
+		{
+			keccak_f1600_round((uint2*)a, r++, 25);
+		}
+	}
+	while (r < 23);
+	
+	// final round optimised for digest size
+	keccak_f1600_round((uint2*)a, r++, out_size);
+}
+
+#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+
+uint fnv(uint x, uint y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint4 fnv4(uint4 x, uint4 y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
+
+typedef union
+{
+	ulong ulongs[32 / sizeof(ulong)];
+	uint uints[32 / sizeof(uint)];
+} hash32_t;
+
+typedef union
+{
+	ulong ulongs[64 / sizeof(ulong)];
+	uint4 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union
+{
+	uint uints[128 / sizeof(uint)];
+	uint4 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
+{
+	hash64_t init;
+	uint const init_size = countof(init.ulongs);
+	uint const hash_size = countof(header->ulongs);
+	
+	// sha3_512(header .. nonce)
+	ulong state[25];
+	copy(state, header->ulongs, hash_size);
+	state[hash_size] = nonce;
+	keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
+
+	copy(init.ulongs, state, init_size);
+	return init;
+}
+
+uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+		}
+	}
+	while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
+{
+	ulong state[25];
+
+	hash32_t hash;
+	uint const hash_size = countof(hash.ulongs);
+	uint const init_size = countof(init->ulongs);
+	uint const mix_size = countof(mix->ulongs);
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->ulongs, init_size);
+	copy(state + init_size, mix->ulongs, mix_size);
+	keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
+
+	// copy out
+	copy(hash.ulongs, state, hash_size);
+	return hash;
+}
+
+hash32_t compute_hash_simple(
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	hash128_t mix;
+	for (uint i = 0; i != countof(mix.uint4s); ++i)
+	{
+		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
+	}
+	
+	uint mix_val = mix.uints[0];
+	uint init0 = mix.uints[0];
+	uint a = 0;
+	do
+	{
+		uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
+		uint n = (a+1) % countof(mix.uints);
+
+		#pragma unroll
+		for (uint i = 0; i != countof(mix.uints); ++i)
+		{
+			mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
+			mix_val = i == n ? mix.uints[i] : mix_val;
+		}
+	}
+	while (++a != (ACCESSES & isolate));
+
+	// reduce to output
+	hash32_t fnv_mix;
+	for (uint i = 0; i != countof(fnv_mix.uints); ++i)
+	{
+		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
+	}
+	
+	return final_hash(&init, &fnv_mix, isolate);
+}
+
+typedef union
+{
+	struct
+	{
+		hash64_t init;
+		uint pad; // avoid lds bank conflicts
+	};
+	hash32_t mix;
+} compute_hash_share;
+
+hash32_t compute_hash(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_simple(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_simple(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		g_output[slot] = gid;
+	}
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		g_output[slot] = gid;
+	}
+}