diff --git a/libethash-cl/ethash_cl_miner.cpp b/libethash-cl/ethash_cl_miner.cpp
index 44ae5efe7..201bfce53 100644
--- a/libethash-cl/ethash_cl_miner.cpp
+++ b/libethash-cl/ethash_cl_miner.cpp
@@ -42,6 +42,10 @@
 
 #define ETHASH_BYTES 32
 
+#define OPENCL_PLATFORM_UNKNOWN 0
+#define OPENCL_PLATFORM_NVIDIA  1
+#define OPENCL_PLATFORM_AMD		2
+
 // workaround lame platforms
 #if !CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
@@ -331,14 +335,14 @@ bool ethash_cl_miner::init(
 		string platformName = platforms[_platformId].getInfo<CL_PLATFORM_NAME>();
 		ETHCL_LOG("Using platform: " << platformName.c_str());
 
-		int platformId = 0;
+		int platformId = OPENCL_PLATFORM_UNKNOWN;
 		if (platformName == "NVIDIA CUDA")
 		{
-			platformId = 1;
+			platformId = OPENCL_PLATFORM_NVIDIA;
 		}
 		else if (platformName == "AMD Accelerated Parallel Processing")
 		{
-			platformId = 2;
+			platformId = OPENCL_PLATFORM_AMD;
 		}
 		// get GPU device of the default platform
 		vector<cl::Device> devices = getDevices(platforms, _platformId);
@@ -361,6 +365,21 @@ bool ethash_cl_miner::init(
 		if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0)
 			m_openclOnePointOne = true;
 
+
+		char options[256];
+		int computeCapability = 0;
+		if (platformId == OPENCL_PLATFORM_NVIDIA) {
+			cl_uint computeCapabilityMajor;
+			cl_uint computeCapabilityMinor;
+			clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
+			clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &computeCapabilityMinor, NULL);
+
+			computeCapability = computeCapabilityMajor * 10 + computeCapabilityMinor;
+			int maxregs = computeCapability >= 35 ? 72 : 63;
+			//printf("-cl-nv-maxrregcount=%d -cl-nv-arch sm_%d -cl-nv-verbose", maxregs, computeCapability);
+			sprintf(options, "-cl-nv-verbose -cl-nv-maxrregcount=%d", maxregs);// , computeCapability);
+		}
+
 		// create context
 		m_context = cl::Context(vector<cl::Device>(&device, &device + 1));
 		m_queue = cl::CommandQueue(m_context, device);
@@ -379,9 +398,12 @@ bool ethash_cl_miner::init(
 		addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
 		addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
 		addDefinition(code, "PLATFORM", platformId);
+		addDefinition(code, "COMPUTE", computeCapability);
 
 		//debugf("%s", code.c_str());
 
+
+		
 		// create miner OpenCL program
 		cl::Program::Sources sources;
 		sources.push_back({ code.c_str(), code.size() });
@@ -389,7 +411,7 @@ bool ethash_cl_miner::init(
 		cl::Program program(m_context, sources);
 		try
 		{
-			program.build({ device });
+			program.build({ device }, options);
 			ETHCL_LOG("Printing program log");
 			ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
 		}
diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl
index c1eed75c6..77bb2d2b8 100644
--- a/libethash-cl/ethash_cl_miner_kernel.cl
+++ b/libethash-cl/ethash_cl_miner_kernel.cl
@@ -1,8 +1,7 @@
-// author Tim Hughes <tim@twistedfury.com>
-// Tested on Radeon HD 7850
-// Hashrate: 15940347 hashes/s
-// Bandwidth: 124533 MB/s
-// search kernel should fit in <= 84 VGPRS (3 wavefronts)
+#define OPENCL_PLATFORM_UNKNOWN 0
+#define OPENCL_PLATFORM_NVIDIA  1
+#define OPENCL_PLATFORM_AMD		2
+
 
 #define THREADS_PER_HASH (128 / 16)
 #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
@@ -36,7 +35,7 @@ __constant uint2 const Keccak_f1600_RC[24] = {
 	(uint2)(0x80008008, 0x80000000),
 };
 
-#if PLATFORM == 1 // CUDA
+#if PLATFORM == OPENCL_PLATFORM_NVIDIA && COMPUTE >= 35
 static uint2 ROL2(const uint2 a, const int offset) {
 	uint2 result;
 	if (offset >= 32) {
@@ -49,7 +48,7 @@ static uint2 ROL2(const uint2 a, const int offset) {
 	}
 	return result;
 }
-#elif PLATFORM == 2 // APP
+#elif PLATFORM == OPENCL_PLATFORM_AMD
 #pragma OPENCL EXTENSION cl_amd_media_ops : enable
 static uint2 ROL2(const uint2 vv, const int r)
 {
@@ -259,7 +258,9 @@ typedef union {
 	uint  uints[16];
 } compute_hash_share;
 
+#if PLATFORM != OPENCL_PLATFORM_NVIDIA // use maxrregs on nv
 __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+#endif
 __kernel void ethash_search(
 	__global volatile uint* restrict g_output,
 	__constant hash32_t const* g_header,