diff --git a/libethash-cl/ethash_cl_miner.cpp b/libethash-cl/ethash_cl_miner.cpp index 44ae5efe7..201bfce53 100644 --- a/libethash-cl/ethash_cl_miner.cpp +++ b/libethash-cl/ethash_cl_miner.cpp @@ -42,6 +42,10 @@ #define ETHASH_BYTES 32 +#define OPENCL_PLATFORM_UNKNOWN 0 +#define OPENCL_PLATFORM_NVIDIA 1 +#define OPENCL_PLATFORM_AMD 2 + // workaround lame platforms #if !CL_VERSION_1_2 #define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE @@ -331,14 +335,14 @@ bool ethash_cl_miner::init( string platformName = platforms[_platformId].getInfo(); ETHCL_LOG("Using platform: " << platformName.c_str()); - int platformId = 0; + int platformId = OPENCL_PLATFORM_UNKNOWN; if (platformName == "NVIDIA CUDA") { - platformId = 1; + platformId = OPENCL_PLATFORM_NVIDIA; } else if (platformName == "AMD Accelerated Parallel Processing") { - platformId = 2; + platformId = OPENCL_PLATFORM_AMD; } // get GPU device of the default platform vector devices = getDevices(platforms, _platformId); @@ -361,6 +365,21 @@ bool ethash_cl_miner::init( if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0) m_openclOnePointOne = true; + + char options[256]; + int computeCapability = 0; + if (platformId == OPENCL_PLATFORM_NVIDIA) { + cl_uint computeCapabilityMajor; + cl_uint computeCapabilityMinor; + clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL); + clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &computeCapabilityMinor, NULL); + + computeCapability = computeCapabilityMajor * 10 + computeCapabilityMinor; + int maxregs = computeCapability >= 35 ? 72 : 63; + //printf("-cl-nv-maxrregcount=%d -cl-nv-arch sm_%d -cl-nv-verbose", maxregs, computeCapability); + sprintf(options, "-cl-nv-verbose -cl-nv-maxrregcount=%d", maxregs);// , computeCapability); + } + // create context m_context = cl::Context(vector(&device, &device + 1)); m_queue = cl::CommandQueue(m_context, device); @@ -379,9 +398,12 @@ bool ethash_cl_miner::init( addDefinition(code, "ACCESSES", ETHASH_ACCESSES); addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults); addDefinition(code, "PLATFORM", platformId); + addDefinition(code, "COMPUTE", computeCapability); //debugf("%s", code.c_str()); + + // create miner OpenCL program cl::Program::Sources sources; sources.push_back({ code.c_str(), code.size() }); @@ -389,7 +411,7 @@ bool ethash_cl_miner::init( cl::Program program(m_context, sources); try { - program.build({ device }); + program.build({ device }, options); ETHCL_LOG("Printing program log"); ETHCL_LOG(program.getBuildInfo(device).c_str()); } diff --git a/libethash-cl/ethash_cl_miner_kernel.cl b/libethash-cl/ethash_cl_miner_kernel.cl index c1eed75c6..77bb2d2b8 100644 --- a/libethash-cl/ethash_cl_miner_kernel.cl +++ b/libethash-cl/ethash_cl_miner_kernel.cl @@ -1,8 +1,7 @@ -// author Tim Hughes -// Tested on Radeon HD 7850 -// Hashrate: 15940347 hashes/s -// Bandwidth: 124533 MB/s -// search kernel should fit in <= 84 VGPRS (3 wavefronts) +#define OPENCL_PLATFORM_UNKNOWN 0 +#define OPENCL_PLATFORM_NVIDIA 1 +#define OPENCL_PLATFORM_AMD 2 + #define THREADS_PER_HASH (128 / 16) #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) @@ -36,7 +35,7 @@ __constant uint2 const Keccak_f1600_RC[24] = { (uint2)(0x80008008, 0x80000000), }; -#if PLATFORM == 1 // CUDA +#if PLATFORM == OPENCL_PLATFORM_NVIDIA && COMPUTE >= 35 static uint2 ROL2(const uint2 a, const int offset) { uint2 result; if (offset >= 32) { @@ -49,7 +48,7 @@ static uint2 ROL2(const uint2 a, const int offset) { } return result; } -#elif PLATFORM == 2 // APP +#elif PLATFORM == OPENCL_PLATFORM_AMD #pragma OPENCL EXTENSION cl_amd_media_ops : enable static uint2 ROL2(const uint2 vv, const int r) { @@ -259,7 +258,9 @@ typedef union { uint uints[16]; } compute_hash_share; +#if PLATFORM != OPENCL_PLATFORM_NVIDIA // use maxrregs on nv __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +#endif __kernel void ethash_search( __global volatile uint* restrict g_output, __constant hash32_t const* g_header,