Browse Source

nvidia maxregs

cl-refactor
Genoil 9 years ago
parent
commit
a5be252960
  1. 30
      libethash-cl/ethash_cl_miner.cpp
  2. 15
      libethash-cl/ethash_cl_miner_kernel.cl

30
libethash-cl/ethash_cl_miner.cpp

@ -42,6 +42,10 @@
#define ETHASH_BYTES 32 #define ETHASH_BYTES 32
#define OPENCL_PLATFORM_UNKNOWN 0
#define OPENCL_PLATFORM_NVIDIA 1
#define OPENCL_PLATFORM_AMD 2
// workaround lame platforms // workaround lame platforms
#if !CL_VERSION_1_2 #if !CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE #define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
@ -331,14 +335,14 @@ bool ethash_cl_miner::init(
string platformName = platforms[_platformId].getInfo<CL_PLATFORM_NAME>(); string platformName = platforms[_platformId].getInfo<CL_PLATFORM_NAME>();
ETHCL_LOG("Using platform: " << platformName.c_str()); ETHCL_LOG("Using platform: " << platformName.c_str());
int platformId = 0; int platformId = OPENCL_PLATFORM_UNKNOWN;
if (platformName == "NVIDIA CUDA") if (platformName == "NVIDIA CUDA")
{ {
platformId = 1; platformId = OPENCL_PLATFORM_NVIDIA;
} }
else if (platformName == "AMD Accelerated Parallel Processing") else if (platformName == "AMD Accelerated Parallel Processing")
{ {
platformId = 2; platformId = OPENCL_PLATFORM_AMD;
} }
// get GPU device of the default platform // get GPU device of the default platform
vector<cl::Device> devices = getDevices(platforms, _platformId); vector<cl::Device> devices = getDevices(platforms, _platformId);
@ -361,6 +365,21 @@ bool ethash_cl_miner::init(
if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0) if (strncmp("OpenCL 1.1", device_version.c_str(), 10) == 0)
m_openclOnePointOne = true; m_openclOnePointOne = true;
char options[256];
int computeCapability = 0;
if (platformId == OPENCL_PLATFORM_NVIDIA) {
cl_uint computeCapabilityMajor;
cl_uint computeCapabilityMinor;
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &computeCapabilityMinor, NULL);
computeCapability = computeCapabilityMajor * 10 + computeCapabilityMinor;
int maxregs = computeCapability >= 35 ? 72 : 63;
//printf("-cl-nv-maxrregcount=%d -cl-nv-arch sm_%d -cl-nv-verbose", maxregs, computeCapability);
sprintf(options, "-cl-nv-verbose -cl-nv-maxrregcount=%d", maxregs);// , computeCapability);
}
// create context // create context
m_context = cl::Context(vector<cl::Device>(&device, &device + 1)); m_context = cl::Context(vector<cl::Device>(&device, &device + 1));
m_queue = cl::CommandQueue(m_context, device); m_queue = cl::CommandQueue(m_context, device);
@ -379,9 +398,12 @@ bool ethash_cl_miner::init(
addDefinition(code, "ACCESSES", ETHASH_ACCESSES); addDefinition(code, "ACCESSES", ETHASH_ACCESSES);
addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults); addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
addDefinition(code, "PLATFORM", platformId); addDefinition(code, "PLATFORM", platformId);
addDefinition(code, "COMPUTE", computeCapability);
//debugf("%s", code.c_str()); //debugf("%s", code.c_str());
// create miner OpenCL program // create miner OpenCL program
cl::Program::Sources sources; cl::Program::Sources sources;
sources.push_back({ code.c_str(), code.size() }); sources.push_back({ code.c_str(), code.size() });
@ -389,7 +411,7 @@ bool ethash_cl_miner::init(
cl::Program program(m_context, sources); cl::Program program(m_context, sources);
try try
{ {
program.build({ device }); program.build({ device }, options);
ETHCL_LOG("Printing program log"); ETHCL_LOG("Printing program log");
ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str()); ETHCL_LOG(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str());
} }

15
libethash-cl/ethash_cl_miner_kernel.cl

@ -1,8 +1,7 @@
// author Tim Hughes <tim@twistedfury.com> #define OPENCL_PLATFORM_UNKNOWN 0
// Tested on Radeon HD 7850 #define OPENCL_PLATFORM_NVIDIA 1
// Hashrate: 15940347 hashes/s #define OPENCL_PLATFORM_AMD 2
// Bandwidth: 124533 MB/s
// search kernel should fit in <= 84 VGPRS (3 wavefronts)
#define THREADS_PER_HASH (128 / 16) #define THREADS_PER_HASH (128 / 16)
#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH) #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
@ -36,7 +35,7 @@ __constant uint2 const Keccak_f1600_RC[24] = {
(uint2)(0x80008008, 0x80000000), (uint2)(0x80008008, 0x80000000),
}; };
#if PLATFORM == 1 // CUDA #if PLATFORM == OPENCL_PLATFORM_NVIDIA && COMPUTE >= 35
static uint2 ROL2(const uint2 a, const int offset) { static uint2 ROL2(const uint2 a, const int offset) {
uint2 result; uint2 result;
if (offset >= 32) { if (offset >= 32) {
@ -49,7 +48,7 @@ static uint2 ROL2(const uint2 a, const int offset) {
} }
return result; return result;
} }
#elif PLATFORM == 2 // APP #elif PLATFORM == OPENCL_PLATFORM_AMD
#pragma OPENCL EXTENSION cl_amd_media_ops : enable #pragma OPENCL EXTENSION cl_amd_media_ops : enable
static uint2 ROL2(const uint2 vv, const int r) static uint2 ROL2(const uint2 vv, const int r)
{ {
@ -259,7 +258,9 @@ typedef union {
uint uints[16]; uint uints[16];
} compute_hash_share; } compute_hash_share;
#if PLATFORM != OPENCL_PLATFORM_NVIDIA // use maxrregs on nv
__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
#endif
__kernel void ethash_search( __kernel void ethash_search(
__global volatile uint* restrict g_output, __global volatile uint* restrict g_output,
__constant hash32_t const* g_header, __constant hash32_t const* g_header,

Loading…
Cancel
Save