diff --git a/libethash-cuda/CMakeLists.txt b/libethash-cuda/CMakeLists.txt index 7a3f8b753..91d77848d 100644 --- a/libethash-cuda/CMakeLists.txt +++ b/libethash-cuda/CMakeLists.txt @@ -10,11 +10,11 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--std=c++11;--disable-warnings;--ptxas-op LIST(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) LIST(APPEND CUDA_NVCC_FLAGS_DEBUG -G) -if(COMPUTE) +if(COMPUTE AND (COMPUTE GREATER 0)) LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) -else(COMPUTE) +else(COMPUTE AND (COMPUTE GREATER 0)) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_32,code=sm_32;-gencode arch=compute_35,code=sm_35;-gencode arch=compute_50,code=sm_50;-gencode arch=compute_52,code=sm_52) -endif(COMPUTE) +endif(COMPUTE AND (COMPUTE GREATER 0)) diff --git a/libethash-cuda/cuda_helper.h b/libethash-cuda/cuda_helper.h index 1b75d2ffd..b410be1f9 100644 --- a/libethash-cuda/cuda_helper.h +++ b/libethash-cuda/cuda_helper.h @@ -248,6 +248,7 @@ uint64_t xor1(const uint64_t a, const uint64_t b) #define xor1(a,b) (a ^ b) #endif +/* #if USE_XOR_ASM_OPTS // device asm for whirpool __device__ __forceinline__ @@ -256,13 +257,14 @@ uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) uint64_t result; asm("xor.b64 %0, %2, %3;\n\t" "xor.b64 %0, %0, %1;\n\t" - /* output : input registers */ + //output : input registers : "=l"(result) : "l"(a), "l"(b), "l"(c)); return result; } #else #define xor3(a,b,c) (a ^ b ^ c) #endif +*/ #if USE_XOR_ASM_OPTS // device asm for whirpool diff --git a/libethash-cuda/keccak.cuh b/libethash-cuda/keccak.cuh index 8e770f50e..42373195a 100644 --- a/libethash-cuda/keccak.cuh +++ b/libethash-cuda/keccak.cuh @@ -14,7 +14,7 @@ __device__ __constant__ uint64_t const keccak_round_constants[24] = { #if __CUDA_ARCH__ >= 500 __device__ __forceinline__ -uint2 lop3xor(const uint2 a, const uint2 b, const uint2 c) { +uint2 xor3(const uint2 a, const uint2 b, const uint2 c) { uint2 result; asm("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x), "r"(c.x)); asm("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y), "r"(c.y)); @@ -23,8 +23,8 @@ uint2 lop3xor(const uint2 a, const uint2 b, const uint2 c) { __device__ __forceinline__ uint2 xor5(const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uint2 e) { - uint2 f = lop3xor(a,b,c); - return lop3xor(d,e,f); + uint2 f = xor3(a,b,c); + return xor3(d,e,f); } #else __device__ __forceinline__ @@ -32,7 +32,7 @@ uint2 xor5(const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uin return a ^ b ^ c ^ d ^ e; } __device__ __forceinline__ -uint2 lop3xor(const uint2 a, const uint2 b, const uint2 c) { +uint2 xor3(const uint2 a, const uint2 b, const uint2 c) { return a ^ b ^ c; } #endif @@ -69,52 +69,42 @@ __device__ __forceinline__ void keccak_f1600_block(uint2* s, uint32_t out_size) /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - u = ROL2(t[1], 1); - //s[0] ^= u; s[5] ^= u; s[10] ^= u; s[15] ^= u; s[20] ^= u; - s[0] = lop3xor(s[0], t[4], u); - s[5] = lop3xor(s[5], t[4], u); - s[10] = lop3xor(s[10], t[4], u); - s[15] = lop3xor(s[15], t[4], u); - s[20] = lop3xor(s[20], t[4], u); + u = ROL2(t[1], 1); + s[0] = xor3(s[0], t[4], u); + s[5] = xor3(s[5], t[4], u); + s[10] = xor3(s[10], t[4], u); + s[15] = xor3(s[15], t[4], u); + s[20] = xor3(s[20], t[4], u); u = ROL2(t[2], 1); - //s[1] ^= u; s[6] ^= u; s[11] ^= u; s[16] ^= u; s[21] ^= u; - - s[1] = lop3xor(s[1], t[0], u); - s[6] = lop3xor(s[6], t[0], u); - s[11] = lop3xor(s[11], t[0], u); - s[16] = lop3xor(s[16], t[0], u); - s[21] = lop3xor(s[21], t[0], u); + s[1] = xor3(s[1], t[0], u); + s[6] = xor3(s[6], t[0], u); + s[11] = xor3(s[11], t[0], u); + s[16] = xor3(s[16], t[0], u); + s[21] = xor3(s[21], t[0], u); u = ROL2(t[3], 1); - //s[2] ^= u; s[7] ^= u; s[12] ^= u; s[17] ^= u; s[22] ^= u; - - s[2] = lop3xor(s[2], t[1], u); - s[7] = lop3xor(s[7], t[1], u); - s[12] = lop3xor(s[12], t[1], u); - s[17] = lop3xor(s[17], t[1], u); - s[22] = lop3xor(s[22], t[1], u); + s[2] = xor3(s[2], t[1], u); + s[7] = xor3(s[7], t[1], u); + s[12] = xor3(s[12], t[1], u); + s[17] = xor3(s[17], t[1], u); + s[22] = xor3(s[22], t[1], u); u = ROL2(t[4], 1); - - //s[3] ^= u; s[8] ^= u; s[13] ^= u; s[18] ^= u; s[23] ^= u; - - s[3] = lop3xor(s[3], t[2], u); - s[8] = lop3xor(s[8], t[2], u); - s[13] = lop3xor(s[13], t[2], u); - s[18] = lop3xor(s[18], t[2], u); - s[23] = lop3xor(s[23], t[2], u); + s[3] = xor3(s[3], t[2], u); + s[8] = xor3(s[8], t[2], u); + s[13] = xor3(s[13], t[2], u); + s[18] = xor3(s[18], t[2], u); + s[23] = xor3(s[23], t[2], u); u = ROL2(t[0], 1); - //s[4] ^= u; s[9] ^= u; s[14] ^= u; s[19] ^= u; s[24] ^= u; - - s[4] = lop3xor(s[4], t[3], u); - s[9] = lop3xor(s[9], t[3], u); - s[14] = lop3xor(s[14], t[3], u); - s[19] = lop3xor(s[19], t[3], u); - s[24] = lop3xor(s[24], t[3], u); + s[4] = xor3(s[4], t[3], u); + s[9] = xor3(s[9], t[3], u); + s[14] = xor3(s[14], t[3], u); + s[19] = xor3(s[19], t[3], u); + s[24] = xor3(s[24], t[3], u); /* rho pi: b[..] = rotl(a[..], ..) */ u = s[1];