From 5c49ab12a071df4d0fd4b5f0d91c4c0c3180b74d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 20 Oct 2023 10:54:25 +0200 Subject: [PATCH] Optimized randomx_reciprocal Also limited it to 32 bit because it's supposed to work only with 32-bit values, according to the specs. --- src/assembly_generator_x86.cpp | 2 +- src/bytecode_machine.cpp | 2 +- src/jit_compiler_a64.cpp | 19 ++++--------------- src/jit_compiler_rv64.cpp | 2 +- src/jit_compiler_x86.cpp | 2 +- src/reciprocal.c | 32 ++++++++++++-------------------- src/reciprocal.h | 4 ++-- src/tests/perf-simulation.cpp | 2 +- 8 files changed, 23 insertions(+), 42 deletions(-) diff --git a/src/assembly_generator_x86.cpp b/src/assembly_generator_x86.cpp index e7e5258..1ce31dd 100644 --- a/src/assembly_generator_x86.cpp +++ b/src/assembly_generator_x86.cpp @@ -445,7 +445,7 @@ namespace randomx { } void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl; diff --git a/src/bytecode_machine.cpp b/src/bytecode_machine.cpp index 7d8e902..1d00d09 100644 --- a/src/bytecode_machine.cpp +++ b/src/bytecode_machine.cpp @@ -243,7 +243,7 @@ namespace randomx { } if (opcode < ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::IMUL_R; diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 75ea8cc..5be8f6e 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -686,7 +686,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) { - const uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (isZeroOrPowerOf2(divisor)) return; @@ -695,22 +695,11 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint64_t N = 1ULL << 63; - const uint64_t q = N / divisor; - const uint64_t r = N % divisor; -#ifdef __GNUC__ - const uint64_t shift = 64 - __builtin_clzll(divisor); -#else - uint64_t shift = 32; - for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1) - --shift; -#endif - const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); - literalPos -= sizeof(uint64_t); - const uint64_t randomx_reciprocal = (q << shift) + ((r << shift) / divisor); - memcpy(code + literalPos, &randomx_reciprocal, sizeof(randomx_reciprocal)); + + const uint64_t reciprocal = randomx_reciprocal_fast(divisor); + memcpy(code + literalPos, &reciprocal, sizeof(reciprocal)); if (literal_id < 12) { diff --git a/src/jit_compiler_rv64.cpp b/src/jit_compiler_rv64.cpp index 301c294..6f0842e 100644 --- a/src/jit_compiler_rv64.cpp +++ b/src/jit_compiler_rv64.cpp @@ -776,7 +776,7 @@ namespace randomx { } static void v1_IMUL_RCP(HANDLER_ARGS) { - uint64_t divisor = isn.getImm32(); + const uint32_t divisor = isn.getImm32(); if (!isZeroOrPowerOf2(divisor)) { state.registerUsage[isn.dst] = i; if (state.rcpCount < 4) { diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp index 96c6492..785ce5f 100644 --- a/src/jit_compiler_x86.cpp +++ b/src/jit_compiler_x86.cpp @@ -618,7 +618,7 @@ namespace randomx { } void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; emit(MOV_RAX_I); diff --git a/src/reciprocal.c b/src/reciprocal.c index 22620f5..074d184 100644 --- a/src/reciprocal.c +++ b/src/reciprocal.c @@ -44,36 +44,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret */ -uint64_t randomx_reciprocal(uint64_t divisor) { +uint64_t randomx_reciprocal(uint32_t divisor) { assert(divisor != 0); const uint64_t p2exp63 = 1ULL << 63; + const uint64_t q = p2exp63 / divisor; + const uint64_t r = p2exp63 % divisor; - uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; +#ifdef __GNUC__ + const uint32_t shift = 64 - __builtin_clzll(divisor); +#else + uint32_t shift = 32; + for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1) + --shift; +#endif - unsigned bsr = 0; //highest set bit in divisor - - for (uint64_t bit = divisor; bit > 0; bit >>= 1) - bsr++; - - for (unsigned shift = 0; shift < bsr; shift++) { - if (remainder >= divisor - remainder) { - quotient = quotient * 2 + 1; - remainder = remainder * 2 - divisor; - } - else { - quotient = quotient * 2; - remainder = remainder * 2; - } - } - - return quotient; + return (q << shift) + ((r << shift) / divisor); } #if !RANDOMX_HAVE_FAST_RECIPROCAL -uint64_t randomx_reciprocal_fast(uint64_t divisor) { +uint64_t randomx_reciprocal_fast(uint32_t divisor) { return randomx_reciprocal(divisor); } diff --git a/src/reciprocal.h b/src/reciprocal.h index 8858df2..90bd9b6 100644 --- a/src/reciprocal.h +++ b/src/reciprocal.h @@ -40,8 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -uint64_t randomx_reciprocal(uint64_t); -uint64_t randomx_reciprocal_fast(uint64_t); +uint64_t randomx_reciprocal(uint32_t); +uint64_t randomx_reciprocal_fast(uint32_t); #if defined(__cplusplus) } diff --git a/src/tests/perf-simulation.cpp b/src/tests/perf-simulation.cpp index 1068a40..27f34d8 100644 --- a/src/tests/perf-simulation.cpp +++ b/src/tests/perf-simulation.cpp @@ -477,7 +477,7 @@ int analyze(randomx::Program& p) { } if (opcode < randomx::ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!randomx::isZeroOrPowerOf2(divisor)) { instr.dst = instr.dst % randomx::RegistersCount; instr.opcode |= DST_INT;