From d9bc6cfedaa12f5478193bd4316c242bca151ba1 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 24 Feb 2019 17:24:06 +0100 Subject: [PATCH] Updated JIT compiler and assembly generator for new int -> float conversion --- doc/isa-ops.md | 5 ++-- src/AssemblyGeneratorX86.cpp | 4 +-- src/JitCompilerX86.cpp | 10 +++---- src/asm/program_loop_load.inc | 12 ++++++--- src/asm/program_loop_store.inc | 8 +++--- src/asm/program_prologue_linux.inc | 6 ++--- src/asm/program_prologue_win64.inc | 6 ++--- src/asm/program_xmm_constants.inc | 10 +++---- src/executeProgram-win64.asm | 42 ++++++++++++++++-------------- src/main.cpp | 2 +- 10 files changed, 56 insertions(+), 49 deletions(-) diff --git a/doc/isa-ops.md b/doc/isa-ops.md index d403bda..d98d5f0 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -40,6 +40,8 @@ For floating point instructions, the destination can be a group F or group E reg Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`. +Memory operands for group E registers are loaded as described above, then their sign bit is cleared and their exponent value is set to `0x30F` (corresponds to 2-240). + |frequency|instruction|dst|src|operation| |-|-|-|-|-| |8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`| @@ -58,8 +60,7 @@ This instruction negates the number and multiplies it by 2x "a2" ; xmm11 -> "a3" ; xmm12 -> temporary - ; xmm13 -> DBL_MIN - ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff - ; xmm15 -> sign mask 0x80000000000000008000000000000000 + ; xmm13 -> mantissa mask = 0x000fffffffffffff000fffffffffffff + ; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000 + ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 */ @@ -165,7 +165,7 @@ namespace RandomX { static const uint8_t JMP = 0xe9; static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; - static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0f, 0x54, 0xe6 }; + static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 }; static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; @@ -556,8 +556,6 @@ namespace RandomX { emit(REX_ANDPS_XMM12); emit(REX_DIVPD); emitByte(0xe4 + 8 * instr.dst); - emit(REX_MAXPD); - emitByte(0xe5 + 8 * instr.dst); } void JitCompilerX86::h_FSQRT_R(Instruction& instr) { diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc index 76b8f3d..6ef67ec 100644 --- a/src/asm/program_loop_load.inc +++ b/src/asm/program_loop_load.inc @@ -22,7 +22,11 @@ cvtdq2pd xmm5, qword ptr [rcx+40] cvtdq2pd xmm6, qword ptr [rcx+48] cvtdq2pd xmm7, qword ptr [rcx+56] - andps xmm4, xmm14 - andps xmm5, xmm14 - andps xmm6, xmm14 - andps xmm7, xmm14 + andps xmm4, xmm13 + andps xmm5, xmm13 + andps xmm6, xmm13 + andps xmm7, xmm13 + orps xmm4, xmm14 + orps xmm5, xmm14 + orps xmm6, xmm14 + orps xmm7, xmm14 diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index a0acebc..1ba1635 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -8,10 +8,10 @@ mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 pop rcx - mulpd xmm0, xmm4 - mulpd xmm1, xmm5 - mulpd xmm2, xmm6 - mulpd xmm3, xmm7 + xorpd xmm0, xmm4 + xorpd xmm1, xmm5 + xorpd xmm2, xmm6 + xorpd xmm3, xmm7 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index e487c58..c798ce7 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -32,8 +32,8 @@ movapd xmm9, xmmword ptr [rcx+88] movapd xmm10, xmmword ptr [rcx+104] movapd xmm11, xmmword ptr [rcx+120] - movapd xmm13, xmmword ptr minDbl[rip] - movapd xmm14, xmmword ptr absMask[rip] - movapd xmm15, xmmword ptr signMask[rip] + movapd xmm13, xmmword ptr mantissaMask[rip] + movapd xmm14, xmmword ptr exp240[rip] + movapd xmm15, xmmword ptr scaleMask[rip] jmp DECL(randomx_program_loop_begin) \ No newline at end of file diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index f91cca2..5a666a3 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -45,8 +45,8 @@ movapd xmm9, xmmword ptr [rcx+88] movapd xmm10, xmmword ptr [rcx+104] movapd xmm11, xmmword ptr [rcx+120] - movapd xmm13, xmmword ptr [minDbl] - movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + movapd xmm13, xmmword ptr [mantissaMask] + movapd xmm14, xmmword ptr [exp240] + movapd xmm15, xmmword ptr [scaleMask] jmp randomx_program_loop_begin \ No newline at end of file diff --git a/src/asm/program_xmm_constants.inc b/src/asm/program_xmm_constants.inc index 79d05a4..5c2600b 100644 --- a/src/asm/program_xmm_constants.inc +++ b/src/asm/program_xmm_constants.inc @@ -1,6 +1,6 @@ -minDbl: - db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 -absMask: - db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 -signMask: +mantissaMask: + db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0 +exp240: + db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48 +scaleMask: db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129 \ No newline at end of file diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index d7d6f87..37392cd 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -52,9 +52,9 @@ executeProgram PROC ; xmm10 -> "a2" ; xmm11 -> "a3" ; xmm12 -> temporary - ; xmm13 -> DBL_MIN - ; xmm14 -> absolute value mask - ; xmm15 -> sign mask + ; xmm13 -> mantissa mask = 0x000fffffffffffff000fffffffffffff + ; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000 + ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 ; store callee-saved registers push rbx @@ -103,18 +103,18 @@ executeProgram PROC movapd xmm9, xmmword ptr [rcx+88] movapd xmm10, xmmword ptr [rcx+104] movapd xmm11, xmmword ptr [rcx+120] - movapd xmm13, xmmword ptr [minDbl] - movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + movapd xmm13, xmmword ptr [mantissaMask] + movapd xmm14, xmmword ptr [exp240] + movapd xmm15, xmmword ptr [scaleMask] jmp program_begin ALIGN 64 -minDbl: - db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 -absMask: - db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 -signMask: +mantissaMask: + db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0 +exp240: + db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48 +scaleMask: db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129 ALIGN 64 @@ -145,10 +145,14 @@ program_begin: cvtdq2pd xmm5, qword ptr [rcx+40] cvtdq2pd xmm6, qword ptr [rcx+48] cvtdq2pd xmm7, qword ptr [rcx+56] - andps xmm4, xmm14 - andps xmm5, xmm14 - andps xmm6, xmm14 - andps xmm7, xmm14 + andps xmm4, xmm13 + andps xmm5, xmm13 + andps xmm6, xmm13 + andps xmm7, xmm13 + orps xmm4, xmm14 + orps xmm5, xmm14 + orps xmm6, xmm14 + orps xmm7, xmm14 ;# 256 instructions include program.inc @@ -181,10 +185,10 @@ IF 1 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 pop rcx - mulpd xmm0, xmm4 - mulpd xmm1, xmm5 - mulpd xmm2, xmm6 - mulpd xmm3, xmm7 + xorpd xmm0, xmm4 + xorpd xmm1, xmm5 + xorpd xmm2, xmm6 + xorpd xmm3, xmm7 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/main.cpp b/src/main.cpp index 0b6a0fa..ac63dce 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -341,7 +341,7 @@ int main(int argc, char** argv) { std::cout << "Calculated result: "; result.print(std::cout); if(programCount == 1000) - std::cout << "Reference result: d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl; + std::cout << "Reference result: e1b4144293ff9ab5aa4c98f2389bb18950d8c3fd874891ac64628e028a286006" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; }