Updated JIT compiler and assembly generator for new int -> float conversion

2024-12-22 07:48:54 +00:00 · 2019-02-24 17:24:06 +01:00 · 2019-02-24 17:24:06 +01:00 · d9bc6cfeda
commit d9bc6cfeda
parent 790b382eda
10 changed files with 56 additions and 49 deletions
--- a/doc/isa-ops.md
+++ b/doc/isa-ops.md
@ -40,6 +40,8 @@ For floating point instructions, the destination can be a group F or group E reg

 Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`.

+Memory operands for group E registers are loaded as described above, then their sign bit is cleared and their exponent value is set to `0x30F` (corresponds to 2<sup>-240</sup>).
+
 |frequency|instruction|dst|src|operation|
 |-|-|-|-|-|
 |8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`|
@ -58,8 +60,7 @@ This instruction negates the number and multiplies it by <code>2<sup>x</sup></co
 The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`.

 #### Denormal and NaN values
-Due to restrictions on the values of the floating point registers, no operation results in `NaN`.
-`FDIV_M` can produce a denormal result. In that case, the result is set to `DBL_MIN = 2.22507385850720138309e-308`, which is the smallest positive normal number.
+Due to restrictions on the values of the floating point registers, no operation results in `NaN` or a denormal number.

 #### Rounding
 All floating point instructions give correctly rounded results. The rounding mode depends on the value of the `fprc` register:
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -385,9 +385,9 @@ namespace RandomX {
 		instr.dst %= 4;
 		genAddressReg(instr);
 		asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl;
-		asmCode << "\tandps xmm12, xmm14" << std::endl;
+		asmCode << "\tandps xmm12, xmm13" << std::endl;
+		asmCode << "\torps xmm12, xmm14" << std::endl;
 		asmCode << "\tdivpd " << regE[instr.dst] << ", xmm12" << std::endl;
-		asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl;
 		traceflt(instr);
 	}

--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -73,9 +73,9 @@ namespace RandomX {
 	; xmm10 -> "a2"
 	; xmm11 -> "a3"
 	; xmm12 -> temporary
-	; xmm13 -> DBL_MIN
-	; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
-	; xmm15 -> sign mask           0x80000000000000008000000000000000
+	; xmm13 -> mantissa mask    = 0x000fffffffffffff000fffffffffffff
+	; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000
+	; xmm15 -> scale mask       = 0x81f000000000000081f0000000000000

 	*/

@ -165,7 +165,7 @@ namespace RandomX {
 	static const uint8_t JMP = 0xe9;
 	static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
 	static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
-	static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0f, 0x54, 0xe6 };
+	static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
 	static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
 	static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };

@ -556,8 +556,6 @@ namespace RandomX {
 		emit(REX_ANDPS_XMM12);
 		emit(REX_DIVPD);
 		emitByte(0xe4 + 8 * instr.dst);
-		emit(REX_MAXPD);
-		emitByte(0xe5 + 8 * instr.dst);
 	}

 	void JitCompilerX86::h_FSQRT_R(Instruction& instr) {
--- a/src/asm/program_loop_load.inc
+++ b/src/asm/program_loop_load.inc
@ -22,7 +22,11 @@
 	cvtdq2pd xmm5, qword ptr [rcx+40]
 	cvtdq2pd xmm6, qword ptr [rcx+48]
 	cvtdq2pd xmm7, qword ptr [rcx+56]
-	andps xmm4, xmm14
-	andps xmm5, xmm14
-	andps xmm6, xmm14
-	andps xmm7, xmm14
+	andps xmm4, xmm13
+	andps xmm5, xmm13
+	andps xmm6, xmm13
+	andps xmm7, xmm13
+	orps xmm4, xmm14
+	orps xmm5, xmm14
+	orps xmm6, xmm14
+	orps xmm7, xmm14
--- a/src/asm/program_loop_store.inc
+++ b/src/asm/program_loop_store.inc
@ -8,10 +8,10 @@
 	mov qword ptr [rcx+48], r14
 	mov qword ptr [rcx+56], r15
 	pop rcx
-	mulpd xmm0, xmm4
-	mulpd xmm1, xmm5
-	mulpd xmm2, xmm6
-	mulpd xmm3, xmm7
+	xorpd xmm0, xmm4
+	xorpd xmm1, xmm5
+	xorpd xmm2, xmm6
+	xorpd xmm3, xmm7
 	movapd xmmword ptr [rcx+0], xmm0
 	movapd xmmword ptr [rcx+16], xmm1
 	movapd xmmword ptr [rcx+32], xmm2
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@ -32,8 +32,8 @@
 	movapd xmm9, xmmword ptr [rcx+88]
 	movapd xmm10, xmmword ptr [rcx+104]
 	movapd xmm11, xmmword ptr [rcx+120]
-	movapd xmm13, xmmword ptr minDbl[rip]
-	movapd xmm14, xmmword ptr absMask[rip]
-	movapd xmm15, xmmword ptr signMask[rip]
+	movapd xmm13, xmmword ptr mantissaMask[rip]
+	movapd xmm14, xmmword ptr exp240[rip]
+	movapd xmm15, xmmword ptr scaleMask[rip]

 	jmp DECL(randomx_program_loop_begin)
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@ -45,8 +45,8 @@
 	movapd xmm9, xmmword ptr [rcx+88]
 	movapd xmm10, xmmword ptr [rcx+104]
 	movapd xmm11, xmmword ptr [rcx+120]
-	movapd xmm13, xmmword ptr [minDbl]
-	movapd xmm14, xmmword ptr [absMask]
-	movapd xmm15, xmmword ptr [signMask]
+	movapd xmm13, xmmword ptr [mantissaMask]
+	movapd xmm14, xmmword ptr [exp240]
+	movapd xmm15, xmmword ptr [scaleMask]

 	jmp randomx_program_loop_begin
--- a/src/asm/program_xmm_constants.inc
+++ b/src/asm/program_xmm_constants.inc
@ -1,6 +1,6 @@
-minDbl:
-	db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
-	db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
+mantissaMask:
+	db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0
+exp240:
+	db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48
+scaleMask:
 	db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@ -52,9 +52,9 @@ executeProgram PROC
 	; xmm10 -> "a2"
 	; xmm11 -> "a3"
 	; xmm12 -> temporary
-	; xmm13 -> DBL_MIN
-	; xmm14 -> absolute value mask
-	; xmm15 -> sign mask
+	; xmm13 -> mantissa mask    = 0x000fffffffffffff000fffffffffffff
+	; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000
+	; xmm15 -> scale mask       = 0x81f000000000000081f0000000000000

 	; store callee-saved registers
 	push rbx
@ -103,18 +103,18 @@ executeProgram PROC
 	movapd xmm9, xmmword ptr [rcx+88]
 	movapd xmm10, xmmword ptr [rcx+104]
 	movapd xmm11, xmmword ptr [rcx+120]
-	movapd xmm13, xmmword ptr [minDbl]
-	movapd xmm14, xmmword ptr [absMask]
-	movapd xmm15, xmmword ptr [signMask]
+	movapd xmm13, xmmword ptr [mantissaMask]
+	movapd xmm14, xmmword ptr [exp240]
+	movapd xmm15, xmmword ptr [scaleMask]

 	jmp program_begin

 ALIGN 64
-minDbl:
-	db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
-	db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
+mantissaMask:
+	db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0
+exp240:
+	db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48
+scaleMask:
 	db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129

 ALIGN 64
@ -145,10 +145,14 @@ program_begin:
 	cvtdq2pd xmm5, qword ptr [rcx+40]
 	cvtdq2pd xmm6, qword ptr [rcx+48]
 	cvtdq2pd xmm7, qword ptr [rcx+56]
-	andps xmm4, xmm14
-	andps xmm5, xmm14
-	andps xmm6, xmm14
-	andps xmm7, xmm14
+	andps xmm4, xmm13
+	andps xmm5, xmm13
+	andps xmm6, xmm13
+	andps xmm7, xmm13
+	orps xmm4, xmm14
+	orps xmm5, xmm14
+	orps xmm6, xmm14
+	orps xmm7, xmm14

 	;# 256 instructions
 	include program.inc
@ -181,10 +185,10 @@ IF 1
 	mov qword ptr [rcx+48], r14
 	mov qword ptr [rcx+56], r15
 	pop rcx
-	mulpd xmm0, xmm4
-	mulpd xmm1, xmm5
-	mulpd xmm2, xmm6
-	mulpd xmm3, xmm7
+	xorpd xmm0, xmm4
+	xorpd xmm1, xmm5
+	xorpd xmm2, xmm6
+	xorpd xmm3, xmm7
 	movapd xmmword ptr [rcx+0], xmm0
 	movapd xmmword ptr [rcx+16], xmm1
 	movapd xmmword ptr [rcx+32], xmm2
--- a/src/main.cpp
+++ b/src/main.cpp
@ -341,7 +341,7 @@ int main(int argc, char** argv) {
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
 		if(programCount == 1000)
-		std::cout << "Reference result:  d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl;
+		std::cout << "Reference result:  e1b4144293ff9ab5aa4c98f2389bb18950d8c3fd874891ac64628e028a286006" << std::endl;
 		if (!miningMode) {
 			std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
 		}