Vector FPU instructions

JitCompilerX86 - static code written in asm Updated ALU/FPU tests Updated instruction weights
2024-12-22 15:58:53 +00:00 · 2018-12-31 19:06:45 +01:00 · 2018-12-31 19:06:45 +01:00 · 3caecc7646
commit 3caecc7646
parent a09bee8d60
30 changed files with 3757 additions and 3618 deletions
--- a/6
+++ b/6
@ -12,6 +12,9 @@ OBJDIR=obj
 LDFLAGS=-lpthread
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
 ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
+ifeq ($(PLATFORM),x86_64)
+    ROBJS += $(OBJDIR)/JitCompilerX86-static.o
+endif

 all: release test

@ -57,6 +60,9 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
 $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@

+$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR)
+	$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
+
 $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -54,7 +54,7 @@ namespace RandomX {
 		(this->*generator)(instr, i);
 	}

-	void AssemblyGeneratorX86::gena(Instruction& instr) {
+	void AssemblyGeneratorX86::genar(Instruction& instr) {
 		asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
 		switch (instr.loca & 7)
 		{
@ -63,7 +63,7 @@ namespace RandomX {
 		case 2:
 		case 3:
 			asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
-			asmCode << "\tcall rx_read_dataset" << std::endl;
+			asmCode << "\tcall rx_read_dataset_r" << std::endl;
 			return;

 		case 4:
@ -80,6 +80,33 @@ namespace RandomX {
 		}
 	}

+
+	void AssemblyGeneratorX86::genaf(Instruction& instr) {
+		asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
+		switch (instr.loca & 7)
+		{
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+			asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
+			asmCode << "\tcall rx_read_dataset_f" << std::endl;
+			return;
+
+		case 4:
+			asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
+			asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
+			asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
+			return;
+
+		default:
+			asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
+			asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
+			asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
+			return;
+		}
+	}
+
 	void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) {
 		switch (instr.locb & 7)
 		{
@ -87,8 +114,6 @@ namespace RandomX {
 		case 1:
 		case 2:
 		case 3:
-		case 4:
-		case 5:
 			asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
 			asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
 			return;
@ -133,26 +158,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) {
-		asmCode << "\tand rax, -2048" << std::endl;
-		asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
-		switch (instr.locb & 7)
-		{
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
 		asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
-			return;
-		default:
-			convertible_t bimm;
-			bimm.f64 = (double)instr.imm32;
-			asmCode << "\tmov rax, " << bimm.i64 << std::endl;
-			asmCode << "\tmovd xmm1, rax" << std::endl;
-			asmCode << "\t" << instrx86 << " xmm0, xmm1" << std::endl;
-			return;
-		}
 	}

 	void AssemblyGeneratorX86::gencr(Instruction& instr) {
@ -165,7 +171,7 @@ namespace RandomX {
 			asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
 			asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
 			if (trace) {
-				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
+				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
 			}
 			return;

@ -178,76 +184,75 @@ namespace RandomX {
 			asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
 			asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
 			if (trace) {
-				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
+				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
 			}
 			return;

 		default:
 			asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
 			if (trace) {
-				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
+				asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
 			}
 		}
 	}

-	void AssemblyGeneratorX86::gencf(Instruction& instr) {
+	void AssemblyGeneratorX86::gencf(Instruction& instr, bool alwaysLow = false) {
+		if(!alwaysLow)
+			asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
+		const char* store = (!alwaysLow && (instr.locc & 8)) ? "movhpd" : "movlpd";
 		switch (instr.locc & 7)
 		{
-		case 0:
+			case 4:
 				asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
 				asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
 				asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
-			asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
+				asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
 				break;

-		case 1:
-		case 2:
-		case 3:
+			case 5:
+			case 6:
+			case 7:
 				asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
 				asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
 				asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
-			asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
-			break;
-
-		default:
-			asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
+				asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
 				break;
 		}
 		if (trace) {
-			asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl;
+			asmCode << "\t" << store << " qword ptr [rsi + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl;
 		}
 	}

 	void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tadd rax, ";
 		genbr1(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tadd eax, ";
 		genbr132(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tsub rax, ";
 		genbr1(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tsub eax, ";
 		genbr132(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\timul rax, ";
 		if ((instr.locb & 7) >= 6) {
 			asmCode << "rax, ";
@ -257,7 +262,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmov rcx, ";
 		genbr1(instr);
 		asmCode << "\tmul rcx" << std::endl;
@ -266,7 +271,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmov ecx, eax" << std::endl;
 		asmCode << "\tmov eax, ";
 		genbr132(instr);
@ -275,7 +280,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmovsxd rcx, eax" << std::endl;
 		if ((instr.locb & 7) >= 6) {
 			asmCode << "\tmov rax, " << instr.imm32 << std::endl;
@ -288,7 +293,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmov rcx, ";
 		genbr1(instr);
 		asmCode << "\timul rcx" << std::endl;
@ -297,7 +302,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		if ((instr.locb & 7) >= 6) {
 			if (instr.imm32 == 0) {
 				asmCode << "\tmov ecx, 1" << std::endl;
@ -318,7 +323,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmov edx, ";
 		genbr132(instr);
 		asmCode << "\tcmp edx, -1" << std::endl;
@ -339,123 +344,125 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tand rax, ";
 		genbr1(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tand eax, ";
 		genbr132(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tor rax, ";
 		genbr1(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tor eax, ";
 		genbr132(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\txor rax, ";
 		genbr1(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\txor eax, ";
 		genbr132(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		genbr0(instr, "shl");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		genbr0(instr, "shr");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		genbr0(instr, "sar");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		genbr0(instr, "rol");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		genbr0(instr, "ror");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) {
-		gena(instr);
-		genbf(instr, "addsd");
+		genaf(instr);
+		genbf(instr, "addpd");
 		gencf(instr);
 	}

 	void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) {
-		gena(instr);
-		genbf(instr, "subsd");
+		genaf(instr);
+		genbf(instr, "subpd");
 		gencf(instr);
 	}

 	void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) {
-		gena(instr);
-		asmCode << "\tor rax, 2048" << std::endl;
-		genbf(instr, "mulsd");
+		genaf(instr);
+		genbf(instr, "mulpd");
+		asmCode << "\tmovaps xmm1, xmm0" << std::endl;
+		asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
+		asmCode << "\tandps xmm0, xmm1" << std::endl;
 		gencf(instr);
 	}

 	void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) {
-		gena(instr);
-		asmCode << "\tor rax, 2048" << std::endl;
-		genbf(instr, "divsd");
+		genaf(instr);
+		genbf(instr, "divpd");
+		asmCode << "\tmovaps xmm1, xmm0" << std::endl;
+		asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
+		asmCode << "\tandps xmm0, xmm1" << std::endl;
 		gencf(instr);
 	}

 	void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
-		gena(instr);
-		asmCode << "\tmov rcx, 9223372036854773760" << std::endl;
-		asmCode << "\tand rax, rcx" << std::endl;
-		asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
-		asmCode << "\tsqrtsd xmm0, xmm0" << std::endl;
+		genaf(instr);
+		asmCode << "\tandps xmm0, xmm10" << std::endl;
+		asmCode << "\tsqrtpd xmm0, xmm0" << std::endl;
 		gencf(instr);
 	}

 	void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tmov rcx, rax" << std::endl;
 		asmCode << "\tshl eax, 13" << std::endl;
 		asmCode << "\tand rcx, -2048" << std::endl;
 		asmCode << "\tand eax, 24576" << std::endl;
-		asmCode << "\tcvtsi2sd xmm0, rcx" << std::endl;
+		asmCode << "\tcvtsi2sd " << regF[instr.regc % RegistersCount] << ", rcx" << std::endl;
 		asmCode << "\tor eax, 40896" << std::endl;
 		asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
 		asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
-		gencf(instr);
+		gencf(instr, true);
 	}

 	static inline const char* jumpCondition(Instruction& instr, bool invert = false) {
@ -481,7 +488,7 @@ namespace RandomX {
 	}

 	void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
 		asmCode << "\t" << jumpCondition(instr);
 		asmCode << " short taken_call_" << i << std::endl;
@ -489,14 +496,14 @@ namespace RandomX {
 		asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl;
 		asmCode << "taken_call_" << i << ":" << std::endl;
 		if (trace) {
-			asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
+			asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
 		}
 		asmCode << "\tpush rax" << std::endl;
 		asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
 	}

 	void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {
-		gena(instr);
+		genar(instr);
 		asmCode << "\tcmp rsp, rbp" << std::endl;
 		asmCode << "\tje short not_taken_ret_" << i << std::endl;
 		asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl;
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@ -38,13 +38,14 @@ namespace RandomX {
 		static InstructionGenerator engine[256];
 		std::stringstream asmCode;

-		void gena(Instruction&);
+		void genar(Instruction&);
+		void genaf(Instruction&);
 		void genbr0(Instruction&, const char*);
 		void genbr1(Instruction&);
 		void genbr132(Instruction&);
 		void genbf(Instruction&, const char*);
 		void gencr(Instruction&);
-		void gencf(Instruction&);
+		void gencf(Instruction&, bool);

 		void generateCode(Instruction&, int);

--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@ -26,9 +26,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 namespace RandomX {

 	CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {
-#if !defined(_M_X64) && !defined(__x86_64__)
-		throw std::runtime_error("Compiled VM only supports x86-64 CPUs");
-#endif
+
 	}

 	void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
@ -51,7 +49,7 @@ namespace RandomX {
 	void CompiledVirtualMachine::execute() {
 		//executeProgram(reg, mem, scratchpad, readDataset);
 		compiler.getProgramFunc()(reg, mem, scratchpad);
-#ifdef TRACE
+#ifdef TRACEVM
 		for (int32_t i = InstructionCount - 1; i >= 0; --i) {
 			std::cout << std::hex << tracepad[i].u64 << std::endl;
 		}
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@ -18,7 +18,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
-//#define TRACE
+//#define TRACEVM
 #include "VirtualMachine.hpp"
 #include "JitCompilerX86.hpp"

@ -34,7 +34,7 @@ namespace RandomX {
 			return compiler.getCode();
 		}
 	private:
-#ifdef TRACE
+#ifdef TRACEVM
 		convertible_t tracepad[InstructionCount];
 #endif
 		JitCompilerX86 compiler;
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -44,9 +44,11 @@ namespace RandomX {
 			*(((uint32_t*)&reg) + i) = gen();
 		}
 		FPINIT();
-		for (int i = 0; i < 8; ++i) {
-			reg.f[i].f64 = (double)reg.f[i].i64;
+		for (int i = 0; i < RegistersCount; ++i) {
+			reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
+			reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
 		}
+		//std::cout << reg;
 		p.initialize(gen);
 		mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
 		mem.mx = *(((uint32_t*)seed) + 5);
@ -119,9 +121,9 @@ namespace RandomX {
 			case 1:
 			case 2:
 			case 3:
+				return reg.r[inst.regb % RegistersCount];
 			case 4:
 			case 5:
-			return reg.r[inst.regb % RegistersCount];
 			case 6:
 			case 7:
 				convertible_t temp;
@ -130,22 +132,6 @@ namespace RandomX {
 		}
 	}

-	double InterpretedVirtualMachine::loadbf(Instruction& inst) {
-		switch (inst.locb & 7)
-		{
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
-			return reg.f[inst.regb % RegistersCount].f64;
-		case 6:
-		case 7:
-			return (double)inst.imm32;
-		}
-	}
-
 	convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
 		addr_t addr;
 		switch (inst.locc & 7)
@ -168,25 +154,43 @@ namespace RandomX {
 		}
 	}

-	convertible_t& InterpretedVirtualMachine::getcf(Instruction& inst) {
+	void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) {
 		addr_t addr;
 		switch (inst.locc & 7)
 		{
-		case 0:
-			addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
-			return scratchpad[addr % ScratchpadL2];
-
-		case 1:
-		case 2:
-		case 3:
-			addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
-			return scratchpad[addr % ScratchpadL1];
-
 			case 4:
+				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
+				scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo;
+				break;
+
 			case 5:
 			case 6:
 			case 7:
-			return reg.f[inst.regc % RegistersCount];
+				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
+				scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo;
+
+			default:
+				break;
+		}
+	}
+
+	void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) {
+		addr_t addr;
+		switch (inst.locc & 7)
+		{
+			case 4:
+				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
+				scratchpad[addr % ScratchpadL2] = regc.lo;
+				break;
+
+			case 5:
+			case 6:
+			case 7:
+				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
+				scratchpad[addr % ScratchpadL1] = regc.lo;
+
+			default:
+				break;
 		}
 	}

@ -194,22 +198,18 @@ namespace RandomX {
 	if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;

 #define FPU_RETIRE(x) x(a, b, c); \
+	writecf(inst, c); \
 	if(trace) { \
-		convertible_t bc; \
-		bc.f64 = b; \
-		std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \
+		std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \
 	} \
 	if(fpuCheck) { \
-		convertible_t bc; \
-		if(c.f64 != c.f64)  { \
+		if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64)  { \
 			std::stringstream ss; \
-			bc.f64 = b; \
-			ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
+			ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
 			throw std::runtime_error(ss.str()); \
-		} else if (std::fpclassify(c.f64) == FP_SUBNORMAL) {\
+		} else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\
 			std::stringstream ss; \
-			bc.f64 = b; \
-			ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
+			ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
 			throw std::runtime_error(ss.str()); \
 		} \
 	}
@ -220,8 +220,13 @@ namespace RandomX {
 #define INC_COUNT(x)
 #endif

-#define FPU_RETIRE_NB(x) x(a, b, c); \
-	if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
+#define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \
+	writecf(inst, c); \
+	if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl;
+
+#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \
+	writecflo(inst, c); \
+	if(trace) std::cout << std::hex << c.lo.u64 << std::endl;

 #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
 	INC_COUNT(x) \
@ -242,17 +247,17 @@ namespace RandomX {
 #define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
 	INC_COUNT(x) \
 	convertible_t a = loada(inst); \
-	double b = loadbf(inst); \
-	convertible_t& c = getcf(inst); \
+	fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
+	fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
 	FPU_RETIRE(x) \
 	}

 #define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
 	INC_COUNT(x) \
 	convertible_t a = loada(inst); \
-	convertible_t b; \
-	convertible_t& c = getcf(inst); \
-	FPU_RETIRE_NB(x) \
+	fpu_reg_t b; \
+	fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
+	FPU_RETIRE_##x(x) \
 	}

 	ALU_INST(ADD_64)
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@ -18,7 +18,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

 #pragma once
-#define STATS
+//#define STATS
 #include "VirtualMachine.hpp"
 #include "Program.hpp"
 #include <vector>
@ -88,9 +88,9 @@ namespace RandomX {
 		convertible_t loada(Instruction&);
 		convertible_t loadbr0(Instruction&);
 		convertible_t loadbr1(Instruction&);
-		double loadbf(Instruction&);
 		convertible_t& getcr(Instruction&);
-		convertible_t& getcf(Instruction&);
+		void writecf(Instruction&, fpu_reg_t&);
+		void writecflo(Instruction&, fpu_reg_t&);

 		void stackPush(convertible_t& c) {
 			stack.push_back(c);
--- a/src/JitCompilerX86-static.S
+++ b/src/JitCompilerX86-static.S
@ -0,0 +1,58 @@
+;# Copyright (c) 2018 tevador
+;#
+;# This file is part of RandomX.
+;#
+;# RandomX is free software: you can redistribute it and/or modify
+;# it under the terms of the GNU General Public License as published by
+;# the Free Software Foundation, either version 3 of the License, or
+;# (at your option) any later version.
+;#
+;# RandomX is distributed in the hope that it will be useful,
+;# but WITHOUT ANY WARRANTY; without even the implied warranty of
+;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;# GNU General Public License for more details.
+;#
+;# You should have received a copy of the GNU General Public License
+;# along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+
+.intel_syntax noprefix
+#if defined(__APPLE__)
+.text
+#else
+.section .text
+#endif
+#if defined(__WIN32__) || defined(__APPLE__)
+#define DECL(x) _##x
+#else
+#define DECL(x) x
+#endif
+.global DECL(randomx_program_prologue)
+.global DECL(randomx_program_begin)
+.global DECL(randomx_program_epilogue)
+.global DECL(randomx_program_read_r)
+.global DECL(randomx_program_read_f)
+.global DECL(randomx_program_end)
+
+.align 64
+DECL(randomx_program_prologue):
+	#include "asm/program_prologue_linux.inc"
+
+.align 64
+DECL(randomx_program_begin):
+	nop
+
+.align 64
+DECL(randomx_program_epilogue):
+	#include "asm/program_epilogue_linux.inc"
+
+.align 64
+DECL(randomx_program_read_r):
+	#include "asm/program_read_r.inc"
+
+.align 64
+DECL(randomx_program_read_f):
+	#include "asm/program_read_f.inc"
+
+.align 64
+DECL(randomx_program_end):
+	nop
--- a/src/JitCompilerX86-static.asm
+++ b/src/JitCompilerX86-static.asm
@ -0,0 +1,59 @@
+;# Copyright (c) 2018 tevador
+;#
+;# This file is part of RandomX.
+;#
+;# RandomX is free software: you can redistribute it and/or modify
+;# it under the terms of the GNU General Public License as published by
+;# the Free Software Foundation, either version 3 of the License, or
+;# (at your option) any later version.
+;#
+;# RandomX is distributed in the hope that it will be useful,
+;# but WITHOUT ANY WARRANTY; without even the implied warranty of
+;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;# GNU General Public License for more details.
+;#
+;# You should have received a copy of the GNU General Public License
+;# along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+
+_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
+
+PUBLIC randomx_program_prologue
+PUBLIC randomx_program_begin
+PUBLIC randomx_program_epilogue
+PUBLIC randomx_program_read_r
+PUBLIC randomx_program_read_f
+PUBLIC randomx_program_end
+
+ALIGN 64
+randomx_program_prologue PROC
+	include asm/program_prologue_win64.inc
+randomx_program_prologue ENDP
+
+ALIGN 64
+randomx_program_begin PROC
+	nop
+randomx_program_begin ENDP
+
+ALIGN 64
+randomx_program_epilogue PROC
+	include asm/program_epilogue_win64.inc
+randomx_program_epilogue ENDP
+
+ALIGN 64
+randomx_program_read_r PROC
+	include asm/program_read_r.inc
+randomx_program_read_r ENDP
+
+ALIGN 64
+randomx_program_read_f PROC
+	include asm/program_read_f.inc
+randomx_program_read_f ENDP
+
+ALIGN 64
+randomx_program_end PROC
+	nop
+randomx_program_end ENDP
+
+_RANDOMX_JITX86_STATIC ENDS
+
+END
--- a/src/JitCompilerX86-static.hpp
+++ b/src/JitCompilerX86-static.hpp
@ -0,0 +1,27 @@
+/*
+Copyright (c) 2018 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+extern "C" {
+  void randomx_program_prologue();
+  void randomx_program_begin();
+  void randomx_program_epilogue();
+  void randomx_program_read_r();
+  void randomx_program_read_f();
+  void randomx_program_end();
+}
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -34,6 +34,16 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 namespace RandomX {

+#if !defined(_M_X64) && !defined(__x86_64__)
+	JitCompilerX86::JitCompilerX86() {
+		throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
+	}
+
+	void JitCompilerX86::generateProgram(Pcg32& gen) {
+
+	}
+#else
+
 	/*
 	 REGISTER ALLOCATION:

@ -41,7 +51,7 @@ namespace RandomX {
 	 rbx -> MemoryRegisters& memory
 	 rcx -> temporary
 	 rdx -> temporary
-	 rsi -> convertible_t& scratchpad
+	 rsi -> convertible_t* scratchpad
 	 rdi -> "ic" (instruction counter)
 	 rbp -> beginning of VM stack
 	 rsp -> end of VM stack
@ -63,6 +73,7 @@ namespace RandomX {
 	 xmm7 -> "f7"
 	 xmm8 -> "f0"
 	 xmm9 -> "f1"
+	 xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff

 	 STACK STRUCTURE:

@ -81,127 +92,23 @@ namespace RandomX {

 	*/

-	constexpr uint8_t ic3 = ((InstructionCount + 1) >> 24);
-	constexpr uint8_t ic2 = ((InstructionCount + 1) >> 16);
-	constexpr uint8_t ic1 = ((InstructionCount + 1) >> 8);
-	constexpr uint8_t ic0 = ((InstructionCount + 1) >> 0);
+#include "JitCompilerX86-static.hpp"

-	const uint8_t prologue[] = {
-	   0x53,                                        //push   rbx
-	   0x55,                                        //push   rbp
-#ifdef _WIN32
-	   0x57,                                        //push   rdi
-	   0x56,                                        //push   rsi
-#endif
-	   0x41, 0x54,                                  //push   r12
-	   0x41, 0x55,                                  //push   r13
-	   0x41, 0x56,                                  //push   r14
-	   0x41, 0x57,                                  //push   r15
-#ifdef _WIN32
-	   0x48, 0x83, 0xec, 0x48,                      //sub    rsp,0x48
-	   0xf3, 0x0f, 0x7f, 0x74, 0x24, 0x30,          //movdqu XMMWORD PTR[rsp + 0x30],xmm6
-	   0xf3, 0x0f, 0x7f, 0x7c, 0x24, 0x20,          //movdqu XMMWORD PTR[rsp + 0x20],xmm7
-	   0xf3, 0x44, 0x0f, 0x7f, 0x44, 0x24, 0x10,    //movdqu XMMWORD PTR[rsp + 0x10],xmm8
-	   0xf3, 0x44, 0x0f, 0x7f, 0x0c, 0x24,          //movdqu XMMWORD PTR[rsp],xmm9
-	   0x51,                                        //push   rcx
-	   0x48, 0x8b, 0xda,                            //mov    rbx,rdx
-	   0x49, 0x8b, 0xf0,                            //mov    rsi,r8
-#else
-	   0x57,                                        //push   rdi
-	   0x48, 0x8b, 0xde,                            //mov    rbx, rsi
-	   0x48, 0x8b, 0xf2,                            //mov    rsi, rdx
-	   0x48, 0x8b, 0xcf,                            //mov    rcx, rdi
-#endif
-	   0x48, 0x8b, 0xec,                            //mov    rbp,rsp
-	   0x48, 0xc7, 0xc7, ic0, ic1, ic2, ic3,        //mov    rdi, "InstructionCount"
-	   0x4c, 0x8b, 0x01,                            //mov    r8,QWORD PTR[rcx]
-	   0x4c, 0x8b, 0x49, 0x08,                      //mov    r9,QWORD PTR[rcx+0x8]
-	   0x4c, 0x8b, 0x51, 0x10,                      //mov    r10,QWORD PTR[rcx+0x10]
-	   0x4c, 0x8b, 0x59, 0x18,                      //mov    r11,QWORD PTR[rcx+0x18]
-	   0x4c, 0x8b, 0x61, 0x20,                      //mov    r12,QWORD PTR[rcx+0x20]
-	   0x4c, 0x8b, 0x69, 0x28,                      //mov    r13,QWORD PTR[rcx+0x28]
-	   0x4c, 0x8b, 0x71, 0x30,                      //mov    r14,QWORD PTR[rcx+0x30]
-	   0x4c, 0x8b, 0x79, 0x38,                      //mov    r15,QWORD PTR[rcx+0x38]
-	   0xc7, 0x44, 0x24, 0xf8, 0xc0, 0x9f, 0x00,    //mov    DWORD PTR[rsp-0x8],0x9fc0
-	   0x00,
-	   0x0f, 0xae, 0x54, 0x24, 0xf8,                //ldmxcsr DWORD PTR[rsp-0x8]
-	   0xf2, 0x4c, 0x0f, 0x2a, 0x41, 0x40,          //cvtsi2sd xmm8,QWORD PTR[rcx+0x40]
-	   0xf2, 0x4c, 0x0f, 0x2a, 0x49, 0x48,          //cvtsi2sd xmm9,QWORD PTR[rcx+0x48]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x51, 0x50,          //cvtsi2sd xmm2,QWORD PTR[rcx+0x50]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x59, 0x58,          //cvtsi2sd xmm3,QWORD PTR[rcx+0x58]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x61, 0x60,          //cvtsi2sd xmm4,QWORD PTR[rcx+0x60]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x69, 0x68,          //cvtsi2sd xmm5,QWORD PTR[rcx+0x68]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x71, 0x70,          //cvtsi2sd xmm6,QWORD PTR[rcx+0x70]
-	   0xf2, 0x48, 0x0f, 0x2a, 0x79, 0x78,          //cvtsi2sd xmm7,QWORD PTR[rcx+0x78]
-	};
+	const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
+	const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
+	const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
+	const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
+	const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
+	const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;

-	const uint8_t epilogue[] = {
-	   0x48, 0x8b, 0xe5,                         //mov    rsp,rbp
-	   0x59,                                     //pop    rcx
-	   0x4c, 0x89, 0x01,                         //mov    QWORD PTR [rcx],r8
-	   0x4c, 0x89, 0x49, 0x08,                   //mov    QWORD PTR [rcx+0x8],r9
-	   0x4c, 0x89, 0x51, 0x10,                   //mov    QWORD PTR [rcx+0x10],r10
-	   0x4c, 0x89, 0x59, 0x18,                   //mov    QWORD PTR [rcx+0x18],r11
-	   0x4c, 0x89, 0x61, 0x20,                   //mov    QWORD PTR [rcx+0x20],r12
-	   0x4c, 0x89, 0x69, 0x28,                   //mov    QWORD PTR [rcx+0x28],r13
-	   0x4c, 0x89, 0x71, 0x30,                   //mov    QWORD PTR [rcx+0x30],r14
-	   0x4c, 0x89, 0x79, 0x38,                   //mov    QWORD PTR [rcx+0x38],r15
-	   0x66, 0x4c, 0x0f, 0x7e, 0x41, 0x40,       //movq   QWORD PTR [rcx+0x40],xmm8
-	   0x66, 0x4c, 0x0f, 0x7e, 0x49, 0x48,       //movq   QWORD PTR [rcx+0x48],xmm9
-	   0x66, 0x48, 0x0f, 0x7e, 0x51, 0x50,       //movq   QWORD PTR [rcx+0x50],xmm2
-	   0x66, 0x48, 0x0f, 0x7e, 0x59, 0x58,       //movq   QWORD PTR [rcx+0x58],xmm3
-	   0x66, 0x48, 0x0f, 0x7e, 0x61, 0x60,       //movq   QWORD PTR [rcx+0x60],xmm4
-	   0x66, 0x48, 0x0f, 0x7e, 0x69, 0x68,       //movq   QWORD PTR [rcx+0x68],xmm5
-	   0x66, 0x48, 0x0f, 0x7e, 0x71, 0x70,       //movq   QWORD PTR [rcx+0x70],xmm6
-	   0x66, 0x48, 0x0f, 0x7e, 0x79, 0x78,       //movq   QWORD PTR [rcx+0x78],xmm7
-#ifdef _WIN32
-	   0xf3, 0x44, 0x0f, 0x6f, 0x0c, 0x24,       //movdqu xmm9,XMMWORD PTR [rsp]
-	   0xf3, 0x44, 0x0f, 0x6f, 0x44, 0x24, 0x10, //movdqu xmm8,XMMWORD PTR [rsp+0x10]
-	   0xf3, 0x0f, 0x6f, 0x7c, 0x24, 0x20,       //movdqu xmm7,XMMWORD PTR [rsp+0x20]
-	   0xf3, 0x0f, 0x6f, 0x74, 0x24, 0x30,       //movdqu xmm6,XMMWORD PTR [rsp+0x30]
-	   0x48, 0x83, 0xc4, 0x48,                   //add    rsp,0x48
-#endif
-	   0x41, 0x5f,                               //pop    r15
-	   0x41, 0x5e,                               //pop    r14
-	   0x41, 0x5d,                               //pop    r13
-	   0x41, 0x5c,                               //pop    r12
-#ifdef _WIN32
-	   0x5e,                                     //pop    rsi
-	   0x5f,                                     //pop    rdi
-#endif
-	   0x5d,                                     //pop    rbp
-	   0x5b,                                     //pop    rbx
-	   0xc3,                                     //ret
-	};
+	const int32_t prologueSize = codeProgramBegin - codePrologue;
+	const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
+	const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
+	const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;

-	//41 bytes -> 1 cache line
-	const uint8_t readDatasetSub[] = {
-	   0x8b, 0x13,                                  //mov    edx,DWORD PTR [rbx]
-	   0x48, 0x8b, 0x43, 0x08,                      //mov    rax,QWORD PTR [rbx+0x8]
-	   0x48, 0x8b, 0x04, 0x10,                      //mov    rax,QWORD PTR [rax+rdx*1]
-	   0x83, 0x03, 0x08,                            //add    DWORD PTR [rbx],0x8
-	   0x33, 0x4b, 0x04,                            //xor    ecx,DWORD PTR [rbx+0x4]
-	   0x89, 0x4b, 0x04,                            //mov    DWORD PTR [rbx+0x4],ecx
-	   0xf7, 0xc1, 0xf8, 0xff, 0x00, 0x00,          //test   ecx,0xfff8
-	   0x75, 0x0d,                                  //jne
-	   0x83, 0xe1, 0xf8,                            //and    ecx,0xfffffff8
-	   0x89, 0x0b,                                  //mov    DWORD PTR [rbx],ecx
-	   0x48, 0x8b, 0x53, 0x08,                      //mov    rdx,QWORD PTR [rbx+0x8]
-	   0x0f, 0x18, 0x0c, 0x0a,                      //prefetcht0 BYTE PTR [rdx+rcx*1]
-	   0xc3,                                        //ret
-	};
-
-	constexpr int getNumCacheLines(size_t size) {
-		return (size + (CacheLineSize - 1)) / CacheLineSize;
-	}
-
-	constexpr int32_t align(int32_t pos, int32_t align) {
-		return ((pos - 1) / align + 1) * align;
-	}
-
-	constexpr int32_t readDatasetSubOffset = CodeSize - CacheLineSize * getNumCacheLines(sizeof(readDatasetSub));
-	constexpr int32_t epilogueOffset = readDatasetSubOffset - CacheLineSize * getNumCacheLines(sizeof(epilogue));
-	constexpr int32_t startOffsetAligned = align(sizeof(prologue), CacheLineSize);
+	const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
+	const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
+	const int32_t epilogueOffset = readDatasetROffset - epilogueSize;

 	JitCompilerX86::JitCompilerX86() {
 #ifdef _WIN32
@ -213,24 +120,16 @@ namespace RandomX {
 		if (code == (uint8_t*)-1)
 			throw std::runtime_error("mmap failed");
 #endif
-		memcpy(code, prologue, sizeof(prologue));
-		codePos = sizeof(prologue);
-		if (startOffsetAligned - codePos > 4) {
-			emitByte(0xeb);
-			emitByte(startOffsetAligned - (codePos + 1));
-		}
-		else {
-			while (codePos < startOffsetAligned)
-				emitByte(0x90); //nop
-		}
-		memcpy(code + readDatasetSubOffset, readDatasetSub, sizeof(readDatasetSub));
-		memcpy(code + epilogueOffset, epilogue, sizeof(epilogue));
+		memcpy(code, codePrologue, prologueSize);
+		memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
+		memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
+		memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
 	}

 	void JitCompilerX86::generateProgram(Pcg32& gen) {
 		instructionOffsets.clear();
 		callOffsets.clear();
-		codePos = startOffsetAligned;
+		codePos = prologueSize;
 		Instruction instr;
 		for (unsigned i = 0; i < ProgramLength; ++i) {
 			for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
@ -247,7 +146,6 @@ namespace RandomX {
 		instructionOffsets.push_back(codePos);
 		emit(0x840fcfff); //dec edx; jz <epilogue>
 		emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
-		gena(instr);
 		auto generator = engine[instr.opcode];
 		(this->*generator)(instr, i);
 	}
@ -258,11 +156,10 @@ namespace RandomX {
 		}
 	}

-	void JitCompilerX86::gena(Instruction& instr) {
+	void JitCompilerX86::genar(Instruction& instr) {
 		emit(uint16_t(0x8149)); //xor
 		emitByte(0xf0 + (instr.rega % RegistersCount));
 		emit(instr.addra);
-		int32_t pc;
 		switch (instr.loca & 7)
 		{
 			case 0:
@ -272,7 +169,7 @@ namespace RandomX {
 				emit(uint16_t(0x8b41)); //mov
 				emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
 				emitByte(0xe8); //call
-				emit(readDatasetSubOffset - (codePos + 4));
+				emit(readDatasetROffset - (codePos + 4));
 				return;

 			case 4:
@ -293,8 +190,44 @@ namespace RandomX {
 		}
 	}

+	void JitCompilerX86::genaf(Instruction& instr) {
+		emit(uint16_t(0x8149)); //xor
+		emitByte(0xf0 + (instr.rega % RegistersCount));
+		emit(instr.addra);
+		switch (instr.loca & 7)
+		{
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+			emit(uint16_t(0x8b41)); //mov
+			emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
+			emitByte(0xe8); //call
+			emit(readDatasetFOffset - (codePos + 4));
+			return;
+
+		case 4:
+			emit(uint16_t(0x8b41)); //mov
+			emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
+			emitByte(0x25); //and
+			emit(ScratchpadL2 - 1); //whole scratchpad
+			emitByte(0xf3);
+			emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
+			return;
+
+		default:
+			emit(uint16_t(0x8b41)); //mov
+			emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
+			emitByte(0x25); //and
+			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
+			emitByte(0xf3);
+			emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
+			return;
+		}
+	}
+
 	void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
-		if ((instr.locb & 7) <= 5) {
+		if ((instr.locb & 7) <= 3) {
 			emit(uint16_t(0x8b49)); //mov
 			emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
 			emitByte(0x48); //REX.W
@ -330,12 +263,8 @@ namespace RandomX {
 	}

 	void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) {
-		emit(0x48f2fffff8002548); //and rax,0xfffffffffffff800; cvtsi2sd xmm0,rax
-		emit(uint16_t(0x2a0f));
-		emitByte(0xc0);
-		if ((instr.locb & 7) <= 5) {
 		int regb = (instr.regb % RegistersCount);
-			emitByte(0xf2); //xxxsd  xmm0,regb
+		emitByte(0x66); //xxxpd  xmm0,regb
 		if (regb <= 1) {
 			emitByte(0x41); //REX
 		}
@ -343,44 +272,30 @@ namespace RandomX {
 		emitByte(opcode);
 		emitByte(0xc0 + regb);
 	}
-		else {
-			convertible_t bimm;
-			bimm.f64 = (double)instr.imm32;
-			emit(uint16_t(0xb848)); //movabs rax,imm64
-			emit(bimm.i64);
-			emitByte(0x66); //movq xmm1,rax
-			emit(0xc86e0f48);
-			emit(uint16_t(0x0ff2)); //xxxsd xmm0,xmm1
-			emitByte(opcode);
-			emitByte(0xc1);
-		}
+
+
+	void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
+		emit(0x41c88b48); //mov rcx, rax; REX
+		emitByte(0x8b); // mov
+		emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
+		emitByte(0x35); // xor eax
+		emit(instr.addrc);
+		emitByte(0x25); //and
+		emit(scratchpadSize - 1);
+		emit(0xc60c8948); // mov    QWORD PTR [rsi+rax*8],rcx
 	}

 	void JitCompilerX86::gencr(Instruction& instr) {
 		switch (instr.locc & 7)
 		{
 			case 0:
-			emit(0x41c88b48); //mov rcx, rax; REX
-			emitByte(0x8b); // mov
-			emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
-			emitByte(0x35); // xor eax
-			emit(instr.addrc);
-			emitByte(0x25); //and
-			emit(ScratchpadL2 - 1); //whole scratchpad
-			emit(0xc60c8948); // mov    QWORD PTR [rsi+rax*8],rcx
+				scratchpadStoreR(instr, ScratchpadL2);
 				break;

 			case 1:
 			case 2:
 			case 3:
-			emit(0x41c88b48); //mov rcx, rax; REX
-			emitByte(0x8b); // mov
-			emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
-			emitByte(0x35); // xor eax
-			emit(instr.addrc);
-			emitByte(0x25); //and
-			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
-			emit(0xc60c8948); // mov    QWORD PTR [rsi+rax*8],rcx
+				scratchpadStoreR(instr, ScratchpadL1);
 				break;

 			default:
@ -390,66 +305,75 @@ namespace RandomX {
 		}
 	}

-	void JitCompilerX86::gencf(Instruction& instr) {
-		int regc = (instr.regc % RegistersCount);
-		switch (instr.locc & 7)
-		{
-		case 0:
+	void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) {
 		emit(uint16_t(0x8b41)); //mov
 		emitByte(0xc0 + regc); //eax, regc
 		emitByte(0x35); // xor eax
 		emit(instr.addrc);
 		emitByte(0x25); //and
-			emit(ScratchpadL2 - 1); //whole scratchpad
-			emit(uint16_t(0x4866)); //prefix
-			emit(0xc6047e0f); // movq   QWORD PTR [rsi+rax*8],xmm0
-			break;
-
-		case 1:
-		case 2:
-		case 3:
-			emit(uint16_t(0x8b41)); //mov
-			emitByte(0xc0 + regc); //eax, regc
-			emitByte(0x35); // xor eax
-			emit(instr.addrc);
-			emitByte(0x25); //and
-			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
-			emit(uint16_t(0x4866)); //prefix
-			emit(0xc6047e0f); // movq   QWORD PTR [rsi+rax*8],xmm0
-			break;
-
-		default:
-			emitByte(0xf2);
+		emit(scratchpadSize - 1);
+		emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc
 		if (regc <= 1) {
 			emitByte(0x44); //REX
 		}
-			emit(uint16_t(0x100f)); //movsd
+		emitByte(0x0f);
+		emitByte(storeHigh ? 0x17 : 0x13);
+		emitByte(4 + 8 * regc);
+		emitByte(0xc6);
+	}
+
+	void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
+		int regc = (instr.regc % RegistersCount);
+		if (!alwaysLow) {
+			if (regc <= 1) {
+				emitByte(0x44); //REX
+			}
+			emit(uint16_t(0x280f)); //movaps
 			emitByte(0xc0 + 8 * regc); // regc, xmm0
+		}
+		switch (instr.locc & 7)
+		{
+			case 4:
+				scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
+				break;
+
+			case 5:
+			case 6:
+			case 7:
+				scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
+				break;
+
+			default:
 				break;
 		}
 	}

 	void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr1(instr, 0x0349, 0x0548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
+		genar(instr);
 		genbr132(instr, 0x0341, 0x05);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr1(instr, 0x2b49, 0x2d48);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
+		genar(instr);
 		genbr132(instr, 0x2b41, 0x2d);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_MUL_64(Instruction& instr, int i) {
+		genar(instr);
 		if ((instr.locb & 7) <= 5) {
 			emitByte(0x49); //REX
 			emit(uint16_t(0xaf0f)); // imul rax, r64
@ -464,6 +388,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_MULH_64(Instruction& instr, int i) {
+		genar(instr);
 		if ((instr.locb & 7) <= 5) {
 			emit(uint16_t(0x8b49)); //mov rcx, r64
 			emitByte(0xc8 + (instr.regb % RegistersCount));
@ -481,6 +406,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_MUL_32(Instruction& instr, int i) {
+		genar(instr);
 		emit(uint16_t(0xc88b)); //mov ecx, eax
 		if ((instr.locb & 7) <= 5) {
 			emit(uint16_t(0x8b41)); // mov eax, r32
@ -495,6 +421,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) {
+		genar(instr);
 		emitByte(0x48);
 		emit(uint16_t(0xc863)); //movsxd rcx,eax
 		if ((instr.locb & 7) <= 5) {
@ -511,6 +438,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) {
+		genar(instr);
 		if ((instr.locb & 7) <= 5) {
 			emit(uint16_t(0x8b49)); //mov rcx, r64
 			emitByte(0xc8 + (instr.regb % RegistersCount));
@ -528,6 +456,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
+		genar(instr);
 		if ((instr.locb & 7) <= 5) {
 			emitByte(0xb9); //mov ecx, 1
 			emit(1);
@ -546,6 +475,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
+		genar(instr);
 		if ((instr.locb & 7) <= 5) {
 			emit(uint16_t(0x8b41)); //mov edx, r32
 			emitByte(0xd0 + (instr.regb % RegistersCount));
@ -563,100 +493,127 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr1(instr, 0x2349, 0x2548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
+		genar(instr);
 		genbr132(instr, 0x2341, 0x25);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr1(instr, 0x0b49, 0x0d48);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
+		genar(instr);
 		genbr132(instr, 0x0b41, 0x0d);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr1(instr, 0x3349, 0x3548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
+		genar(instr);
 		genbr132(instr, 0x3341, 0x35);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr0(instr, 0xe0d3, 0xe0c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr0(instr, 0xe8d3, 0xe8c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr0(instr, 0xf8d3, 0xf8c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr0(instr, 0xc0d3, 0xc0c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
+		genar(instr);
 		genbr0(instr, 0xc8d3, 0xc8c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_FPADD(Instruction& instr, int i) {
+		genaf(instr);
 		genbf(instr, 0x58);
 		gencf(instr);
 	}

 	void JitCompilerX86::h_FPSUB(Instruction& instr, int i) {
+		genaf(instr);
 		genbf(instr, 0x5c);
 		gencf(instr);
 	}

 	void JitCompilerX86::h_FPMUL(Instruction& instr, int i) {
-		emit(uint16_t(0x0d48)); //or rax,0x800
-		emit(0x00000800);
+		genaf(instr);
 		genbf(instr, 0x59);
+		emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
+		emit(uint16_t(0x540f)); //andps  xmm0,xmm1
+		emitByte(0xc1);
 		gencf(instr);
 	}

 	void JitCompilerX86::h_FPDIV(Instruction& instr, int i) {
-		emit(uint16_t(0x0d48)); //or rax,0x800
-		emit(0x00000800);
+		genaf(instr);
 		genbf(instr, 0x5e);
+		emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
+		emit(uint16_t(0x540f)); //andps  xmm0,xmm1
+		emitByte(0xc1);
 		gencf(instr);
 	}

 	void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) {
-		emit(uint16_t(0xb948)); //or movabs rcx, imm64
-		emit(0x7ffffffffffff800);
-		emit(0xc02a0f48f2c12348); //and rax,rcx; cvtsi2sd xmm0,rax
-		emit(0xc0510ff2); //sqrtsd xmm0,xmm0
+		genaf(instr);
+		emit(0xc0510f66c2540f41); //andps  xmm0,xmm10; sqrtpd xmm0,xmm0
 		gencf(instr);
 	}

 	void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
+		genar(instr);
 		emit(0x81480de0c1c88b48);
 		emit(0x600025fffff800e1);
-		emit(0x0dc12a0f48f20000);
+		emit(uint16_t(0x0000));
+		emitByte(0xf2);
+		int regc = (instr.regc % RegistersCount);
+		if (regc <= 1) {
+			emitByte(0x4c); //REX
+		}
+		else {
+			emitByte(0x48); //REX
+		}
+		emit(uint16_t(0x2a0f));
+		emitByte(0xc1 + 8 * regc);
+		emitByte(0x0d);
 		emit(0xf824448900009fc0);
 		emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
 		emitByte(0xf8);
-		gencf(instr);
+		gencf(instr, true);
 	}

 	static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@ -682,6 +639,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_CALL(Instruction& instr, int i) {
+		genar(instr);
 		emit(uint16_t(0x8141)); //cmp regb, imm32
 		emitByte(0xf8 + (instr.regb % RegistersCount));
 		emit(instr.imm32);
@ -707,6 +665,7 @@ namespace RandomX {
 	}

 	void JitCompilerX86::h_RET(Instruction& instr, int i) {
+		genar(instr);
 		int crlen = 0;
 		if ((instr.locc & 7) <= 3) {
 			crlen = 17;
@ -756,4 +715,6 @@ namespace RandomX {
 		INST_HANDLE(CALL)
 		INST_HANDLE(RET)
 	};
+
+#endif
 }
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -58,13 +58,16 @@ namespace RandomX {
 		std::vector<int32_t> instructionOffsets;
 		std::vector<CallOffset> callOffsets;

-		void gena(Instruction&);
+		void genar(Instruction&);
+		void genaf(Instruction&);
 		void genbr0(Instruction&, uint16_t, uint16_t);
 		void genbr1(Instruction&, uint16_t, uint16_t);
 		void genbr132(Instruction&, uint16_t, uint8_t);
 		void genbf(Instruction&, uint8_t);
+		void scratchpadStoreR(Instruction&, uint32_t);
+		void scratchpadStoreF(Instruction&, int, uint32_t, bool);
 		void gencr(Instruction&);
-		void gencf(Instruction&);
+		void gencf(Instruction&, bool);
 		void generateCode(Instruction&, int);
 		void fixCallOffsets();

--- a/src/TestAluFpu.cpp
+++ b/src/TestAluFpu.cpp
@ -21,33 +21,36 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <iomanip>
 #include <limits>
 #include "instructions.hpp"
-#include "Pcg32.hpp"
 //#define DEBUG

 using namespace RandomX;

-typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
-
-uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
-	convertible_t a, b, c;
-	a.u64 = mode;
-	FPROUND(a, b, c);
-#ifdef DEBUG
-	a.f64 = convertToDouble(x);
-	b.f64 = convertToDouble(y);
-	std::cout << std::hex << (uint64_t)x << " -> " << a.u64 << std::endl;
-	std::cout << std::hex << (uint64_t)y << " -> " << b.u64 << std::endl;
-	std::cout << std::dec;
-#endif
-	a.i64 = x;
-	b.i64 = y;
-	op(a, b, c);
-	return c.u64;
-}
+typedef void(*FpuOperation)(convertible_t&, fpu_reg_t&, fpu_reg_t&);

 #define CATCH_CONFIG_MAIN
 #include "catch.hpp"

+uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, FpuOperation op, bool hiEqualsLo = true) {
+	convertible_t a;
+	fpu_reg_t b, c;
+	a.u64 = mode;
+	FPROUND(a, b, c);
+	if (hiEqualsLo) {
+		a.i32lo = x;
+		a.i32hi = x;
+	}
+	else {
+		a.i64 = x;
+	}
+	b.lo.i64 = y;
+	b.hi.i64 = y;
+	op(a, b, c);
+	if (hiEqualsLo) {
+		CHECK(c.lo.u64 == c.hi.u64);
+	}
+	return c.lo.u64;
+}
+
 #define RX_EXECUTE_U64(va, vb, INST) do { \
 	a.u64 = va; \
 	b.u64 = vb; \
@ -273,118 +276,126 @@ TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {

 TEST_CASE("Denormal results are not produced", "[FTZ]") {
 	FPINIT();
-	convertible_t a, b, c;
-	a.i64 = 2048;
-	FPDIV(a, DBL_MAX, c);
+	convertible_t a;
+	fpu_reg_t b;
+	a.i64 = 1;
+	b.lo.f64 = DBL_MAX;
+	FPDIV(a, b, b);
 #ifdef DEBUG
-	std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << c.u64 << std::endl;
+	std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << b.lo.u64 << std::endl;
 #endif
-	REQUIRE(std::fpclassify(c.f64) != FP_SUBNORMAL);
-	b.f64 = c.f64;
+	CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
 	a.i64 = 0;
-	FPSUB_64(a, b, c);
+	FPSUB(a, b, b);
 #ifdef DEBUG
-	std::cout << a.i64 << " - " << b.f64 << " = " << std::hex << c.u64 << std::endl;
+	std::cout << a.i64 << " - " << b.lo.f64 << " = " << std::hex << b.lo.u64 << std::endl;
 #endif
-	CHECK(std::fpclassify(c.f64) != FP_SUBNORMAL);
+	CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
 }

 TEST_CASE("NaN results are not produced", "[NAN]") {
 	FPINIT();
-	convertible_t a, c;
+	convertible_t a;
+	fpu_reg_t b;
 	a.i64 = 0;
-	FPDIV(a, 0, c);
-	CHECK(std::fpclassify(c.f64) != FP_NAN);
-	FPMUL(a, std::numeric_limits<double>::infinity(), c);
-	CHECK(std::fpclassify(c.f64) != FP_NAN);
+	b.lo.f64 = 0;
+	FPDIV(a, b, b);
+	CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
+	b.lo.f64 = std::numeric_limits<double>::infinity();
+	FPMUL(a, b, b);
+	CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
 }

-volatile int64_t fpAdda = 7379480244170225589;
-volatile int64_t fpAddb = -438072579179686797;
-volatile int64_t fpSuba = 2939258788088626026;
-volatile int64_t fpSubb = 4786131045320678734;
-volatile int64_t fpMula1 = 8399833736388895639;
-volatile int64_t fpMulb1 = 5671608020317594922;
-volatile int64_t fpMula2 = -7094299423744805450;
-volatile int64_t fpMulb2 = 4982086006202596504;
-volatile int64_t fpDiva1 = 8399833736388895639;
-volatile int64_t fpDivb1 = 5671608020317594922;
-volatile int64_t fpDiva2 = -7434878587645025912;
-volatile int64_t fpDivb2 = 5266243837734830806;
-volatile int64_t fpSqrta = -7594301562963134542;
+volatile int64_t fpRounda = 7379480244170225589;
+volatile int32_t fpAdda = -2110701072;
+volatile int64_t fpAddb = 5822431907862180274;
+volatile int32_t fpSuba = -1651770302;
+volatile int64_t fpSubb = 4982086006202596504;
+volatile int32_t fpMula1 = 122885310;
+volatile int64_t fpMulb1 = 6036690890763685020;
+volatile int32_t fpMula2 = -1952486466;
+volatile int64_t fpMulb2 = 5693689137909219638;
+volatile int32_t fpDiva1 = -1675630642;
+volatile int64_t fpDivb1 = -3959960229647489051;
+volatile int32_t fpDiva2 = -1651770302;
+volatile int64_t fpDivb2 = 4982086006202596504;
+volatile int32_t fpSqrta1 = 440505508;
+volatile int32_t fpSqrta2 = -2147483648;

 TEST_CASE("IEEE-754 compliance", "[FPU]") {
 	FPINIT();
-	convertible_t a, b, c;
+	convertible_t a;
+	fpu_reg_t b, c;
+	b.lo.f64 = 0.0;

-	a.i64 = 2048;
-	FPDIV(a, 0, c);
-	CHECK(c.f64 == std::numeric_limits<double>::infinity());
+	a.i64 = 1;
+	FPDIV(a, b, c);
+	CHECK(c.lo.f64 == std::numeric_limits<double>::infinity());

-	a.i64 = -2048;
-	FPDIV(a, 0, c);
-	CHECK(c.f64 == -std::numeric_limits<double>::infinity());
+	a.i64 = -1;
+	FPDIV(a, b, c);
+	CHECK(c.lo.f64 == -std::numeric_limits<double>::infinity());

 #ifdef DEBUG
 	std::cout << "FPROUND" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
-	CHECK(rxRound(RoundDown, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
-	CHECK(rxRound(RoundUp, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
-	CHECK(rxRound(RoundToZero, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
-
-	CHECK(rxRound(RoundToNearest, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
-	CHECK(rxRound(RoundDown, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
-	CHECK(rxRound(RoundUp, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
-	CHECK(rxRound(RoundToZero, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
+	CHECK(rxRound(RoundToNearest, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
+	CHECK(rxRound(RoundDown, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
+	CHECK(rxRound(RoundUp, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
+	CHECK(rxRound(RoundToZero, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);

 #ifdef DEBUG
 	std::cout << "FPADD" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
-	CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
-	CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
-	CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
+	CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
+	CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
+	CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
+	CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);

 #ifdef DEBUG
 	std::cout << "FPSUB" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
-	CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
-	CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
-	CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
+	CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
+	CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
+	CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
+	CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);

 #ifdef DEBUG
 	std::cout << "FPMUL" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
-	CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
-	CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
-	CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
+	CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
+	CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
+	CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
+	CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);

-	CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
-	CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
-	CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
-	CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
+	CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
+	CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
+	CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
+	CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);

 #ifdef DEBUG
 	std::cout << "FPDIV" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
-	CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
-	CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
-	CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
+	CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
+	CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
+	CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
+	CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);

-	CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
-	CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
-	CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
-	CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
+	CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
+	CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
+	CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
+	CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);

 #ifdef DEBUG
 	std::cout << "FPSQRT" << std::endl;
 #endif
-	CHECK(rxRound(RoundToNearest, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
-	CHECK(rxRound(RoundDown, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
-	CHECK(rxRound(RoundUp, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
-	CHECK(rxRound(RoundToZero, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
+	CHECK(rxRound(RoundToNearest, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
+	CHECK(rxRound(RoundDown, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
+	CHECK(rxRound(RoundUp, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
+	CHECK(rxRound(RoundToZero, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
+
+	CHECK(rxRound(RoundToNearest, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
+	CHECK(rxRound(RoundDown, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
+	CHECK(rxRound(RoundUp, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
+	CHECK(rxRound(RoundToZero, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
 }
--- a/src/VirtualMachine.cpp
+++ b/src/VirtualMachine.cpp
@ -24,8 +24,19 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "t1ha/t1ha.h"
 #include "blake2/blake2.h"
 #include <cstring>
+#include <iomanip>
+
+std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
+	for (int i = 0; i < RandomX::RegistersCount; ++i)
+		os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
+	for (int i = 0; i < RandomX::RegistersCount; ++i)
+		os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl
+		<< "   = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec;
+	return os;
+}

 namespace RandomX {
+
 	VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
 		mem.ds.dataset = nullptr;
 	}
@ -83,9 +94,10 @@ namespace RandomX {
 	}

 	void VirtualMachine::getResult(void* out) {
-		uint64_t smallState[sizeof(RegisterFile) / sizeof(uint64_t) + 2];
+		constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2;
+		uint64_t smallState[smallStateLength];
 		memcpy(smallState, &reg, sizeof(RegisterFile));
-		smallState[17] = t1ha2_atonce128(&smallState[16], scratchpad, ScratchpadSize, reg.r[0].u64);
+		smallState[smallStateLength - 1] = t1ha2_atonce128(&smallState[smallStateLength - 2], scratchpad, ScratchpadSize, reg.r[0].u64);
 		blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0);
 	}
 }
--- a/src/VirtualMachine.hpp
+++ b/src/VirtualMachine.hpp
@ -32,11 +32,14 @@ namespace RandomX {
 		virtual void initializeProgram(const void* seed) = 0;
 		virtual void execute() = 0;
 		void getResult(void*);
+		const RegisterFile& getRegisterFile() {
+			return reg;
+		}
 	protected:
 		bool softAes, lightClient;
-		RegisterFile reg;
-		MemoryRegisters mem;
 		DatasetReadFunc readDataset;
+		alignas(16) RegisterFile reg;
+		MemoryRegisters mem;
 		alignas(16) convertible_t scratchpad[ScratchpadLength];
 	};
 }
--- a/src/asm/program_epilogue_linux.inc
+++ b/src/asm/program_epilogue_linux.inc
@ -0,0 +1,12 @@
+	#include "program_epilogue_store.inc"
+
+	;# restore callee-saved registers - System V AMD64 ABI
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rbp
+	pop rbx
+
+	;# program finished
+	ret 0
--- a/src/asm/program_epilogue_store.inc
+++ b/src/asm/program_epilogue_store.inc
@ -0,0 +1,22 @@
+	;# unroll VM stack
+	mov rsp, rbp
+
+	;# save VM register values
+	pop rcx
+	mov qword ptr [rcx+0], r8
+	mov qword ptr [rcx+8], r9
+	mov qword ptr [rcx+16], r10
+	mov qword ptr [rcx+24], r11
+	mov qword ptr [rcx+32], r12
+	mov qword ptr [rcx+40], r13
+	mov qword ptr [rcx+48], r14
+	mov qword ptr [rcx+56], r15
+	movdqa xmmword ptr [rcx+64], xmm8
+	movdqa xmmword ptr [rcx+80], xmm9
+	movdqa xmmword ptr [rcx+96], xmm2
+	movdqa xmmword ptr [rcx+112], xmm3
+	lea rcx, [rcx+64]
+	movdqa xmmword ptr [rcx+64], xmm4
+	movdqa xmmword ptr [rcx+80], xmm5
+	movdqa xmmword ptr [rcx+96], xmm6
+	movdqa xmmword ptr [rcx+112], xmm7
--- a/src/asm/program_epilogue_win64.inc
+++ b/src/asm/program_epilogue_win64.inc
@ -0,0 +1,20 @@
+	include program_epilogue_store.inc
+
+	;# restore callee-saved registers - Microsoft x64 calling convention
+	movdqu xmm10, xmmword ptr [rsp]
+	movdqu xmm9, xmmword ptr [rsp+16]
+	movdqu xmm8, xmmword ptr [rsp+32]
+	movdqu xmm7, xmmword ptr [rsp+48]
+	movdqu xmm6, xmmword ptr [rsp+64]
+	add rsp, 80
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rsi
+	pop rdi
+	pop rbp
+	pop rbx
+
+	;# program finished
+	ret	0
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@ -0,0 +1,17 @@
+	;# callee-saved registers - System V AMD64 ABI
+	push rbx
+	push rbp
+	push r12
+	push r13
+	push r14
+	push r15
+
+	;# function arguments
+	push rdi        ;# RegisterFile& registerFile
+	mov rbx, rsi    ;# MemoryRegisters& memory
+	mov rsi, rdx    ;# convertible_t* scratchpad
+	mov rcx, rdi
+
+	#include "program_prologue_load.inc"
+
+	jmp randomx_program_begin
--- a/src/asm/program_prologue_load.inc
+++ b/src/asm/program_prologue_load.inc
@ -0,0 +1,63 @@
+	mov rbp, rsp      ;# beginning of VM stack
+	mov rdi, 1048577  ;# number of VM instructions to execute + 1
+
+	xorps xmm10, xmm10
+	cmpeqpd xmm10, xmm10
+	psrlq xmm10, 1    ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
+
+	;# reset rounding mode
+	mov dword ptr [rsp-8], 40896
+	ldmxcsr dword ptr [rsp-8]
+
+	;# load integer registers
+	mov r8, qword ptr [rcx+0]
+	mov r9, qword ptr [rcx+8]
+	mov r10, qword ptr [rcx+16]
+	mov r11, qword ptr [rcx+24]
+	mov r12, qword ptr [rcx+32]
+	mov r13, qword ptr [rcx+40]
+	mov r14, qword ptr [rcx+48]
+	mov r15, qword ptr [rcx+56]
+
+	;# initialize floating point registers
+	xorps xmm8, xmm8
+	cvtsi2sd xmm8, qword ptr [rcx+72]
+	pslldq xmm8, 8
+	cvtsi2sd xmm8, qword ptr [rcx+64]
+
+	xorps xmm9, xmm9
+	cvtsi2sd xmm9, qword ptr [rcx+88]
+	pslldq xmm9, 8
+	cvtsi2sd xmm9, qword ptr [rcx+80]
+
+	xorps xmm2, xmm2
+	cvtsi2sd xmm2, qword ptr [rcx+104]
+	pslldq xmm2, 8
+	cvtsi2sd xmm2, qword ptr [rcx+96]
+
+	xorps xmm3, xmm3
+	cvtsi2sd xmm3, qword ptr [rcx+120]
+	pslldq xmm3, 8
+	cvtsi2sd xmm3, qword ptr [rcx+112]
+
+	lea rcx, [rcx+64]
+
+	xorps xmm4, xmm4
+	cvtsi2sd xmm4, qword ptr [rcx+72]
+	pslldq xmm4, 8
+	cvtsi2sd xmm4, qword ptr [rcx+64]
+
+	xorps xmm5, xmm5
+	cvtsi2sd xmm5, qword ptr [rcx+88]
+	pslldq xmm5, 8
+	cvtsi2sd xmm5, qword ptr [rcx+80]
+
+	xorps xmm6, xmm6
+	cvtsi2sd xmm6, qword ptr [rcx+104]
+	pslldq xmm6, 8
+	cvtsi2sd xmm6, qword ptr [rcx+96]
+
+	xorps xmm7, xmm7
+	cvtsi2sd xmm7, qword ptr [rcx+120]
+	pslldq xmm7, 8
+	cvtsi2sd xmm7, qword ptr [rcx+112]
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@ -0,0 +1,24 @@
+	;# callee-saved registers - Microsoft x64 calling convention
+	push rbx
+	push rbp
+	push rdi
+	push rsi
+	push r12
+	push r13
+	push r14
+	push r15
+	sub rsp, 80
+	movdqu xmmword ptr [rsp+64], xmm6
+	movdqu xmmword ptr [rsp+48], xmm7
+	movdqu xmmword ptr [rsp+32], xmm8
+	movdqu xmmword ptr [rsp+16], xmm9
+	movdqu xmmword ptr [rsp+0], xmm10
+
+	;# function arguments
+	push rcx        ;# RegisterFile& registerFile
+	mov rbx, rdx    ;# MemoryRegisters& memory
+	mov rsi, r8     ;# convertible_t* scratchpad
+
+	include program_prologue_load.inc
+
+	jmp randomx_program_begin
--- a/src/asm/program_read_f.inc
+++ b/src/asm/program_read_f.inc
@ -0,0 +1,13 @@
+	mov edx, dword ptr [rbx]      ;# ma
+	mov rax, qword ptr [rbx+8]    ;# dataset
+	cvtdq2pd xmm0, qword ptr [rax+rdx]
+	add dword ptr [rbx], 8
+	xor ecx, dword ptr [rbx+4]    ;# mx
+	mov dword ptr [rbx+4], ecx
+	test ecx, 65528
+	jne short rx_read_dataset_f_ret
+	and ecx, -8
+	mov dword ptr [rbx], ecx
+	prefetcht0 byte ptr [rax+rcx]
+rx_read_dataset_f_ret:
+	ret 0
--- a/src/asm/program_read_r.inc
+++ b/src/asm/program_read_r.inc
@ -0,0 +1,13 @@
+	mov eax, dword ptr [rbx]      ;# ma
+	mov rdx, qword ptr [rbx+8]    ;# dataset
+	mov rax, qword ptr [rdx+rax]
+	add dword ptr [rbx], 8
+	xor ecx, dword ptr [rbx+4]    ;# mx
+	mov dword ptr [rbx+4], ecx
+	test ecx, 65528
+	jne short rx_read_dataset_r_ret
+	and ecx, -8
+	mov dword ptr [rbx], ecx
+	prefetcht0 byte ptr [rdx+rcx]
+rx_read_dataset_r_ret:
+	ret 0
--- a/src/common.hpp
+++ b/src/common.hpp
@ -20,6 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once

 #include <cstdint>
+#include <iostream>

 namespace RandomX {

@ -59,6 +60,15 @@ namespace RandomX {
 		uint64_t u64;
 		int32_t i32;
 		uint32_t u32;
+		struct {
+			int32_t i32lo;
+			int32_t i32hi;
+		};
+	};
+
+	struct fpu_reg_t {
+		convertible_t lo;
+		convertible_t hi;
 	};

 	constexpr int ProgramLength = 512;
@ -96,10 +106,10 @@ namespace RandomX {

 	struct RegisterFile {
 		convertible_t r[RegistersCount];
-		convertible_t f[RegistersCount];
+		fpu_reg_t f[RegistersCount];
 	};

-	static_assert(sizeof(RegisterFile) == 2 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
+	static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");

 	typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&);

@ -109,3 +119,5 @@ namespace RandomX {
 		void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc);
 	}
 }
+
+std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@ -1,19 +1,19 @@
-; Copyright (c) 2018 tevador
-;
-; This file is part of RandomX.
-;
-; RandomX is free software: you can redistribute it and/or modify
-; it under the terms of the GNU General Public License as published by
-; the Free Software Foundation, either version 3 of the License, or
-; (at your option) any later version.
-;
-; RandomX is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-; GNU General Public License for more details.
-;
-; You should have received a copy of the GNU General Public License
-; along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+;# Copyright (c) 2018 tevador
+;#
+;# This file is part of RandomX.
+;#
+;# RandomX is free software: you can redistribute it and/or modify
+;# it under the terms of the GNU General Public License as published by
+;# the Free Software Foundation, either version 3 of the License, or
+;# (at your option) any later version.
+;#
+;# RandomX is distributed in the hope that it will be useful,
+;# but WITHOUT ANY WARRANTY; without even the implied warranty of
+;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;# GNU General Public License for more details.
+;#
+;# You should have received a copy of the GNU General Public License
+;# along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 PUBLIC executeProgram

@ -47,6 +47,7 @@ executeProgram PROC
 	; xmm7 -> "f7"
 	; xmm8 -> "f0"
 	; xmm9 -> "f1"
+	; xmm10 -> absolute value mask

 	; STACK STRUCTURE:
 	;   |
@ -71,11 +72,12 @@ executeProgram PROC
 	push r13
 	push r14
 	push r15
-	sub rsp, 64
-	movdqu xmmword ptr [rsp+48], xmm6
-	movdqu xmmword ptr [rsp+32], xmm7
-	movdqu xmmword ptr [rsp+16], xmm8
-	movdqu xmmword ptr [rsp+0], xmm9
+	sub rsp, 80
+	movdqu xmmword ptr [rsp+64], xmm6
+	movdqu xmmword ptr [rsp+48], xmm7
+	movdqu xmmword ptr [rsp+32], xmm8
+	movdqu xmmword ptr [rsp+16], xmm9
+	movdqu xmmword ptr [rsp+0], xmm10

 	; function arguments
 	push rcx				; RegisterFile& registerFile
@ -86,7 +88,15 @@ executeProgram PROC
 	mov rbp, rsp			; beginning of VM stack
 	mov rdi, 1048577	; number of VM instructions to execute + 1

-	; load VM register values
+	xorps xmm10, xmm10
+	cmpeqpd xmm10, xmm10
+	psrlq xmm10, 1		; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
+
+	; reset rounding mode
+	mov dword ptr [rsp-8], 40896
+	ldmxcsr dword ptr [rsp-8]
+
+	; load integer registers
 	mov r8, qword ptr [rcx+0]
 	mov r9, qword ptr [rcx+8]
 	mov r10, qword ptr [rcx+16]
@ -95,16 +105,56 @@ executeProgram PROC
 	mov r13, qword ptr [rcx+40]
 	mov r14, qword ptr [rcx+48]
 	mov r15, qword ptr [rcx+56]
-	mov dword ptr [rsp-8], 40896
-	ldmxcsr dword ptr [rsp-8]
+
+	; load register f0 hi, lo
+	xorps xmm8, xmm8
+	cvtsi2sd xmm8, qword ptr [rcx+72]
+	pslldq xmm8, 8
 	cvtsi2sd xmm8, qword ptr [rcx+64]
-	cvtsi2sd xmm9, qword ptr [rcx+72]
-	cvtsi2sd xmm2, qword ptr [rcx+80]
-	cvtsi2sd xmm3, qword ptr [rcx+88]
-	cvtsi2sd xmm4, qword ptr [rcx+96]
-	cvtsi2sd xmm5, qword ptr [rcx+104]
-	cvtsi2sd xmm6, qword ptr [rcx+112]
+
+	; load register f1 hi, lo
+	xorps xmm9, xmm9
+	cvtsi2sd xmm9, qword ptr [rcx+88]
+	pslldq xmm9, 8
+	cvtsi2sd xmm9, qword ptr [rcx+80]
+
+	; load register f2 hi, lo
+	xorps xmm2, xmm2
+	cvtsi2sd xmm2, qword ptr [rcx+104]
+	pslldq xmm2, 8
+	cvtsi2sd xmm2, qword ptr [rcx+96]
+
+	; load register f3 hi, lo
+	xorps xmm3, xmm3
+	cvtsi2sd xmm3, qword ptr [rcx+120]
+	pslldq xmm3, 8
+	cvtsi2sd xmm3, qword ptr [rcx+112]
+
+	lea rcx, [rcx+64]
+
+	; load register f4 hi, lo
+	xorps xmm4, xmm4
+	cvtsi2sd xmm4, qword ptr [rcx+72]
+	pslldq xmm4, 8
+	cvtsi2sd xmm4, qword ptr [rcx+64]
+
+	; load register f5 hi, lo
+	xorps xmm5, xmm5
+	cvtsi2sd xmm5, qword ptr [rcx+88]
+	pslldq xmm5, 8
+	cvtsi2sd xmm5, qword ptr [rcx+80]
+
+	; load register f6 hi, lo
+	xorps xmm6, xmm6
+	cvtsi2sd xmm6, qword ptr [rcx+104]
+	pslldq xmm6, 8
+	cvtsi2sd xmm6, qword ptr [rcx+96]
+
+	; load register f7 hi, lo
+	xorps xmm7, xmm7
 	cvtsi2sd xmm7, qword ptr [rcx+120]
+	pslldq xmm7, 8
+	cvtsi2sd xmm7, qword ptr [rcx+112]

 	; program body

@ -125,21 +175,23 @@ rx_finish:
 	mov qword ptr [rcx+40], r13
 	mov qword ptr [rcx+48], r14
 	mov qword ptr [rcx+56], r15
-	movd qword ptr [rcx+64], xmm8
-	movd qword ptr [rcx+72], xmm9
-	movd qword ptr [rcx+80], xmm2
-	movd qword ptr [rcx+88], xmm3
-	movd qword ptr [rcx+96], xmm4
-	movd qword ptr [rcx+104], xmm5
-	movd qword ptr [rcx+112], xmm6
-	movd qword ptr [rcx+120], xmm7
+	movdqa xmmword ptr [rcx+64], xmm8
+	movdqa xmmword ptr [rcx+80], xmm9
+	movdqa xmmword ptr [rcx+96], xmm2
+	movdqa xmmword ptr [rcx+112], xmm3
+	lea rcx, [rcx+64]
+	movdqa xmmword ptr [rcx+64], xmm4
+	movdqa xmmword ptr [rcx+80], xmm5
+	movdqa xmmword ptr [rcx+96], xmm6
+	movdqa xmmword ptr [rcx+112], xmm7

 	; load callee-saved registers
-	movdqu xmm9, xmmword ptr [rsp]
-	movdqu xmm8, xmmword ptr [rsp+16]
-	movdqu xmm7, xmmword ptr [rsp+32]
-	movdqu xmm6, xmmword ptr [rsp+48]
-	add rsp, 64
+	movdqu xmm10, xmmword ptr [rsp]
+	movdqu xmm9, xmmword ptr [rsp+16]
+	movdqu xmm8, xmmword ptr [rsp+32]
+	movdqu xmm7, xmmword ptr [rsp+48]
+	movdqu xmm6, xmmword ptr [rsp+64]
+	add rsp, 80
 	pop r15
 	pop r14
 	pop r13
@ -171,7 +223,7 @@ rx_read_dataset:
 	pop r8
 	ret 0

-rx_read_dataset_full:
+rx_read_dataset_r:
 	mov edx, dword ptr [rbx]	; ma
 	mov rax, qword ptr [rbx+8]	; dataset
 	mov rax, qword ptr [rax+rdx]
@ -179,12 +231,27 @@ rx_read_dataset_full:
 	xor ecx, dword ptr [rbx+4]	; mx
 	mov dword ptr [rbx+4], ecx
 	test ecx, 0FFF8h
-	jne short rx_read_dataset_full_ret
+	jne short rx_read_dataset_r_ret
 	and ecx, -8
 	mov dword ptr [rbx], ecx
 	mov rdx, qword ptr [rbx+8]
 	prefetcht0 byte ptr [rdx+rcx]
-rx_read_dataset_full_ret:
+rx_read_dataset_r_ret:
+	ret 0
+
+rx_read_dataset_f:
+	mov edx, dword ptr [rbx]	; ma
+	mov rax, qword ptr [rbx+8]	; dataset
+	cvtdq2pd xmm0, qword ptr [rax+rdx]
+	add dword ptr [rbx], 8
+	xor ecx, dword ptr [rbx+4]	; mx
+	mov dword ptr [rbx+4], ecx
+	test ecx, 0FFF8h
+	jne short rx_read_dataset_f_ret
+	and ecx, -8
+	mov dword ptr [rbx], ecx
+	prefetcht0 byte ptr [rax+rcx]
+rx_read_dataset_f_ret:
 	ret 0
 executeProgram ENDP

--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@ -19,15 +19,15 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #pragma once

-#define WT_ADD_64 10
+#define WT_ADD_64 11
 #define WT_ADD_32 2
-#define WT_SUB_64 10
+#define WT_SUB_64 11
 #define WT_SUB_32 2
-#define WT_MUL_64 21
+#define WT_MUL_64 23
 #define WT_MULH_64 10
 #define WT_MUL_32 15
 #define WT_IMUL_32 15
-#define WT_IMULH_64 10
+#define WT_IMULH_64 6
 #define WT_DIV_64 1
 #define WT_IDIV_64 1
 #define WT_AND_64 4
@ -47,8 +47,9 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #define WT_FPDIV 8
 #define WT_FPSQRT 6
 #define WT_FPROUND 2
-#define WT_CALL 24
-#define WT_RET 18
+#define WT_CALL 20
+#define WT_RET 22
+

 constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \
 WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \
@ -60,6 +61,7 @@ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \
 static_assert(wtSum == 256,
 	"Sum of instruction weights must be 256");

+#define REP0(x)
 #define REP1(x) x,
 #define REP2(x) REP1(x) x,
 #define REP3(x) REP2(x) x,
@ -86,6 +88,16 @@ static_assert(wtSum == 256,
 #define REP24(x) REP23(x) x,
 #define REP25(x) REP24(x) x,
 #define REP26(x) REP25(x) x,
+#define REP27(x) REP26(x) x,
+#define REP28(x) REP27(x) x,
+#define REP29(x) REP28(x) x,
+#define REP30(x) REP29(x) x,
+#define REP31(x) REP30(x) x,
+#define REP32(x) REP31(x) x,
+#define REP33(x) REP32(x) x,
+#define REP40(x) REP32(x) REP8(x)
+#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
+#define REP256(x) REP128(x) REP128(x)
 #define REPNX(x,N) REP##N(x)
 #define REPN(x,N) REPNX(x,N)
 #define NUM(x) x
--- a/src/instructions.hpp
+++ b/src/instructions.hpp
@ -22,16 +22,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 namespace RandomX {

-	inline double convertToDouble(int64_t x) {
-		return (double)(x &-2048L);
-	}
-
-	inline double convertToDoubleNonZero(int64_t x) {
-		return (double)((x & -2048L) | 2048);
-	}
-
-	inline double convertToDoubleNonNegative(int64_t x) {
-		return (double)(x & 9223372036854773760L);
+	//Clears the 11 least-significant bits before conversion. This is done so the number
+	//fits exactly into the 52-bit mantissa without rounding.
+	inline double convertSigned52(int64_t x) {
+		return (double)(x & -2048L);
 	}

 	extern "C" {
@ -59,27 +53,11 @@ namespace RandomX {
 		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		bool JMP_COND(uint8_t, convertible_t&, int32_t);
 		void FPINIT();
-		void FPADD(convertible_t& a, double b, convertible_t& c);
-		void FPSUB(convertible_t& a, double b, convertible_t& c);
-		void FPMUL(convertible_t& a, double b, convertible_t& c);
-		void FPDIV(convertible_t& a, double b, convertible_t& c);
-		void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
-		void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c);
-
-		inline void FPADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FPADD(a, b.f64, c);
-		}
-
-		inline void FPSUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FPSUB(a, b.f64, c);
-		}
-
-		inline void FPMUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FPMUL(a, b.f64, c);
-		}
-
-		inline void FPDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FPDIV(a, b.f64, c);
-		}
+		void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
+		void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
+		void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
+		void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
+		void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
+		void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 	}
 }
--- a/src/instructionsPortable.cpp
+++ b/src/instructionsPortable.cpp
@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 //#define DEBUG
-//#define FTZ
 #include "instructions.hpp"
 #include "intrinPortable.h"
 #pragma STDC FENV_ACCESS on
@ -154,19 +153,17 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
 	#define subOverflow __subOverflow
 #endif

-static double FlushDenormal(double x) {
-	if (std::fpclassify(x) == FP_SUBNORMAL) {
-		return 0;
+static inline double FlushDenormalNaN(double x) {
+	int fpc = std::fpclassify(x);
+	if (fpc == FP_SUBNORMAL || fpc == FP_NAN) {
+		return 0.0;
 	}
 	return x;
 }

-#ifdef FTZ
-#undef FTZ
-#define FTZ(x) FlushDenormal(x)
-#else
-#define FTZ(x) x
-#endif
+static inline double FlushNaN(double x) {
+	return x != x ? 0.0 : x;
+}

 namespace RandomX {

@ -286,37 +283,95 @@ namespace RandomX {
 		}

 		void FPINIT() {
-			setRoundMode(FE_TONEAREST);
-		}
-
-		void FPADD(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ(convertToDouble(a.i64) + b);
-		}
-
-		void FPSUB(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ(convertToDouble(a.i64) - b);
-		}
-
-		void FPMUL(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ(convertToDoubleNonZero(a.i64) * b);
-		}
-
-		void FPDIV(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ(convertToDoubleNonZero(a.i64) / b);
-		}
-
-		void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
 #ifdef __SSE2__
-			double d = convertToDoubleNonNegative(a.i64);
-			c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
+			_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
 #else
-			c.f64 = FTZ(sqrt(convertToDoubleNonNegative(a.i64)));
+			setRoundMode(FE_TONEAREST);
 #endif
-
 		}

-		void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.f64 = convertToDouble(a.i64);
+		void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+#ifdef __SSE2__
+			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
+			__m128d ad = _mm_cvtepi32_pd(ai);
+			__m128d bd = _mm_load_pd(&b.lo.f64);
+			__m128d cd = _mm_add_pd(ad, bd);
+			_mm_store_pd(&c.lo.f64, cd);
+#else
+			double alo = (double)a.i32lo;
+			double ahi = (double)a.i32hi;
+			c.lo.f64 = alo + b.lo.f64;
+			c.hi.f64 = ahi + b.hi.f64;
+#endif
+		}
+
+		void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+#ifdef __SSE2__
+			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
+			__m128d ad = _mm_cvtepi32_pd(ai);
+			__m128d bd = _mm_load_pd(&b.lo.f64);
+			__m128d cd = _mm_sub_pd(ad, bd);
+			_mm_store_pd(&c.lo.f64, cd);
+#else
+			double alo = (double)a.i32lo;
+			double ahi = (double)a.i32hi;
+			c.lo.f64 = alo - b.lo.f64;
+			c.hi.f64 = ahi - b.hi.f64;
+#endif
+		}
+
+		void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+#ifdef __SSE2__
+			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
+			__m128d ad = _mm_cvtepi32_pd(ai);
+			__m128d bd = _mm_load_pd(&b.lo.f64);
+			__m128d cd = _mm_mul_pd(ad, bd);
+			__m128d mask = _mm_cmpeq_pd(cd, cd);
+			cd = _mm_and_pd(cd, mask);
+			_mm_store_pd(&c.lo.f64, cd);
+#else
+			double alo = (double)a.i32lo;
+			double ahi = (double)a.i32hi;
+			c.lo.f64 = FlushNaN(alo * b.lo.f64);
+			c.hi.f64 = FlushNaN(ahi * b.hi.f64);
+#endif
+		}
+
+		void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+#ifdef __SSE2__
+			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
+			__m128d ad = _mm_cvtepi32_pd(ai);
+			__m128d bd = _mm_load_pd(&b.lo.f64);
+			__m128d cd = _mm_div_pd(ad, bd);
+			__m128d mask = _mm_cmpeq_pd(cd, cd);
+			cd = _mm_and_pd(cd, mask);
+			_mm_store_pd(&c.lo.f64, cd);
+#else
+			double alo = (double)a.i32lo;
+			double ahi = (double)a.i32hi;
+			c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64);
+			c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64);
+#endif
+		}
+
+		void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+#ifdef __SSE2__
+			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
+			__m128d ad = _mm_cvtepi32_pd(ai);
+			const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
+			ad = _mm_and_pd(ad, absmask);
+			__m128d cd = _mm_sqrt_pd(ad);
+			_mm_store_pd(&c.lo.f64, cd);
+#else
+			double alo = (double)a.i32lo;
+			double ahi = (double)a.i32hi;
+			c.lo.f64 = sqrt(std::abs(alo));
+			c.hi.f64 = sqrt(std::abs(ahi));
+#endif
+		}
+
+		void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
+			c.lo.f64 = convertSigned52(a.i64);
 			switch (a.u64 & 3) {
 				case RoundDown:
 #ifdef DEBUG
--- a/src/main.cpp
+++ b/src/main.cpp
@ -79,14 +79,6 @@ void readInt(int argc, char** argv, int& out, int defaultValue) {
 	out = defaultValue;
 }

-std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
-	for (int i = 0; i < RandomX::RegistersCount; ++i)
-		os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
-	for (int i = 0; i < RandomX::RegistersCount; ++i)
-		os << std::hex << "f" << i << " = " << rf.f[i].u64 << " (" << rf.f[i].f64 << ")" << std::endl << std::dec;
-	return os;
-}
-
 class AtomicHash {
 public:
 	AtomicHash() {
@ -282,7 +274,7 @@ int main(int argc, char** argv) {
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
 		if(programCount == 1000)
-		std::cout << "Reference result:  f6bf06465d5fa1b1dc919140b9e9f9e210b07ae6d662988458a172e9a267eb3f" << std::endl;
+		std::cout << "Reference result:  3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
 		std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
 		/*if (threadCount == 1 && !compiled) {
 			auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];
--- a/src/program.inc
+++ b/src/program.inc