Added ISWAP instruction

Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization
2024-12-22 15:58:53 +00:00 · 2019-02-04 17:07:00 +01:00 · 2019-02-04 17:07:00 +01:00 · 1ee94bef2a
commit 1ee94bef2a
parent 20eb549725
23 changed files with 528 additions and 290 deletions
--- a/5
+++ b/5
@ -13,7 +13,7 @@ LDFLAGS=-lpthread
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
 ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o)
 ifeq ($(PLATFORM),x86_64)
-    ROBJS += $(OBJDIR)/JitCompilerX86-static.o
+    ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
 endif

 all: release test
@ -77,6 +77,9 @@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompile
 $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
 	$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@

+$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc))  | $(OBJDIR)
+	$(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@
+
 $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -72,16 +72,16 @@ namespace RandomX {

 	void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") {
 		asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl;
-		asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
+		asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
 	}

 	void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
 		asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl;
-		asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
+		asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
 	}

 	int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
-		return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
+		return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
 	}

 	//1 uOP
@ -348,6 +348,13 @@ namespace RandomX {
 		}
 	}

+	//2 uOPs
+	void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) {
+		if (instr.src != instr.dst) {
+			asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
+		}
+	}
+
 	//1 uOPs
 	void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) {
 		asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl;
@ -431,7 +438,7 @@ namespace RandomX {
 	//6 uOPs
 	void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
 		asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
-		int rotate = (13 - (instr.alt & 63)) & 63;
+		int rotate = (13 - (instr.imm32 & 63)) & 63;
 		if (rotate != 0)
 			asmCode << "\trol rax, " << rotate << std::endl;
 		asmCode << "\tand eax, 24576" << std::endl;
@ -441,7 +448,7 @@ namespace RandomX {
 	}

 	static inline const char* condition(Instruction& instr, bool invert = false) {
-		switch (((instr.alt >> 2) & 7) ^ invert)
+		switch (((instr.mod >> 2) & 7) ^ invert)
 		{
 			case 0:
 				return "be";
@ -519,6 +526,7 @@ namespace RandomX {
 		INST_HANDLE(IXOR_M)
 		INST_HANDLE(IROR_R)
 		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)

 		//Common floating point
 		INST_HANDLE(FPSWAP_R)
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@ -63,6 +63,7 @@ namespace RandomX {
 		void  h_IXOR_M(Instruction&, int);
 		void  h_IROR_R(Instruction&, int);
 		void  h_IROL_R(Instruction&, int);
+		void  h_ISWAP_R(Instruction&, int);
 		void  h_FPSWAP_R(Instruction&, int);
 		void  h_FPADD_R(Instruction&, int);
 		void  h_FPADD_M(Instruction&, int);
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@ -57,7 +57,7 @@ namespace RandomX {
 		for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
 			*(((uint32_t*)&reg) + i) = gen();
 		}
-		FPINIT();
+		initFpu();
 		/*for (int i = 0; i < RegistersCount / 2; ++i) {
 			reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
 			reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@ -29,15 +29,15 @@ namespace RandomX {
 	}

 	void Instruction::genAddressReg(std::ostream& os) const {
-		os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
+		os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
 	}

 	void Instruction::genAddressRegDst(std::ostream& os) const {
-		os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
+		os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
 	}

 	void Instruction::genAddressImm(std::ostream& os) const {
-		os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
+		os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
 	}

 	void Instruction::h_IADD_R(std::ostream& os) const {
@ -211,6 +211,10 @@ namespace RandomX {
 		os << "r" << (int)dst << ", " << imm32 << std::endl;
 	}

+	void Instruction::h_ISWAP_R(std::ostream& os) const {
+		os << "r" << (int)dst << ", r" << (int)src << std::endl;
+	}
+
 	void Instruction::h_FPSWAP_R(std::ostream& os) const {
 		const char reg = (dst >= 4) ? 'e' : 'f';
 		auto dstIndex = dst % 4;
@ -280,7 +284,7 @@ namespace RandomX {
 	}

 	void Instruction::h_CFROUND(std::ostream& os) const {
-		os << "r" << (int)src << ", " << (alt & 63) << std::endl;
+		os << "r" << (int)src << ", " << (imm32 & 63) << std::endl;
 	}

 	static inline const char* condition(int index) {
@ -306,11 +310,11 @@ namespace RandomX {
 	}

 	void Instruction::h_COND_R(std::ostream& os) const {
-		os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
+		os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
 	}

 	void Instruction::h_COND_M(std::ostream& os) const {
-		os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(";
+		os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(";
 		genAddressReg(os);
 		os << ", " << imm32 << ")" << std::endl;
 	}
@ -356,6 +360,7 @@ namespace RandomX {
 		INST_NAME(IXOR_M)
 		INST_NAME(IROR_R)
 		INST_NAME(IROL_R)
+		INST_NAME(ISWAP_R)

 		//Common floating point
 		INST_NAME(FPSWAP_R)
@ -406,6 +411,7 @@ namespace RandomX {
 		INST_HANDLE(IXOR_M)
 		INST_HANDLE(IROR_R)
 		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)

 		//Common floating point
 		INST_HANDLE(FPSWAP_R)
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@ -28,12 +28,52 @@ namespace RandomX {

 	typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const;

+	namespace InstructionType {
+		constexpr int IADD_R = 0;
+		constexpr int IADD_M = 1;
+		constexpr int IADD_RC = 2;
+		constexpr int ISUB_R = 3;
+		constexpr int ISUB_M = 4;
+		constexpr int IMUL_9C = 5;
+		constexpr int IMUL_R = 6;
+		constexpr int IMUL_M = 7;
+		constexpr int IMULH_R = 8;
+		constexpr int IMULH_M = 9;
+		constexpr int ISMULH_R = 10;
+		constexpr int ISMULH_M = 11;
+		constexpr int IDIV_C = 12;
+		constexpr int ISDIV_C = 13;
+		constexpr int INEG_R = 14;
+		constexpr int IXOR_R = 15;
+		constexpr int IXOR_M = 16;
+		constexpr int IROR_R = 17;
+		constexpr int IROL_R = 18;
+		constexpr int ISWAP_R = 19;
+		constexpr int FPSWAP_R = 20;
+		constexpr int FPADD_R = 21;
+		constexpr int FPADD_M = 22;
+		constexpr int FPSUB_R = 23;
+		constexpr int FPSUB_M = 24;
+		constexpr int FPNEG_R = 25;
+		constexpr int FPMUL_R = 26;
+		constexpr int FPMUL_M = 27;
+		constexpr int FPDIV_R = 28;
+		constexpr int FPDIV_M = 29;
+		constexpr int FPSQRT_R = 30;
+		constexpr int COND_R = 31;
+		constexpr int COND_M = 32;
+		constexpr int CFROUND = 33;
+		constexpr int ISTORE = 34;
+		constexpr int FSTORE = 35;
+		constexpr int NOP = 36;
+	}
+
 	class Instruction {
 	public:
 		uint8_t opcode;
 		uint8_t dst;
 		uint8_t src;
-		uint8_t alt;
+		uint8_t mod;
 		int32_t imm32;
 		const char* getName() const {
 			return names[opcode];
@ -70,6 +110,7 @@ namespace RandomX {
 		void  h_IXOR_M(std::ostream&) const;
 		void  h_IROR_R(std::ostream&) const;
 		void  h_IROL_R(std::ostream&) const;
+		void  h_ISWAP_R(std::ostream&) const;
 		void  h_FPSWAP_R(std::ostream&) const;
 		void  h_FPADD_R(std::ostream&) const;
 		void  h_FPADD_M(std::ostream&) const;
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -30,6 +30,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <sstream>
 #include <cmath>
 #include <thread>
+#include "intrinPortable.h"
 #ifdef STATS
 #include <algorithm>
 #endif
@ -98,7 +99,7 @@ namespace RandomX {
 		for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
 			*(((uint32_t*)&reg) + i) = gen();
 		}
-		FPINIT();
+		initFpu();
 		for (int i = 0; i < RegistersCount; ++i) {
 			reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
 			reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
@ -114,24 +115,32 @@ namespace RandomX {
 	}

 	void InterpretedVirtualMachine::execute() {
-		while (ic > 0) {
-#ifdef STATS
-			count_instructions[pc]++;
-#endif
-			auto& inst = p(pc);
-			if(trace) std::cout << inst.getName() << " (" << std::dec << pc << ")" << std::endl;
-			pc = (pc + 1) % ProgramLength;
-			auto handler = engine[inst.opcode];
-			(this->*handler)(inst);
-			ic--;
+		for(int i = 0; i < InstructionCount; ++i) {
+			for (int j = 0; j < ProgramLength; ++j) {
+				auto& ibc = byteCode[j];
+				switch (ibc.type)
+				{
+					case InstructionType::CFROUND: {
+						uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32);
+						setRoundMode(rcFlag);
+						}
+						break;
+				}
+			}
 		}
-#ifdef STATS
-		count_endstack += stack.size();
-#endif
+
 	}

 #include "instructionWeights.hpp"
-#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x))
+
+	void InterpretedVirtualMachine::executeInstruction(Instruction& instr) {
+		switch (instr.opcode)
+		{
+			CASE_REP(IADD_R)
+
+				break;
+		}
+	}

 	InstructionHandler InterpretedVirtualMachine::engine[256] = {

--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@ -33,10 +33,24 @@ namespace RandomX {
 		virtual std::ostream& printCxx(std::ostream&) const = 0;
 	};

+	struct InstructionByteCode;
 	class InterpretedVirtualMachine;

 	typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&);

+	struct alignas(64) InstructionByteCode {
+		convertible_t* idst;
+		convertible_t* isrc;
+		convertible_t imm;
+		fpu_reg_t* fdst;
+		fpu_reg_t* fsrc;
+		uint32_t condition;
+		uint32_t memMask;
+		uint32_t type;
+	};
+
+	constexpr int asedwfagdewsa = sizeof(InstructionByteCode);
+
 	class InterpretedVirtualMachine : public VirtualMachine {
 	public:
 		InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {}
@ -53,6 +67,7 @@ namespace RandomX {
 		static const ITransform* addressTransformations[TransformationCount];
 		bool softAes, asyncWorker;
 		Program p;
+		InstructionByteCode byteCode[ProgramLength];
 		std::vector<convertible_t> stack;
 		uint64_t pc, ic;
 		const ITransform* currentTransform;
@ -106,7 +121,7 @@ namespace RandomX {
 		int count_FPMUL_nop2 = 0;
 		int datasetAccess[256] = { 0 };
 #endif
-
+		void executeInstruction(Instruction&);
 		convertible_t loada(Instruction&);
 		convertible_t loadbiashift(Instruction&);
 		convertible_t loadbiadiv(Instruction&);
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -176,6 +176,7 @@ namespace RandomX {
 	static const uint8_t JNZ[] = { 0x0f, 0x85 };
 	static const uint8_t JMP = 0xe9;
 	static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
+	static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };

 	size_t JitCompilerX86::getCodeSize() {
 		return codePos - prologueSize;
@ -248,7 +249,7 @@ namespace RandomX {
 			emitByte(AND_EAX_I);
 		else
 			emit(AND_ECX_I);
-		emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
+		emit32((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
 	}

 	void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) {
@ -257,11 +258,11 @@ namespace RandomX {
 		emitByte(AND_EAX_I);
 		int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask;
 		int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask;
-		emit32((instr.alt % 4) ? maskL1 : maskL2);
+		emit32((instr.mod % 4) ? maskL1 : maskL2);
 	}

 	void JitCompilerX86::genAddressImm(Instruction& instr) {
-		emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
+		emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
 	}

 	void JitCompilerX86::h_IADD_R(Instruction& instr) {
@ -595,6 +596,13 @@ namespace RandomX {
 		}
 	}

+	void JitCompilerX86::h_ISWAP_R(Instruction& instr) {
+		if (instr.src != instr.dst) {
+			emit(REX_XCHG);
+			emitByte(0xc0 + instr.dst + 8 * instr.src);
+		}
+	}
+
 	void JitCompilerX86::h_FPSWAP_R(Instruction& instr) {
 		emit(SHUFPD);
 		emitByte(0xc0 + 9 * instr.dst);
@ -682,7 +690,7 @@ namespace RandomX {
 	void JitCompilerX86::h_CFROUND(Instruction& instr) {
 		emit(REX_MOV_RR64);
 		emitByte(0xc0 + instr.src);	
-		int rotate = (13 - (instr.alt & 63)) & 63;
+		int rotate = (13 - (instr.imm32 & 63)) & 63;
 		if (rotate != 0) {
 			emit(ROL_RAX);
 			emitByte(rotate);
@ -691,7 +699,7 @@ namespace RandomX {
 	}

 	static inline uint8_t condition(Instruction& instr, bool invert = false) {
-		switch ((instr.alt & 7) ^ invert)
+		switch ((instr.mod & 7) ^ invert)
 		{
 			case 0:
 				return 0x96; //setbe
@ -777,6 +785,7 @@ namespace RandomX {
 		INST_HANDLE(IXOR_M)
 		INST_HANDLE(IROR_R)
 		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)
 		INST_HANDLE(FPSWAP_R)
 		INST_HANDLE(FPADD_R)
 		INST_HANDLE(FPADD_M)
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -109,6 +109,7 @@ namespace RandomX {
 		void  h_IXOR_M(Instruction&);
 		void  h_IROR_R(Instruction&);
 		void  h_IROL_R(Instruction&);
+		void  h_ISWAP_R(Instruction&);
 		void  h_FPSWAP_R(Instruction&);
 		void  h_FPADD_R(Instruction&);
 		void  h_FPADD_M(Instruction&);
--- a/src/asm/program_loop_load.inc
+++ b/src/asm/program_loop_load.inc
@ -1,5 +1,5 @@
 	mov rdx, rax
-	and eax, 1048512
+	and eax, 2097088
 	lea rcx, [rsi+rax]
 	push rcx
 	xor r8,  qword ptr [rcx+0]
@ -11,7 +11,7 @@
 	xor r14, qword ptr [rcx+48]
 	xor r15, qword ptr [rcx+56]
 	ror rdx, 32
-	and edx, 1048512
+	and edx, 2097088
 	lea rcx, [rsi+rdx]
 	push rcx
 	cvtdq2pd xmm0, qword ptr [rcx+0]
--- a/src/asm/squareHash.inc
+++ b/src/asm/squareHash.inc
@ -0,0 +1,87 @@
+	mov rax, 1613783669344650115
+	add rax, rcx
+	mul rax
+	sub rax, rdx ;# 1
+	mul rax
+	sub rax, rdx ;# 2
+	mul rax
+	sub rax, rdx ;# 3
+	mul rax
+	sub rax, rdx ;# 4
+	mul rax
+	sub rax, rdx ;# 5
+	mul rax
+	sub rax, rdx ;# 6
+	mul rax
+	sub rax, rdx ;# 7
+	mul rax
+	sub rax, rdx ;# 8
+	mul rax
+	sub rax, rdx ;# 9
+	mul rax
+	sub rax, rdx ;# 10
+	mul rax
+	sub rax, rdx ;# 11
+	mul rax
+	sub rax, rdx ;# 12
+	mul rax
+	sub rax, rdx ;# 13
+	mul rax
+	sub rax, rdx ;# 14
+	mul rax
+	sub rax, rdx ;# 15
+	mul rax
+	sub rax, rdx ;# 16
+	mul rax
+	sub rax, rdx ;# 17
+	mul rax
+	sub rax, rdx ;# 18
+	mul rax
+	sub rax, rdx ;# 19
+	mul rax
+	sub rax, rdx ;# 20
+	mul rax
+	sub rax, rdx ;# 21
+	mul rax
+	sub rax, rdx ;# 22
+	mul rax
+	sub rax, rdx ;# 23
+	mul rax
+	sub rax, rdx ;# 24
+	mul rax
+	sub rax, rdx ;# 25
+	mul rax
+	sub rax, rdx ;# 26
+	mul rax
+	sub rax, rdx ;# 27
+	mul rax
+	sub rax, rdx ;# 28
+	mul rax
+	sub rax, rdx ;# 29
+	mul rax
+	sub rax, rdx ;# 30
+	mul rax
+	sub rax, rdx ;# 31
+	mul rax
+	sub rax, rdx ;# 32
+	mul rax
+	sub rax, rdx ;# 33
+	mul rax
+	sub rax, rdx ;# 34
+	mul rax
+	sub rax, rdx ;# 35
+	mul rax
+	sub rax, rdx ;# 36
+	mul rax
+	sub rax, rdx ;# 37
+	mul rax
+	sub rax, rdx ;# 38
+	mul rax
+	sub rax, rdx ;# 39
+	mul rax
+	sub rax, rdx ;# 40
+	mul rax
+	sub rax, rdx ;# 41
+	mul rax
+	sub rax, rdx ;# 42
+	ret
--- a/src/common.hpp
+++ b/src/common.hpp
@ -26,11 +26,6 @@ namespace RandomX {

 	using addr_t = uint32_t;

-	constexpr int RoundToNearest = 0;
-	constexpr int RoundDown = 1;
-	constexpr int RoundUp = 2;
-	constexpr int RoundToZero = 3;
-
 	constexpr int SeedSize = 32;
 	constexpr int ResultSize = 32;

@ -46,7 +41,7 @@ namespace RandomX {
 	constexpr int CacheBlockCount = CacheSize / CacheLineSize;
 	constexpr int BlockExpansionRatio = DatasetSize / CacheSize;
 	constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount;
-	constexpr int DatasetIterations = 3;
+	constexpr int DatasetIterations = 10;


 #ifdef TRACE
@ -72,12 +67,12 @@ namespace RandomX {
 		convertible_t hi;
 	};

-	constexpr int ProgramLength = 128;
+	constexpr int ProgramLength = 256;
 	constexpr uint32_t InstructionCount = 1024;
-	constexpr uint32_t ScratchpadSize = 1024 * 1024;
+	constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024;
 	constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
-	constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t);
-	constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t);
+	constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t);
+	constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t);
 	constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
 	constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
 	constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
@ -133,6 +128,8 @@ namespace RandomX {

 	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);

+	typedef bool(*Condition)(convertible_t&, convertible_t&);
+
 	extern "C" {
 		void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
 	}
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@ -28,10 +28,11 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Cache.hpp"
 #include "virtualMemory.hpp"
 #include "softAes.h"
+#include "squareHash.h"

 #if defined(__SSE2__)
 #include <wmmintrin.h>
-#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA)
+#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
 #else
 #define PREFETCH(memory)
 #endif
@ -49,42 +50,37 @@ namespace RandomX {

 	template<bool soft>
 	void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) {
-		__m128i x0, x1, x2, x3;
+		uint64_t r0, r1, r2, r3, r4, r5, r6, r7;

-		__m128i* xit = (__m128i*)intermediate;
-		__m128i* xout = (__m128i*)out;
+		r0 = 4ULL * blockNumber;
+		r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;

-		x0 = _mm_cvtsi32_si128(blockNumber);
-		constexpr int mask = (CacheSize / CacheLineSize) - 1;
+		constexpr int mask = (CacheSize - 1) & -64;

 		for (auto i = 0; i < DatasetIterations; ++i) {
-			x0 = aesenc<soft>(x0, keys[0]);
-			//x0 = aesenc<soft>(x0, keys[1]);
-			x1 = aesenc<soft>(x0, keys[2]);
-			//x1 = aesenc<soft>(x1, keys[3]);
-			x2 = aesenc<soft>(x1, keys[4]);
-			//x2 = aesenc<soft>(x2, keys[5]);
-			x3 = aesenc<soft>(x2, keys[6]);
-			//x3 = aesenc<soft>(x3, keys[7]);
-
-			int index = _mm_cvtsi128_si32(x3);
-			index &= mask;
-
-			__m128i t0 = _mm_load_si128(xit + 4 * index + 0);
-			__m128i t1 = _mm_load_si128(xit + 4 * index + 1);
-			__m128i t2 = _mm_load_si128(xit + 4 * index + 2);
-			__m128i t3 = _mm_load_si128(xit + 4 * index + 3);
-
-			x0 = _mm_xor_si128(x0, t0);
-			x1 = _mm_xor_si128(x1, t1);
-			x2 = _mm_xor_si128(x2, t2);
-			x3 = _mm_xor_si128(x3, t3);
+			uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask));
+			PREFETCHNTA(mix);
+			r0 = squareHash(r0);
+			r0 ^= mix[0];
+			r1 ^= mix[1];
+			r2 ^= mix[2];
+			r3 ^= mix[3];
+			r4 ^= mix[4];
+			r5 ^= mix[5];
+			r6 ^= mix[6];
+			r7 ^= mix[7];
 		}

-		_mm_store_si128(xout + 0, x0);
-		_mm_store_si128(xout + 1, x1);
-		_mm_store_si128(xout + 2, x2);
-		_mm_store_si128(xout + 3, x3);
+		uint64_t* out64 = (uint64_t*)out;
+
+		out64[0] = r0;
+		out64[1] = r1;
+		out64[2] = r2;
+		out64[3] = r3;
+		out64[4] = r4;
+		out64[5] = r5;
+		out64[6] = r6;
+		out64[7] = r7;
 	}

 	template
@ -98,7 +94,7 @@ namespace RandomX {
 		memory.mx ^= addr;
 		memory.mx &= -64; //align to cache line
 		std::swap(memory.mx, memory.ma);
-		PREFETCH(memory);
+		PREFETCHNTA(memory.ds.dataset + memory.ma);
 		for (int i = 0; i < RegistersCount; ++i)
 			reg.r[i].u64 ^= datasetLine[i];
 	}
--- a/src/hashAes1Rx4.cpp
+++ b/src/hashAes1Rx4.cpp
@ -71,3 +71,44 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {

 template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
 template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash);
+
+template<bool softAes>
+void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
+	const uint8_t* outptr = (uint8_t*)buffer;
+	const uint8_t* outputEnd = outptr + outputSize;
+
+	__m128i state0, state1, state2, state3;
+	__m128i key0, key1, key2, key3;
+
+	key0 = _mm_set_epi32(0x9274f206, 0x79498d2f, 0x7d2de6ab, 0x67a04d26);
+	key1 = _mm_set_epi32(0xe1f7af05, 0x2a3a6f1d, 0x86658a15, 0x4f719812);
+	key2 = _mm_set_epi32(0xd1b1f791, 0x9e2ec914, 0x14c77bce, 0xba90750e);
+	key3 = _mm_set_epi32(0x179d0fd9, 0x6e57883c, 0xa53bbe4f, 0xaa07621f);
+
+	state0 = _mm_load_si128((__m128i*)state + 0);
+	state1 = _mm_load_si128((__m128i*)state + 1);
+	state2 = _mm_load_si128((__m128i*)state + 2);
+	state3 = _mm_load_si128((__m128i*)state + 3);
+
+	while (outptr < outputEnd) {
+		state0 = aesdec<softAes>(state0, key0);
+		state1 = aesenc<softAes>(state1, key1);
+		state2 = aesdec<softAes>(state2, key2);
+		state3 = aesenc<softAes>(state3, key3);
+
+		_mm_store_si128((__m128i*)outptr + 0, state0);
+		_mm_store_si128((__m128i*)outptr + 1, state1);
+		_mm_store_si128((__m128i*)outptr + 2, state2);
+		_mm_store_si128((__m128i*)outptr + 3, state3);
+
+		outptr += 64;
+	}
+
+	_mm_store_si128((__m128i*)state + 0, state0);
+	_mm_store_si128((__m128i*)state + 1, state1);
+	_mm_store_si128((__m128i*)state + 2, state2);
+	_mm_store_si128((__m128i*)state + 3, state3);
+}
+
+template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
+template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);
--- a/src/hashAes1Rx4.hpp
+++ b/src/hashAes1Rx4.hpp
@ -21,3 +21,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 template<bool softAes>
 void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
+
+template<bool softAes>
+void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@ -37,8 +37,9 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #define WT_INEG_R 2
 #define WT_IXOR_R 12
 #define WT_IXOR_M 3
-#define WT_IROR_R 12
-#define WT_IROL_R 12
+#define WT_IROR_R 10
+#define WT_IROL_R 10
+#define WT_ISWAP_R 4

 //Common floating point
 #define WT_FPSWAP_R 8
@ -72,7 +73,7 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
 WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
 WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
 WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
-WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
+WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
 WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
 WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;

--- a/src/instructionsPortable.cpp
+++ b/src/instructionsPortable.cpp
@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 //#define DEBUG
-#include "instructions.hpp"
 #include "intrinPortable.h"
 #pragma STDC FENV_ACCESS on
 #include <cfenv>
@ -29,14 +28,14 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #if defined(__SIZEOF_INT128__)
 	typedef unsigned __int128 uint128_t;
 	typedef __int128 int128_t;
-	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+	uint64_t mulh(uint64_t a, uint64_t b) {
 		return ((uint128_t)a * b) >> 64;
 	}
-	static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+	int64_t smulh(int64_t a, int64_t b) {
 		return ((int128_t)a * b) >> 64;
 	}
-	#define umulhi64 __umulhi64
-	#define imulhi64 __imulhi64
+	#define HAVE_MULH
+	#define HAVE_SMULH
 #endif

 #if defined(_MSC_VER)
@ -44,62 +43,62 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 	#define EVAL_DEFINE(X) HAS_VALUE(X)
 	#include <intrin.h>
 	#include <stdlib.h>
-	#define ror64 _rotr64
-	#define rol64 _rotl64
+
+	uint64_t rotl(uint64_t x, int c) {
+		return _rotl64(x, c);
+	}
+	uint64_t rotr(uint64_t x , int c) {
+		return _rotr64(x, c);
+	}
+	#define HAVE_ROTL
+	#define HAVE_ROTR
+
 	#if EVAL_DEFINE(__MACHINEARM64_X64(1))
-		#define umulhi64 __umulh
+		uint64_t mulh(uint64_t a, uint64_t b) {
+			return __umulh(a, b);
+		}
+		#define HAVE_MULH
 	#endif
+
 	#if EVAL_DEFINE(__MACHINEX64(1))
-		static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+		int64_t smulh(int64_t a, int64_t b) {
 			int64_t hi;
 			_mul128(a, b, &hi);
 			return hi;
 		}
-		#define imulhi64 __imulhi64
+		#define HAVE_SMULH
 	#endif
-	static inline uint32_t _setRoundMode(uint32_t mode) {
-		return _controlfp(mode, _MCW_RC);
+
+	static void setRoundMode__(uint32_t mode) {
+		_controlfp(mode, _MCW_RC);
 	}
-	#define setRoundMode _setRoundMode
+	#define HAVE_SETROUNDMODE_IMPL
 #endif

-#ifndef setRoundMode
-	#define setRoundMode fesetround
+#ifndef HAVE_SETROUNDMODE_IMPL
+	static void setRoundMode__(uint32_t mode) {
+		fesetround(mode);
+	}
 #endif

-#ifndef ror64
-	static inline uint64_t __ror64(uint64_t a, int b) {
+#ifndef HAVE_ROTR
+	uint64_t rotr(uint64_t a, int b) {
 		return (a >> b) | (a << (64 - b));
 	}
-	#define ror64 __ror64
+	#define HAS_ROTR
 #endif

-#ifndef rol64
-	static inline uint64_t __rol64(uint64_t a, int b) {
+#ifndef HAVE_ROTL
+	uint64_t rotl(uint64_t a, int b) {
 		return (a << b) | (a >> (64 - b));
 	}
-	#define rol64 __rol64
+	#define HAS_ROTL
 #endif

-#ifndef sar64
-	#include <type_traits>
-	constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
-		return value >> shift;
-	}
-
-	struct UsesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
-	};
-
-	static inline int64_t __sar64(int64_t a, int b) {
-		return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
-	}
-	#define sar64 __sar64
-#endif
-
-#ifndef umulhi64
+#ifndef HAVE_MULH
 	#define LO(x) ((x)&0xffffffff)
 	#define HI(x) ((x)>>32)
-	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+	uint64_t mulh(uint64_t a, uint64_t b) {
 		uint64_t ah = HI(a), al = LO(a);
 		uint64_t bh = HI(b), bl = LO(b);
 		uint64_t x00 = al * bl;
@ -112,17 +111,17 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 		return (m3 << 32) + LO(m2);
 	}
-	#define umulhi64 __umulhi64
+	#define HAVE_MULH
 #endif

-#ifndef imulhi64
-	static inline int64_t __imulhi64(int64_t a, int64_t b) {
-		int64_t hi = umulhi64(a, b);
+#ifndef HAVE_SMULH
+	int64_t smulh(int64_t a, int64_t b) {
+		int64_t hi = mulh(a, b);
 		if (a < 0LL) hi -= b;
 		if (b < 0LL) hi -= a;
 		return hi;
 	}
-	#define imulhi64 __imulhi64
+	#define HAVE_SMULH
 #endif

 // avoid undefined behavior of signed overflow
@ -137,20 +136,20 @@ static inline int32_t safeSub(int32_t a, int32_t b) {

 #if defined(__has_builtin)
 #if __has_builtin(__builtin_sub_overflow)
-	static inline bool __subOverflow(int32_t a, int32_t b) {
+	static inline bool subOverflow__(int32_t a, int32_t b) {
 		int32_t temp;
 		return __builtin_sub_overflow(a, b, &temp);
 	}
-	#define subOverflow __subOverflow
+	#define HAVE_SUB_OVERFLOW
 #endif
 #endif

-#ifndef subOverflow
-	static inline bool __subOverflow(int32_t a, int32_t b) {
+#ifndef HAVE_SUB_OVERFLOW
+	static inline bool subOverflow__(int32_t a, int32_t b) {
 		auto c = safeSub(a, b);
 		return (c < a) != (b > 0);
 	}
-	#define subOverflow __subOverflow
+	#define HAVE_SUB_OVERFLOW
 #endif

 static inline double FlushDenormalNaN(double x) {
@ -165,47 +164,57 @@ static inline double FlushNaN(double x) {
 	return x != x ? 0.0 : x;
 }

+void setRoundMode(uint32_t rcflag) {
+	switch (rcflag & 3) {
+	case RoundDown:
+		setRoundMode__(FE_DOWNWARD);
+		break;
+	case RoundUp:
+		setRoundMode__(FE_UPWARD);
+		break;
+	case RoundToZero:
+		setRoundMode__(FE_TOWARDZERO);
+		break;
+	default:
+		setRoundMode__(FE_TONEAREST);
+		break;
+	}
+}
+
+bool condition(uint32_t type, int32_t value, int32_t imm32) {
+	switch (type & 7)
+	{
+	case 0:
+		return (uint32_t)value <= (uint32_t)imm32;
+	case 1:
+		return (uint32_t)value > (uint32_t)imm32;
+	case 2:
+		return safeSub(value, imm32) < 0;
+	case 3:
+		return safeSub(value, imm32) >= 0;
+	case 4:
+		return subOverflow__(value, imm32);
+	case 5:
+		return !subOverflow__(value, imm32);
+	case 6:
+		return value < imm32;
+	case 7:
+		return value >= imm32;
+	}
+}
+
+void initFpu() {
+#ifdef __SSE2__
+	_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
+#else
+	setRoundMode(FE_TONEAREST);
+#endif
+}
+
 namespace RandomX {

 	extern "C" {
-
-		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 + b.u64;
-		}
-
-		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 + b.u32;
-		}
-
-		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 - b.u64;
-		}
-
-		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 - b.u32;
-		}
-
-		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 * b.u64;
-		}
-
-		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = umulhi64(a.u64, b.u64);
-		}
-
-		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = (uint64_t)a.u32 * b.u32;
-		}
-
-		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.i64 = (int64_t)a.i32 * b.i32;
-		}
-
-		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.i64 = imulhi64(a.i64, b.i64);
-		}
-
-		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+		/*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
 			c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
 		}

@ -216,80 +225,6 @@ namespace RandomX {
 				c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
 		}

-		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 & b.u64;
-		}
-
-		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 & b.u32;
-		}
-
-		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 | b.u64;
-		}
-
-		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 | b.u32;
-		}
-
-		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 ^ b.u64;
-		}
-
-		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 ^ b.u32;
-		}
-
-		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 << (b.u64 & 63);
-		}
-
-		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 >> (b.u64 & 63);
-		}
-
-		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = sar64(a.i64, b.u64 & 63);
-		}
-
-		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = rol64(a.u64, (b.u64 & 63));
-		}
-
-		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = ror64(a.u64, (b.u64 & 63));
-		}
-
-		bool JMP_COND(uint8_t type, convertible_t& regb, int32_t imm32) {
-			switch (type & 7)
-			{
-				case 0:
-					return regb.u32 <= (uint32_t)imm32;
-				case 1:
-					return regb.u32 > (uint32_t)imm32;
-				case 2:
-					return safeSub(regb.i32, imm32) < 0;
-				case 3:
-					return safeSub(regb.i32, imm32) >= 0;
-				case 4:
-					return subOverflow(regb.i32, imm32);
-				case 5:
-					return !subOverflow(regb.i32, imm32);
-				case 6:
-					return regb.i32 < imm32;
-				case 7:
-					return regb.i32 >= imm32;
-			}
-		}
-
-		void FPINIT() {
-#ifdef __SSE2__
-			_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
-#else
-			setRoundMode(FE_TONEAREST);
-#endif
-		}
-
 		void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
 #ifdef __SSE2__
 			__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
@ -368,48 +303,8 @@ namespace RandomX {
 			c.lo.f64 = sqrt(std::abs(alo));
 			c.hi.f64 = sqrt(std::abs(ahi));
 #endif
-		}
+		}*/
+

-		void FPROUND(convertible_t a, uint8_t rot) {
-			uint64_t flag = ror64(a.u64, rot);
-			switch (flag & 3) {
-				case RoundDown:
-#ifdef DEBUG
-					std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " <<
-#endif
-					setRoundMode(FE_DOWNWARD);
-#ifdef DEBUG
-					std::cout << std::endl;
-#endif
-					break;
-				case RoundUp:
-#ifdef DEBUG
-					std::cout << "Round FE_UPWARD (" << FE_UPWARD << ") = " <<
-#endif
-					setRoundMode(FE_UPWARD);
-#ifdef DEBUG
-					std::cout << std::endl;
-#endif
-					break;
-				case RoundToZero:
-#ifdef DEBUG
-					std::cout << "Round FE_TOWARDZERO (" << FE_TOWARDZERO << ") = " <<
-#endif
-					setRoundMode(FE_TOWARDZERO);
-#ifdef DEBUG
-					std::cout << std::endl;
-#endif
-					break;
-				default:
-#ifdef DEBUG
-					std::cout << "Round FE_TONEAREST (" << FE_TONEAREST << ") = " <<
-#endif
-					setRoundMode(FE_TONEAREST);
-#ifdef DEBUG
-					std::cout << std::endl;
-#endif
-					break;
-			}
-		}
 	}
 }
--- a/src/intrinPortable.h
+++ b/src/intrinPortable.h
@ -19,6 +19,8 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #pragma once

+#include <cstdint>
+
 #if defined(_MSC_VER)
 #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
 #define __SSE2__ 1
@ -45,6 +47,18 @@ typedef union {
 	uint8_t u8[16];
 } __m128i;

+typedef struct {
+	double lo;
+	double hi;
+} __m128d;
+
+inline __m128d _mm_load_pd(const double* pd) {
+	__m128d x;
+	x.lo = *(pd + 0);
+	x.hi = *(pd + 1);
+	return x;
+}
+
 static const char* platformError = "Platform doesn't support hardware AES";

 inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
@ -132,3 +146,16 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
 }

 #endif
+
+constexpr int RoundToNearest = 0;
+constexpr int RoundDown = 1;
+constexpr int RoundUp = 2;
+constexpr int RoundToZero = 3;
+
+uint64_t mulh(uint64_t, uint64_t);
+int64_t smulh(int64_t, int64_t);
+uint64_t rotl(uint64_t, int);
+uint64_t rotr(uint64_t, int);
+void initFpu();
+void setRoundMode(uint32_t);
+bool condition(uint32_t, int32_t, int32_t);
--- a/src/main.cpp
+++ b/src/main.cpp
@ -35,6 +35,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "dataset.hpp"
 #include "Cache.hpp"
 #include "Pcg32.hpp"
+#include "hashAes1Rx4.hpp"

 const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };

@ -153,7 +154,7 @@ void generateNative(int nonce) {
 }

 void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) {
-	uint64_t hash[4];
+	alignas(16) uint64_t hash[8];
 	unsigned char blockTemplate[] = {
 		0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
 		0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
@ -167,8 +168,8 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
 		//std::cout << "Thread " << thread << " nonce " << nonce << std::endl;
 		*noncePtr = nonce;
 		blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
-		int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
-		vm->initializeScratchpad(scratchpad, spIndex);
+		fillAes1Rx4<false>((void*)hash, RandomX::ScratchpadSize, scratchpad);
+		//vm->initializeScratchpad(scratchpad, spIndex);
 		vm->setScratchpad(scratchpad);
 		//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
 		for (int chain = 0; chain < 16; ++chain) {
@ -309,7 +310,7 @@ int main(int argc, char** argv) {
 		}
 		uint8_t* scratchpadMem;
 		if (largePages) {
-			scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2);
+			scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RandomX::ScratchpadSize);
 		}
 		else {
 			scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize);
--- a/src/squareHash.S
+++ b/src/squareHash.S
@ -0,0 +1,17 @@
+.intel_syntax noprefix
+#if defined(__APPLE__)
+.text
+#else
+.section .text
+#endif
+#if defined(__WIN32__) || defined(__APPLE__)
+#define DECL(x) _##x
+#else
+#define DECL(x) x
+#endif
+
+.global DECL(squareHash)
+
+DECL(squareHash):
+	mov rcx, rsi
+	#include "asm/squareHash.inc"
--- a/src/squareHash.asm
+++ b/src/squareHash.asm
@ -0,0 +1,9 @@
+PUBLIC squareHash
+
+.code
+
+squareHash PROC
+	include asm/squareHash.inc
+squareHash ENDP
+
+END
--- a/src/squareHash.h
+++ b/src/squareHash.h
@ -0,0 +1,71 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include <stdint.h>
+
+#if !defined(_M_X64) && !defined(__x86_64__)
+
+typedef struct {
+	uint64_t lo;
+	uint64_t hi;
+} uint128_t;
+
+#define LO(x) ((x)&0xffffffff)
+#define HI(x) ((x)>>32)
+static inline uint128_t square128(uint64_t x) {
+	uint64_t xh = HI(x), xl = LO(x);
+	uint64_t xll = xl * xl;
+	uint64_t xlh = xl * xh;
+	uint64_t xhh = xh * xh;
+	uint64_t m1 = 2 * LO(xlh) + HI(xll);
+	uint64_t m2 = 2 * HI(xlh) + LO(xhh) + HI(m1);
+	uint64_t m3 = HI(xhh) + HI(m2);
+
+	uint128_t x2;
+
+	x2.lo = (m1 << 32) + LO(xll);
+	x2.hi = (m3 << 32) + LO(m2);
+
+	return x2;
+}
+#undef LO(x)
+#undef HI(x)
+
+inline uint64_t squareHash(uint64_t x) {
+	x += 1613783669344650115;
+	for (int i = 0; i < 42; ++i) {
+		uint128_t x2 = square128(x);
+		x = x2.lo - x2.hi;
+	}
+	return x;
+}
+
+#else
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+uint64_t squareHash(uint64_t);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif