From 005c67f64c5670f6b87c3bca36f497622ead63cf Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 10:52:30 +0100 Subject: [PATCH] Added explicit STORE instructions JIT compiler --- src/AssemblyGeneratorX86.cpp | 22 +- src/AssemblyGeneratorX86.hpp | 12 +- src/CompiledVirtualMachine.cpp | 6 +- src/Instruction.cpp | 24 +- src/Instruction.hpp | 3 + src/JitCompilerX86-static.S | 45 +- src/JitCompilerX86-static.asm | 57 +- src/JitCompilerX86-static.hpp | 17 +- src/JitCompilerX86.cpp | 1217 ++++++++++++----------- src/JitCompilerX86.hpp | 126 +-- src/asm/program_epilogue_store.inc | 20 +- src/asm/program_epilogue_win64.inc | 8 +- src/asm/program_load_flt.inc | 14 + src/asm/program_load_int.inc | 10 + src/asm/program_prologue_linux.inc | 11 +- src/asm/program_prologue_load.inc | 43 +- src/asm/program_prologue_win64.inc | 21 +- src/asm/program_read.inc | 20 - src/asm/program_read_dataset.inc | 16 + src/asm/program_store_flt.inc | 11 + src/asm/program_store_int.inc | 10 + src/asm/program_xmm_constants.inc | 6 + src/common.hpp | 4 +- src/executeProgram-win64.asm | 21 +- src/instructionWeights.hpp | 63 +- src/main.cpp | 2 +- src/program.inc | 1460 ++++++++++++++-------------- 27 files changed, 1751 insertions(+), 1518 deletions(-) create mode 100644 src/asm/program_load_flt.inc create mode 100644 src/asm/program_load_int.inc delete mode 100644 src/asm/program_read.inc create mode 100644 src/asm/program_read_dataset.inc create mode 100644 src/asm/program_store_flt.inc create mode 100644 src/asm/program_store_int.inc create mode 100644 src/asm/program_xmm_constants.inc diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index f1c3de8..11bb3f0 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -75,6 +75,11 @@ namespace RandomX { asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; } + void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { + asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl; + asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; + } + int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -425,7 +430,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmov rax, " << regR[instr.src] << std::endl; int rotate = (13 - (instr.alt & 63)) & 63; if (rotate != 0) asmCode << "\trol rax, " << rotate << std::endl; @@ -474,6 +479,18 @@ namespace RandomX { asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } + //3 uOPs + void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) { + genAddressRegDst(instr); + asmCode << "\tmov qword ptr [rsi+rax], " << regR[instr.src] << std::endl; + } + + //3 uOPs + void AssemblyGeneratorX86::h_FSTORE(Instruction& instr, int i) { + genAddressRegDst(instr, 16); + asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl; + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) @@ -520,5 +537,8 @@ namespace RandomX { INST_HANDLE(COND_R) INST_HANDLE(COND_M) INST_HANDLE(CFROUND) + + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; } \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 2d3c9a6..5c22142 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -38,16 +38,8 @@ namespace RandomX { static InstructionGenerator engine[256]; std::stringstream asmCode; - void gena(Instruction&, int); - void genar(Instruction&, int); - void genaf(Instruction&, int); - void genbiashift(Instruction&, const char*); - void genbia(Instruction&); - void genbia32(Instruction&); - void genbf(Instruction&, const char*); - void gencr(Instruction&, bool); - void gencf(Instruction&, bool); void genAddressReg(Instruction&, const char*); + void genAddressRegDst(Instruction&, int); int32_t genAddressImm(Instruction&); void generateCode(Instruction&, int); @@ -85,5 +77,7 @@ namespace RandomX { void h_COND_R(Instruction&, int); void h_COND_M(Instruction&, int); void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_FSTORE(Instruction&, int); }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index f0a63d1..f5d33d0 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -71,14 +71,14 @@ namespace RandomX { reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64); } compiler.generateProgram(gen); - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; + mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64; mem.mx = *(((uint32_t*)seed) + 5); } void CompiledVirtualMachine::execute() { - executeProgram(reg, mem, scratchpad, InstructionCount); + //executeProgram(reg, mem, scratchpad, InstructionCount); totalSize += compiler.getCodeSize(); - //compiler.getProgramFunc()(reg, mem, scratchpad); + compiler.getProgramFunc()(reg, mem, scratchpad, InstructionCount); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index c766ffd..13cfc1d 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -32,6 +32,10 @@ namespace RandomX { os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; } + void Instruction::genAddressRegDst(std::ostream& os) const { + os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; + } + void Instruction::genAddressImm(std::ostream& os) const { os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; } @@ -276,7 +280,7 @@ namespace RandomX { } void Instruction::h_CFROUND(std::ostream& os) const { - os << "r" << (int)dst << ", " << (alt & 63) << std::endl; + os << "r" << (int)src << ", " << (alt & 63) << std::endl; } static inline const char* condition(int index) { @@ -311,6 +315,18 @@ namespace RandomX { os << ", " << imm32 << ")" << std::endl; } + void Instruction::h_ISTORE(std::ostream& os) const { + genAddressRegDst(os); + os << ", r" << (int)src << std::endl; + } + + void Instruction::h_FSTORE(std::ostream& os) const { + const char reg = (src >= 4) ? 'e' : 'f'; + genAddressRegDst(os); + auto srcIndex = src % 4; + os << ", " << reg << srcIndex << std::endl; + } + #include "instructionWeights.hpp" #define INST_NAME(x) REPN(#x, WT(x)) #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) @@ -358,6 +374,9 @@ namespace RandomX { INST_NAME(COND_R) INST_NAME(COND_M) INST_NAME(CFROUND) + + INST_NAME(ISTORE) + INST_NAME(FSTORE) }; InstructionVisualizer Instruction::engine[256] = { @@ -403,6 +422,9 @@ namespace RandomX { INST_HANDLE(COND_R) INST_HANDLE(COND_M) INST_HANDLE(CFROUND) + + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index becb983..017d92f 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -49,6 +49,7 @@ namespace RandomX { void genAddressReg(std::ostream& os) const; void genAddressImm(std::ostream& os) const; + void genAddressRegDst(std::ostream&) const; void h_IADD_R(std::ostream&) const; void h_IADD_M(std::ostream&) const; @@ -83,6 +84,8 @@ namespace RandomX { void h_COND_R(std::ostream&) const; void h_COND_M(std::ostream&) const; void h_CFROUND(std::ostream&) const; + void h_ISTORE(std::ostream&) const; + void h_FSTORE(std::ostream&) const; }; static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index e0e8f62..a799e11 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -27,11 +27,16 @@ #define DECL(x) x #endif .global DECL(randomx_program_prologue) -.global DECL(randomx_program_begin) +.global DECL(randomx_loop_begin) +.global DECL(randomx_program_load_int) +.global DECL(randomx_program_load_flt) +.global DECL(randomx_program_start) +.global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_store_int) +.global DECL(randomx_program_store_flt) +.global DECL(randomx_program_loop_end) .global DECL(randomx_program_epilogue) -.global DECL(randomx_program_read) .global DECL(randomx_program_end) -.global DECL(randomx_program_transform) #define db .byte @@ -40,21 +45,37 @@ DECL(randomx_program_prologue): #include "asm/program_prologue_linux.inc" .align 64 -DECL(randomx_program_begin): + #include "asm/program_xmm_constants.inc" + +.align 64 +DECL(randomx_loop_begin): + nop + +DECL(randomx_program_load_int): + #include "asm/program_load_int.inc" + +DECL(randomx_program_load_flt): + #include "asm/program_load_flt.inc" + +DECL(randomx_program_start): + nop + +DECL(randomx_program_read_dataset): + #include "asm/program_read_dataset.inc" + +DECL(randomx_program_store_int): + #include "asm/program_store_int.inc" + +DECL(randomx_program_store_flt): + #include "asm/program_store_flt.inc" + +DECL(randomx_program_loop_end): nop .align 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -.align 64 -DECL(randomx_program_read): - #include "asm/program_read.inc" - .align 64 DECL(randomx_program_end): nop - -.align 8 -DECL(randomx_program_transform): - #include "asm/program_transform_address.inc" diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 031c2e4..8d5a4fe 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,12 +20,16 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue -PUBLIC randomx_program_begin +PUBLIC randomx_loop_begin +PUBLIC randomx_program_load_int +PUBLIC randomx_program_load_flt +PUBLIC randomx_program_start +PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_store_int +PUBLIC randomx_program_store_flt +PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue -PUBLIC randomx_program_read PUBLIC randomx_program_end -PUBLIC randomx_program_transform - ALIGN 64 randomx_program_prologue PROC @@ -33,30 +37,51 @@ randomx_program_prologue PROC randomx_program_prologue ENDP ALIGN 64 -randomx_program_begin PROC + include asm/program_xmm_constants.inc + +ALIGN 64 +randomx_loop_begin PROC nop -randomx_program_begin ENDP +randomx_loop_begin ENDP + +randomx_program_load_int PROC + include asm/program_load_int.inc +randomx_program_load_int ENDP + +randomx_program_load_flt PROC + include asm/program_load_flt.inc +randomx_program_load_flt ENDP + +randomx_program_start PROC + nop +randomx_program_start ENDP + +randomx_program_read_dataset PROC + include asm/program_read_dataset.inc +randomx_program_read_dataset ENDP + +randomx_program_store_int PROC + include asm/program_store_int.inc +randomx_program_store_int ENDP + +randomx_program_store_flt PROC + include asm/program_store_flt.inc +randomx_program_store_flt ENDP + +randomx_program_loop_end PROC + nop +randomx_program_loop_end ENDP ALIGN 64 randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP -ALIGN 64 -randomx_program_read PROC - include asm/program_read.inc -randomx_program_read ENDP - ALIGN 64 randomx_program_end PROC nop randomx_program_end ENDP -ALIGN 8 -randomx_program_transform PROC - include asm/program_transform_address.inc -randomx_program_transform ENDP - _RANDOMX_JITX86_STATIC ENDS ENDIF diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index e72244a..df5cd28 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -18,10 +18,15 @@ along with RandomX. If not, see. */ extern "C" { - void randomx_program_prologue(); - void randomx_program_begin(); - void randomx_program_epilogue(); - void randomx_program_transform(); - void randomx_program_read(); - void randomx_program_end(); + void randomx_program_prologue(); + void randomx_loop_begin(); + void randomx_program_load_int(); + void randomx_program_load_flt(); + void randomx_program_start(); + void randomx_program_read_dataset(); + void randomx_program_store_int(); + void randomx_program_store_flt(); + void randomx_program_loop_end(); + void randomx_program_epilogue(); + void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8776d61..e001464 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -38,7 +38,7 @@ along with RandomX. If not, see. namespace RandomX { -#if true || !defined(_M_X64) && !defined(__x86_64__) +#if !defined(_M_X64) && !defined(__x86_64__) JitCompilerX86::JitCompilerX86() { //throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); } @@ -53,69 +53,132 @@ namespace RandomX { #else /* - REGISTER ALLOCATION: - rax -> temporary - rbx -> "ic" - rcx -> temporary - rdx -> temporary - rsi -> convertible_t* scratchpad - rdi -> beginning of VM stack - rbp -> "ma", "mx" - rsp -> end of VM stack - r8 -> "r0" - r9 -> "r1" - r10 -> "r2" - r11 -> "r3" - r12 -> "r4" - r13 -> "r5" - r14 -> "r6" - r15 -> "r7" - xmm0 -> temporary - xmm1 -> temporary - xmm2 -> "f2" - xmm3 -> "f3" - xmm4 -> "f4" - xmm5 -> "f5" - xmm6 -> "f6" - xmm7 -> "f7" - xmm8 -> "f0" - xmm9 -> "f1" - xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff + REGISTER ALLOCATION: - STACK STRUCTURE: - - | - | - | saved registers - | - v - [rdi+8] RegisterFile& registerFile - [rdi] uint8_t* dataset - | - | - | VM stack - | - v - [rsp] last element of VM stack + ; rax -> temporary + ; rbx -> loop counter "lc" + ; rcx -> temporary + ; rdx -> temporary + ; rsi -> scratchpad pointer + ; rdi -> dataset pointer + ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits) + ; rsp -> stack pointer + ; r8 -> "r0" + ; r9 -> "r1" + ; r10 -> "r2" + ; r11 -> "r3" + ; r12 -> "r4" + ; r13 -> "r5" + ; r14 -> "r6" + ; r15 -> "r7" + ; xmm0 -> "f0" + ; xmm1 -> "f1" + ; xmm2 -> "f2" + ; xmm3 -> "f3" + ; xmm4 -> "e0" + ; xmm5 -> "e1" + ; xmm6 -> "e2" + ; xmm7 -> "e3" + ; xmm8 -> "a0" + ; xmm9 -> "a1" + ; xmm10 -> "a2" + ; xmm11 -> "a3" + ; xmm12 -> temporary + ; xmm13 -> DBL_MIN + ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff + ; xmm15 -> sign mask 0x80000000000000008000000000000000 */ #include "JitCompilerX86-static.hpp" const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; - const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin; + const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int; + const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt; + const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; + const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; + const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int; + const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt; + const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; - const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; - const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; - const int32_t prologueSize = codeProgramBegin - codePrologue; - const int32_t epilogueSize = codeReadDataset - codeEpilogue; - const int32_t readDatasetSize = codeProgramEnd - codeReadDataset; + const int32_t prologueSize = codeLoopBegin - codePrologue; + const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t readDatasetOffset = CodeSize - readDatasetSize; - const int32_t epilogueOffset = readDatasetOffset - epilogueSize; + const int32_t loadIntSize = codeLoadFlt - codeLoadInt; + const int32_t loadFltSize = codeProgamStart - codeLoadFlt; + const int32_t readDatasetSize = codeStoreInt - codeReadDataset; + const int32_t storeIntSize = codeStoreFlt - codeStoreInt; + const int32_t storeFltSize = codeLoopEnd - codeStoreFlt; + + const int32_t epilogueOffset = CodeSize - epilogueSize; + + static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; + static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; + static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; + static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b }; + static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b }; + static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; + static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; + static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; + static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 }; + static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; + static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; + static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 }; + static const uint8_t REX_81[] = { 0x49, 0x81 }; + static const uint8_t AND_EAX_I = 0x25; + static const uint8_t MOV_EAX_I = 0xb8; + static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; + static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 }; + static const uint8_t REX_LEA[] = { 0x4f, 0x8d }; + static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e }; + static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e }; + static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 }; + static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 }; + static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 }; + static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea }; + static const uint8_t REX_SH[] = { 0x49, 0xc1 }; + static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f }; + static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 }; + static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 }; + static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 }; + static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 }; + static const uint8_t ADD_R_RAX[] = { 0x49, 0x01 }; + static const uint8_t XOR_EAX_EAX[] = { 0x31, 0xC0 }; + static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 }; + static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 }; + static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA }; + static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 }; + static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x01, 0xC2 }; + static const uint8_t REX_NEG[] = { 0x49, 0xF7 }; + static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; + static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; + static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 }; + static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 }; + static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; + static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 }; + static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 }; + static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 }; + static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c }; + static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 }; + static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 }; + static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f }; + static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; + static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; + static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; + static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; + static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; + static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; + static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; + static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 }; + static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 }; + static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; + static const uint8_t JNZ[] = { 0x0f, 0x85 }; + static const uint8_t JMP = 0xe9; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize + readDatasetSize; @@ -132,687 +195,613 @@ namespace RandomX { throw std::runtime_error("mmap failed"); #endif memcpy(code, codePrologue, prologueSize); - memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize); - memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize); + memcpy(code + CodeSize - epilogueSize, codeEpilogue, epilogueSize); } void JitCompilerX86::generateProgram(Pcg32& gen) { - instructionOffsets.clear(); - callOffsets.clear(); + auto addressRegisters = gen(); + int readReg1 = addressRegisters & 1; + addressRegisters >>= 1; + int readReg2 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + int writeReg1 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + int writeReg2 = 6 + (addressRegisters & 1); codePos = prologueSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg1); + memcpy(code + codePos, codeLoadInt, loadIntSize); + codePos += loadIntSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg2); + memcpy(code + codePos, codeLoadFlt, loadFltSize); + codePos += loadFltSize; Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { *(((uint32_t*)&instr) + j) = gen(); } - generateCode(instr, i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr); } - emitByte(0xe9); - emit(instructionOffsets[0] - (codePos + 4)); - fixCallOffsets(); - uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - *reinterpret_cast(code + readDatasetOffset) = transform; + emit(REX_MOV_RR); + emitByte(0xc0 + readReg1); + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg2); + memcpy(code + codePos, codeReadDataset, readDatasetSize); + codePos += readDatasetSize; + emit(REX_MOV_RR); + emitByte(0xc0 + writeReg1); + memcpy(code + codePos, codeStoreInt, storeIntSize); + codePos += storeIntSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + writeReg2); + memcpy(code + codePos, codeStoreFlt, storeFltSize); + codePos += storeFltSize; + emit(SUB_EBX); + emit(JNZ); + emit32(prologueSize - codePos - 4); + emitByte(JMP); + emit32(epilogueOffset - codePos - 4); + emitByte(0x90); } - void JitCompilerX86::generateCode(Instruction& instr, int i) { - instructionOffsets.push_back(codePos); - emit(0x840fcbff); //dec ebx; jz - emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + void JitCompilerX86::generateCode(Instruction& instr) { auto generator = engine[instr.opcode]; - (this->*generator)(instr, i); + (this->*generator)(instr); } - void JitCompilerX86::fixCallOffsets() { - for (CallOffset& co : callOffsets) { - *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4); - } + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { + emit(REX_MOV_RR); + emitByte((rax ? 0xc0 : 0xc8) + instr.src); + if (rax) + emitByte(AND_EAX_I); + else + emit(AND_ECX_I); + emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } - void JitCompilerX86::gena(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emit(0x753fc3f6); //test bl,0x3f; jne - emit(uint16_t(0xe805)); - emit(readDatasetOffset - (codePos + 4)); - if ((instr.loca & 192) == 0) { //A.LOC.X - emit(uint16_t(0x3348)); - emitByte(0xe8); //xor rbp, rax - } - emitByte(0x25); //and eax, - //if (instr.loca & 15) { - if (instr.loca & 3) { - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - } - else { - emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad - } - /*} - else { - emit(ScratchpadL3 - 1); //whole scratchpad - }*/ + void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) { + emit(REX_MOV_RR); + emitByte(0xc0 + instr.dst); + emitByte(AND_EAX_I); + int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask; + int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask; + emit32((instr.alt % 4) ? maskL1 : maskL2); } - void JitCompilerX86::genar(Instruction& instr) { - gena(instr); - emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8] + void JitCompilerX86::genAddressImm(Instruction& instr) { + emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); } - void JitCompilerX86::genaf(Instruction& instr) { - gena(instr); - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - } - - void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if (instr.locb & 1) { - emit(uint16_t(0x8b49)); //mov - emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb - emitByte(0x48); //REX.W - emit(opcodeReg); //xxx rax, cl + void JitCompilerX86::h_IADD_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_ADD_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emitByte(0x48); //REX.W - emit(opcodeImm); //xxx rax, imm8 - emitByte((instr.imm8 & 63)); + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.imm32); } } - void JitCompilerX86::genbia(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if (instr.locb & 3) { - emit(opcodeReg); // xxx rax, r64 - emitByte(0xc0 + (instr.regb % RegistersCount)); + void JitCompilerX86::h_IADD_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_ADD_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); } else { - emit(opcodeImm); // xxx rax, imm32 - emit(instr.imm32); + emit(REX_ADD_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); } } - void JitCompilerX86::genbia32(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) { - if (instr.locb & 3) { - emit(opcodeReg); // xxx eax, r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); + void JitCompilerX86::genSIB(int scale, int index, int base) { + emitByte((scale << 5) | (index << 3) | base); + } + + void JitCompilerX86::h_IADD_RC(Instruction& instr) { + emit(REX_LEA); + emitByte(0x84 + 8 * instr.dst); + genSIB(0, instr.src, instr.dst); + emit32(instr.imm32); + } + + void JitCompilerX86::h_ISUB_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emitByte(opcodeImm); // xxx eax, imm32 - emit(instr.imm32); + emit(REX_81); + emitByte(0xe8 + instr.dst); + genAddressImm(instr); } } - void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) { - int regb = (instr.regb % RegistersCount); - emitByte(0x66); //xxxpd xmm0,regb - if (regb <= 1) { - emitByte(0x41); //REX - } - emitByte(0x0f); - emitByte(opcode); - emitByte(0xc0 + regb); - } - - - void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) { - if (rax) { - emit(0x41c88b48); //mov rcx, rax; REX + void JitCompilerX86::h_ISUB_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_SUB_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); } else { - emitByte(0x41); + emit(REX_SUB_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); } - emitByte(0x8b); // mov - emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc - emitByte(0x35); // xor eax - emit(instr.addrc); - emitByte(0x25); //and - emit(scratchpadSize - 1); - emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx } - void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { - if (instr.locc & 8) { //write to register - emit(uint16_t(0x8b4c)); //mov - if (rax) { - emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax - } - else { - emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx - } + void JitCompilerX86::h_IMUL_9C(Instruction& instr) { + emit(REX_LEA); + emitByte(0x84 + 8 * instr.dst); + genSIB(3, instr.src, instr.dst); + emit32(instr.imm32); + } + + void JitCompilerX86::h_IMUL_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - //if (instr.locc & 7) { - if (instr.locc & 1) { - scratchpadStoreR(instr, ScratchpadL1, rax); + emit(REX_IMUL_RRI); + emitByte(0xc0 + 9 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMUL_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_IMUL_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_IMUL_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMULH_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + else { + emitByte(MOV_EAX_I); + emit32(instr.imm32); + emit(REX_MUL_R); + emitByte(0xe0 + instr.dst); + emit(REX_ADD_RM); + emitByte(0xc2 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_IMULH_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xa6); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + else { + emitByte(MOV_EAX_I); + emit32(instr.imm32); + emit(REX_MUL_R); + emitByte(0xe8 + instr.dst); + emit(REX_ADD_RM); + emitByte(0xc2 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_ISMULH_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_IMUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xae); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IDIV_C(Instruction& instr) { + if (instr.imm32 != 0) { + uint32_t divisor = instr.imm32; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + if (mi.pre_shift == 0 && !mi.increment) { + emit(MOV_RAX_I); + emit64(mi.multiplier); + emit(REX_MUL_R); + emitByte(0xe0 + instr.dst); } else { - scratchpadStoreR(instr, ScratchpadL2, rax); - } - /*} - else { - scratchpadStoreR(instr, ScratchpadL3, rax); - }*/ - } - } - - void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) { - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + regc); //eax, regc - emitByte(0x35); // xor eax - emit(instr.addrc); - emitByte(0x25); //and - emit(scratchpadSize - 1); - emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc - if (regc <= 1) { - emitByte(0x44); //REX - } - emitByte(0x0f); - emitByte(storeHigh ? 0x17 : 0x13); - emitByte(4 + 8 * regc); - emitByte(0xc6); - } - - void JitCompilerX86::gencf(Instruction& instr) { - int regc = (instr.regc % RegistersCount); - if (regc <= 1) { - emitByte(0x44); //REX - } - emit(uint16_t(0x280f)); //movaps - emitByte(0xc0 + 8 * regc); // regc, xmm0 - if (instr.locc & 8) { //write to scratchpad - //if (instr.locc & 7) { - if (instr.locc & 1) { //C.LOC.W - scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad - } - else { - scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad - } - //} - /*else { - scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad - }*/ - } - } - - void JitCompilerX86::h_ADD_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x0349, 0x0548); - gencr(instr); - } - - void JitCompilerX86::h_ADD_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x0341, 0x05); - gencr(instr); - } - - void JitCompilerX86::h_SUB_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x2b49, 0x2d48); - gencr(instr); - } - - void JitCompilerX86::h_SUB_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x2b41, 0x2d); - gencr(instr); - } - - void JitCompilerX86::h_MUL_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emitByte(0x49); //REX - emit(uint16_t(0xaf0f)); // imul rax, r64 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); //REX - emit(uint16_t(0xc069)); // imul rax, rax, imm32 - emit(instr.imm32); - } - gencr(instr); - } - - void JitCompilerX86::h_MULH_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b49)); //mov rcx, r64 - emitByte(0xc8 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc1c7)); // mov rcx, imm32 - emit(instr.imm32); - } - emitByte(0x48); - emit(uint16_t(0xe1f7)); // mul rcx - emitByte(0x48); - emit(uint16_t(0xc28b)); // mov rax,rdx - gencr(instr); - } - - void JitCompilerX86::h_MUL_32(Instruction& instr, int i) { - genar(instr); - emit(uint16_t(0xc88b)); //mov ecx, eax - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b41)); // mov eax, r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0xb8); // mov eax, imm32 - emit(instr.imm32); - } - emit(0xc1af0f48); //imul rax,rcx - gencr(instr); - } - - void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) { - genar(instr); - emitByte(0x48); - emit(uint16_t(0xc863)); //movsxd rcx,eax - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x6349)); //movsxd rax,r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc0c7)); // mov rax, imm32 - emit(instr.imm32); - } - emit(0xc1af0f48); //imul rax,rcx - gencr(instr); - } - - void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b49)); //mov rcx, r64 - emitByte(0xc8 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc1c7)); // mov rcx, imm32 - emit(instr.imm32); - } - emitByte(0x48); - emit(uint16_t(0xe9f7)); // imul rcx - emitByte(0x48); - emit(uint16_t(0xc28b)); // mov rax,rdx - gencr(instr); - } - - void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { - genar(instr); - if (instr.locb & 7) { -#ifdef MAGIC_DIVISION - if (instr.imm32 != 0) { - uint32_t divisor = instr.imm32; - if (divisor & (divisor - 1)) { - magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); if (mi.pre_shift > 0) { - if (mi.pre_shift == 1) { - emitByte(0x48); - emit(uint16_t(0xe8d1)); //shr rax,1 - } - else { - emit(0x00e8c148 | (mi.pre_shift << 24)); //shr rax, pre_shift - } + emit(REX_SHR_RAX); + emitByte(mi.pre_shift); } if (mi.increment) { - emit(0x00d8834801c08348); //add rax,1; sbb rax,0 + emit(RAX_ADD_SBB_1); } - emit(uint16_t(0xb948)); //movabs rcx, multiplier - emit(mi.multiplier); - emit(0x48e1f748); //mul rcx; REX - emit(uint16_t(0xc28b)); //mov rax,rdx - if (mi.post_shift > 0) - emit(0x00e8c148 | (mi.post_shift << 24)); //shr rax, post_shift - } - else { //divisor is a power of two - int shift = 0; - while (divisor >>= 1) - ++shift; - if (shift > 0) - emit(0x00e8c148 | (shift << 24)); //shr rax, shift + emit(MOV_RCX_I); + emit64(mi.multiplier); + emit(MUL_RCX); } - } -#else - emitByte(0xb9); //mov ecx, imm32 - emit(instr.imm32 != 0 ? instr.imm32 : 1); -#endif - } - else { - emitByte(0xb9); //mov ecx, 1 - emit(1); - emit(uint16_t(0x8b41)); //mov edx, r32 - emitByte(0xd0 + (instr.regb % RegistersCount)); - emit(0x450fd285); //test edx, edx; cmovne ecx,edx - emitByte(0xca); -#ifdef MAGIC_DIVISION - emit(0xf748d233); //xor edx,edx; div rcx - emitByte(0xf1); -#endif - } -#ifndef MAGIC_DIVISION - emit(0xf748d233); //xor edx,edx; div rcx - emitByte(0xf1); -#endif - gencr(instr); - } - - void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { - genar(instr); - if (instr.locb & 7) { -#ifdef MAGIC_DIVISION - int64_t divisor = instr.imm32; - if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { - // +/- power of two - bool negative = divisor < 0; - if (negative) - divisor = -divisor; + if (mi.post_shift > 0) { + emit(REX_SHR_RDX); + emitByte(mi.post_shift); + } + emit(REX_ADD_RR); + emitByte(0xc2 + 8 * instr.dst); + } + else { //divisor is a power of two int shift = 0; - uint64_t unsignedDivisor = divisor; - while (unsignedDivisor >>= 1) + while (divisor >>= 1) ++shift; if (shift > 0) { - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx, rax - emit(0x3ff9c148); //sar rcx, 63 - uint32_t mask = (1ULL << shift) - 1; - emit(uint16_t(0xe181)); //and ecx, mask - emit(mask); - emitByte(0x48); - emit(uint16_t(0xc103)); //add rax, rcx - emit(0x00f8c148 | (shift << 24)); //sar rax, shift - } - if (negative) { - emitByte(0x48); - emit(uint16_t(0xd8f7)); //neg rax + emit(REX_SH); + emitByte(0xe8 + instr.dst); } } - else if (divisor != 0) { - magics_info mi = compute_signed_magic_info(divisor); - if ((divisor >= 0) != (mi.multiplier >= 0)) { - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx, rax - } - emit(uint16_t(0xba48)); //movabs rdx, multiplier - emit(mi.multiplier); - emit(0xd233c28b48eaf748); //imul rdx; mov rax,rdx; xor edx,edx - bool haveSF = false; - if (divisor > 0 && mi.multiplier < 0) { - emitByte(0x48); - emit(uint16_t(0xc103)); //add rax, rcx - haveSF = true; - } - if (divisor < 0 && mi.multiplier > 0) { - emitByte(0x48); - emit(uint16_t(0xc12b)); //sub rax, rcx - haveSF = true; - } - if (mi.shift > 0) { - emit(0x00f8c148 | (mi.shift << 24)); //sar rax, shift - haveSF = true; - } - if (!haveSF) { - emitByte(0x48); - emit(uint16_t(0x85c0)); - } - emit(0x48c2980f); //sets dl; add rax, rdx - emit(uint16_t(0xc203)); + } + } + + void JitCompilerX86::h_ISDIV_C(Instruction& instr) { + int64_t divisor = instr.imm32; + if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + // +/- power of two + bool negative = divisor < 0; + if (negative) + divisor = -divisor; + int shift = 0; + uint64_t unsignedDivisor = divisor; + while (unsignedDivisor >>= 1) + ++shift; + if (shift > 0) { + emit(MOV_RCX_RAX_SAR_RCX_63); + uint32_t mask = (1ULL << shift) - 1; + emit(AND_ECX_I); + emit32(mask); + emit(ADD_RAX_RCX); + emit(SAR_RAX_I8); + emitByte(shift); } -#else - emitByte(0xba); // mov edx, imm32 - emit(instr.imm32); -#endif + if (negative) + emit(NEG_RAX); + emit(ADD_R_RAX); + emitByte(0xc0 + instr.dst); + } + else if (divisor != 0) { + magics_info mi = compute_signed_magic_info(divisor); + emit(MOV_RAX_I); + emit64(mi.multiplier); + emit(REX_MUL_R); + emitByte(0xe8 + instr.dst); + emit(XOR_EAX_EAX); + bool haveSF = false; + if (divisor > 0 && mi.multiplier < 0) { + emit(ADD_RDX_R); + emitByte(0xc2 + 8 * instr.dst); + haveSF = true; + } + if (divisor < 0 && mi.multiplier > 0) { + emit(SUB_RDX_R); + emitByte(0xc2 + 8 * instr.dst); + haveSF = true; + } + if (mi.shift > 0) { + emit(SAR_RDX_I8); + emitByte(mi.shift); + haveSF = true; + } + if (!haveSF) + emit(TEST_RDX_RDX); + emit(SETS_AL_ADD_RDX_RAX); + emit(ADD_R_RAX); + emitByte(0xd0 + instr.dst); + } + } + + void JitCompilerX86::h_INEG_R(Instruction& instr) { + emit(REX_NEG); + emitByte(0xd8 + instr.dst); + } + + void JitCompilerX86::h_IXOR_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emit(uint16_t(0x8b41)); //mov edx, r32 - emitByte(0xd0 + (instr.regb % RegistersCount)); -#ifndef MAGIC_DIVISION + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.imm32); } -#endif - emit(0xd8f7480575fffa83); //cmp edx,-1 - emit(uint16_t(0x12eb)); //jmp result - emit(0x0fd28500000001b9); - emit(0x489948c96348ca45); - emit(uint16_t(0xf9f7)); //idiv rcx -#ifdef MAGIC_DIVISION - } -#endif - gencr(instr); } - void JitCompilerX86::h_AND_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x2349, 0x2548); - gencr(instr); + void JitCompilerX86::h_IXOR_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_XOR_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_XOR_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } } - void JitCompilerX86::h_AND_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x2341, 0x25); - gencr(instr); + void JitCompilerX86::h_IROR_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc8 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.imm32 & 63); + } } - void JitCompilerX86::h_OR_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x0b49, 0x0d48); - gencr(instr); + void JitCompilerX86::h_IROL_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc0 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc0 + instr.dst); + emitByte(instr.imm32 & 63); + } } - void JitCompilerX86::h_OR_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x0b41, 0x0d); - gencr(instr); + void JitCompilerX86::h_FPSWAP_R(Instruction& instr) { + emit(SHUFPD); + emitByte(0xc0 + 9 * instr.dst); + emitByte(1); } - void JitCompilerX86::h_XOR_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x3349, 0x3548); - gencr(instr); + void JitCompilerX86::h_FPADD_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_ADDPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_XOR_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x3341, 0x35); - gencr(instr); + void JitCompilerX86::h_FPADD_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ADDPD); + emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_SHL_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xe0d3, 0xe0c1); - gencr(instr); + void JitCompilerX86::h_FPSUB_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_SUBPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_SHR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xe8d3, 0xe8c1); - gencr(instr); + void JitCompilerX86::h_FPSUB_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_SUBPD); + emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_SAR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xf8d3, 0xf8c1); - gencr(instr); + void JitCompilerX86::h_FPNEG_R(Instruction& instr) { + instr.dst %= 4; + emit(REX_XORPS); + emitByte(0xc7 + 8 * instr.dst); } - void JitCompilerX86::h_ROL_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xc0d3, 0xc0c1); - gencr(instr); + void JitCompilerX86::h_FPMUL_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_MULPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_ROR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xc8d3, 0xc8c1); - gencr(instr); + void JitCompilerX86::h_FPMUL_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_MULPD); + emitByte(0xe4 + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPADD(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x58); - gencf(instr); + void JitCompilerX86::h_FPDIV_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_DIVPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPSUB(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x5c); - gencf(instr); + void JitCompilerX86::h_FPDIV_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_DIVPD); + emitByte(0xe4 + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPMUL(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x59); - emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1 - emit(uint16_t(0x540f)); //andps xmm0,xmm1 - emitByte(0xc1); - gencf(instr); + void JitCompilerX86::h_FPSQRT_R(Instruction& instr) { + instr.dst %= 4; + emit(SQRTPD); + emitByte(0xe4 + 9 * instr.dst); } - void JitCompilerX86::h_FPDIV(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x5e); - emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1 - emit(uint16_t(0x540f)); //andps xmm0,xmm1 - emitByte(0xc1); - gencf(instr); - } - - void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) { - genaf(instr); - emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0 - gencf(instr); - } - - void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { - genar(instr); - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx,rax - int rotate = (13 - (instr.imm8 & 63)) & 63; + void JitCompilerX86::h_CFROUND(Instruction& instr) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.src); + int rotate = (13 - (instr.alt & 63)) & 63; if (rotate != 0) { - emitByte(0x48); - emit(uint16_t(0xc0c1)); //rol rax + emit(ROL_RAX); emitByte(rotate); } - emit(uint16_t(0x0025)); - emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0 - emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8] - emitByte(0xf8); - gencr(instr, false); //result in rcx + emit(AND_OR_MOV_LDMXCSR); } - static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { - switch ((instr.locb & 7) ^ invert) + static inline uint8_t condition(Instruction& instr, bool invert = false) { + switch ((instr.alt & 7) ^ invert) { case 0: - return 0x76; //jbe + return 0x96; //setbe case 1: - return 0x77; //ja + return 0x97; //seta case 2: - return 0x78; //js + return 0x98; //sets case 3: - return 0x79; //jns + return 0x99; //setns case 4: - return 0x70; //jo + return 0x90; //seto case 5: - return 0x71; //jno + return 0x91; //setno case 6: - return 0x7c; //jl + return 0x9c; //setl case 7: - return 0x7d; //jge + return 0x9d; //setge } } - void JitCompilerX86::h_JUMP(Instruction& instr, int i) { - genar(instr); - gencr(instr); - emit(uint16_t(0x8141)); //cmp regb, imm32 - emitByte(0xf8 + (instr.regb % RegistersCount)); - emit(instr.imm32); - emitByte(0x0f); //near jump - emitByte(jumpCondition(instr) + 0x10); - i = wrapInstr(i + (instr.imm8 & 127) + 2); - if (i < instructionOffsets.size()) { - emit(instructionOffsets[i] - (codePos + 4)); - } - else { - callOffsets.push_back(CallOffset(codePos, i)); - codePos += 4; - } + void JitCompilerX86::h_COND_R(Instruction& instr) { + emit(XOR_ECX_ECX); + emit(REX_CMP_R32I); + emitByte(0xf8 + instr.src); + emit32(instr.imm32); + emitByte(0x0f); + emitByte(condition(instr)); + emitByte(0xc1); + emit(REX_ADD_RM); + emitByte(0xc1 + 8 * instr.dst); } - void JitCompilerX86::h_CALL(Instruction& instr, int i) { - genar(instr); - gencr(instr); - emit(uint16_t(0x8141)); //cmp regb, imm32 - emitByte(0xf8 + (instr.regb % RegistersCount)); - emit(instr.imm32); - emitByte(jumpCondition(instr, true)); - emitByte(0x05); - emitByte(0xe8); //call - i = wrapInstr(i + (instr.imm8 & 127) + 2); - if (i < instructionOffsets.size()) { - emit(instructionOffsets[i] - (codePos + 4)); - } - else { - callOffsets.push_back(CallOffset(codePos, i)); - codePos += 4; - } + void JitCompilerX86::h_COND_M(Instruction& instr) { + emit(XOR_ECX_ECX); + genAddressReg(instr); + emit(REX_CMP_M32I); + emit32(instr.imm32); + emitByte(0x0f); + emitByte(condition(instr)); + emitByte(0xc1); + emit(REX_ADD_RM); + emitByte(0xc1 + 8 * instr.dst); } - void JitCompilerX86::h_RET(Instruction& instr, int i) { - genar(instr); - int crlen = 0; - if ((instr.locc & 7) <= 3) { - crlen = 17; - } - emit(0x74e73b48); //cmp rsp, rdi; je - emitByte(0x01); - emitByte(0xc3); //ret + void JitCompilerX86::h_ISTORE(Instruction& instr) { + genAddressRegDst(instr); + emit(REX_MOV_MR); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); } - void JitCompilerX86::h_NOP(Instruction& instr, int i) { - genar(instr); + void JitCompilerX86::h_FSTORE(Instruction& instr) { + genAddressRegDst(instr, true); + emit(MOVAPD); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); } #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) InstructionGeneratorX86 JitCompilerX86::engine[256] = { - INST_HANDLE(ADD_64) - INST_HANDLE(ADD_32) - INST_HANDLE(SUB_64) - INST_HANDLE(SUB_32) - INST_HANDLE(MUL_64) - INST_HANDLE(MULH_64) - INST_HANDLE(MUL_32) - INST_HANDLE(IMUL_32) - INST_HANDLE(IMULH_64) - INST_HANDLE(DIV_64) - INST_HANDLE(IDIV_64) - INST_HANDLE(AND_64) - INST_HANDLE(AND_32) - INST_HANDLE(OR_64) - INST_HANDLE(OR_32) - INST_HANDLE(XOR_64) - INST_HANDLE(XOR_32) - INST_HANDLE(SHL_64) - INST_HANDLE(SHR_64) - INST_HANDLE(SAR_64) - INST_HANDLE(ROL_64) - INST_HANDLE(ROR_64) - INST_HANDLE(FPADD) - INST_HANDLE(FPSUB) - INST_HANDLE(FPMUL) - INST_HANDLE(FPDIV) - INST_HANDLE(FPSQRT) - INST_HANDLE(FPROUND) - INST_HANDLE(JUMP) - INST_HANDLE(CALL) - INST_HANDLE(RET) - INST_HANDLE(NOP) + INST_HANDLE(IADD_R) + INST_HANDLE(IADD_M) + INST_HANDLE(IADD_RC) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_9C) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IDIV_C) + INST_HANDLE(ISDIV_C) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(FPSWAP_R) + INST_HANDLE(FPADD_R) + INST_HANDLE(FPADD_M) + INST_HANDLE(FPSUB_R) + INST_HANDLE(FPSUB_M) + INST_HANDLE(FPNEG_R) + INST_HANDLE(FPMUL_R) + INST_HANDLE(FPMUL_M) + INST_HANDLE(FPDIV_R) + INST_HANDLE(FPDIV_M) + INST_HANDLE(FPSQRT_R) + INST_HANDLE(COND_R) + INST_HANDLE(COND_M) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; + #endif } \ No newline at end of file diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e6a7e6d..fa5aa93 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -30,16 +30,10 @@ namespace RandomX { class JitCompilerX86; - typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&); constexpr uint32_t CodeSize = 64 * 1024; - struct CallOffset { - CallOffset(int32_t p, int32_t i) : pos(p), index(i) {} - int32_t pos; - int32_t index; - }; - class JitCompilerX86 { public: JitCompilerX86(); @@ -55,66 +49,82 @@ namespace RandomX { static InstructionGeneratorX86 engine[256]; uint8_t* code; int32_t codePos; - std::vector instructionOffsets; - std::vector callOffsets; - void gena(Instruction&); - void genar(Instruction&); - void genaf(Instruction&); - void genbiashift(Instruction&, uint16_t, uint16_t); - void genbia(Instruction&, uint16_t, uint16_t); - void genbia32(Instruction&, uint16_t, uint8_t); - void genbf(Instruction&, uint8_t); - void scratchpadStoreR(Instruction&, uint32_t, bool); - void scratchpadStoreF(Instruction&, int, uint32_t, bool); - void gencr(Instruction&, bool); - void gencf(Instruction&); - void generateCode(Instruction&, int); - void fixCallOffsets(); + void genAddressReg(Instruction&, bool); + void genAddressRegDst(Instruction&, bool); + void genAddressImm(Instruction&); + void genSIB(int scale, int index, int base); + + void generateCode(Instruction&); void emitByte(uint8_t val) { code[codePos] = val; codePos++; } - template - void emit(T val) { - *reinterpret_cast(code + codePos) = val; - codePos += sizeof(T); + void emit32(uint32_t val) { + code[codePos + 0] = val; + code[codePos + 1] = val >> 8; + code[codePos + 2] = val >> 16; + code[codePos + 3] = val >> 24; + codePos += 4; } - void h_ADD_64(Instruction&, int); - void h_ADD_32(Instruction&, int); - void h_SUB_64(Instruction&, int); - void h_SUB_32(Instruction&, int); - void h_MUL_64(Instruction&, int); - void h_MULH_64(Instruction&, int); - void h_MUL_32(Instruction&, int); - void h_IMUL_32(Instruction&, int); - void h_IMULH_64(Instruction&, int); - void h_DIV_64(Instruction&, int); - void h_IDIV_64(Instruction&, int); - void h_AND_64(Instruction&, int); - void h_AND_32(Instruction&, int); - void h_OR_64(Instruction&, int); - void h_OR_32(Instruction&, int); - void h_XOR_64(Instruction&, int); - void h_XOR_32(Instruction&, int); - void h_SHL_64(Instruction&, int); - void h_SHR_64(Instruction&, int); - void h_SAR_64(Instruction&, int); - void h_ROL_64(Instruction&, int); - void h_ROR_64(Instruction&, int); - void h_FPADD(Instruction&, int); - void h_FPSUB(Instruction&, int); - void h_FPMUL(Instruction&, int); - void h_FPDIV(Instruction&, int); - void h_FPSQRT(Instruction&, int); - void h_FPROUND(Instruction&, int); - void h_JUMP(Instruction&, int); - void h_CALL(Instruction&, int); - void h_RET(Instruction&, int); - void h_NOP(Instruction&, int); + void emit64(uint64_t val) { + code[codePos + 0] = val; + code[codePos + 1] = val >> 8; + code[codePos + 2] = val >> 16; + code[codePos + 3] = val >> 24; + code[codePos + 4] = val >> 32; + code[codePos + 5] = val >> 40; + code[codePos + 6] = val >> 48; + code[codePos + 7] = val >> 56; + codePos += 8; + } + + template + void emit(const uint8_t (&src)[N]) { + for (int i = 0; i < N; ++i) { + code[codePos + i] = src[i]; + } + codePos += N; + } + + void h_IADD_R(Instruction&); + void h_IADD_M(Instruction&); + void h_IADD_RC(Instruction&); + void h_ISUB_R(Instruction&); + void h_ISUB_M(Instruction&); + void h_IMUL_9C(Instruction&); + void h_IMUL_R(Instruction&); + void h_IMUL_M(Instruction&); + void h_IMULH_R(Instruction&); + void h_IMULH_M(Instruction&); + void h_ISMULH_R(Instruction&); + void h_ISMULH_M(Instruction&); + void h_IDIV_C(Instruction&); + void h_ISDIV_C(Instruction&); + void h_INEG_R(Instruction&); + void h_IXOR_R(Instruction&); + void h_IXOR_M(Instruction&); + void h_IROR_R(Instruction&); + void h_IROL_R(Instruction&); + void h_FPSWAP_R(Instruction&); + void h_FPADD_R(Instruction&); + void h_FPADD_M(Instruction&); + void h_FPSUB_R(Instruction&); + void h_FPSUB_M(Instruction&); + void h_FPNEG_R(Instruction&); + void h_FPMUL_R(Instruction&); + void h_FPMUL_M(Instruction&); + void h_FPDIV_R(Instruction&); + void h_FPDIV_M(Instruction&); + void h_FPSQRT_R(Instruction&); + void h_COND_R(Instruction&); + void h_COND_M(Instruction&); + void h_CFROUND(Instruction&); + void h_ISTORE(Instruction&); + void h_FSTORE(Instruction&); }; } \ No newline at end of file diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index 95a4752..b94fa4d 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -1,9 +1,5 @@ - ;# unroll VM stack - mov rsp, rdi - ;# save VM register values pop rcx - pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -12,12 +8,12 @@ mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - movapd xmmword ptr [rcx+64], xmm8 - movapd xmmword ptr [rcx+80], xmm9 - movapd xmmword ptr [rcx+96], xmm2 - movapd xmmword ptr [rcx+112], xmm3 + movdqa xmmword ptr [rcx+64], xmm0 + movdqa xmmword ptr [rcx+80], xmm1 + movdqa xmmword ptr [rcx+96], xmm2 + movdqa xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] - movapd xmmword ptr [rcx+64], xmm4 - movapd xmmword ptr [rcx+80], xmm5 - movapd xmmword ptr [rcx+96], xmm6 - movapd xmmword ptr [rcx+112], xmm7 \ No newline at end of file + movdqa xmmword ptr [rcx+64], xmm4 + movdqa xmmword ptr [rcx+80], xmm5 + movdqa xmmword ptr [rcx+96], xmm6 + movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/src/asm/program_epilogue_win64.inc b/src/asm/program_epilogue_win64.inc index 220bed8..f2e4b44 100644 --- a/src/asm/program_epilogue_win64.inc +++ b/src/asm/program_epilogue_win64.inc @@ -1,6 +1,12 @@ include program_epilogue_store.inc ;# restore callee-saved registers - Microsoft x64 calling convention + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 movdqu xmm10, xmmword ptr [rsp] movdqu xmm9, xmmword ptr [rsp+16] movdqu xmm8, xmmword ptr [rsp+32] @@ -17,4 +23,4 @@ pop rbx ;# program finished - ret 0 \ No newline at end of file + ret diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc new file mode 100644 index 0000000..af6f1b7 --- /dev/null +++ b/src/asm/program_load_flt.inc @@ -0,0 +1,14 @@ + and eax, 262080 + lea rcx, [rsi+rax] + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + andps xmm4, xmm14 + andps xmm5, xmm14 + andps xmm6, xmm14 + andps xmm7, xmm14 diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc new file mode 100644 index 0000000..d139549 --- /dev/null +++ b/src/asm/program_load_int.inc @@ -0,0 +1,10 @@ + and eax, 262080 + lea rcx, [rsi+rax] + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 6bc3bd2..67a967d 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -7,13 +7,14 @@ push r15 ;# function arguments + mov rbx, rcx ;# loop counter push rdi ;# RegisterFile& registerFile - mov rbp, qword ptr [rsi] ;# "mx", "ma" - mov rax, qword ptr [rsi+8] ;# uint8_t* dataset - push rax - mov rsi, rdx ;# convertible_t* scratchpad mov rcx, rdi + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov eax, ebp ;# "mx" + mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset + mov rsi, rdx ;# convertible_t* scratchpad #include "program_prologue_load.inc" - jmp randomx_program_begin \ No newline at end of file + jmp DECL(randomx_loop_begin) \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 9ceeed6..ecdd4f9 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,27 +1,20 @@ - mov rdi, rsp ;# beginning of VM stack - mov ebx, 262145 ;# number of VM instructions to execute + 1 + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 - xorps xmm10, xmm10 - cmpeqpd xmm10, xmm10 - psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] + movapd xmm13, xmmword ptr [minDbl] + movapd xmm14, xmmword ptr [absMask] + movapd xmm15, xmmword ptr [signMask] - ;# load integer registers - mov r8, qword ptr [rcx+0] - mov r9, qword ptr [rcx+8] - mov r10, qword ptr [rcx+16] - mov r11, qword ptr [rcx+24] - mov r12, qword ptr [rcx+32] - mov r13, qword ptr [rcx+40] - mov r14, qword ptr [rcx+48] - mov r15, qword ptr [rcx+56] - - ;# load floating point registers - movapd xmm8, xmmword ptr [rcx+64] - movapd xmm9, xmmword ptr [rcx+80] - movapd xmm2, xmmword ptr [rcx+96] - movapd xmm3, xmmword ptr [rcx+112] - lea rcx, [rcx+64] - movapd xmm4, xmmword ptr [rcx+64] - movapd xmm5, xmmword ptr [rcx+80] - movapd xmm6, xmmword ptr [rcx+96] - movapd xmm7, xmmword ptr [rcx+112] diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index bbf7851..83ae2a5 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -13,14 +13,21 @@ movdqu xmmword ptr [rsp+32], xmm8 movdqu xmmword ptr [rsp+16], xmm9 movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 - ;# function arguments - push rcx ;# RegisterFile& registerFile - mov rbp, qword ptr [rdx] ;# "mx", "ma" - mov rax, qword ptr [rdx+8] ;# uint8_t* dataset - push rax - mov rsi, r8 ;# convertible_t* scratchpad + ; function arguments + push rcx ; RegisterFile& registerFile + mov rbp, qword ptr [rdx] ; "mx", "ma" + mov eax, ebp ; "mx" + mov rdi, qword ptr [rdx+8] ; uint8_t* dataset + mov rsi, r8 ; convertible_t* scratchpad + mov rbx, r9 ; loop counter include program_prologue_load.inc - jmp randomx_program_begin \ No newline at end of file + jmp randomx_loop_begin \ No newline at end of file diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc deleted file mode 100644 index c7650ea..0000000 --- a/src/asm/program_read.inc +++ /dev/null @@ -1,20 +0,0 @@ - db 0, 0, 0, 0 ;# TransformAddress placeholder - mov rcx, qword ptr [rdi] ;# load the dataset address - xor rbp, rax ;# modify "mx" - ;# prefetch cacheline "mx" - and rbp, -64 ;# align "mx" to the start of a cache line - mov edx, ebp ;# edx = mx - prefetchnta byte ptr [rcx+rdx] - ;# read cacheline "ma" - ror rbp, 32 ;# swap "ma" and "mx" - mov edx, ebp ;# edx = ma - lea rcx, [rcx+rdx] ;# dataset cache line - xor r8, qword ptr [rcx+0] - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] - ret \ No newline at end of file diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc new file mode 100644 index 0000000..bae4817 --- /dev/null +++ b/src/asm/program_read_dataset.inc @@ -0,0 +1,16 @@ + xor rbp, rax ;# modify "mx" + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + lea rcx, [rdi+rdx] ;# dataset cache line + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + \ No newline at end of file diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc new file mode 100644 index 0000000..d6ca7f1 --- /dev/null +++ b/src/asm/program_store_flt.inc @@ -0,0 +1,11 @@ + and eax, 262080 + lea rcx, [rsi+rax] + mulpd xmm0, xmm4 + mulpd xmm1, xmm5 + mulpd xmm2, xmm6 + mulpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 + diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc new file mode 100644 index 0000000..75c973f --- /dev/null +++ b/src/asm/program_store_int.inc @@ -0,0 +1,10 @@ + and eax, 262080 + lea rcx, [rsi+rax] + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 diff --git a/src/asm/program_xmm_constants.inc b/src/asm/program_xmm_constants.inc new file mode 100644 index 0000000..38c897c --- /dev/null +++ b/src/asm/program_xmm_constants.inc @@ -0,0 +1,6 @@ +minDbl: + db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 +absMask: + db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 +signMask: + db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index bf235ec..053f2a1 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -81,6 +81,8 @@ namespace RandomX { constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; + constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; + constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; @@ -129,7 +131,7 @@ namespace RandomX { typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); - typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); + typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); extern "C" { void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 17e593d..be3bc82 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -21,14 +21,6 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram -ALIGN 16 -minDbl: -db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 -absMask: -db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 -signMask: -db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 - executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary @@ -114,6 +106,17 @@ executeProgram PROC movapd xmm14, xmmword ptr [absMask] movapd xmm15, xmmword ptr [signMask] + jmp program_begin + +ALIGN 64 +minDbl: + db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 +absMask: + db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 +signMask: + db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 + +ALIGN 64 program_begin: xor eax, r8d ;# read address register 1 and eax, 262080 @@ -144,7 +147,7 @@ program_begin: ;# 256 instructions include program.inc - + mov eax, r8d ;# read address register 1 xor eax, r9d ;# read address register 2 xor rbp, rax ;# modify "mx" diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 242b5bd..86285de 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -22,21 +22,21 @@ along with RandomX. If not, see. //Integer #define WT_IADD_R 10 #define WT_IADD_M 3 -#define WT_IADD_RC 12 +#define WT_IADD_RC 10 #define WT_ISUB_R 10 #define WT_ISUB_M 3 -#define WT_IMUL_9C 12 -#define WT_IMUL_R 24 -#define WT_IMUL_M 8 +#define WT_IMUL_9C 10 +#define WT_IMUL_R 20 +#define WT_IMUL_M 6 #define WT_IMULH_R 6 #define WT_IMULH_M 2 #define WT_ISMULH_R 6 #define WT_ISMULH_M 2 #define WT_IDIV_C 4 -#define WT_ISDIV_C 2 -#define WT_INEG_R 4 -#define WT_IXOR_R 15 -#define WT_IXOR_M 5 +#define WT_ISDIV_C 4 +#define WT_INEG_R 2 +#define WT_IXOR_R 12 +#define WT_IXOR_M 4 #define WT_IROR_R 10 #define WT_IROL_R 10 @@ -58,10 +58,14 @@ along with RandomX. If not, see. #define WT_FPSQRT_R 6 //Control -#define WT_COND_R 15 -#define WT_COND_M 5 +#define WT_COND_R 12 +#define WT_COND_M 4 #define WT_CFROUND 1 +//Store +#define WT_ISTORE 12 +#define WT_FSTORE 6 + #define WT_NOP 0 constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ @@ -70,7 +74,7 @@ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ -WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP; +WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); @@ -116,3 +120,40 @@ static_assert(wtSum == 256, #define REPN(x,N) REPNX(x,N) #define NUM(x) x #define WT(x) NUM(WT_##x) + +#define REPCASE0(x) +#define REPCASE1(x) case __COUNTER__: +#define REPCASE2(x) REPCASE1(x) case __COUNTER__: +#define REPCASE3(x) REPCASE2(x) case __COUNTER__: +#define REPCASE4(x) REPCASE3(x) case __COUNTER__: +#define REPCASE5(x) REPCASE4(x) case __COUNTER__: +#define REPCASE6(x) REPCASE5(x) case __COUNTER__: +#define REPCASE7(x) REPCASE6(x) case __COUNTER__: +#define REPCASE8(x) REPCASE7(x) case __COUNTER__: +#define REPCASE9(x) REPCASE8(x) case __COUNTER__: +#define REPCASE10(x) REPCASE9(x) case __COUNTER__: +#define REPCASE11(x) REPCASE10(x) case __COUNTER__: +#define REPCASE12(x) REPCASE11(x) case __COUNTER__: +#define REPCASE13(x) REPCASE12(x) case __COUNTER__: +#define REPCASE14(x) REPCASE13(x) case __COUNTER__: +#define REPCASE15(x) REPCASE14(x) case __COUNTER__: +#define REPCASE16(x) REPCASE15(x) case __COUNTER__: +#define REPCASE17(x) REPCASE16(x) case __COUNTER__: +#define REPCASE18(x) REPCASE17(x) case __COUNTER__: +#define REPCASE19(x) REPCASE18(x) case __COUNTER__: +#define REPCASE20(x) REPCASE19(x) case __COUNTER__: +#define REPCASE21(x) REPCASE20(x) case __COUNTER__: +#define REPCASE22(x) REPCASE21(x) case __COUNTER__: +#define REPCASE23(x) REPCASE22(x) case __COUNTER__: +#define REPCASE24(x) REPCASE23(x) case __COUNTER__: +#define REPCASE25(x) REPCASE24(x) case __COUNTER__: +#define REPCASE26(x) REPCASE25(x) case __COUNTER__: +#define REPCASE27(x) REPCASE26(x) case __COUNTER__: +#define REPCASE28(x) REPCASE27(x) case __COUNTER__: +#define REPCASE29(x) REPCASE28(x) case __COUNTER__: +#define REPCASE30(x) REPCASE29(x) case __COUNTER__: +#define REPCASE31(x) REPCASE30(x) case __COUNTER__: +#define REPCASE32(x) REPCASE31(x) case __COUNTER__: +#define REPCASENX(x,N) REPCASE##N(x) +#define REPCASEN(x,N) REPCASENX(x,N) +#define CASE_REP(x) REPCASEN(x, WT(x)) \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 0b09a74..12e9cdb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -174,7 +174,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash for (int chain = 0; chain < 16; ++chain) { vm->initializeProgram(hash); int segment = hash[3] & 3; - vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4); + vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4); vm->execute(); vm->getResult(nullptr, 0, hash); } diff --git a/src/program.inc b/src/program.inc index a91240e..21f7d0b 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,745 +1,793 @@ - ; ISUB_R r0, r4 - sub r8, r12 - ; IROR_R r5, 15 - ror r13, 15 - ; ISUB_M r6, L1[r5] - mov eax, r13d - and eax, 16376 - sub r14, qword ptr [rsi+rax] - ; IMUL_R r7, r6 - imul r15, r14 - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IMUL_R r2, r4 - imul r10, r12 - ; IADD_RC r4, r5, 1789610138 - lea r12, [r12+r13+1789610138] - ; IADD_R r1, r4 - add r9, r12 - ; IADD_R r6, r0 - add r14, r8 - ; IXOR_R r7, r2 - xor r15, r10 - ; ISMULH_M r6, L1[6816] - mov rax, r14 - imul qword ptr [rsi+6816] - mov r14, rdx - ; ISUB_R r0, r4 - sub r8, r12 - ; IXOR_R r7, r2 - xor r15, r10 - ; INEG_R r4 - neg r12 - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; IADD_RC r2, r5, -1667142135 - lea r10, [r10+r13-1667142135] - ; ISUB_R r6, r2 - sub r14, r10 - ; IDIV_C r3, 2650709570 - mov rax, 3736177069856446853 - mul r11 - shr rdx, 29 - add r11, rdx - ; IMULH_R r3, r0 - mov rax, r11 - mul r8 - mov r11, rdx - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPADD_M f3, L2[r4] - mov eax, r12d + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r2, r5, -1621224194 + lea r10, [r10+r13-1621224194] + ; ISTORE L2[r2], r7 + mov eax, r10d and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; FPMUL_M e1, L1[r5] - mov eax, r13d + mov qword ptr [rsi+rax], r15 + ; FPMUL_R e2, a2 + mulpd xmm6, xmm10 + ; IMUL_R r6, r3 + imul r14, r11 + ; FPMUL_R e1, a0 + mulpd xmm5, xmm8 + ; IROR_R r5, r3 + mov ecx, r11d + ror r13, cl + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; IXOR_R r0, r4 + xor r8, r12 + ; ISMULH_R r3, r7 + mov rax, r11 + imul r15 + mov r11, rdx + ; FPSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; ISMULH_R r6, r0 + mov rax, r14 + imul r8 + mov r14, rdx + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; ISUB_R r3, r4 + sub r11, r12 + ; IADD_R r7, -1138617760 + add r15, -1138617760 + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; FPMUL_R e2, a1 + mulpd xmm6, xmm9 + ; IROR_R r7, r1 + mov ecx, r9d + ror r15, cl + ; COND_M r2, lt(L1[r7], -41618808) + xor ecx, ecx + mov eax, r15d and eax, 16376 + cmp dword ptr [rsi+rax], -41618808 + setl cl + add r10, rcx + ; FPMUL_M e3, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; CFROUND r1, 43 + mov rax, r9 + rol rax, 34 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FSTORE L1[r6], f2 + mov eax, r14d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm2 + ; IMUL_9C r6, -45112665 + lea r14, [r14+r14*8-45112665] + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; ISTORE L1[r4], r3 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; ISTORE L1[r6], r6 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; COND_R r4, sg(r1, -1189096105) + xor ecx, ecx + cmp r9d, -1189096105 + sets cl + add r12, rcx + ; IXOR_R r2, r5 + xor r10, r13 + ; COND_R r1, be(r5, -965180434) + xor ecx, ecx + cmp r13d, -965180434 + setbe cl + add r9, rcx + ; FPMUL_M e1, L2[r3] + mov eax, r11d + and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] mulpd xmm5, xmm12 maxpd xmm5, xmm13 - ; IMUL_9C r7, -778247271 - lea r15, [r15+r15*8-778247271] - ; IXOR_R r4, 1846379510 - xor r12, 1846379510 - ; COND_M r6, of(L1[r1], -397786451) + ; IMULH_R r7, r6 + mov rax, r15 + mul r14 + mov r15, rdx + ; ISMULH_M r0, L1[r4] + mov ecx, r12d + and ecx, 16376 + mov rax, r8 + imul qword ptr [rsi+rcx] + mov r8, rdx + ; IMUL_R r5, r3 + imul r13, r11 + ; COND_R r2, of(r0, -1045938770) xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -397786451 + cmp r8d, -1045938770 seto cl - add r14, rcx - ; COND_R r6, of(r3, -1033710571) - xor ecx, ecx - cmp r11d, -1033710571 - seto cl - add r14, rcx - ; COND_M r6, sg(L1[r6], 1413230028) - xor ecx, ecx - mov eax, r14d - and eax, 16376 - cmp dword ptr [rsi+rax], 1413230028 - sets cl - add r14, rcx - ; IDIV_C r0, 2791108943 - mov rax, 1774119268816201525 - mul r8 - shr rdx, 28 - add r8, rdx - ; FPSUB_M f1, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; IADD_RC r6, r5, -640194892 - lea r14, [r14+r13-640194892] - ; FPADD_M f0, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; IMUL_R r6, r5 - imul r14, r13 - ; IROL_R r4, r1 - mov ecx, r9d - rol r12, cl - ; FPDIV_R e2, a0 - divpd xmm6, xmm8 - maxpd xmm6, xmm13 - ; IADD_RC r0, r2, -487084195 - lea r8, [r8+r10-487084195] - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; IXOR_R r5, r3 - xor r13, r11 - ; IMUL_R r2, r4 - imul r10, r12 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; IMUL_M r4, L1[4856] - imul r12, qword ptr [rsi+4856] - ; IMUL_9C r2, 7951348 - lea r10, [r10+r10*8+7951348] - ; COND_R r3, ab(r7, 984532162) - xor ecx, ecx - cmp r15d, 984532162 - seta cl - add r11, rcx - ; IXOR_M r7, L1[r4] + add r10, rcx + ; FPADD_M f3, L1[r4] mov eax, r12d and eax, 16376 - xor r15, qword ptr [rsi+rax] - ; IMUL_R r4, 248971329 - imul r12, 248971329 - ; IXOR_R r3, r1 - xor r11, r9 - ; IMUL_R r3, 2098482639 - imul r11, 2098482639 - ; IXOR_R r6, r3 - xor r14, r11 - ; IXOR_R r5, r4 - xor r13, r12 - ; IADD_R r5, r4 - add r13, r12 - ; IMUL_9C r7, 66530302 - lea r15, [r15+r15*8+66530302] - ; IMULH_R r0, r5 - mov rax, r8 - mul r13 - mov r8, rdx - ; IMUL_R r2, r7 - imul r10, r15 - ; IMUL_R r1, 770985098 - imul r9, 770985098 - ; COND_R r7, be(r5, 58538265) - xor ecx, ecx - cmp r13d, 58538265 - setbe cl - add r15, rcx - ; IMUL_9C r3, 245704334 - lea r11, [r11+r11*8+245704334] - ; ISMULH_R r2, r4 - mov rax, r10 - imul r12 - mov r10, rdx - ; FPDIV_R e3, a3 - divpd xmm7, xmm11 - maxpd xmm7, xmm13 - ; IMULH_R r5, r2 - mov rax, r13 - mul r10 - mov r13, rdx - ; ISUB_M r7, L1[r5] - mov eax, r13d + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; IADD_R r3, r2 + add r11, r10 + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; FPSUB_R f0, a1 + subpd xmm0, xmm9 + ; IMUL_M r5, L1[r6] + mov eax, r14d and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; FPMUL_R e3, a3 - mulpd xmm7, xmm11 - ; IMUL_R r3, r4 - imul r11, r12 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IMULH_R r1, 633797287 - mov eax, 633797287 - mul r9 - add r9, rdx - ; IADD_R r4, r3 - add r12, r11 - ; IROR_R r2, r7 - mov ecx, r15d - ror r10, cl - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IMUL_M r4, L1[r3] - mov eax, r11d - and eax, 16376 - imul r12, qword ptr [rsi+rax] - ; IMUL_9C r1, -1901091890 - lea r9, [r9+r9*8-1901091890] - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; IMULH_R r5, r3 - mov rax, r13 - mul r11 - mov r13, rdx - ; FPSUB_M f1, L1[r7] + imul r13, qword ptr [rsi+rax] + ; ISUB_R r1, r2 + sub r9, r10 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; IMUL_M r0, L1[r7] mov eax, r15d and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IMUL_M r2, L1[r1] - mov eax, r9d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; IMUL_R r6, r0 - imul r14, r8 - ; IADD_R r7, r6 - add r15, r14 - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; COND_R r5, no(r2, -1589295370) - xor ecx, ecx - cmp r10d, -1589295370 - setno cl - add r13, rcx - ; IMUL_9C r7, 420978486 - lea r15, [r15+r15*8+420978486] - ; IROL_R r4, r2 - mov ecx, r10d - rol r12, cl - ; IMUL_9C r0, -1084530831 - lea r8, [r8+r8*8-1084530831] - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; IROR_R r6, r4 - mov ecx, r12d - ror r14, cl - ; IROL_R r4, r5 - mov ecx, r13d - rol r12, cl - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; ISMULH_M r6, L2[98600] - mov rax, r14 - imul qword ptr [rsi+98600] - mov r14, rdx - ; IXOR_R r0, r6 - xor r8, r14 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; COND_R r1, ab(r3, -991705199) - xor ecx, ecx - cmp r11d, -991705199 - seta cl - add r9, rcx - ; IMULH_M r4, L2[r2] - mov ecx, r10d - and ecx, 262136 - mov rax, r12 - mul qword ptr [rsi+rcx] - mov r12, rdx - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; IMUL_R r1, r7 - imul r9, r15 - ; COND_R r6, ns(r2, 939392855) - xor ecx, ecx - cmp r10d, 939392855 - setns cl - add r14, rcx - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; COND_R r2, ab(r2, -499266314) - xor ecx, ecx - cmp r10d, -499266314 - seta cl - add r10, rcx - ; COND_M r7, lt(L1[r1], -1624420482) - xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -1624420482 - setl cl - add r15, rcx - ; COND_R r1, lt(r1, 1525413977) - xor ecx, ecx - cmp r9d, 1525413977 - setl cl - add r9, rcx - ; IMUL_R r4, r5 - imul r12, r13 - ; IMUL_R r4, r2 - imul r12, r10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; ISUB_R r2, r6 - sub r10, r14 - ; FPDIV_R e1, a0 - divpd xmm5, xmm8 - maxpd xmm5, xmm13 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_R r6, 671627590 - add r14, 671627590 - ; COND_M r6, sg(L1[r4], -780452820) - xor ecx, ecx - mov eax, r12d - and eax, 16376 - cmp dword ptr [rsi+rax], -780452820 - sets cl - add r14, rcx - ; IMULH_R r4, r7 - mov rax, r12 - mul r15 - mov r12, rdx - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IMUL_R r7, r3 - imul r15, r11 - ; IROL_R r0, r7 - mov ecx, r15d - rol r8, cl - ; IMUL_R r1, r7 - imul r9, r15 - ; COND_R r0, no(r7, 449007464) - xor ecx, ecx - cmp r15d, 449007464 - setno cl - add r8, rcx - ; ISMULH_M r6, L2[134288] - mov rax, r14 - imul qword ptr [rsi+134288] - mov r14, rdx - ; IMULH_R r5, r2 - mov rax, r13 - mul r10 - mov r13, rdx - ; IMULH_R r7, r4 - mov rax, r15 - mul r12 - mov r15, rdx - ; FPDIV_R e3, a0 - divpd xmm7, xmm8 - maxpd xmm7, xmm13 - ; IXOR_R r3, r4 - xor r11, r12 - ; IDIV_C r1, 72349044 - mov rax, 8555331009525020641 - mul r9 - shr rdx, 25 - add r9, rdx - ; IADD_R r5, r4 - add r13, r12 - ; IROR_R r2, r4 - mov ecx, r12d - ror r10, cl - ; FPSUB_M f1, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_R r5, r6 - add r13, r14 - ; IXOR_M r1, L1[r4] - mov eax, r12d - and eax, 16376 - xor r9, qword ptr [rsi+rax] - ; ISUB_R r2, -1544880589 - sub r10, -1544880589 - ; FPNEG_R f0 - xorps xmm0, xmm15 + imul r8, qword ptr [rsi+rax] ; IROR_R r1, r6 mov ecx, r14d ror r9, cl - ; IMUL_R r6, r4 - imul r14, r12 - ; IMULH_M r4, L2[r1] - mov ecx, r9d - and ecx, 262136 - mov rax, r12 - mul qword ptr [rsi+rcx] - mov r12, rdx - ; IXOR_R r3, r0 - xor r11, r8 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; COND_R r0, ns(r2, -308295242) - xor ecx, ecx - cmp r10d, -308295242 - setns cl - add r8, rcx - ; IMUL_9C r1, 591587965 - lea r9, [r9+r9*8+591587965] - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; IMUL_R r5, r4 - imul r13, r12 - ; IMUL_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; COND_R r6, sg(r5, -1119525789) - xor ecx, ecx - cmp r13d, -1119525789 - sets cl - add r14, rcx - ; IMUL_M r0, L1[r1] - mov eax, r9d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; IADD_M r3, L2[r7] - mov eax, r15d - and eax, 262136 - add r11, qword ptr [rsi+rax] - ; IADD_R r0, r1 - add r8, r9 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; IXOR_M r0, L2[r7] - mov eax, r15d - and eax, 262136 - xor r8, qword ptr [rsi+rax] - ; COND_R r6, be(r6, 1481939391) - xor ecx, ecx - cmp r14d, 1481939391 - setbe cl - add r14, rcx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; IXOR_R r3, r2 - xor r11, r10 - ; FPSUB_R f0, a1 - subpd xmm0, xmm9 - ; IXOR_R r7, r3 - xor r15, r11 - ; IXOR_M r6, L1[r4] - mov eax, r12d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; IMULH_R r2, r7 - mov rax, r10 - mul r15 - mov r10, rdx - ; ISUB_R r5, r1 - sub r13, r9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; FPSUB_R f1, a3 - subpd xmm1, xmm11 - ; FPSUB_M f0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IROL_R r2, r4 + ; IROR_R r2, r4 mov ecx, r12d - rol r10, cl - ; COND_M r7, ab(L2[r7], -2012390318) - xor ecx, ecx - mov eax, r15d - and eax, 262136 - cmp dword ptr [rsi+rax], -2012390318 - seta cl - add r15, rcx - ; IMUL_9C r4, -38079585 - lea r12, [r12+r12*8-38079585] - ; IXOR_R r0, r1 - xor r8, r9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; FPMUL_R e1, a1 - mulpd xmm5, xmm9 - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; IMUL_9C r4, -847745598 - lea r12, [r12+r12*8-847745598] - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IADD_R r7, r6 - add r15, r14 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IADD_R r7, r6 - add r15, r14 - ; IROL_R r2, r5 - mov ecx, r13d - rol r10, cl - ; IADD_RC r4, r2, 1338806320 - lea r12, [r12+r10+1338806320] - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_R r5, r0 - imul r13, r8 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; INEG_R r6 - neg r14 - ; IXOR_M r6, L1[r2] - mov eax, r10d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; COND_R r3, be(r4, 174667458) - xor ecx, ecx - cmp r12d, 174667458 - setbe cl - add r11, rcx - ; INEG_R r6 - neg r14 - ; IXOR_R r6, r3 - xor r14, r11 - ; COND_M r5, sg(L1[r0], -864345921) - xor ecx, ecx - mov eax, r8d - and eax, 16376 - cmp dword ptr [rsi+rax], -864345921 - sets cl - add r13, rcx - ; IROL_R r7, r3 - mov ecx, r11d - rol r15, cl - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; IADD_M r1, L1[r0] - mov eax, r8d - and eax, 16376 - add r9, qword ptr [rsi+rax] - ; IMULH_R r1, r3 - mov rax, r9 - mul r11 - mov r9, rdx - ; IMUL_R r0, -1489192296 - imul r8, -1489192296 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; COND_R r1, ge(r1, -1358904097) - xor ecx, ecx - cmp r9d, -1358904097 - setge cl - add r9, rcx - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; IROR_R r4, r7 - mov ecx, r15d - ror r12, cl - ; ISDIV_C r1, -1368098113 - mov rax, -7238896260565957085 - imul r9 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r9, rdx - ; IADD_M r4, L1[r1] - mov eax, r9d - and eax, 16376 - add r12, qword ptr [rsi+rax] - ; IMUL_R r0, -1011605520 - imul r8, -1011605520 + ror r10, cl ; FPSUB_R f3, a1 subpd xmm3, xmm9 - ; IADD_RC r1, r4, 272540736 - lea r9, [r9+r12+272540736] + ; FSTORE L1[r0], e1 + mov eax, r8d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm5 + ; COND_R r2, sg(r3, 1269153133) + xor ecx, ecx + cmp r11d, 1269153133 + sets cl + add r10, rcx ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; IMUL_R r3, 2085105439 - imul r11, 2085105439 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; IMUL_9C r6, -483723153 - lea r14, [r14+r14*8-483723153] - ; FPSUB_M f3, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; IMUL_R r3, r2 - imul r11, r10 - ; ISMULH_R r7, r1 - mov rax, r15 - imul r9 - mov r15, rdx - ; COND_R r1, of(r7, 778804236) + ; IADD_R r7, r5 + add r15, r13 + ; COND_R r0, be(r4, -1486502150) xor ecx, ecx - cmp r15d, 778804236 - seto cl - add r9, rcx - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; IROL_R r5, r7 - mov ecx, r15d - rol r13, cl - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; IMUL_R r6, r0 - imul r14, r8 - ; ISUB_M r2, L2[r4] - mov eax, r12d - and eax, 262136 - sub r10, qword ptr [rsi+rax] - ; IXOR_R r0, r6 - xor r8, r14 - ; INEG_R r6 - neg r14 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_RC r4, r6, -1312075035 - lea r12, [r12+r14-1312075035] - ; IMUL_R r1, r5 - imul r9, r13 - ; IXOR_M r7, L2[r6] - mov eax, r14d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; IROR_R r2, 23 - ror r10, 23 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; ISMULH_M r5, L1[r2] - mov ecx, r10d - and ecx, 16376 - mov rax, r13 - imul qword ptr [rsi+rcx] - mov r13, rdx - ; ISUB_M r7, L1[r4] - mov eax, r12d - and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; COND_R r0, sg(r2, 1538841628) - xor ecx, ecx - cmp r10d, 1538841628 - sets cl + cmp r12d, -1486502150 + setbe cl add r8, rcx - ; IMUL_R r6, r2 - imul r14, r10 - ; ISUB_R r0, r1 - sub r8, r9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; FPADD_R f0, a3 + addpd xmm0, xmm11 + ; IADD_R r2, r0 + add r10, r8 + ; FSTORE L1[r3], e2 + mov eax, r11d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm6 + ; IXOR_R r1, r7 + xor r9, r15 ; IMUL_R r5, r7 imul r13, r15 - ; IADD_RC r1, r0, 516706834 - lea r9, [r9+r8+516706834] - ; INEG_R r5 - neg r13 + ; IXOR_R r7, 266992378 + xor r15, 266992378 + ; COND_R r7, no(r4, 1983804692) + xor ecx, ecx + cmp r12d, 1983804692 + setno cl + add r15, rcx + ; IMUL_M r2, L2[r0] + mov eax, r8d + and eax, 262136 + imul r10, qword ptr [rsi+rax] + ; FPDIV_R e3, a2 + divpd xmm7, xmm10 + maxpd xmm7, xmm13 + ; IMUL_M r0, L2[r6] + mov eax, r14d + and eax, 262136 + imul r8, qword ptr [rsi+rax] + ; ISTORE L1[r0], r7 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROR_R r5, r4 + mov ecx, r12d + ror r13, cl + ; ISTORE L2[r7], r2 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r10 + ; FPSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; FPADD_M f3, L1[r2] + mov eax, r10d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; IDIV_C r5, 2218798981 + mov rax, 17853839665672790751 + mul r13 + shr rdx, 31 + add r13, rdx + ; IADD_RC r0, r4, -1321374359 + lea r8, [r8+r12-1321374359] + ; CFROUND r6, 28 + mov rax, r14 + rol rax, 49 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; IROL_R r7, r6 + mov ecx, r14d + rol r15, cl + ; ISUB_R r2, r4 + sub r10, r12 + ; IMULH_M r0, L1[12400] + mov rax, r8 + mul qword ptr [rsi+12400] + mov r8, rdx + ; IADD_R r2, r3 + add r10, r11 + ; COND_R r6, lt(r1, -1124202227) + xor ecx, ecx + cmp r9d, -1124202227 + setl cl + add r14, rcx + ; IROR_R r7, r4 + mov ecx, r12d + ror r15, cl + ; IMUL_R r4, r2 + imul r12, r10 + ; ISUB_R r3, r7 + sub r11, r15 + ; IADD_R r2, r7 + add r10, r15 ; FPSQRT_R e3 sqrtpd xmm7, xmm7 - ; IADD_RC r5, r4, -1679394922 - lea r13, [r13+r12-1679394922] - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IMUL_R r0, r2 - imul r8, r10 - ; ISUB_R r3, r2 - sub r11, r10 + ; ISUB_R r6, 540663146 + sub r14, 540663146 + ; IROL_R r5, 58 + rol r13, 58 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; IADD_R r5, r3 + add r13, r11 + ; IADD_M r7, L1[880] + add r15, qword ptr [rsi+880] + ; ISUB_R r7, r0 + sub r15, r8 + ; ISTORE L2[r0], r7 + mov eax, r8d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IDIV_C r2, 1014940364 + mov rax, r10 + shr rax, 2 + mov rcx, 1219717022984988185 + mul rcx + shr rdx, 24 + add r10, rdx + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IDIV_C r2, 3059159304 + mov rax, 12949335853590502915 + mul r10 + shr rdx, 31 + add r10, rdx + ; IADD_R r0, r3 + add r8, r11 + ; IMUL_9C r7, -2124093035 + lea r15, [r15+r15*8-2124093035] + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; FPDIV_R e0, a2 + divpd xmm4, xmm10 + maxpd xmm4, xmm13 + ; FPSUB_R f2, a3 + subpd xmm2, xmm11 + ; IMUL_R r1, r2 + imul r9, r10 + ; ISMULH_R r7, r5 + mov rax, r15 + imul r13 + mov r15, rdx + ; IMULH_R r3, r2 + mov rax, r11 + mul r10 + mov r11, rdx + ; IXOR_M r1, L2[r0] + mov eax, r8d + and eax, 262136 + xor r9, qword ptr [rsi+rax] + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; ISUB_R r4, 1456841848 + sub r12, 1456841848 + ; IXOR_M r3, L2[r2] + mov eax, r10d + and eax, 262136 + xor r11, qword ptr [rsi+rax] + ; COND_M r0, of(L1[r4], 1678513610) + xor ecx, ecx + mov eax, r12d + and eax, 16376 + cmp dword ptr [rsi+rax], 1678513610 + seto cl + add r8, rcx + ; IDIV_C r4, 2674394209 + mov rax, 925772300223658071 + mul r12 + shr rdx, 27 + add r12, rdx + ; IMUL_R r4, r1 + imul r12, r9 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; FPMUL_M e1, L2[r6] + mov eax, r14d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FPSUB_M f0, L2[r3] + mov eax, r11d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; IROR_R r0, r7 + mov ecx, r15d + ror r8, cl + ; FSTORE L2[r1], e0 + mov eax, r9d + and eax, 262128 + movapd xmmword ptr [rsi+rax], xmm4 + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl + ; IMUL_9C r2, 266593902 + lea r10, [r10+r10*8+266593902] + ; IMUL_R r4, r6 + imul r12, r14 + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; IXOR_M r7, L1[r2] + mov eax, r10d + and eax, 16376 + xor r15, qword ptr [rsi+rax] + ; IROR_R r0, r5 + mov ecx, r13d + ror r8, cl + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; FPADD_R f3, a1 + addpd xmm3, xmm9 + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; COND_M r2, ge(L2[r2], -226330940) + xor ecx, ecx + mov eax, r10d + and eax, 262136 + cmp dword ptr [rsi+rax], -226330940 + setge cl + add r10, rcx + ; FPDIV_R e2, a3 + divpd xmm6, xmm11 + maxpd xmm6, xmm13 + ; FPMUL_R e2, a1 + mulpd xmm6, xmm9 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; IMUL_R r7, r5 + imul r15, r13 + ; IMUL_R r0, r1 + imul r8, r9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROL_R r3, r5 + mov ecx, r13d + rol r11, cl + ; IADD_RC r5, r2, 795784298 + lea r13, [r13+r10+795784298] + ; ISUB_R r0, r4 + sub r8, r12 + ; IMUL_R r5, r4 + imul r13, r12 + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; ISDIV_C r3, 1662492575 + mov rax, 2978515652703905219 + imul r11 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r11, rdx + ; ISMULH_R r5, r0 + mov rax, r13 + imul r8 + mov r13, rdx + ; ISDIV_C r4, 1963597892 + mov rax, -8359627607928540073 + imul r12 + xor eax, eax + add rdx, r12 + sar rdx, 30 + sets al + add rdx, rax + add r12, rdx + ; IMUL_R r7, r0 + imul r15, r8 + ; IMULH_M r0, L1[r3] + mov ecx, r11d + and ecx, 16376 + mov rax, r8 + mul qword ptr [rsi+rcx] + mov r8, rdx + ; IXOR_R r3, r7 + xor r11, r15 + ; IDIV_C r4, 1146125335 + mov rax, 8640870253760721727 + mul r12 + shr rdx, 29 + add r12, rdx + ; FPSWAP_R f3 + shufpd xmm3, xmm3, 1 + ; IXOR_M r2, L1[r0] + mov eax, r8d + and eax, 16376 + xor r10, qword ptr [rsi+rax] + ; IROR_R r0, r1 + mov ecx, r9d + ror r8, cl + ; IXOR_R r7, r4 + xor r15, r12 + ; ISMULH_R r6, r2 + mov rax, r14 + imul r10 + mov r14, rdx + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IADD_RC r4, r2, 1704868083 + lea r12, [r12+r10+1704868083] + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISTORE L1[r0], r0 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 ; FPDIV_R e0, a3 divpd xmm4, xmm11 maxpd xmm4, xmm13 - ; ISUB_R r1, r5 - sub r9, r13 - ; COND_M r2, be(L2[r2], 1840094725) + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISUB_R r7, 1302457878 + sub r15, 1302457878 + ; IMUL_9C r1, 1330165941 + lea r9, [r9+r9*8+1330165941] + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IROL_R r0, r4 + mov ecx, r12d + rol r8, cl + ; FPSUB_M f1, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; IROL_R r5, r6 + mov ecx, r14d + rol r13, cl + ; COND_M r0, ab(L1[r1], -310933871) xor ecx, ecx + mov eax, r9d + and eax, 16376 + cmp dword ptr [rsi+rax], -310933871 + seta cl + add r8, rcx + ; CFROUND r7, 39 + mov rax, r15 + rol rax, 38 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPDIV_R e0, a1 + divpd xmm4, xmm9 + maxpd xmm4, xmm13 + ; IMUL_M r1, L1[r3] + mov eax, r11d + and eax, 16376 + imul r9, qword ptr [rsi+rax] + ; IMUL_9C r3, 1573236728 + lea r11, [r11+r11*8+1573236728] + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; COND_R r1, lt(r4, -1805702334) + xor ecx, ecx + cmp r12d, -1805702334 + setl cl + add r9, rcx + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IADD_R r7, -1421188024 + add r15, -1421188024 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; FPSUB_M f2, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; ISUB_R r2, r4 + sub r10, r12 + ; ISMULH_R r4, r5 + mov rax, r12 + imul r13 + mov r12, rdx + ; COND_R r1, of(r7, 1294727006) + xor ecx, ecx + cmp r15d, 1294727006 + seto cl + add r9, rcx + ; IADD_M r5, L2[r2] mov eax, r10d and eax, 262136 - cmp dword ptr [rsi+rax], 1840094725 - setbe cl + add r13, qword ptr [rsi+rax] + ; IMUL_9C r4, 401020510 + lea r12, [r12+r12*8+401020510] + ; IROL_R r3, r0 + mov ecx, r8d + rol r11, cl + ; ISTORE L1[r7], r0 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPSUB_R f2, a1 + subpd xmm2, xmm9 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMUL_R r3, 720965215 + imul r11, 720965215 + ; IMUL_R r6, r2 + imul r14, r10 + ; ISTORE L1[r7], r3 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMUL_9C r4, 788211341 + lea r12, [r12+r12*8+788211341] + ; IMUL_9C r3, -67993446 + lea r11, [r11+r11*8-67993446] + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; IMUL_M r2, L1[r6] + mov eax, r14d + and eax, 16376 + imul r10, qword ptr [rsi+rax] + ; COND_M r2, ge(L1[r2], -1892157506) + xor ecx, ecx + mov eax, r10d + and eax, 16376 + cmp dword ptr [rsi+rax], -1892157506 + setge cl add r10, rcx + ; FPADD_M f1, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; IADD_M r7, L1[r0] + mov eax, r8d + and eax, 16376 + add r15, qword ptr [rsi+rax] + ; ISDIV_C r1, 624867857 + mov rax, 7924491717200811467 + imul r9 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r9, rdx + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; ISUB_R r5, r7 + sub r13, r15 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; IMUL_R r6, r2 + imul r14, r10 + ; FPMUL_M e3, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; IADD_R r0, r4 + add r8, r12 + ; FPSUB_M f3, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; INEG_R r2 + neg r10 + ; FPMUL_R e2, a2 + mulpd xmm6, xmm10 + ; FPSUB_M f3, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; FPADD_R f1, a3 + addpd xmm1, xmm11 + ; IMULH_R r3, r2 + mov rax, r11 + mul r10 + mov r11, rdx + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 + ; IDIV_C r5, 2887845607 + mov rax, 13717520480010955377 + mul r13 + shr rdx, 31 + add r13, rdx + ; ISMULH_M r6, L1[r2] + mov ecx, r10d + and ecx, 16376 + mov rax, r14 + imul qword ptr [rsi+rcx] + mov r14, rdx + ; FPSUB_R f3, a3 + subpd xmm3, xmm11 ; IMUL_M r6, L1[r7] mov eax, r15d and eax, 16376 imul r14, qword ptr [rsi+rax] - ; IMULH_M r6, L1[r5] - mov ecx, r13d - and ecx, 16376 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; IMUL_9C r7, -1048659408 - lea r15, [r15+r15*8-1048659408] - ; IMUL_R r6, r3 - imul r14, r11 - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IMULH_R r0, r3 - mov rax, r8 - mul r11 - mov r8, rdx - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; IMUL_9C r6, 295130073 + lea r14, [r14+r14*8+295130073] + ; FPADD_R f1, a1 + addpd xmm1, xmm9 + ; IXOR_R r0, r5 + xor r8, r13 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 ; FPSQRT_R e3 sqrtpd xmm7, xmm7 - ; IMULH_R r2, r0 - mov rax, r10 - mul r8 - mov r10, rdx - ; FPDIV_R e1, a1 - divpd xmm5, xmm9 - maxpd xmm5, xmm13 + ; IADD_RC r3, r6, -1317630728 + lea r11, [r11+r14-1317630728] + ; IMUL_M r2, L1[r3] + mov eax, r11d + and eax, 16376 + imul r10, qword ptr [rsi+rax] + ; IADD_RC r1, r4, 894105694 + lea r9, [r9+r12+894105694] + ; IMUL_R r7, r0 + imul r15, r8 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; IMUL_M r7, L1[r1] + mov eax, r9d + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; IXOR_R r2, r4 + xor r10, r12 + ; ISUB_M r0, L1[r1] + mov eax, r9d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; INEG_R r4 + neg r12 + ; IMUL_9C r4, -285272388 + lea r12, [r12+r12*8-285272388] + ; IMUL_R r7, r4 + imul r15, r12 + ; IMULH_M r5, L1[r7] + mov ecx, r15d + and ecx, 16376 + mov rax, r13 + mul qword ptr [rsi+rcx] + mov r13, rdx + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IXOR_R r4, -757532727 + xor r12, -757532727 + ; IMUL_R r3, 1863959234 + imul r11, 1863959234 + ; IROL_R r4, 59 + rol r12, 59 + ; ISMULH_R r1, 2122681086 + mov rax, 2122681086 + imul r9 + add r9, rdx + ; ISTORE L2[r6], r7 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISTORE L1[r1], r5 + mov eax, r9d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; COND_R r2, ns(r1, 486049737) + xor ecx, ecx + cmp r9d, 486049737 + setns cl + add r10, rcx + ; FPMUL_M e0, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r5, r2 + mov ecx, r10d + rol r13, cl + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax]