From 8f2abd6c05bf1be3c8667ca84dd7683c1cb9cde1 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 18:19:49 +0100 Subject: [PATCH] NOP instruction register load/store from L3 --- src/AssemblyGeneratorX86.cpp | 6 + src/AssemblyGeneratorX86.hpp | 1 + src/Instruction.cpp | 8 + src/Instruction.hpp | 1 + src/JitCompilerX86.cpp | 7 +- src/JitCompilerX86.hpp | 1 + src/asm/program_load_flt.inc | 2 +- src/asm/program_load_int.inc | 2 +- src/asm/program_store_flt.inc | 2 +- src/asm/program_store_int.inc | 2 +- src/common.hpp | 2 +- src/executeProgram-win64.asm | 8 +- src/instructionWeights.hpp | 51 +-- src/main.cpp | 4 +- src/program.inc | 760 ++++++++-------------------------- 15 files changed, 233 insertions(+), 624 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 11bb3f0..a46fe5d 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -491,6 +491,10 @@ namespace RandomX { asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl; } + void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) { + asmCode << "\tnop" << std::endl; + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) @@ -540,5 +544,7 @@ namespace RandomX { INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + + INST_HANDLE(NOP) }; } \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5c22142..6b0c505 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -79,5 +79,6 @@ namespace RandomX { void h_CFROUND(Instruction&, int); void h_ISTORE(Instruction&, int); void h_FSTORE(Instruction&, int); + void h_NOP(Instruction&, int); }; } \ No newline at end of file diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 13cfc1d..0aa0289 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -327,6 +327,10 @@ namespace RandomX { os << ", " << reg << srcIndex << std::endl; } + void Instruction::h_NOP(std::ostream& os) const { + os << std::endl; + } + #include "instructionWeights.hpp" #define INST_NAME(x) REPN(#x, WT(x)) #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) @@ -377,6 +381,8 @@ namespace RandomX { INST_NAME(ISTORE) INST_NAME(FSTORE) + + INST_NAME(NOP) }; InstructionVisualizer Instruction::engine[256] = { @@ -425,6 +431,8 @@ namespace RandomX { INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + + INST_HANDLE(NOP) }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 017d92f..ffa3880 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -86,6 +86,7 @@ namespace RandomX { void h_CFROUND(std::ostream&) const; void h_ISTORE(std::ostream&) const; void h_FSTORE(std::ostream&) const; + void h_NOP(std::ostream&) const; }; static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index e001464..30c6f73 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -181,7 +181,7 @@ namespace RandomX { static const uint8_t JMP = 0xe9; size_t JitCompilerX86::getCodeSize() { - return codePos - prologueSize + readDatasetSize; + return codePos - prologueSize; } JitCompilerX86::JitCompilerX86() { @@ -761,6 +761,10 @@ namespace RandomX { emitByte(0x06); } + void JitCompilerX86::h_NOP(Instruction& instr) { + emitByte(0x90); + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) @@ -800,6 +804,7 @@ namespace RandomX { INST_HANDLE(CFROUND) INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + INST_HANDLE(NOP) }; diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index fa5aa93..0aef990 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -125,6 +125,7 @@ namespace RandomX { void h_CFROUND(Instruction&); void h_ISTORE(Instruction&); void h_FSTORE(Instruction&); + void h_NOP(Instruction&); }; } \ No newline at end of file diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc index af6f1b7..2c631ce 100644 --- a/src/asm/program_load_flt.inc +++ b/src/asm/program_load_flt.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc index d139549..d9277ed 100644 --- a/src/asm/program_load_int.inc +++ b/src/asm/program_load_int.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc index d6ca7f1..4bbab9f 100644 --- a/src/asm/program_store_flt.inc +++ b/src/asm/program_store_flt.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc index 75c973f..03dd31a 100644 --- a/src/asm/program_store_int.inc +++ b/src/asm/program_store_int.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 diff --git a/src/common.hpp b/src/common.hpp index 053f2a1..bbd5a2b 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -72,7 +72,7 @@ namespace RandomX { convertible_t hi; }; - constexpr int ProgramLength = 256; + constexpr int ProgramLength = 128; constexpr uint32_t InstructionCount = 1024; constexpr uint32_t ScratchpadSize = 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index be3bc82..e9bc30a 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -119,7 +119,7 @@ signMask: ALIGN 64 program_begin: xor eax, r8d ;# read address register 1 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] @@ -130,7 +130,7 @@ program_begin: xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] xor eax, r9d ;# read address register 2 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] @@ -166,7 +166,7 @@ program_begin: xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] mov eax, r12d ;# write address register 1 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 @@ -177,7 +177,7 @@ program_begin: mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 xor eax, r13d ;# write address register 2 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 86285de..55c9b79 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -20,51 +20,51 @@ along with RandomX. If not, see. #pragma once //Integer -#define WT_IADD_R 10 +#define WT_IADD_R 12 #define WT_IADD_M 3 -#define WT_IADD_RC 10 -#define WT_ISUB_R 10 +#define WT_IADD_RC 12 +#define WT_ISUB_R 12 #define WT_ISUB_M 3 #define WT_IMUL_9C 10 -#define WT_IMUL_R 20 -#define WT_IMUL_M 6 -#define WT_IMULH_R 6 -#define WT_IMULH_M 2 -#define WT_ISMULH_R 6 -#define WT_ISMULH_M 2 +#define WT_IMUL_R 16 +#define WT_IMUL_M 4 +#define WT_IMULH_R 4 +#define WT_IMULH_M 1 +#define WT_ISMULH_R 4 +#define WT_ISMULH_M 1 #define WT_IDIV_C 4 #define WT_ISDIV_C 4 #define WT_INEG_R 2 #define WT_IXOR_R 12 -#define WT_IXOR_M 4 -#define WT_IROR_R 10 -#define WT_IROL_R 10 +#define WT_IXOR_M 3 +#define WT_IROR_R 12 +#define WT_IROL_R 12 //Common floating point -#define WT_FPSWAP_R 6 +#define WT_FPSWAP_R 8 //Floating point group F -#define WT_FPADD_R 18 -#define WT_FPADD_M 3 -#define WT_FPSUB_R 18 -#define WT_FPSUB_M 3 -#define WT_FPNEG_R 5 +#define WT_FPADD_R 20 +#define WT_FPADD_M 5 +#define WT_FPSUB_R 20 +#define WT_FPSUB_M 5 +#define WT_FPNEG_R 6 //Floating point group E -#define WT_FPMUL_R 18 -#define WT_FPMUL_M 3 -#define WT_FPDIV_R 6 +#define WT_FPMUL_R 16 +#define WT_FPMUL_M 4 +#define WT_FPDIV_R 7 #define WT_FPDIV_M 1 #define WT_FPSQRT_R 6 //Control -#define WT_COND_R 12 -#define WT_COND_M 4 +#define WT_COND_R 7 +#define WT_COND_M 1 #define WT_CFROUND 1 //Store -#define WT_ISTORE 12 -#define WT_FSTORE 6 +#define WT_ISTORE 18 +#define WT_FSTORE 0 #define WT_NOP 0 @@ -115,6 +115,7 @@ static_assert(wtSum == 256, #define REP33(x) REP32(x) x, #define REP40(x) REP32(x) REP8(x) #define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x) +#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x) #define REP256(x) REP128(x) REP128(x) #define REPNX(x,N) REP##N(x) #define REPN(x,N) REPNX(x,N) diff --git a/src/main.cpp b/src/main.cpp index 12e9cdb..4f5a021 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -169,12 +169,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); vm->initializeScratchpad(scratchpad, spIndex); - //vm->initializeProgram(hash); + vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 16; ++chain) { vm->initializeProgram(hash); - int segment = hash[3] & 3; - vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4); vm->execute(); vm->getResult(nullptr, 0, hash); } diff --git a/src/program.inc b/src/program.inc index 21f7d0b..d901e9a 100644 --- a/src/program.inc +++ b/src/program.inc @@ -10,54 +10,54 @@ mulpd xmm6, xmm10 ; IMUL_R r6, r3 imul r14, r11 - ; FPMUL_R e1, a0 - mulpd xmm5, xmm8 - ; IROR_R r5, r3 + ; FPSUB_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; IROL_R r5, r3 mov ecx, r11d - ror r13, cl + rol r13, cl ; FPMUL_R e2, a0 mulpd xmm6, xmm8 - ; FPNEG_R f3 - xorps xmm3, xmm15 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 ; IXOR_R r0, r4 xor r8, r12 - ; ISMULH_R r3, r7 + ; ISMULH_M r3, L1[r7] + mov ecx, r15d + and ecx, 16376 mov rax, r11 - imul r15 + imul qword ptr [rsi+rcx] mov r11, rdx ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 - ; ISMULH_R r6, r0 - mov rax, r14 - imul r8 - mov r14, rdx + ; IDIV_C r6, 1248528248 + mov rax, 15864311168205210203 + mul r14 + shr rdx, 30 + add r14, rdx ; FPMUL_R e0, a2 mulpd xmm4, xmm10 - ; ISUB_R r3, r4 - sub r11, r12 + ; IADD_RC r3, r4, -52260428 + lea r11, [r11+r12-52260428] ; IADD_R r7, -1138617760 add r15, -1138617760 - ; IROR_R r2, r6 + ; IROL_R r2, r6 mov ecx, r14d - ror r10, cl - ; FPMUL_R e2, a1 - mulpd xmm6, xmm9 + rol r10, cl + ; FPNEG_R f2 + xorps xmm2, xmm15 ; IROR_R r7, r1 mov ecx, r9d ror r15, cl - ; COND_M r2, lt(L1[r7], -41618808) + ; COND_R r2, lt(r7, -41618808) xor ecx, ecx - mov eax, r15d - and eax, 16376 - cmp dword ptr [rsi+rax], -41618808 + cmp r15d, -41618808 setl cl add r10, rcx - ; FPMUL_M e3, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 ; CFROUND r1, 43 mov rax, r9 rol rax, 34 @@ -67,14 +67,17 @@ ldmxcsr dword ptr [rsp-8] ; FPADD_R f2, a1 addpd xmm2, xmm9 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FSTORE L1[r6], f2 + ; FPSUB_M f0, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; ISTORE L1[r6], r2 mov eax, r14d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm2 - ; IMUL_9C r6, -45112665 - lea r14, [r14+r14*8-45112665] + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; ISUB_R r6, r5 + sub r14, r13 ; IADD_M r0, L1[r4] mov eax, r12d and eax, 16376 @@ -87,41 +90,30 @@ mov eax, r14d and eax, 16376 mov qword ptr [rsi+rax], r14 - ; COND_R r4, sg(r1, -1189096105) - xor ecx, ecx - cmp r9d, -1189096105 - sets cl - add r12, rcx + ; FPSQRT_R e0 + sqrtpd xmm4, xmm4 ; IXOR_R r2, r5 xor r10, r13 - ; COND_R r1, be(r5, -965180434) - xor ecx, ecx - cmp r13d, -965180434 - setbe cl - add r9, rcx - ; FPMUL_M e1, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 ; IMULH_R r7, r6 mov rax, r15 mul r14 mov r15, rdx - ; ISMULH_M r0, L1[r4] - mov ecx, r12d - and ecx, 16376 - mov rax, r8 - imul qword ptr [rsi+rcx] - mov r8, rdx + ; ISDIV_C r0, -1706892622 + mov rax, -5802075764249827661 + imul r8 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax + add r8, rdx ; IMUL_R r5, r3 imul r13, r11 - ; COND_R r2, of(r0, -1045938770) - xor ecx, ecx - cmp r8d, -1045938770 - seto cl - add r10, rcx + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 ; FPADD_M f3, L1[r4] mov eax, r12d and eax, 16376 @@ -131,18 +123,19 @@ add r11, r10 ; FPADD_R f1, a0 addpd xmm1, xmm8 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 + ; FPDIV_R e3, a2 + divpd xmm7, xmm10 + maxpd xmm7, xmm13 ; FPSUB_R f0, a1 subpd xmm0, xmm9 ; IMUL_M r5, L1[r6] mov eax, r14d and eax, 16376 imul r13, qword ptr [rsi+rax] - ; ISUB_R r1, r2 - sub r9, r10 - ; IMUL_R r4, r6 - imul r12, r14 + ; IADD_RC r1, r2, -1263285243 + lea r9, [r9+r10-1263285243] + ; IMUL_9C r4, 1994773931 + lea r12, [r12+r12*8+1994773931] ; FPSWAP_R e3 shufpd xmm7, xmm7, 1 ; IMUL_M r0, L1[r7] @@ -152,69 +145,72 @@ ; IROR_R r1, r6 mov ecx, r14d ror r9, cl - ; IROR_R r2, r4 + ; IROL_R r2, r4 mov ecx, r12d - ror r10, cl + rol r10, cl ; FPSUB_R f3, a1 subpd xmm3, xmm9 - ; FSTORE L1[r0], e1 + ; ISTORE L1[r0], r5 mov eax, r8d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm5 - ; COND_R r2, sg(r3, 1269153133) - xor ecx, ecx - cmp r11d, 1269153133 - sets cl - add r10, rcx + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FPDIV_M e2, L2[r3] + mov eax, r11d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + divpd xmm6, xmm12 + maxpd xmm6, xmm13 ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 ; IADD_R r7, r5 add r15, r13 - ; COND_R r0, be(r4, -1486502150) - xor ecx, ecx - cmp r12d, -1486502150 - setbe cl - add r8, rcx - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 + ; FPDIV_M e0, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + divpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FPADD_M f3, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; FPADD_R f0, a3 addpd xmm0, xmm11 ; IADD_R r2, r0 add r10, r8 - ; FSTORE L1[r3], e2 + ; ISTORE L1[r3], r6 mov eax, r11d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm6 - ; IXOR_R r1, r7 - xor r9, r15 - ; IMUL_R r5, r7 - imul r13, r15 + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; IROR_R r1, r7 + mov ecx, r15d + ror r9, cl + ; IMUL_9C r5, 301671287 + lea r13, [r13+r13*8+301671287] ; IXOR_R r7, 266992378 xor r15, 266992378 - ; COND_R r7, no(r4, 1983804692) - xor ecx, ecx - cmp r12d, 1983804692 - setno cl - add r15, rcx + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 ; IMUL_M r2, L2[r0] mov eax, r8d and eax, 262136 imul r10, qword ptr [rsi+rax] - ; FPDIV_R e3, a2 - divpd xmm7, xmm10 - maxpd xmm7, xmm13 - ; IMUL_M r0, L2[r6] - mov eax, r14d - and eax, 262136 - imul r8, qword ptr [rsi+rax] + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IMUL_R r0, r6 + imul r8, r14 ; ISTORE L1[r0], r7 mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r15 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FPADD_M f3, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; IROR_R r5, r4 mov ecx, r12d ror r13, cl @@ -222,17 +218,20 @@ mov eax, r15d and eax, 262136 mov qword ptr [rsi+rax], r10 - ; FPSWAP_R e2 - shufpd xmm6, xmm6, 1 + ; FPADD_R f2, a3 + addpd xmm2, xmm11 ; FPADD_M f3, L1[r2] mov eax, r10d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm3, xmm12 - ; IDIV_C r5, 2218798981 - mov rax, 17853839665672790751 - mul r13 - shr rdx, 31 + ; ISDIV_C r5, -2076168315 + mov rax, -4770095103914078469 + imul r13 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax add r13, rdx ; IADD_RC r0, r4, -1321374359 lea r8, [r8+r12-1321374359] @@ -250,28 +249,26 @@ rol r15, cl ; ISUB_R r2, r4 sub r10, r12 - ; IMULH_M r0, L1[12400] - mov rax, r8 - mul qword ptr [rsi+12400] - mov r8, rdx + ; ISMULH_R r0, -1500893068 + mov rax, -1500893068 + imul r8 + add r8, rdx ; IADD_R r2, r3 add r10, r11 - ; COND_R r6, lt(r1, -1124202227) - xor ecx, ecx - cmp r9d, -1124202227 - setl cl - add r14, rcx - ; IROR_R r7, r4 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROL_R r7, r4 mov ecx, r12d - ror r15, cl + rol r15, cl ; IMUL_R r4, r2 imul r12, r10 ; ISUB_R r3, r7 sub r11, r15 ; IADD_R r2, r7 add r10, r15 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 + ; FPDIV_R e3, a0 + divpd xmm7, xmm8 + maxpd xmm7, xmm13 ; ISUB_R r6, 540663146 sub r14, 540663146 ; IROL_R r5, 58 @@ -280,67 +277,65 @@ addpd xmm2, xmm9 ; FPADD_R f2, a2 addpd xmm2, xmm10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 + ; FPDIV_R e1, a2 + divpd xmm5, xmm10 + maxpd xmm5, xmm13 ; FPADD_R f1, a2 addpd xmm1, xmm10 ; IADD_R r5, r3 add r13, r11 - ; IADD_M r7, L1[880] - add r15, qword ptr [rsi+880] + ; IADD_R r7, -1780268176 + add r15, -1780268176 ; ISUB_R r7, r0 sub r15, r8 ; ISTORE L2[r0], r7 mov eax, r8d and eax, 262136 mov qword ptr [rsi+rax], r15 - ; IDIV_C r2, 1014940364 - mov rax, r10 - shr rax, 2 - mov rcx, 1219717022984988185 - mul rcx - shr rdx, 24 - add r10, rdx - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IDIV_C r2, 3059159304 - mov rax, 12949335853590502915 - mul r10 - shr rdx, 31 - add r10, rdx + ; INEG_R r2 + neg r10 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; INEG_R r2 + neg r10 ; IADD_R r0, r3 add r8, r11 ; IMUL_9C r7, -2124093035 lea r15, [r15+r15*8-2124093035] - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; FPDIV_R e0, a2 - divpd xmm4, xmm10 + ; FPADD_M f2, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FPMUL_M e0, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm4, xmm12 maxpd xmm4, xmm13 ; FPSUB_R f2, a3 subpd xmm2, xmm11 ; IMUL_R r1, r2 imul r9, r10 - ; ISMULH_R r7, r5 - mov rax, r15 - imul r13 - mov r15, rdx + ; IDIV_C r7, 3214009572 + mov rax, 12325439725582798855 + mul r15 + shr rdx, 31 + add r15, rdx ; IMULH_R r3, r2 mov rax, r11 mul r10 mov r11, rdx - ; IXOR_M r1, L2[r0] - mov eax, r8d - and eax, 262136 - xor r9, qword ptr [rsi+rax] + ; IROR_R r1, r0 + mov ecx, r8d + ror r9, cl ; FPMUL_R e0, a1 mulpd xmm4, xmm9 - ; ISUB_R r4, 1456841848 - sub r12, 1456841848 - ; IXOR_M r3, L2[r2] - mov eax, r10d - and eax, 262136 - xor r11, qword ptr [rsi+rax] + ; IADD_RC r4, r4, 1456841848 + lea r12, [r12+r12+1456841848] + ; IROR_R r3, r2 + mov ecx, r10d + ror r11, cl ; COND_M r0, of(L1[r4], 1678513610) xor ecx, ecx mov eax, r12d @@ -348,446 +343,39 @@ cmp dword ptr [rsi+rax], 1678513610 seto cl add r8, rcx - ; IDIV_C r4, 2674394209 - mov rax, 925772300223658071 - mul r12 - shr rdx, 27 - add r12, rdx + ; INEG_R r4 + neg r12 ; IMUL_R r4, r1 imul r12, r9 ; FPADD_R f1, a2 addpd xmm1, xmm10 ; FPSUB_R f2, a0 subpd xmm2, xmm8 - ; FPMUL_M e1, L2[r6] - mov eax, r14d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FPSUB_M f0, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FPMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 ; IROR_R r0, r7 mov ecx, r15d ror r8, cl - ; FSTORE L2[r1], e0 + ; ISTORE L2[r1], r4 mov eax, r9d - and eax, 262128 - movapd xmmword ptr [rsi+rax], xmm4 - ; IROR_R r7, r6 + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IROL_R r7, r6 mov ecx, r14d - ror r15, cl + rol r15, cl ; IMUL_9C r2, 266593902 lea r10, [r10+r10*8+266593902] ; IMUL_R r4, r6 imul r12, r14 ; FPSUB_R f2, a2 subpd xmm2, xmm10 - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; IXOR_M r7, L1[r2] - mov eax, r10d - and eax, 16376 - xor r15, qword ptr [rsi+rax] + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; IROR_R r7, r2 + mov ecx, r10d + ror r15, cl ; IROR_R r0, r5 mov ecx, r13d ror r8, cl - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; COND_M r2, ge(L2[r2], -226330940) - xor ecx, ecx - mov eax, r10d - and eax, 262136 - cmp dword ptr [rsi+rax], -226330940 - setge cl - add r10, rcx - ; FPDIV_R e2, a3 - divpd xmm6, xmm11 - maxpd xmm6, xmm13 - ; FPMUL_R e2, a1 - mulpd xmm6, xmm9 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; IMUL_R r7, r5 - imul r15, r13 - ; IMUL_R r0, r1 - imul r8, r9 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; IROL_R r3, r5 - mov ecx, r13d - rol r11, cl - ; IADD_RC r5, r2, 795784298 - lea r13, [r13+r10+795784298] - ; ISUB_R r0, r4 - sub r8, r12 - ; IMUL_R r5, r4 - imul r13, r12 - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; ISDIV_C r3, 1662492575 - mov rax, 2978515652703905219 - imul r11 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r11, rdx - ; ISMULH_R r5, r0 - mov rax, r13 - imul r8 - mov r13, rdx - ; ISDIV_C r4, 1963597892 - mov rax, -8359627607928540073 - imul r12 - xor eax, eax - add rdx, r12 - sar rdx, 30 - sets al - add rdx, rax - add r12, rdx - ; IMUL_R r7, r0 - imul r15, r8 - ; IMULH_M r0, L1[r3] - mov ecx, r11d - and ecx, 16376 - mov rax, r8 - mul qword ptr [rsi+rcx] - mov r8, rdx - ; IXOR_R r3, r7 - xor r11, r15 - ; IDIV_C r4, 1146125335 - mov rax, 8640870253760721727 - mul r12 - shr rdx, 29 - add r12, rdx - ; FPSWAP_R f3 - shufpd xmm3, xmm3, 1 - ; IXOR_M r2, L1[r0] - mov eax, r8d - and eax, 16376 - xor r10, qword ptr [rsi+rax] - ; IROR_R r0, r1 - mov ecx, r9d - ror r8, cl - ; IXOR_R r7, r4 - xor r15, r12 - ; ISMULH_R r6, r2 - mov rax, r14 - imul r10 - mov r14, rdx - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IADD_RC r4, r2, 1704868083 - lea r12, [r12+r10+1704868083] - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; ISTORE L1[r0], r0 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 - ; FPDIV_R e0, a3 - divpd xmm4, xmm11 - maxpd xmm4, xmm13 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; ISUB_R r7, 1302457878 - sub r15, 1302457878 - ; IMUL_9C r1, 1330165941 - lea r9, [r9+r9*8+1330165941] - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IROL_R r0, r4 - mov ecx, r12d - rol r8, cl - ; FPSUB_M f1, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IROL_R r5, r6 - mov ecx, r14d - rol r13, cl - ; COND_M r0, ab(L1[r1], -310933871) - xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -310933871 - seta cl - add r8, rcx - ; CFROUND r7, 39 - mov rax, r15 - rol rax, 38 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; IMUL_M r1, L1[r3] - mov eax, r11d - and eax, 16376 - imul r9, qword ptr [rsi+rax] - ; IMUL_9C r3, 1573236728 - lea r11, [r11+r11*8+1573236728] - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; COND_R r1, lt(r4, -1805702334) - xor ecx, ecx - cmp r12d, -1805702334 - setl cl - add r9, rcx - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IADD_R r7, -1421188024 - add r15, -1421188024 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FPSUB_M f2, L2[r7] - mov eax, r15d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; ISUB_R r2, r4 - sub r10, r12 - ; ISMULH_R r4, r5 - mov rax, r12 - imul r13 - mov r12, rdx - ; COND_R r1, of(r7, 1294727006) - xor ecx, ecx - cmp r15d, 1294727006 - seto cl - add r9, rcx - ; IADD_M r5, L2[r2] - mov eax, r10d - and eax, 262136 - add r13, qword ptr [rsi+rax] - ; IMUL_9C r4, 401020510 - lea r12, [r12+r12*8+401020510] - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; ISTORE L1[r7], r0 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_R r3, 720965215 - imul r11, 720965215 - ; IMUL_R r6, r2 - imul r14, r10 - ; ISTORE L1[r7], r3 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_9C r4, 788211341 - lea r12, [r12+r12*8+788211341] - ; IMUL_9C r3, -67993446 - lea r11, [r11+r11*8-67993446] - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_M r2, L1[r6] - mov eax, r14d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; COND_M r2, ge(L1[r2], -1892157506) - xor ecx, ecx - mov eax, r10d - and eax, 16376 - cmp dword ptr [rsi+rax], -1892157506 - setge cl - add r10, rcx - ; FPADD_M f1, L1[r3] - mov eax, r11d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 - ; IADD_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - add r15, qword ptr [rsi+rax] - ; ISDIV_C r1, 624867857 - mov rax, 7924491717200811467 - imul r9 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r9, rdx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; ISUB_R r5, r7 - sub r13, r15 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; IMUL_R r6, r2 - imul r14, r10 - ; FPMUL_M e3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; IADD_R r0, r4 - add r8, r12 - ; FPSUB_M f3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; INEG_R r2 - neg r10 - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; FPSUB_M f3, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; FPADD_R f1, a3 - addpd xmm1, xmm11 - ; IMULH_R r3, r2 - mov rax, r11 - mul r10 - mov r11, rdx - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 - ; IDIV_C r5, 2887845607 - mov rax, 13717520480010955377 - mul r13 - shr rdx, 31 - add r13, rdx - ; ISMULH_M r6, L1[r2] - mov ecx, r10d - and ecx, 16376 - mov rax, r14 - imul qword ptr [rsi+rcx] - mov r14, rdx - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; IMUL_M r6, L1[r7] - mov eax, r15d - and eax, 16376 - imul r14, qword ptr [rsi+rax] - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; IMUL_9C r6, 295130073 - lea r14, [r14+r14*8+295130073] - ; FPADD_R f1, a1 - addpd xmm1, xmm9 - ; IXOR_R r0, r5 - xor r8, r13 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IADD_RC r3, r6, -1317630728 - lea r11, [r11+r14-1317630728] - ; IMUL_M r2, L1[r3] - mov eax, r11d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; IADD_RC r1, r4, 894105694 - lea r9, [r9+r12+894105694] - ; IMUL_R r7, r0 - imul r15, r8 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; IMUL_M r7, L1[r1] - mov eax, r9d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; IXOR_R r2, r4 - xor r10, r12 - ; ISUB_M r0, L1[r1] - mov eax, r9d - and eax, 16376 - sub r8, qword ptr [rsi+rax] - ; INEG_R r4 - neg r12 - ; IMUL_9C r4, -285272388 - lea r12, [r12+r12*8-285272388] - ; IMUL_R r7, r4 - imul r15, r12 - ; IMULH_M r5, L1[r7] - mov ecx, r15d - and ecx, 16376 - mov rax, r13 - mul qword ptr [rsi+rcx] - mov r13, rdx - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IXOR_R r4, -757532727 - xor r12, -757532727 - ; IMUL_R r3, 1863959234 - imul r11, 1863959234 - ; IROL_R r4, 59 - rol r12, 59 - ; ISMULH_R r1, 2122681086 - mov rax, 2122681086 - imul r9 - add r9, rdx - ; ISTORE L2[r6], r7 - mov eax, r14d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; ISTORE L1[r1], r5 - mov eax, r9d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; COND_R r2, ns(r1, 486049737) - xor ecx, ecx - cmp r9d, 486049737 - setns cl - add r10, rcx - ; FPMUL_M e0, L2[r7] - mov eax, r15d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IROL_R r5, r2 - mov ecx, r10d - rol r13, cl - ; IADD_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - add r8, qword ptr [rsi+rax]