From b417fd08eaf8036fafc0faeb85c6d88d807b09cc Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 5 Feb 2019 23:06:44 +0100 Subject: [PATCH] 16 -> 8 chained programs constant address loads are always from L3 --- src/AssemblyGeneratorX86.cpp | 2 +- src/Instruction.cpp | 2 +- src/JitCompilerX86.cpp | 2 +- src/common.hpp | 3 +- src/instructionWeights.hpp | 4 +- src/main.cpp | 2 +- src/program.inc | 1095 +++++++++++++++++++++++----------- 7 files changed, 749 insertions(+), 361 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 3092e4d..70d396b 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -81,7 +81,7 @@ namespace RandomX { } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + return instr.imm32 & ScratchpadL3Mask; } //1 uOP diff --git a/src/Instruction.cpp b/src/Instruction.cpp index ce75f43..8a175fc 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -37,7 +37,7 @@ namespace RandomX { } void Instruction::genAddressImm(std::ostream& os) const { - os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; + os << "L3" << "[" << (imm32 & ScratchpadL3Mask) << "]"; } void Instruction::h_IADD_R(std::ostream& os) const { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index d8e7a42..e926e4a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -262,7 +262,7 @@ namespace RandomX { } void JitCompilerX86::genAddressImm(Instruction& instr) { - emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); + emit32(instr.imm32 & ScratchpadL3Mask); } void JitCompilerX86::h_IADD_R(Instruction& instr) { diff --git a/src/common.hpp b/src/common.hpp index e52dbc2..ea67ff9 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -68,7 +68,7 @@ namespace RandomX { }; constexpr int ProgramLength = 256; - constexpr uint32_t InstructionCount = 1024; + constexpr uint32_t InstructionCount = 2048; constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); @@ -78,6 +78,7 @@ namespace RandomX { constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; + constexpr int ScratchpadL3Mask = (ScratchpadLength - 1) * 8; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index d24800e..0bb26ff 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -25,7 +25,7 @@ along with RandomX. If not, see. #define WT_IADD_RC 12 #define WT_ISUB_R 12 #define WT_ISUB_M 3 -#define WT_IMUL_9C 10 +#define WT_IMUL_9C 9 #define WT_IMUL_R 16 #define WT_IMUL_M 4 #define WT_IMULH_R 4 @@ -36,7 +36,7 @@ along with RandomX. If not, see. #define WT_ISDIV_C 4 #define WT_INEG_R 2 #define WT_IXOR_R 12 -#define WT_IXOR_M 3 +#define WT_IXOR_M 4 #define WT_IROR_R 10 #define WT_IROL_R 10 #define WT_ISWAP_R 4 diff --git a/src/main.cpp b/src/main.cpp index c761b97..58b381c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -172,7 +172,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash //vm->initializeScratchpad(scratchpad, spIndex); vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); - for (int chain = 0; chain < 16; ++chain) { + for (int chain = 0; chain < 8; ++chain) { vm->initializeProgram(hash); vm->execute(); vm->getResult(nullptr, 0, hash); diff --git a/src/program.inc b/src/program.inc index d901e9a..e4de06f 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,381 +1,768 @@ - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r2, r5, -1621224194 - lea r10, [r10+r13-1621224194] - ; ISTORE L2[r2], r7 - mov eax, r10d + ; IMUL_R r0, r7 + imul r8, r15 + ; ISMULH_R r2, r1 + mov rax, r10 + imul r9 + mov r10, rdx + ; IMUL_R r2, r4 + imul r10, r12 + ; IADD_R r7, r0 + add r15, r8 + ; FPSQRT_R e0 + sqrtpd xmm4, xmm4 + ; IMUL_R r3, r6 + imul r11, r14 + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IMULH_M r6, L1[r3] + mov ecx, r11d + and ecx, 16376 + mov rax, r14 + mul qword ptr [rsi+rcx] + mov r14, rdx + ; IMUL_R r5, r1 + imul r13, r9 + ; FPADD_M f0, L2[r6] + mov eax, r14d and eax, 262136 - mov qword ptr [rsi+rax], r15 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 + ; IROR_R r4, r3 + mov ecx, r11d + ror r12, cl + ; IXOR_M r4, L3[984888] + xor r12, qword ptr [rsi+984888] + ; IROR_R r0, r3 + mov ecx, r11d + ror r8, cl + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; IMUL_R r0, r2 + imul r8, r10 + ; ISUB_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; ISWAP_R r7, r4 + xchg r15, r12 + ; IDIV_C r1, 3690475308 + mov rax, r9 + shr rax, 2 + mov rcx, 5367070356934653253 + mul rcx + shr rdx, 28 + add r9, rdx + ; IROL_R r4, r2 + mov ecx, r10d + rol r12, cl + ; IMUL_M r5, L1[r4] + mov eax, r12d + and eax, 16376 + imul r13, qword ptr [rsi+rax] + ; IROL_R r4, r7 + mov ecx, r15d + rol r12, cl + ; ISUB_R r3, r1 + sub r11, r9 + ; IADD_R r7, r0 + add r15, r8 + ; IADD_M r1, L1[r3] + mov eax, r11d + and eax, 16376 + add r9, qword ptr [rsi+rax] ; FPMUL_R e2, a2 mulpd xmm6, xmm10 - ; IMUL_R r6, r3 - imul r14, r11 - ; FPSUB_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IROL_R r5, r3 - mov ecx, r11d - rol r13, cl - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; IXOR_R r0, r4 - xor r8, r12 - ; ISMULH_M r3, L1[r7] - mov ecx, r15d - and ecx, 16376 - mov rax, r11 - imul qword ptr [rsi+rcx] - mov r11, rdx - ; FPSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; IDIV_C r6, 1248528248 - mov rax, 15864311168205210203 - mul r14 - shr rdx, 30 - add r14, rdx - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r3, r4, -52260428 - lea r11, [r11+r12-52260428] - ; IADD_R r7, -1138617760 - add r15, -1138617760 - ; IROL_R r2, r6 - mov ecx, r14d - rol r10, cl - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; IROR_R r7, r1 - mov ecx, r9d - ror r15, cl - ; COND_R r2, lt(r7, -41618808) - xor ecx, ecx - cmp r15d, -41618808 - setl cl - add r10, rcx - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; CFROUND r1, 43 - mov rax, r9 - rol rax, 34 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSUB_M f0, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; ISTORE L1[r6], r2 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; ISUB_R r6, r5 - sub r14, r13 - ; IADD_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - add r8, qword ptr [rsi+rax] + ; IADD_R r6, -1115286770 + add r14, -1115286770 + ; FPDIV_R e2, a3 + divpd xmm6, xmm11 + maxpd xmm6, xmm13 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; IXOR_R r3, r7 + xor r11, r15 ; ISTORE L1[r4], r3 mov eax, r12d and eax, 16376 mov qword ptr [rsi+rax], r11 - ; ISTORE L1[r6], r6 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; FPSQRT_R e0 - sqrtpd xmm4, xmm4 - ; IXOR_R r2, r5 - xor r10, r13 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IMULH_R r7, r6 - mov rax, r15 - mul r14 - mov r15, rdx - ; ISDIV_C r0, -1706892622 - mov rax, -5802075764249827661 - imul r8 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r8, rdx - ; IMUL_R r5, r3 - imul r13, r11 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; FPADD_M f3, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; IADD_R r3, r2 - add r11, r10 - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; FPDIV_R e3, a2 - divpd xmm7, xmm10 - maxpd xmm7, xmm13 - ; FPSUB_R f0, a1 - subpd xmm0, xmm9 - ; IMUL_M r5, L1[r6] - mov eax, r14d - and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IADD_RC r1, r2, -1263285243 - lea r9, [r9+r10-1263285243] - ; IMUL_9C r4, 1994773931 - lea r12, [r12+r12*8+1994773931] - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_M r0, L1[r7] - mov eax, r15d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; IROR_R r1, r6 + ; IROR_R r3, r6 mov ecx, r14d - ror r9, cl - ; IROL_R r2, r4 - mov ecx, r12d - rol r10, cl - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISTORE L1[r0], r5 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FPDIV_M e2, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - divpd xmm6, xmm12 - maxpd xmm6, xmm13 - ; FPSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; IADD_R r7, r5 - add r15, r13 - ; FPDIV_M e0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - divpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPADD_M f3, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; FPADD_R f0, a3 - addpd xmm0, xmm11 - ; IADD_R r2, r0 - add r10, r8 - ; ISTORE L1[r3], r6 - mov eax, r11d - and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; IROR_R r1, r7 - mov ecx, r15d - ror r9, cl - ; IMUL_9C r5, 301671287 - lea r13, [r13+r13*8+301671287] - ; IXOR_R r7, 266992378 - xor r15, 266992378 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_M r2, L2[r0] - mov eax, r8d - and eax, 262136 - imul r10, qword ptr [rsi+rax] + ror r11, cl + ; ISMULH_R r0, r6 + mov rax, r8 + imul r14 + mov r8, rdx + ; IROR_R r6, r5 + mov ecx, r13d + ror r14, cl + ; IMULH_M r6, L2[r0] + mov ecx, r8d + and ecx, 262136 + mov rax, r14 + mul qword ptr [rsi+rcx] + mov r14, rdx + ; ISUB_R r2, 1512125960 + sub r10, 1512125960 + ; IMUL_R r7, r6 + imul r15, r14 + ; IMULH_R r6, r7 + mov rax, r14 + mul r15 + mov r14, rdx + ; ISUB_R r4, r1 + sub r12, r9 ; FPMUL_R e3, a2 mulpd xmm7, xmm10 - ; IMUL_R r0, r6 - imul r8, r14 - ; ISTORE L1[r0], r7 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FPADD_M f3, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; IROR_R r5, r4 - mov ecx, r12d - ror r13, cl - ; ISTORE L2[r7], r2 - mov eax, r15d - and eax, 262136 - mov qword ptr [rsi+rax], r10 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; FPADD_M f3, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; ISDIV_C r5, -2076168315 - mov rax, -4770095103914078469 - imul r13 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r13, rdx - ; IADD_RC r0, r4, -1321374359 - lea r8, [r8+r12-1321374359] - ; CFROUND r6, 28 - mov rax, r14 - rol rax, 49 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; IROL_R r7, r6 - mov ecx, r14d - rol r15, cl - ; ISUB_R r2, r4 - sub r10, r12 - ; ISMULH_R r0, -1500893068 - mov rax, -1500893068 - imul r8 - add r8, rdx - ; IADD_R r2, r3 - add r10, r11 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; IROL_R r7, r4 - mov ecx, r12d - rol r15, cl - ; IMUL_R r4, r2 - imul r12, r10 - ; ISUB_R r3, r7 - sub r11, r15 - ; IADD_R r2, r7 - add r10, r15 - ; FPDIV_R e3, a0 - divpd xmm7, xmm8 - maxpd xmm7, xmm13 - ; ISUB_R r6, 540663146 - sub r14, 540663146 - ; IROL_R r5, 58 - rol r13, 58 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; FPDIV_R e1, a2 - divpd xmm5, xmm10 - maxpd xmm5, xmm13 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; IADD_R r5, r3 - add r13, r11 - ; IADD_R r7, -1780268176 - add r15, -1780268176 - ; ISUB_R r7, r0 - sub r15, r8 - ; ISTORE L2[r0], r7 - mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; INEG_R r2 - neg r10 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; INEG_R r2 - neg r10 - ; IADD_R r0, r3 - add r8, r11 - ; IMUL_9C r7, -2124093035 - lea r15, [r15+r15*8-2124093035] + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IXOR_R r5, r2 + xor r13, r10 ; FPADD_M f2, L1[r0] mov eax, r8d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm2, xmm12 - ; FPMUL_M e0, L1[r6] + ; IMULH_R r6, r1 + mov rax, r14 + mul r9 + mov r14, rdx + ; ISUB_M r5, L1[r0] + mov eax, r8d + and eax, 16376 + sub r13, qword ptr [rsi+rax] + ; FPMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISUB_R r3, r2 + sub r11, r10 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r7, r0 + mov ecx, r8d + rol r15, cl + ; FPSUB_R f3, a2 + subpd xmm3, xmm10 + ; IROL_R r3, r7 + mov ecx, r15d + rol r11, cl + ; ISWAP_R r5, r7 + xchg r13, r15 + ; IDIV_C r5, 749951529 + mov rax, 13205547200481862341 + mul r13 + shr rdx, 29 + add r13, rdx + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IMUL_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + imul r8, qword ptr [rsi+rax] + ; FPADD_R f1, a1 + addpd xmm1, xmm9 + ; IROR_R r2, 60 + ror r10, 60 + ; IROR_R r5, r4 + mov ecx, r12d + ror r13, cl + ; FPADD_R f2, a0 + addpd xmm2, xmm8 + ; IXOR_M r4, L1[r6] mov eax, r14d and eax, 16376 + xor r12, qword ptr [rsi+rax] + ; IXOR_R r2, r6 + xor r10, r14 + ; FPADD_M f3, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; ISUB_R r7, r6 + sub r15, r14 + ; IMUL_9C r2, -962375579 + lea r10, [r10+r10*8-962375579] + ; FPSUB_R f3, a2 + subpd xmm3, xmm10 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; IMUL_R r1, r5 + imul r9, r13 + ; IMUL_R r6, r4 + imul r14, r12 + ; ISWAP_R r0, r2 + xchg r8, r10 + ; ISUB_R r6, r5 + sub r14, r13 + ; FPSUB_R f2, a1 + subpd xmm2, xmm9 + ; ISDIV_C r6, 652931802 + mov rax, -3278972671018643631 + imul r14 + xor eax, eax + add rdx, r14 + sar rdx, 29 + sets al + add rdx, rax + add r14, rdx + ; IMUL_9C r5, -1142924545 + lea r13, [r13+r13*8-1142924545] + ; ISUB_R r7, 1085161834 + sub r15, 1085161834 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPMUL_M e1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FPMUL_M e3, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; COND_R r2, lt(r5, 1635027096) + xor ecx, ecx + cmp r13d, 1635027096 + setl cl + add r10, rcx + ; IMUL_R r5, -1219696062 + imul r13, -1219696062 + ; IXOR_R r5, r0 + xor r13, r8 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; FPADD_R f3, a2 + addpd xmm3, xmm10 + ; FPSUB_R f1, a3 + subpd xmm1, xmm11 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPDIV_R e1, a3 + divpd xmm5, xmm11 + maxpd xmm5, xmm13 + ; IXOR_M r6, L1[r0] + mov eax, r8d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; ISUB_R r7, r4 + sub r15, r12 + ; ISUB_M r6, L1[r1] + mov eax, r9d + and eax, 16376 + sub r14, qword ptr [rsi+rax] + ; ISTORE L1[r5], r3 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IMUL_R r5, r1 + imul r13, r9 + ; IROR_R r3, r2 + mov ecx, r10d + ror r11, cl + ; IMUL_R r4, r7 + imul r12, r15 + ; ISDIV_C r6, -54134756 + mov rax, 7012869325244995177 + imul r14 + xor eax, eax + sub rdx, r14 + sar rdx, 25 + sets al + add rdx, rax + add r14, rdx + ; FPMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FPSUB_M f2, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; IMUL_R r0, r5 + imul r8, r13 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; COND_R r5, be(r4, 1545677311) + xor ecx, ecx + cmp r12d, 1545677311 + setbe cl + add r13, rcx + ; IMUL_R r6, r3 + imul r14, r11 + ; IROL_R r6, r2 + mov ecx, r10d + rol r14, cl + ; FPDIV_R e3, a1 + divpd xmm7, xmm9 + maxpd xmm7, xmm13 + ; IXOR_M r5, L1[r1] + mov eax, r9d + and eax, 16376 + xor r13, qword ptr [rsi+rax] + ; COND_R r3, ab(r2, 1734636060) + xor ecx, ecx + cmp r10d, 1734636060 + seta cl + add r11, rcx + ; ISTORE L1[r2], r7 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; IADD_R r5, r6 + add r13, r14 + ; FPSUB_R f1, a2 + subpd xmm1, xmm10 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IROL_R r2, r6 + mov ecx, r14d + rol r10, cl + ; IMUL_R r0, r4 + imul r8, r12 + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; ISUB_R r6, r7 + sub r14, r15 + ; IROL_R r4, r7 + mov ecx, r15d + rol r12, cl + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; ISUB_R r1, r3 + sub r9, r11 + ; FPDIV_R e0, a1 + divpd xmm4, xmm9 + maxpd xmm4, xmm13 + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPSUB_M f2, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FPMUL_R e0, a0 + mulpd xmm4, xmm8 + ; IXOR_M r4, L2[r7] + mov eax, r15d + and eax, 262136 + xor r12, qword ptr [rsi+rax] + ; FPSUB_R f3, a3 + subpd xmm3, xmm11 + ; ISMULH_R r1, r6 + mov rax, r9 + imul r14 + mov r9, rdx + ; COND_R r4, be(r7, 224524971) + xor ecx, ecx + cmp r15d, 224524971 + setbe cl + add r12, rcx + ; FPADD_M f2, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; IMUL_R r5, r4 + imul r13, r12 + ; IADD_RC r1, r5, 370966979 + lea r9, [r9+r13+370966979] + ; IADD_RC r7, r3, -1762209698 + lea r15, [r15+r11-1762209698] + ; FPMUL_M e3, L2[r2] + mov eax, r10d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; ISUB_R r2, r7 + sub r10, r15 + ; IMUL_9C r3, 171157280 + lea r11, [r11+r11*8+171157280] + ; ISUB_R r3, r5 + sub r11, r13 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; ISTORE L1[r4], r1 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; IADD_R r0, r2 + add r8, r10 + ; IXOR_R r7, r6 + xor r15, r14 + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IXOR_M r4, L1[r7] + mov eax, r15d + and eax, 16376 + xor r12, qword ptr [rsi+rax] + ; ISTORE L1[r5], r7 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; IMUL_9C r7, -1206742834 + lea r15, [r15+r15*8-1206742834] + ; ISMULH_R r0, r4 + mov rax, r8 + imul r12 + mov r8, rdx + ; FPADD_R f2, a0 + addpd xmm2, xmm8 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; INEG_R r7 + neg r15 + ; COND_M r0, of(L1[r5], -2056260506) + xor ecx, ecx + mov eax, r13d + and eax, 16376 + cmp dword ptr [rsi+rax], -2056260506 + seto cl + add r8, rcx + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IMUL_R r3, r4 + imul r11, r12 + ; FPNEG_R f1 + xorps xmm1, xmm15 + ; FPADD_M f2, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; FPMUL_M e3, L2[r5] + mov eax, r13d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; ISTORE L1[r2], r2 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; IMUL_M r3, L2[r4] + mov eax, r12d + and eax, 262136 + imul r11, qword ptr [rsi+rax] + ; IROL_R r5, r6 + mov ecx, r14d + rol r13, cl + ; IADD_RC r4, r3, -904431293 + lea r12, [r12+r11-904431293] + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; IROL_R r7, r0 + mov ecx, r8d + rol r15, cl + ; ISTORE L2[r1], r7 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IROL_R r4, r3 + mov ecx, r11d + rol r12, cl + ; IADD_R r5, r2 + add r13, r10 + ; COND_R r3, ge(r6, -444806705) + xor ecx, ecx + cmp r14d, -444806705 + setge cl + add r11, rcx + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; IROL_R r0, 57 + rol r8, 57 + ; IADD_R r0, r2 + add r8, r10 + ; IADD_R r7, r4 + add r15, r12 + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IXOR_M r7, L2[r5] + mov eax, r13d + and eax, 262136 + xor r15, qword ptr [rsi+rax] + ; ISTORE L1[r2], r0 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; ISUB_R r1, r4 + sub r9, r12 + ; IXOR_R r5, r0 + xor r13, r8 + ; IXOR_M r7, L2[r1] + mov eax, r9d + and eax, 262136 + xor r15, qword ptr [rsi+rax] + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IXOR_M r1, L1[r4] + mov eax, r12d + and eax, 16376 + xor r9, qword ptr [rsi+rax] + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; ISDIV_C r1, 1473744194 + mov rax, -5006799265644655925 + imul r9 + xor eax, eax + add rdx, r9 + sar rdx, 30 + sets al + add rdx, rax + add r9, rdx + ; IMUL_9C r1, 1626151459 + lea r9, [r9+r9*8+1626151459] + ; IXOR_M r6, L1[r4] + mov eax, r12d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; FPADD_R f0, a0 + addpd xmm0, xmm8 + ; FPADD_R f3, a2 + addpd xmm3, xmm10 + ; ISUB_R r6, r7 + sub r14, r15 + ; IADD_RC r1, r5, 2075955307 + lea r9, [r9+r13+2075955307] + ; IROL_R r6, r3 + mov ecx, r11d + rol r14, cl + ; IMULH_R r2, -1135671124 + mov eax, -1135671124 + mul r10 + add r10, rdx + ; ISUB_R r5, r2 + sub r13, r10 + ; IMULH_R r3, r5 + mov rax, r11 + mul r13 + mov r11, rdx + ; IADD_M r4, L3[386040] + add r12, qword ptr [rsi+386040] + ; COND_R r6, ge(r4, 1518758207) + xor ecx, ecx + cmp r12d, 1518758207 + setge cl + add r14, rcx + ; FPDIV_R e3, a1 + divpd xmm7, xmm9 + maxpd xmm7, xmm13 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; FPADD_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; FPMUL_M e0, L1[r4] + mov eax, r12d + and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] mulpd xmm4, xmm12 maxpd xmm4, xmm13 - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; IMUL_R r1, r2 - imul r9, r10 - ; IDIV_C r7, 3214009572 - mov rax, 12325439725582798855 - mul r15 - shr rdx, 31 - add r15, rdx - ; IMULH_R r3, r2 - mov rax, r11 - mul r10 - mov r11, rdx - ; IROR_R r1, r0 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROL_R r5, r1 + mov ecx, r9d + rol r13, cl + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IROL_R r3, r0 mov ecx, r8d - ror r9, cl - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IADD_RC r4, r4, 1456841848 - lea r12, [r12+r12+1456841848] - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; COND_M r0, of(L1[r4], 1678513610) - xor ecx, ecx - mov eax, r12d - and eax, 16376 - cmp dword ptr [rsi+rax], 1678513610 - seto cl - add r8, rcx - ; INEG_R r4 - neg r12 - ; IMUL_R r4, r1 - imul r12, r9 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 + rol r11, cl + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 ; IROR_R r0, r7 mov ecx, r15d ror r8, cl - ; ISTORE L2[r1], r4 - mov eax, r9d + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; IXOR_R r7, r0 + xor r15, r8 + ; ISTORE L1[r4], r1 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; ISTORE L2[r0], r4 + mov eax, r8d and eax, 262136 mov qword ptr [rsi+rax], r12 - ; IROL_R r7, r6 - mov ecx, r14d - rol r15, cl - ; IMUL_9C r2, 266593902 - lea r10, [r10+r10*8+266593902] - ; IMUL_R r4, r6 - imul r12, r14 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; IROR_R r7, r2 - mov ecx, r10d + ; FPDIV_R e3, a3 + divpd xmm7, xmm11 + maxpd xmm7, xmm13 + ; ISTORE L2[r4], r6 + mov eax, r12d + and eax, 262136 + mov qword ptr [rsi+rax], r14 + ; IMUL_R r3, r1 + imul r11, r9 + ; IXOR_R r2, r4 + xor r10, r12 + ; ISTORE L2[r3], r5 + mov eax, r11d + and eax, 262136 + mov qword ptr [rsi+rax], r13 + ; FPMUL_M e2, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm6, xmm12 + maxpd xmm6, xmm13 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; COND_R r1, ab(r7, -229570354) + xor ecx, ecx + cmp r15d, -229570354 + seta cl + add r9, rcx + ; IROR_R r7, r3 + mov ecx, r11d ror r15, cl - ; IROR_R r0, r5 - mov ecx, r13d - ror r8, cl + ; FPDIV_R e2, a0 + divpd xmm6, xmm8 + maxpd xmm6, xmm13 + ; IADD_R r2, r5 + add r10, r13 + ; FPDIV_R e1, a3 + divpd xmm5, xmm11 + maxpd xmm5, xmm13 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; ISUB_R r3, r7 + sub r11, r15 + ; FPADD_R f0, a0 + addpd xmm0, xmm8 + ; IMUL_M r0, L3[98136] + imul r8, qword ptr [rsi+98136] + ; IMUL_9C r5, -895487055 + lea r13, [r13+r13*8-895487055] + ; IMULH_R r2, r7 + mov rax, r10 + mul r15 + mov r10, rdx + ; IADD_R r4, r1 + add r12, r9 + ; ISDIV_C r0, 494395999 + mov rax, 5007888582388710937 + imul r8 + xor eax, eax + sar rdx, 27 + sets al + add rdx, rax + add r8, rdx + ; FPSWAP_R e0 + shufpd xmm4, xmm4, 1 + ; IXOR_R r1, r5 + xor r9, r13 + ; COND_R r2, ab(r3, 1932234501) + xor ecx, ecx + cmp r11d, 1932234501 + seta cl + add r10, rcx + ; FPMUL_R e1, a0 + mulpd xmm5, xmm8 + ; FPSUB_M f1, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IADD_RC r0, r5, -2051588680 + lea r8, [r8+r13-2051588680] + ; COND_R r6, of(r5, -795593984) + xor ecx, ecx + cmp r13d, -795593984 + seto cl + add r14, rcx + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; IMULH_R r7, r3 + mov rax, r15 + mul r11 + mov r15, rdx + ; ISUB_R r7, r4 + sub r15, r12 + ; IROL_R r0, r6 + mov ecx, r14d + rol r8, cl + ; ISDIV_C r1, -675825513 + mov rax, -7326980207007250257 + imul r9 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r9, rdx + ; ISTORE L1[r6], r3 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IROR_R r4, r3 + mov ecx, r11d + ror r12, cl + ; IDIV_C r4, 3919226376 + mov rax, r12 + shr rax, 3 + mov rcx, 2526906936258851663 + mul rcx + shr rdx, 26 + add r12, rdx + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IADD_R r0, r2 + add r8, r10 + ; IADD_M r4, L1[r2] + mov eax, r10d + and eax, 16376 + add r12, qword ptr [rsi+rax] + ; ISTORE L1[r7], r2 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IADD_R r5, r4 + add r13, r12 + ; IXOR_R r6, r7 + xor r14, r15 + ; ISMULH_R r4, r7 + mov rax, r12 + imul r15 + mov r12, rdx + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5