diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index 3092e4d..70d396b 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -81,7 +81,7 @@ namespace RandomX {
}
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
- return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
+ return instr.imm32 & ScratchpadL3Mask;
}
//1 uOP
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index ce75f43..8a175fc 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -37,7 +37,7 @@ namespace RandomX {
}
void Instruction::genAddressImm(std::ostream& os) const {
- os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
+ os << "L3" << "[" << (imm32 & ScratchpadL3Mask) << "]";
}
void Instruction::h_IADD_R(std::ostream& os) const {
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index d8e7a42..e926e4a 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -262,7 +262,7 @@ namespace RandomX {
}
void JitCompilerX86::genAddressImm(Instruction& instr) {
- emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
+ emit32(instr.imm32 & ScratchpadL3Mask);
}
void JitCompilerX86::h_IADD_R(Instruction& instr) {
diff --git a/src/common.hpp b/src/common.hpp
index e52dbc2..ea67ff9 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -68,7 +68,7 @@ namespace RandomX {
};
constexpr int ProgramLength = 256;
- constexpr uint32_t InstructionCount = 1024;
+ constexpr uint32_t InstructionCount = 2048;
constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t);
@@ -78,6 +78,7 @@ namespace RandomX {
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16;
constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16;
+ constexpr int ScratchpadL3Mask = (ScratchpadLength - 1) * 8;
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8;
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index d24800e..0bb26ff 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -25,7 +25,7 @@ along with RandomX. If not, see.
#define WT_IADD_RC 12
#define WT_ISUB_R 12
#define WT_ISUB_M 3
-#define WT_IMUL_9C 10
+#define WT_IMUL_9C 9
#define WT_IMUL_R 16
#define WT_IMUL_M 4
#define WT_IMULH_R 4
@@ -36,7 +36,7 @@ along with RandomX. If not, see.
#define WT_ISDIV_C 4
#define WT_INEG_R 2
#define WT_IXOR_R 12
-#define WT_IXOR_M 3
+#define WT_IXOR_M 4
#define WT_IROR_R 10
#define WT_IROL_R 10
#define WT_ISWAP_R 4
diff --git a/src/main.cpp b/src/main.cpp
index c761b97..58b381c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -172,7 +172,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash
//vm->initializeScratchpad(scratchpad, spIndex);
vm->setScratchpad(scratchpad);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
- for (int chain = 0; chain < 16; ++chain) {
+ for (int chain = 0; chain < 8; ++chain) {
vm->initializeProgram(hash);
vm->execute();
vm->getResult(nullptr, 0, hash);
diff --git a/src/program.inc b/src/program.inc
index d901e9a..e4de06f 100644
--- a/src/program.inc
+++ b/src/program.inc
@@ -1,381 +1,768 @@
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; IADD_RC r2, r5, -1621224194
- lea r10, [r10+r13-1621224194]
- ; ISTORE L2[r2], r7
- mov eax, r10d
+ ; IMUL_R r0, r7
+ imul r8, r15
+ ; ISMULH_R r2, r1
+ mov rax, r10
+ imul r9
+ mov r10, rdx
+ ; IMUL_R r2, r4
+ imul r10, r12
+ ; IADD_R r7, r0
+ add r15, r8
+ ; FPSQRT_R e0
+ sqrtpd xmm4, xmm4
+ ; IMUL_R r3, r6
+ imul r11, r14
+ ; FPMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; IMULH_M r6, L1[r3]
+ mov ecx, r11d
+ and ecx, 16376
+ mov rax, r14
+ mul qword ptr [rsi+rcx]
+ mov r14, rdx
+ ; IMUL_R r5, r1
+ imul r13, r9
+ ; FPADD_M f0, L2[r6]
+ mov eax, r14d
and eax, 262136
- mov qword ptr [rsi+rax], r15
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm0, xmm12
+ ; IROR_R r4, r3
+ mov ecx, r11d
+ ror r12, cl
+ ; IXOR_M r4, L3[984888]
+ xor r12, qword ptr [rsi+984888]
+ ; IROR_R r0, r3
+ mov ecx, r11d
+ ror r8, cl
+ ; IROR_R r0, r4
+ mov ecx, r12d
+ ror r8, cl
+ ; FPMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; IMUL_R r0, r2
+ imul r8, r10
+ ; ISUB_M r0, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ sub r8, qword ptr [rsi+rax]
+ ; FPSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; ISWAP_R r7, r4
+ xchg r15, r12
+ ; IDIV_C r1, 3690475308
+ mov rax, r9
+ shr rax, 2
+ mov rcx, 5367070356934653253
+ mul rcx
+ shr rdx, 28
+ add r9, rdx
+ ; IROL_R r4, r2
+ mov ecx, r10d
+ rol r12, cl
+ ; IMUL_M r5, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ imul r13, qword ptr [rsi+rax]
+ ; IROL_R r4, r7
+ mov ecx, r15d
+ rol r12, cl
+ ; ISUB_R r3, r1
+ sub r11, r9
+ ; IADD_R r7, r0
+ add r15, r8
+ ; IADD_M r1, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ add r9, qword ptr [rsi+rax]
; FPMUL_R e2, a2
mulpd xmm6, xmm10
- ; IMUL_R r6, r3
- imul r14, r11
- ; FPSUB_M f1, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; IROL_R r5, r3
- mov ecx, r11d
- rol r13, cl
- ; FPMUL_R e2, a0
- mulpd xmm6, xmm8
- ; FPSUB_R f3, a0
- subpd xmm3, xmm8
- ; IXOR_R r0, r4
- xor r8, r12
- ; ISMULH_M r3, L1[r7]
- mov ecx, r15d
- and ecx, 16376
- mov rax, r11
- imul qword ptr [rsi+rcx]
- mov r11, rdx
- ; FPSWAP_R f2
- shufpd xmm2, xmm2, 1
- ; IDIV_C r6, 1248528248
- mov rax, 15864311168205210203
- mul r14
- shr rdx, 30
- add r14, rdx
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; IADD_RC r3, r4, -52260428
- lea r11, [r11+r12-52260428]
- ; IADD_R r7, -1138617760
- add r15, -1138617760
- ; IROL_R r2, r6
- mov ecx, r14d
- rol r10, cl
- ; FPNEG_R f2
- xorps xmm2, xmm15
- ; IROR_R r7, r1
- mov ecx, r9d
- ror r15, cl
- ; COND_R r2, lt(r7, -41618808)
- xor ecx, ecx
- cmp r15d, -41618808
- setl cl
- add r10, rcx
- ; FPMUL_R e3, a0
- mulpd xmm7, xmm8
- ; CFROUND r1, 43
- mov rax, r9
- rol rax, 34
- and eax, 24576
- or eax, 40896
- mov dword ptr [rsp-8], eax
- ldmxcsr dword ptr [rsp-8]
- ; FPADD_R f2, a1
- addpd xmm2, xmm9
- ; FPSUB_M f0, L1[r7]
- mov eax, r15d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm0, xmm12
- ; ISTORE L1[r6], r2
- mov eax, r14d
- and eax, 16376
- mov qword ptr [rsi+rax], r10
- ; ISUB_R r6, r5
- sub r14, r13
- ; IADD_M r0, L1[r4]
- mov eax, r12d
- and eax, 16376
- add r8, qword ptr [rsi+rax]
+ ; IADD_R r6, -1115286770
+ add r14, -1115286770
+ ; FPDIV_R e2, a3
+ divpd xmm6, xmm11
+ maxpd xmm6, xmm13
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; IXOR_R r3, r7
+ xor r11, r15
; ISTORE L1[r4], r3
mov eax, r12d
and eax, 16376
mov qword ptr [rsi+rax], r11
- ; ISTORE L1[r6], r6
- mov eax, r14d
- and eax, 16376
- mov qword ptr [rsi+rax], r14
- ; FPSQRT_R e0
- sqrtpd xmm4, xmm4
- ; IXOR_R r2, r5
- xor r10, r13
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; FPMUL_R e1, a3
- mulpd xmm5, xmm11
- ; IMULH_R r7, r6
- mov rax, r15
- mul r14
- mov r15, rdx
- ; ISDIV_C r0, -1706892622
- mov rax, -5802075764249827661
- imul r8
- xor eax, eax
- sar rdx, 29
- sets al
- add rdx, rax
- add r8, rdx
- ; IMUL_R r5, r3
- imul r13, r11
- ; FPSQRT_R e2
- sqrtpd xmm6, xmm6
- ; FPADD_M f3, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; IADD_R r3, r2
- add r11, r10
- ; FPADD_R f1, a0
- addpd xmm1, xmm8
- ; FPDIV_R e3, a2
- divpd xmm7, xmm10
- maxpd xmm7, xmm13
- ; FPSUB_R f0, a1
- subpd xmm0, xmm9
- ; IMUL_M r5, L1[r6]
- mov eax, r14d
- and eax, 16376
- imul r13, qword ptr [rsi+rax]
- ; IADD_RC r1, r2, -1263285243
- lea r9, [r9+r10-1263285243]
- ; IMUL_9C r4, 1994773931
- lea r12, [r12+r12*8+1994773931]
- ; FPSWAP_R e3
- shufpd xmm7, xmm7, 1
- ; IMUL_M r0, L1[r7]
- mov eax, r15d
- and eax, 16376
- imul r8, qword ptr [rsi+rax]
- ; IROR_R r1, r6
+ ; IROR_R r3, r6
mov ecx, r14d
- ror r9, cl
- ; IROL_R r2, r4
- mov ecx, r12d
- rol r10, cl
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
- ; ISTORE L1[r0], r5
- mov eax, r8d
- and eax, 16376
- mov qword ptr [rsi+rax], r13
- ; FPDIV_M e2, L2[r3]
- mov eax, r11d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- divpd xmm6, xmm12
- maxpd xmm6, xmm13
- ; FPSWAP_R f2
- shufpd xmm2, xmm2, 1
- ; IADD_R r7, r5
- add r15, r13
- ; FPDIV_M e0, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- divpd xmm4, xmm12
- maxpd xmm4, xmm13
- ; FPADD_M f3, L1[r5]
- mov eax, r13d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; FPADD_R f0, a3
- addpd xmm0, xmm11
- ; IADD_R r2, r0
- add r10, r8
- ; ISTORE L1[r3], r6
- mov eax, r11d
- and eax, 16376
- mov qword ptr [rsi+rax], r14
- ; IROR_R r1, r7
- mov ecx, r15d
- ror r9, cl
- ; IMUL_9C r5, 301671287
- lea r13, [r13+r13*8+301671287]
- ; IXOR_R r7, 266992378
- xor r15, 266992378
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; IMUL_M r2, L2[r0]
- mov eax, r8d
- and eax, 262136
- imul r10, qword ptr [rsi+rax]
+ ror r11, cl
+ ; ISMULH_R r0, r6
+ mov rax, r8
+ imul r14
+ mov r8, rdx
+ ; IROR_R r6, r5
+ mov ecx, r13d
+ ror r14, cl
+ ; IMULH_M r6, L2[r0]
+ mov ecx, r8d
+ and ecx, 262136
+ mov rax, r14
+ mul qword ptr [rsi+rcx]
+ mov r14, rdx
+ ; ISUB_R r2, 1512125960
+ sub r10, 1512125960
+ ; IMUL_R r7, r6
+ imul r15, r14
+ ; IMULH_R r6, r7
+ mov rax, r14
+ mul r15
+ mov r14, rdx
+ ; ISUB_R r4, r1
+ sub r12, r9
; FPMUL_R e3, a2
mulpd xmm7, xmm10
- ; IMUL_R r0, r6
- imul r8, r14
- ; ISTORE L1[r0], r7
- mov eax, r8d
- and eax, 16376
- mov qword ptr [rsi+rax], r15
- ; FPNEG_R f0
- xorps xmm0, xmm15
- ; FPADD_M f3, L1[r5]
- mov eax, r13d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; IROR_R r5, r4
- mov ecx, r12d
- ror r13, cl
- ; ISTORE L2[r7], r2
- mov eax, r15d
- and eax, 262136
- mov qword ptr [rsi+rax], r10
- ; FPADD_R f2, a3
- addpd xmm2, xmm11
- ; FPADD_M f3, L1[r2]
- mov eax, r10d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; ISDIV_C r5, -2076168315
- mov rax, -4770095103914078469
- imul r13
- xor eax, eax
- sar rdx, 29
- sets al
- add rdx, rax
- add r13, rdx
- ; IADD_RC r0, r4, -1321374359
- lea r8, [r8+r12-1321374359]
- ; CFROUND r6, 28
- mov rax, r14
- rol rax, 49
- and eax, 24576
- or eax, 40896
- mov dword ptr [rsp-8], eax
- ldmxcsr dword ptr [rsp-8]
- ; FPADD_R f2, a2
- addpd xmm2, xmm10
- ; IROL_R r7, r6
- mov ecx, r14d
- rol r15, cl
- ; ISUB_R r2, r4
- sub r10, r12
- ; ISMULH_R r0, -1500893068
- mov rax, -1500893068
- imul r8
- add r8, rdx
- ; IADD_R r2, r3
- add r10, r11
- ; FPSQRT_R e2
- sqrtpd xmm6, xmm6
- ; IROL_R r7, r4
- mov ecx, r12d
- rol r15, cl
- ; IMUL_R r4, r2
- imul r12, r10
- ; ISUB_R r3, r7
- sub r11, r15
- ; IADD_R r2, r7
- add r10, r15
- ; FPDIV_R e3, a0
- divpd xmm7, xmm8
- maxpd xmm7, xmm13
- ; ISUB_R r6, 540663146
- sub r14, 540663146
- ; IROL_R r5, 58
- rol r13, 58
- ; FPADD_R f2, a1
- addpd xmm2, xmm9
- ; FPADD_R f2, a2
- addpd xmm2, xmm10
- ; FPDIV_R e1, a2
- divpd xmm5, xmm10
- maxpd xmm5, xmm13
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; IADD_R r5, r3
- add r13, r11
- ; IADD_R r7, -1780268176
- add r15, -1780268176
- ; ISUB_R r7, r0
- sub r15, r8
- ; ISTORE L2[r0], r7
- mov eax, r8d
- and eax, 262136
- mov qword ptr [rsi+rax], r15
- ; INEG_R r2
- neg r10
- ; FPNEG_R f0
- xorps xmm0, xmm15
- ; INEG_R r2
- neg r10
- ; IADD_R r0, r3
- add r8, r11
- ; IMUL_9C r7, -2124093035
- lea r15, [r15+r15*8-2124093035]
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; IXOR_R r5, r2
+ xor r13, r10
; FPADD_M f2, L1[r0]
mov eax, r8d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm2, xmm12
- ; FPMUL_M e0, L1[r6]
+ ; IMULH_R r6, r1
+ mov rax, r14
+ mul r9
+ mov r14, rdx
+ ; ISUB_M r5, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ sub r13, qword ptr [rsi+rax]
+ ; FPMUL_R e2, a3
+ mulpd xmm6, xmm11
+ ; IMUL_R r4, r6
+ imul r12, r14
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; ISUB_R r3, r2
+ sub r11, r10
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IROL_R r7, r0
+ mov ecx, r8d
+ rol r15, cl
+ ; FPSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; IROL_R r3, r7
+ mov ecx, r15d
+ rol r11, cl
+ ; ISWAP_R r5, r7
+ xchg r13, r15
+ ; IDIV_C r5, 749951529
+ mov rax, 13205547200481862341
+ mul r13
+ shr rdx, 29
+ add r13, rdx
+ ; FPADD_R f3, a0
+ addpd xmm3, xmm8
+ ; IMUL_M r0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ imul r8, qword ptr [rsi+rax]
+ ; FPADD_R f1, a1
+ addpd xmm1, xmm9
+ ; IROR_R r2, 60
+ ror r10, 60
+ ; IROR_R r5, r4
+ mov ecx, r12d
+ ror r13, cl
+ ; FPADD_R f2, a0
+ addpd xmm2, xmm8
+ ; IXOR_M r4, L1[r6]
mov eax, r14d
and eax, 16376
+ xor r12, qword ptr [rsi+rax]
+ ; IXOR_R r2, r6
+ xor r10, r14
+ ; FPADD_M f3, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
+ ; ISUB_R r7, r6
+ sub r15, r14
+ ; IMUL_9C r2, -962375579
+ lea r10, [r10+r10*8-962375579]
+ ; FPSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; FPSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; IMUL_R r1, r5
+ imul r9, r13
+ ; IMUL_R r6, r4
+ imul r14, r12
+ ; ISWAP_R r0, r2
+ xchg r8, r10
+ ; ISUB_R r6, r5
+ sub r14, r13
+ ; FPSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; ISDIV_C r6, 652931802
+ mov rax, -3278972671018643631
+ imul r14
+ xor eax, eax
+ add rdx, r14
+ sar rdx, 29
+ sets al
+ add rdx, rax
+ add r14, rdx
+ ; IMUL_9C r5, -1142924545
+ lea r13, [r13+r13*8-1142924545]
+ ; ISUB_R r7, 1085161834
+ sub r15, 1085161834
+ ; IMUL_R r4, r6
+ imul r12, r14
+ ; FPMUL_M e1, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm5, xmm12
+ maxpd xmm5, xmm13
+ ; FPMUL_M e3, L2[r1]
+ mov eax, r9d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; COND_R r2, lt(r5, 1635027096)
+ xor ecx, ecx
+ cmp r13d, 1635027096
+ setl cl
+ add r10, rcx
+ ; IMUL_R r5, -1219696062
+ imul r13, -1219696062
+ ; IXOR_R r5, r0
+ xor r13, r8
+ ; FPNEG_R f2
+ xorps xmm2, xmm15
+ ; FPADD_R f3, a2
+ addpd xmm3, xmm10
+ ; FPSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; FPDIV_R e1, a3
+ divpd xmm5, xmm11
+ maxpd xmm5, xmm13
+ ; IXOR_M r6, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ xor r14, qword ptr [rsi+rax]
+ ; ISUB_R r7, r4
+ sub r15, r12
+ ; ISUB_M r6, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ sub r14, qword ptr [rsi+rax]
+ ; ISTORE L1[r5], r3
+ mov eax, r13d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r11
+ ; IMUL_R r5, r1
+ imul r13, r9
+ ; IROR_R r3, r2
+ mov ecx, r10d
+ ror r11, cl
+ ; IMUL_R r4, r7
+ imul r12, r15
+ ; ISDIV_C r6, -54134756
+ mov rax, 7012869325244995177
+ imul r14
+ xor eax, eax
+ sub rdx, r14
+ sar rdx, 25
+ sets al
+ add rdx, rax
+ add r14, rdx
+ ; FPMUL_R e1, a2
+ mulpd xmm5, xmm10
+ ; FPSUB_M f2, L2[r4]
+ mov eax, r12d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; IMUL_R r0, r5
+ imul r8, r13
+ ; FPMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; COND_R r5, be(r4, 1545677311)
+ xor ecx, ecx
+ cmp r12d, 1545677311
+ setbe cl
+ add r13, rcx
+ ; IMUL_R r6, r3
+ imul r14, r11
+ ; IROL_R r6, r2
+ mov ecx, r10d
+ rol r14, cl
+ ; FPDIV_R e3, a1
+ divpd xmm7, xmm9
+ maxpd xmm7, xmm13
+ ; IXOR_M r5, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ xor r13, qword ptr [rsi+rax]
+ ; COND_R r3, ab(r2, 1734636060)
+ xor ecx, ecx
+ cmp r10d, 1734636060
+ seta cl
+ add r11, rcx
+ ; ISTORE L1[r2], r7
+ mov eax, r10d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r15
+ ; IADD_R r5, r6
+ add r13, r14
+ ; FPSUB_R f1, a2
+ subpd xmm1, xmm10
+ ; FPADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FPSWAP_R f1
+ shufpd xmm1, xmm1, 1
+ ; IROL_R r2, r6
+ mov ecx, r14d
+ rol r10, cl
+ ; IMUL_R r0, r4
+ imul r8, r12
+ ; FPSUB_R f0, a2
+ subpd xmm0, xmm10
+ ; ISUB_R r6, r7
+ sub r14, r15
+ ; IROL_R r4, r7
+ mov ecx, r15d
+ rol r12, cl
+ ; FPMUL_R e2, a0
+ mulpd xmm6, xmm8
+ ; ISUB_R r1, r3
+ sub r9, r11
+ ; FPDIV_R e0, a1
+ divpd xmm4, xmm9
+ maxpd xmm4, xmm13
+ ; FPADD_R f0, a1
+ addpd xmm0, xmm9
+ ; FPMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; FPSUB_R f2, a2
+ subpd xmm2, xmm10
+ ; FPSUB_M f2, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; FPMUL_R e0, a0
+ mulpd xmm4, xmm8
+ ; IXOR_M r4, L2[r7]
+ mov eax, r15d
+ and eax, 262136
+ xor r12, qword ptr [rsi+rax]
+ ; FPSUB_R f3, a3
+ subpd xmm3, xmm11
+ ; ISMULH_R r1, r6
+ mov rax, r9
+ imul r14
+ mov r9, rdx
+ ; COND_R r4, be(r7, 224524971)
+ xor ecx, ecx
+ cmp r15d, 224524971
+ setbe cl
+ add r12, rcx
+ ; FPADD_M f2, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm2, xmm12
+ ; IMUL_R r5, r4
+ imul r13, r12
+ ; IADD_RC r1, r5, 370966979
+ lea r9, [r9+r13+370966979]
+ ; IADD_RC r7, r3, -1762209698
+ lea r15, [r15+r11-1762209698]
+ ; FPMUL_M e3, L2[r2]
+ mov eax, r10d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; ISUB_R r2, r7
+ sub r10, r15
+ ; IMUL_9C r3, 171157280
+ lea r11, [r11+r11*8+171157280]
+ ; ISUB_R r3, r5
+ sub r11, r13
+ ; FPNEG_R f3
+ xorps xmm3, xmm15
+ ; FPNEG_R f2
+ xorps xmm2, xmm15
+ ; ISTORE L1[r4], r1
+ mov eax, r12d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r9
+ ; IADD_R r0, r2
+ add r8, r10
+ ; IXOR_R r7, r6
+ xor r15, r14
+ ; IROR_R r0, r4
+ mov ecx, r12d
+ ror r8, cl
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IXOR_M r4, L1[r7]
+ mov eax, r15d
+ and eax, 16376
+ xor r12, qword ptr [rsi+rax]
+ ; ISTORE L1[r5], r7
+ mov eax, r13d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r15
+ ; IMUL_9C r7, -1206742834
+ lea r15, [r15+r15*8-1206742834]
+ ; ISMULH_R r0, r4
+ mov rax, r8
+ imul r12
+ mov r8, rdx
+ ; FPADD_R f2, a0
+ addpd xmm2, xmm8
+ ; FPSUB_R f1, a0
+ subpd xmm1, xmm8
+ ; INEG_R r7
+ neg r15
+ ; COND_M r0, of(L1[r5], -2056260506)
+ xor ecx, ecx
+ mov eax, r13d
+ and eax, 16376
+ cmp dword ptr [rsi+rax], -2056260506
+ seto cl
+ add r8, rcx
+ ; FPSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; IMUL_R r3, r4
+ imul r11, r12
+ ; FPNEG_R f1
+ xorps xmm1, xmm15
+ ; FPADD_M f2, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm2, xmm12
+ ; FPSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; FPNEG_R f3
+ xorps xmm3, xmm15
+ ; FPMUL_M e3, L2[r5]
+ mov eax, r13d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; ISTORE L1[r2], r2
+ mov eax, r10d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r10
+ ; IMUL_M r3, L2[r4]
+ mov eax, r12d
+ and eax, 262136
+ imul r11, qword ptr [rsi+rax]
+ ; IROL_R r5, r6
+ mov ecx, r14d
+ rol r13, cl
+ ; IADD_RC r4, r3, -904431293
+ lea r12, [r12+r11-904431293]
+ ; FPSUB_R f1, a1
+ subpd xmm1, xmm9
+ ; IROL_R r7, r0
+ mov ecx, r8d
+ rol r15, cl
+ ; ISTORE L2[r1], r7
+ mov eax, r9d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; IROL_R r4, r3
+ mov ecx, r11d
+ rol r12, cl
+ ; IADD_R r5, r2
+ add r13, r10
+ ; COND_R r3, ge(r6, -444806705)
+ xor ecx, ecx
+ cmp r14d, -444806705
+ setge cl
+ add r11, rcx
+ ; FPADD_R f0, a1
+ addpd xmm0, xmm9
+ ; IROL_R r0, 57
+ rol r8, 57
+ ; IADD_R r0, r2
+ add r8, r10
+ ; IADD_R r7, r4
+ add r15, r12
+ ; IROL_R r1, r7
+ mov ecx, r15d
+ rol r9, cl
+ ; IXOR_M r7, L2[r5]
+ mov eax, r13d
+ and eax, 262136
+ xor r15, qword ptr [rsi+rax]
+ ; ISTORE L1[r2], r0
+ mov eax, r10d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r8
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; ISUB_R r1, r4
+ sub r9, r12
+ ; IXOR_R r5, r0
+ xor r13, r8
+ ; IXOR_M r7, L2[r1]
+ mov eax, r9d
+ and eax, 262136
+ xor r15, qword ptr [rsi+rax]
+ ; FPSUB_R f0, a0
+ subpd xmm0, xmm8
+ ; IXOR_M r1, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ xor r9, qword ptr [rsi+rax]
+ ; FPMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; ISDIV_C r1, 1473744194
+ mov rax, -5006799265644655925
+ imul r9
+ xor eax, eax
+ add rdx, r9
+ sar rdx, 30
+ sets al
+ add rdx, rax
+ add r9, rdx
+ ; IMUL_9C r1, 1626151459
+ lea r9, [r9+r9*8+1626151459]
+ ; IXOR_M r6, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ xor r14, qword ptr [rsi+rax]
+ ; FPADD_R f0, a0
+ addpd xmm0, xmm8
+ ; FPADD_R f3, a2
+ addpd xmm3, xmm10
+ ; ISUB_R r6, r7
+ sub r14, r15
+ ; IADD_RC r1, r5, 2075955307
+ lea r9, [r9+r13+2075955307]
+ ; IROL_R r6, r3
+ mov ecx, r11d
+ rol r14, cl
+ ; IMULH_R r2, -1135671124
+ mov eax, -1135671124
+ mul r10
+ add r10, rdx
+ ; ISUB_R r5, r2
+ sub r13, r10
+ ; IMULH_R r3, r5
+ mov rax, r11
+ mul r13
+ mov r11, rdx
+ ; IADD_M r4, L3[386040]
+ add r12, qword ptr [rsi+386040]
+ ; COND_R r6, ge(r4, 1518758207)
+ xor ecx, ecx
+ cmp r12d, 1518758207
+ setge cl
+ add r14, rcx
+ ; FPDIV_R e3, a1
+ divpd xmm7, xmm9
+ maxpd xmm7, xmm13
+ ; FPNEG_R f2
+ xorps xmm2, xmm15
+ ; FPADD_M f1, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm1, xmm12
+ ; FPMUL_M e0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm4, xmm12
maxpd xmm4, xmm13
- ; FPSUB_R f2, a3
- subpd xmm2, xmm11
- ; IMUL_R r1, r2
- imul r9, r10
- ; IDIV_C r7, 3214009572
- mov rax, 12325439725582798855
- mul r15
- shr rdx, 31
- add r15, rdx
- ; IMULH_R r3, r2
- mov rax, r11
- mul r10
- mov r11, rdx
- ; IROR_R r1, r0
+ ; FPSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; IROL_R r5, r1
+ mov ecx, r9d
+ rol r13, cl
+ ; FPADD_R f3, a0
+ addpd xmm3, xmm8
+ ; IROL_R r3, r0
mov ecx, r8d
- ror r9, cl
- ; FPMUL_R e0, a1
- mulpd xmm4, xmm9
- ; IADD_RC r4, r4, 1456841848
- lea r12, [r12+r12+1456841848]
- ; IROR_R r3, r2
- mov ecx, r10d
- ror r11, cl
- ; COND_M r0, of(L1[r4], 1678513610)
- xor ecx, ecx
- mov eax, r12d
- and eax, 16376
- cmp dword ptr [rsi+rax], 1678513610
- seto cl
- add r8, rcx
- ; INEG_R r4
- neg r12
- ; IMUL_R r4, r1
- imul r12, r9
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; FPSUB_R f2, a0
- subpd xmm2, xmm8
- ; FPMUL_R e1, a2
- mulpd xmm5, xmm10
- ; FPSUB_R f0, a3
- subpd xmm0, xmm11
+ rol r11, cl
+ ; FPMUL_R e3, a1
+ mulpd xmm7, xmm9
; IROR_R r0, r7
mov ecx, r15d
ror r8, cl
- ; ISTORE L2[r1], r4
- mov eax, r9d
+ ; FPADD_R f2, a2
+ addpd xmm2, xmm10
+ ; IXOR_R r7, r0
+ xor r15, r8
+ ; ISTORE L1[r4], r1
+ mov eax, r12d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r9
+ ; ISTORE L2[r0], r4
+ mov eax, r8d
and eax, 262136
mov qword ptr [rsi+rax], r12
- ; IROL_R r7, r6
- mov ecx, r14d
- rol r15, cl
- ; IMUL_9C r2, 266593902
- lea r10, [r10+r10*8+266593902]
- ; IMUL_R r4, r6
- imul r12, r14
- ; FPSUB_R f2, a2
- subpd xmm2, xmm10
- ; FPNEG_R f3
- xorps xmm3, xmm15
- ; IROR_R r7, r2
- mov ecx, r10d
+ ; FPDIV_R e3, a3
+ divpd xmm7, xmm11
+ maxpd xmm7, xmm13
+ ; ISTORE L2[r4], r6
+ mov eax, r12d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r14
+ ; IMUL_R r3, r1
+ imul r11, r9
+ ; IXOR_R r2, r4
+ xor r10, r12
+ ; ISTORE L2[r3], r5
+ mov eax, r11d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r13
+ ; FPMUL_M e2, L2[r4]
+ mov eax, r12d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm6, xmm12
+ maxpd xmm6, xmm13
+ ; FPSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; COND_R r1, ab(r7, -229570354)
+ xor ecx, ecx
+ cmp r15d, -229570354
+ seta cl
+ add r9, rcx
+ ; IROR_R r7, r3
+ mov ecx, r11d
ror r15, cl
- ; IROR_R r0, r5
- mov ecx, r13d
- ror r8, cl
+ ; FPDIV_R e2, a0
+ divpd xmm6, xmm8
+ maxpd xmm6, xmm13
+ ; IADD_R r2, r5
+ add r10, r13
+ ; FPDIV_R e1, a3
+ divpd xmm5, xmm11
+ maxpd xmm5, xmm13
+ ; FPSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; ISUB_R r3, r7
+ sub r11, r15
+ ; FPADD_R f0, a0
+ addpd xmm0, xmm8
+ ; IMUL_M r0, L3[98136]
+ imul r8, qword ptr [rsi+98136]
+ ; IMUL_9C r5, -895487055
+ lea r13, [r13+r13*8-895487055]
+ ; IMULH_R r2, r7
+ mov rax, r10
+ mul r15
+ mov r10, rdx
+ ; IADD_R r4, r1
+ add r12, r9
+ ; ISDIV_C r0, 494395999
+ mov rax, 5007888582388710937
+ imul r8
+ xor eax, eax
+ sar rdx, 27
+ sets al
+ add rdx, rax
+ add r8, rdx
+ ; FPSWAP_R e0
+ shufpd xmm4, xmm4, 1
+ ; IXOR_R r1, r5
+ xor r9, r13
+ ; COND_R r2, ab(r3, 1932234501)
+ xor ecx, ecx
+ cmp r11d, 1932234501
+ seta cl
+ add r10, rcx
+ ; FPMUL_R e1, a0
+ mulpd xmm5, xmm8
+ ; FPSUB_M f1, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm1, xmm12
+ ; FPSUB_R f0, a0
+ subpd xmm0, xmm8
+ ; IROL_R r1, r7
+ mov ecx, r15d
+ rol r9, cl
+ ; IADD_RC r0, r5, -2051588680
+ lea r8, [r8+r13-2051588680]
+ ; COND_R r6, of(r5, -795593984)
+ xor ecx, ecx
+ cmp r13d, -795593984
+ seto cl
+ add r14, rcx
+ ; FPADD_R f1, a0
+ addpd xmm1, xmm8
+ ; IMULH_R r7, r3
+ mov rax, r15
+ mul r11
+ mov r15, rdx
+ ; ISUB_R r7, r4
+ sub r15, r12
+ ; IROL_R r0, r6
+ mov ecx, r14d
+ rol r8, cl
+ ; ISDIV_C r1, -675825513
+ mov rax, -7326980207007250257
+ imul r9
+ xor eax, eax
+ sar rdx, 28
+ sets al
+ add rdx, rax
+ add r9, rdx
+ ; ISTORE L1[r6], r3
+ mov eax, r14d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r11
+ ; IROR_R r4, r3
+ mov ecx, r11d
+ ror r12, cl
+ ; IDIV_C r4, 3919226376
+ mov rax, r12
+ shr rax, 3
+ mov rcx, 2526906936258851663
+ mul rcx
+ shr rdx, 26
+ add r12, rdx
+ ; FPSUB_R f1, a1
+ subpd xmm1, xmm9
+ ; FPSUB_R f0, a0
+ subpd xmm0, xmm8
+ ; IADD_R r0, r2
+ add r8, r10
+ ; IADD_M r4, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ add r12, qword ptr [rsi+rax]
+ ; ISTORE L1[r7], r2
+ mov eax, r15d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r10
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; IADD_R r5, r4
+ add r13, r12
+ ; IXOR_R r6, r7
+ xor r14, r15
+ ; ISMULH_R r4, r7
+ mov rax, r12
+ imul r15
+ mov r12, rdx
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5