diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index 11bb3f0..a46fe5d 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -491,6 +491,10 @@ namespace RandomX {
asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl;
}
+ void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) {
+ asmCode << "\tnop" << std::endl;
+ }
+
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))
@@ -540,5 +544,7 @@ namespace RandomX {
INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE)
+
+ INST_HANDLE(NOP)
};
}
\ No newline at end of file
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 5c22142..6b0c505 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -79,5 +79,6 @@ namespace RandomX {
void h_CFROUND(Instruction&, int);
void h_ISTORE(Instruction&, int);
void h_FSTORE(Instruction&, int);
+ void h_NOP(Instruction&, int);
};
}
\ No newline at end of file
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index 13cfc1d..0aa0289 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -327,6 +327,10 @@ namespace RandomX {
os << ", " << reg << srcIndex << std::endl;
}
+ void Instruction::h_NOP(std::ostream& os) const {
+ os << std::endl;
+ }
+
#include "instructionWeights.hpp"
#define INST_NAME(x) REPN(#x, WT(x))
#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
@@ -377,6 +381,8 @@ namespace RandomX {
INST_NAME(ISTORE)
INST_NAME(FSTORE)
+
+ INST_NAME(NOP)
};
InstructionVisualizer Instruction::engine[256] = {
@@ -425,6 +431,8 @@ namespace RandomX {
INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE)
+
+ INST_HANDLE(NOP)
};
}
\ No newline at end of file
diff --git a/src/Instruction.hpp b/src/Instruction.hpp
index 017d92f..ffa3880 100644
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@@ -86,6 +86,7 @@ namespace RandomX {
void h_CFROUND(std::ostream&) const;
void h_ISTORE(std::ostream&) const;
void h_FSTORE(std::ostream&) const;
+ void h_NOP(std::ostream&) const;
};
static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction");
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index e001464..30c6f73 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -181,7 +181,7 @@ namespace RandomX {
static const uint8_t JMP = 0xe9;
size_t JitCompilerX86::getCodeSize() {
- return codePos - prologueSize + readDatasetSize;
+ return codePos - prologueSize;
}
JitCompilerX86::JitCompilerX86() {
@@ -761,6 +761,10 @@ namespace RandomX {
emitByte(0x06);
}
+ void JitCompilerX86::h_NOP(Instruction& instr) {
+ emitByte(0x90);
+ }
+
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x))
@@ -800,6 +804,7 @@ namespace RandomX {
INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE)
+ INST_HANDLE(NOP)
};
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index fa5aa93..0aef990 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -125,6 +125,7 @@ namespace RandomX {
void h_CFROUND(Instruction&);
void h_ISTORE(Instruction&);
void h_FSTORE(Instruction&);
+ void h_NOP(Instruction&);
};
}
\ No newline at end of file
diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc
index af6f1b7..2c631ce 100644
--- a/src/asm/program_load_flt.inc
+++ b/src/asm/program_load_flt.inc
@@ -1,4 +1,4 @@
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc
index d139549..d9277ed 100644
--- a/src/asm/program_load_int.inc
+++ b/src/asm/program_load_int.inc
@@ -1,4 +1,4 @@
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc
index d6ca7f1..4bbab9f 100644
--- a/src/asm/program_store_flt.inc
+++ b/src/asm/program_store_flt.inc
@@ -1,4 +1,4 @@
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc
index 75c973f..03dd31a 100644
--- a/src/asm/program_store_int.inc
+++ b/src/asm/program_store_int.inc
@@ -1,4 +1,4 @@
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
diff --git a/src/common.hpp b/src/common.hpp
index 053f2a1..bbd5a2b 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -72,7 +72,7 @@ namespace RandomX {
convertible_t hi;
};
- constexpr int ProgramLength = 256;
+ constexpr int ProgramLength = 128;
constexpr uint32_t InstructionCount = 1024;
constexpr uint32_t ScratchpadSize = 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm
index be3bc82..e9bc30a 100644
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@@ -119,7 +119,7 @@ signMask:
ALIGN 64
program_begin:
xor eax, r8d ;# read address register 1
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
@@ -130,7 +130,7 @@ program_begin:
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor eax, r9d ;# read address register 2
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
@@ -166,7 +166,7 @@ program_begin:
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
mov eax, r12d ;# write address register 1
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
@@ -177,7 +177,7 @@ program_begin:
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
xor eax, r13d ;# write address register 2
- and eax, 262080
+ and eax, 1048512
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index 86285de..55c9b79 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -20,51 +20,51 @@ along with RandomX. If not, see.
#pragma once
//Integer
-#define WT_IADD_R 10
+#define WT_IADD_R 12
#define WT_IADD_M 3
-#define WT_IADD_RC 10
-#define WT_ISUB_R 10
+#define WT_IADD_RC 12
+#define WT_ISUB_R 12
#define WT_ISUB_M 3
#define WT_IMUL_9C 10
-#define WT_IMUL_R 20
-#define WT_IMUL_M 6
-#define WT_IMULH_R 6
-#define WT_IMULH_M 2
-#define WT_ISMULH_R 6
-#define WT_ISMULH_M 2
+#define WT_IMUL_R 16
+#define WT_IMUL_M 4
+#define WT_IMULH_R 4
+#define WT_IMULH_M 1
+#define WT_ISMULH_R 4
+#define WT_ISMULH_M 1
#define WT_IDIV_C 4
#define WT_ISDIV_C 4
#define WT_INEG_R 2
#define WT_IXOR_R 12
-#define WT_IXOR_M 4
-#define WT_IROR_R 10
-#define WT_IROL_R 10
+#define WT_IXOR_M 3
+#define WT_IROR_R 12
+#define WT_IROL_R 12
//Common floating point
-#define WT_FPSWAP_R 6
+#define WT_FPSWAP_R 8
//Floating point group F
-#define WT_FPADD_R 18
-#define WT_FPADD_M 3
-#define WT_FPSUB_R 18
-#define WT_FPSUB_M 3
-#define WT_FPNEG_R 5
+#define WT_FPADD_R 20
+#define WT_FPADD_M 5
+#define WT_FPSUB_R 20
+#define WT_FPSUB_M 5
+#define WT_FPNEG_R 6
//Floating point group E
-#define WT_FPMUL_R 18
-#define WT_FPMUL_M 3
-#define WT_FPDIV_R 6
+#define WT_FPMUL_R 16
+#define WT_FPMUL_M 4
+#define WT_FPDIV_R 7
#define WT_FPDIV_M 1
#define WT_FPSQRT_R 6
//Control
-#define WT_COND_R 12
-#define WT_COND_M 4
+#define WT_COND_R 7
+#define WT_COND_M 1
#define WT_CFROUND 1
//Store
-#define WT_ISTORE 12
-#define WT_FSTORE 6
+#define WT_ISTORE 18
+#define WT_FSTORE 0
#define WT_NOP 0
@@ -115,6 +115,7 @@ static_assert(wtSum == 256,
#define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
+#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x)
#define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x)
#define REPN(x,N) REPNX(x,N)
diff --git a/src/main.cpp b/src/main.cpp
index 12e9cdb..4f5a021 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -169,12 +169,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
vm->initializeScratchpad(scratchpad, spIndex);
- //vm->initializeProgram(hash);
+ vm->setScratchpad(scratchpad);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
for (int chain = 0; chain < 16; ++chain) {
vm->initializeProgram(hash);
- int segment = hash[3] & 3;
- vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4);
vm->execute();
vm->getResult(nullptr, 0, hash);
}
diff --git a/src/program.inc b/src/program.inc
index 21f7d0b..d901e9a 100644
--- a/src/program.inc
+++ b/src/program.inc
@@ -10,54 +10,54 @@
mulpd xmm6, xmm10
; IMUL_R r6, r3
imul r14, r11
- ; FPMUL_R e1, a0
- mulpd xmm5, xmm8
- ; IROR_R r5, r3
+ ; FPSUB_M f1, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm1, xmm12
+ ; IROL_R r5, r3
mov ecx, r11d
- ror r13, cl
+ rol r13, cl
; FPMUL_R e2, a0
mulpd xmm6, xmm8
- ; FPNEG_R f3
- xorps xmm3, xmm15
+ ; FPSUB_R f3, a0
+ subpd xmm3, xmm8
; IXOR_R r0, r4
xor r8, r12
- ; ISMULH_R r3, r7
+ ; ISMULH_M r3, L1[r7]
+ mov ecx, r15d
+ and ecx, 16376
mov rax, r11
- imul r15
+ imul qword ptr [rsi+rcx]
mov r11, rdx
; FPSWAP_R f2
shufpd xmm2, xmm2, 1
- ; ISMULH_R r6, r0
- mov rax, r14
- imul r8
- mov r14, rdx
+ ; IDIV_C r6, 1248528248
+ mov rax, 15864311168205210203
+ mul r14
+ shr rdx, 30
+ add r14, rdx
; FPMUL_R e0, a2
mulpd xmm4, xmm10
- ; ISUB_R r3, r4
- sub r11, r12
+ ; IADD_RC r3, r4, -52260428
+ lea r11, [r11+r12-52260428]
; IADD_R r7, -1138617760
add r15, -1138617760
- ; IROR_R r2, r6
+ ; IROL_R r2, r6
mov ecx, r14d
- ror r10, cl
- ; FPMUL_R e2, a1
- mulpd xmm6, xmm9
+ rol r10, cl
+ ; FPNEG_R f2
+ xorps xmm2, xmm15
; IROR_R r7, r1
mov ecx, r9d
ror r15, cl
- ; COND_M r2, lt(L1[r7], -41618808)
+ ; COND_R r2, lt(r7, -41618808)
xor ecx, ecx
- mov eax, r15d
- and eax, 16376
- cmp dword ptr [rsi+rax], -41618808
+ cmp r15d, -41618808
setl cl
add r10, rcx
- ; FPMUL_M e3, L1[r0]
- mov eax, r8d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm7, xmm12
- maxpd xmm7, xmm13
+ ; FPMUL_R e3, a0
+ mulpd xmm7, xmm8
; CFROUND r1, 43
mov rax, r9
rol rax, 34
@@ -67,14 +67,17 @@
ldmxcsr dword ptr [rsp-8]
; FPADD_R f2, a1
addpd xmm2, xmm9
- ; FPNEG_R f0
- xorps xmm0, xmm15
- ; FSTORE L1[r6], f2
+ ; FPSUB_M f0, L1[r7]
+ mov eax, r15d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm0, xmm12
+ ; ISTORE L1[r6], r2
mov eax, r14d
- and eax, 16368
- movapd xmmword ptr [rsi+rax], xmm2
- ; IMUL_9C r6, -45112665
- lea r14, [r14+r14*8-45112665]
+ and eax, 16376
+ mov qword ptr [rsi+rax], r10
+ ; ISUB_R r6, r5
+ sub r14, r13
; IADD_M r0, L1[r4]
mov eax, r12d
and eax, 16376
@@ -87,41 +90,30 @@
mov eax, r14d
and eax, 16376
mov qword ptr [rsi+rax], r14
- ; COND_R r4, sg(r1, -1189096105)
- xor ecx, ecx
- cmp r9d, -1189096105
- sets cl
- add r12, rcx
+ ; FPSQRT_R e0
+ sqrtpd xmm4, xmm4
; IXOR_R r2, r5
xor r10, r13
- ; COND_R r1, be(r5, -965180434)
- xor ecx, ecx
- cmp r13d, -965180434
- setbe cl
- add r9, rcx
- ; FPMUL_M e1, L2[r3]
- mov eax, r11d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm5, xmm12
- maxpd xmm5, xmm13
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; FPMUL_R e1, a3
+ mulpd xmm5, xmm11
; IMULH_R r7, r6
mov rax, r15
mul r14
mov r15, rdx
- ; ISMULH_M r0, L1[r4]
- mov ecx, r12d
- and ecx, 16376
- mov rax, r8
- imul qword ptr [rsi+rcx]
- mov r8, rdx
+ ; ISDIV_C r0, -1706892622
+ mov rax, -5802075764249827661
+ imul r8
+ xor eax, eax
+ sar rdx, 29
+ sets al
+ add rdx, rax
+ add r8, rdx
; IMUL_R r5, r3
imul r13, r11
- ; COND_R r2, of(r0, -1045938770)
- xor ecx, ecx
- cmp r8d, -1045938770
- seto cl
- add r10, rcx
+ ; FPSQRT_R e2
+ sqrtpd xmm6, xmm6
; FPADD_M f3, L1[r4]
mov eax, r12d
and eax, 16376
@@ -131,18 +123,19 @@
add r11, r10
; FPADD_R f1, a0
addpd xmm1, xmm8
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
+ ; FPDIV_R e3, a2
+ divpd xmm7, xmm10
+ maxpd xmm7, xmm13
; FPSUB_R f0, a1
subpd xmm0, xmm9
; IMUL_M r5, L1[r6]
mov eax, r14d
and eax, 16376
imul r13, qword ptr [rsi+rax]
- ; ISUB_R r1, r2
- sub r9, r10
- ; IMUL_R r4, r6
- imul r12, r14
+ ; IADD_RC r1, r2, -1263285243
+ lea r9, [r9+r10-1263285243]
+ ; IMUL_9C r4, 1994773931
+ lea r12, [r12+r12*8+1994773931]
; FPSWAP_R e3
shufpd xmm7, xmm7, 1
; IMUL_M r0, L1[r7]
@@ -152,69 +145,72 @@
; IROR_R r1, r6
mov ecx, r14d
ror r9, cl
- ; IROR_R r2, r4
+ ; IROL_R r2, r4
mov ecx, r12d
- ror r10, cl
+ rol r10, cl
; FPSUB_R f3, a1
subpd xmm3, xmm9
- ; FSTORE L1[r0], e1
+ ; ISTORE L1[r0], r5
mov eax, r8d
- and eax, 16368
- movapd xmmword ptr [rsi+rax], xmm5
- ; COND_R r2, sg(r3, 1269153133)
- xor ecx, ecx
- cmp r11d, 1269153133
- sets cl
- add r10, rcx
+ and eax, 16376
+ mov qword ptr [rsi+rax], r13
+ ; FPDIV_M e2, L2[r3]
+ mov eax, r11d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ divpd xmm6, xmm12
+ maxpd xmm6, xmm13
; FPSWAP_R f2
shufpd xmm2, xmm2, 1
; IADD_R r7, r5
add r15, r13
- ; COND_R r0, be(r4, -1486502150)
- xor ecx, ecx
- cmp r12d, -1486502150
- setbe cl
- add r8, rcx
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
+ ; FPDIV_M e0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ divpd xmm4, xmm12
+ maxpd xmm4, xmm13
+ ; FPADD_M f3, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
; FPADD_R f0, a3
addpd xmm0, xmm11
; IADD_R r2, r0
add r10, r8
- ; FSTORE L1[r3], e2
+ ; ISTORE L1[r3], r6
mov eax, r11d
- and eax, 16368
- movapd xmmword ptr [rsi+rax], xmm6
- ; IXOR_R r1, r7
- xor r9, r15
- ; IMUL_R r5, r7
- imul r13, r15
+ and eax, 16376
+ mov qword ptr [rsi+rax], r14
+ ; IROR_R r1, r7
+ mov ecx, r15d
+ ror r9, cl
+ ; IMUL_9C r5, 301671287
+ lea r13, [r13+r13*8+301671287]
; IXOR_R r7, 266992378
xor r15, 266992378
- ; COND_R r7, no(r4, 1983804692)
- xor ecx, ecx
- cmp r12d, 1983804692
- setno cl
- add r15, rcx
+ ; FPSQRT_R e3
+ sqrtpd xmm7, xmm7
; IMUL_M r2, L2[r0]
mov eax, r8d
and eax, 262136
imul r10, qword ptr [rsi+rax]
- ; FPDIV_R e3, a2
- divpd xmm7, xmm10
- maxpd xmm7, xmm13
- ; IMUL_M r0, L2[r6]
- mov eax, r14d
- and eax, 262136
- imul r8, qword ptr [rsi+rax]
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IMUL_R r0, r6
+ imul r8, r14
; ISTORE L1[r0], r7
mov eax, r8d
and eax, 16376
mov qword ptr [rsi+rax], r15
- ; FPMUL_R e0, a1
- mulpd xmm4, xmm9
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
+ ; FPNEG_R f0
+ xorps xmm0, xmm15
+ ; FPADD_M f3, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
; IROR_R r5, r4
mov ecx, r12d
ror r13, cl
@@ -222,17 +218,20 @@
mov eax, r15d
and eax, 262136
mov qword ptr [rsi+rax], r10
- ; FPSWAP_R e2
- shufpd xmm6, xmm6, 1
+ ; FPADD_R f2, a3
+ addpd xmm2, xmm11
; FPADD_M f3, L1[r2]
mov eax, r10d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm3, xmm12
- ; IDIV_C r5, 2218798981
- mov rax, 17853839665672790751
- mul r13
- shr rdx, 31
+ ; ISDIV_C r5, -2076168315
+ mov rax, -4770095103914078469
+ imul r13
+ xor eax, eax
+ sar rdx, 29
+ sets al
+ add rdx, rax
add r13, rdx
; IADD_RC r0, r4, -1321374359
lea r8, [r8+r12-1321374359]
@@ -250,28 +249,26 @@
rol r15, cl
; ISUB_R r2, r4
sub r10, r12
- ; IMULH_M r0, L1[12400]
- mov rax, r8
- mul qword ptr [rsi+12400]
- mov r8, rdx
+ ; ISMULH_R r0, -1500893068
+ mov rax, -1500893068
+ imul r8
+ add r8, rdx
; IADD_R r2, r3
add r10, r11
- ; COND_R r6, lt(r1, -1124202227)
- xor ecx, ecx
- cmp r9d, -1124202227
- setl cl
- add r14, rcx
- ; IROR_R r7, r4
+ ; FPSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; IROL_R r7, r4
mov ecx, r12d
- ror r15, cl
+ rol r15, cl
; IMUL_R r4, r2
imul r12, r10
; ISUB_R r3, r7
sub r11, r15
; IADD_R r2, r7
add r10, r15
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
+ ; FPDIV_R e3, a0
+ divpd xmm7, xmm8
+ maxpd xmm7, xmm13
; ISUB_R r6, 540663146
sub r14, 540663146
; IROL_R r5, 58
@@ -280,67 +277,65 @@
addpd xmm2, xmm9
; FPADD_R f2, a2
addpd xmm2, xmm10
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
+ ; FPDIV_R e1, a2
+ divpd xmm5, xmm10
+ maxpd xmm5, xmm13
; FPADD_R f1, a2
addpd xmm1, xmm10
; IADD_R r5, r3
add r13, r11
- ; IADD_M r7, L1[880]
- add r15, qword ptr [rsi+880]
+ ; IADD_R r7, -1780268176
+ add r15, -1780268176
; ISUB_R r7, r0
sub r15, r8
; ISTORE L2[r0], r7
mov eax, r8d
and eax, 262136
mov qword ptr [rsi+rax], r15
- ; IDIV_C r2, 1014940364
- mov rax, r10
- shr rax, 2
- mov rcx, 1219717022984988185
- mul rcx
- shr rdx, 24
- add r10, rdx
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; IDIV_C r2, 3059159304
- mov rax, 12949335853590502915
- mul r10
- shr rdx, 31
- add r10, rdx
+ ; INEG_R r2
+ neg r10
+ ; FPNEG_R f0
+ xorps xmm0, xmm15
+ ; INEG_R r2
+ neg r10
; IADD_R r0, r3
add r8, r11
; IMUL_9C r7, -2124093035
lea r15, [r15+r15*8-2124093035]
- ; FPSUB_R f2, a0
- subpd xmm2, xmm8
- ; FPDIV_R e0, a2
- divpd xmm4, xmm10
+ ; FPADD_M f2, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm2, xmm12
+ ; FPMUL_M e0, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm4, xmm12
maxpd xmm4, xmm13
; FPSUB_R f2, a3
subpd xmm2, xmm11
; IMUL_R r1, r2
imul r9, r10
- ; ISMULH_R r7, r5
- mov rax, r15
- imul r13
- mov r15, rdx
+ ; IDIV_C r7, 3214009572
+ mov rax, 12325439725582798855
+ mul r15
+ shr rdx, 31
+ add r15, rdx
; IMULH_R r3, r2
mov rax, r11
mul r10
mov r11, rdx
- ; IXOR_M r1, L2[r0]
- mov eax, r8d
- and eax, 262136
- xor r9, qword ptr [rsi+rax]
+ ; IROR_R r1, r0
+ mov ecx, r8d
+ ror r9, cl
; FPMUL_R e0, a1
mulpd xmm4, xmm9
- ; ISUB_R r4, 1456841848
- sub r12, 1456841848
- ; IXOR_M r3, L2[r2]
- mov eax, r10d
- and eax, 262136
- xor r11, qword ptr [rsi+rax]
+ ; IADD_RC r4, r4, 1456841848
+ lea r12, [r12+r12+1456841848]
+ ; IROR_R r3, r2
+ mov ecx, r10d
+ ror r11, cl
; COND_M r0, of(L1[r4], 1678513610)
xor ecx, ecx
mov eax, r12d
@@ -348,446 +343,39 @@
cmp dword ptr [rsi+rax], 1678513610
seto cl
add r8, rcx
- ; IDIV_C r4, 2674394209
- mov rax, 925772300223658071
- mul r12
- shr rdx, 27
- add r12, rdx
+ ; INEG_R r4
+ neg r12
; IMUL_R r4, r1
imul r12, r9
; FPADD_R f1, a2
addpd xmm1, xmm10
; FPSUB_R f2, a0
subpd xmm2, xmm8
- ; FPMUL_M e1, L2[r6]
- mov eax, r14d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm5, xmm12
- maxpd xmm5, xmm13
- ; FPSUB_M f0, L2[r3]
- mov eax, r11d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm0, xmm12
+ ; FPMUL_R e1, a2
+ mulpd xmm5, xmm10
+ ; FPSUB_R f0, a3
+ subpd xmm0, xmm11
; IROR_R r0, r7
mov ecx, r15d
ror r8, cl
- ; FSTORE L2[r1], e0
+ ; ISTORE L2[r1], r4
mov eax, r9d
- and eax, 262128
- movapd xmmword ptr [rsi+rax], xmm4
- ; IROR_R r7, r6
+ and eax, 262136
+ mov qword ptr [rsi+rax], r12
+ ; IROL_R r7, r6
mov ecx, r14d
- ror r15, cl
+ rol r15, cl
; IMUL_9C r2, 266593902
lea r10, [r10+r10*8+266593902]
; IMUL_R r4, r6
imul r12, r14
; FPSUB_R f2, a2
subpd xmm2, xmm10
- ; FPMUL_R e3, a0
- mulpd xmm7, xmm8
- ; IXOR_M r7, L1[r2]
- mov eax, r10d
- and eax, 16376
- xor r15, qword ptr [rsi+rax]
+ ; FPNEG_R f3
+ xorps xmm3, xmm15
+ ; IROR_R r7, r2
+ mov ecx, r10d
+ ror r15, cl
; IROR_R r0, r5
mov ecx, r13d
ror r8, cl
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; FPADD_R f3, a1
- addpd xmm3, xmm9
- ; FPADD_R f1, a0
- addpd xmm1, xmm8
- ; COND_M r2, ge(L2[r2], -226330940)
- xor ecx, ecx
- mov eax, r10d
- and eax, 262136
- cmp dword ptr [rsi+rax], -226330940
- setge cl
- add r10, rcx
- ; FPDIV_R e2, a3
- divpd xmm6, xmm11
- maxpd xmm6, xmm13
- ; FPMUL_R e2, a1
- mulpd xmm6, xmm9
- ; FPSUB_R f1, a0
- subpd xmm1, xmm8
- ; IMUL_R r7, r5
- imul r15, r13
- ; IMUL_R r0, r1
- imul r8, r9
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
- ; IROL_R r3, r5
- mov ecx, r13d
- rol r11, cl
- ; IADD_RC r5, r2, 795784298
- lea r13, [r13+r10+795784298]
- ; ISUB_R r0, r4
- sub r8, r12
- ; IMUL_R r5, r4
- imul r13, r12
- ; FPSUB_R f0, a2
- subpd xmm0, xmm10
- ; FPMUL_R e3, a1
- mulpd xmm7, xmm9
- ; ISDIV_C r3, 1662492575
- mov rax, 2978515652703905219
- imul r11
- xor eax, eax
- sar rdx, 28
- sets al
- add rdx, rax
- add r11, rdx
- ; ISMULH_R r5, r0
- mov rax, r13
- imul r8
- mov r13, rdx
- ; ISDIV_C r4, 1963597892
- mov rax, -8359627607928540073
- imul r12
- xor eax, eax
- add rdx, r12
- sar rdx, 30
- sets al
- add rdx, rax
- add r12, rdx
- ; IMUL_R r7, r0
- imul r15, r8
- ; IMULH_M r0, L1[r3]
- mov ecx, r11d
- and ecx, 16376
- mov rax, r8
- mul qword ptr [rsi+rcx]
- mov r8, rdx
- ; IXOR_R r3, r7
- xor r11, r15
- ; IDIV_C r4, 1146125335
- mov rax, 8640870253760721727
- mul r12
- shr rdx, 29
- add r12, rdx
- ; FPSWAP_R f3
- shufpd xmm3, xmm3, 1
- ; IXOR_M r2, L1[r0]
- mov eax, r8d
- and eax, 16376
- xor r10, qword ptr [rsi+rax]
- ; IROR_R r0, r1
- mov ecx, r9d
- ror r8, cl
- ; IXOR_R r7, r4
- xor r15, r12
- ; ISMULH_R r6, r2
- mov rax, r14
- imul r10
- mov r14, rdx
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; IADD_RC r4, r2, 1704868083
- lea r12, [r12+r10+1704868083]
- ; FPSUB_R f2, a0
- subpd xmm2, xmm8
- ; ISTORE L1[r0], r0
- mov eax, r8d
- and eax, 16376
- mov qword ptr [rsi+rax], r8
- ; FPSUB_R f0, a3
- subpd xmm0, xmm11
- ; FPDIV_R e0, a3
- divpd xmm4, xmm11
- maxpd xmm4, xmm13
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; ISUB_R r7, 1302457878
- sub r15, 1302457878
- ; IMUL_9C r1, 1330165941
- lea r9, [r9+r9*8+1330165941]
- ; FPMUL_R e1, a3
- mulpd xmm5, xmm11
- ; IROL_R r0, r4
- mov ecx, r12d
- rol r8, cl
- ; FPSUB_M f1, L1[r0]
- mov eax, r8d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; IROL_R r5, r6
- mov ecx, r14d
- rol r13, cl
- ; COND_M r0, ab(L1[r1], -310933871)
- xor ecx, ecx
- mov eax, r9d
- and eax, 16376
- cmp dword ptr [rsi+rax], -310933871
- seta cl
- add r8, rcx
- ; CFROUND r7, 39
- mov rax, r15
- rol rax, 38
- and eax, 24576
- or eax, 40896
- mov dword ptr [rsp-8], eax
- ldmxcsr dword ptr [rsp-8]
- ; FPDIV_R e0, a1
- divpd xmm4, xmm9
- maxpd xmm4, xmm13
- ; IMUL_M r1, L1[r3]
- mov eax, r11d
- and eax, 16376
- imul r9, qword ptr [rsi+rax]
- ; IMUL_9C r3, 1573236728
- lea r11, [r11+r11*8+1573236728]
- ; FPNEG_R f3
- xorps xmm3, xmm15
- ; COND_R r1, lt(r4, -1805702334)
- xor ecx, ecx
- cmp r12d, -1805702334
- setl cl
- add r9, rcx
- ; FPSWAP_R f1
- shufpd xmm1, xmm1, 1
- ; IADD_R r7, -1421188024
- add r15, -1421188024
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; FPSUB_M f2, L2[r7]
- mov eax, r15d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm2, xmm12
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; ISUB_R r2, r4
- sub r10, r12
- ; ISMULH_R r4, r5
- mov rax, r12
- imul r13
- mov r12, rdx
- ; COND_R r1, of(r7, 1294727006)
- xor ecx, ecx
- cmp r15d, 1294727006
- seto cl
- add r9, rcx
- ; IADD_M r5, L2[r2]
- mov eax, r10d
- and eax, 262136
- add r13, qword ptr [rsi+rax]
- ; IMUL_9C r4, 401020510
- lea r12, [r12+r12*8+401020510]
- ; IROL_R r3, r0
- mov ecx, r8d
- rol r11, cl
- ; ISTORE L1[r7], r0
- mov eax, r15d
- and eax, 16376
- mov qword ptr [rsi+rax], r8
- ; FPSUB_R f2, a1
- subpd xmm2, xmm9
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; IMUL_R r3, 720965215
- imul r11, 720965215
- ; IMUL_R r6, r2
- imul r14, r10
- ; ISTORE L1[r7], r3
- mov eax, r15d
- and eax, 16376
- mov qword ptr [rsi+rax], r11
- ; IROR_R r2, r6
- mov ecx, r14d
- ror r10, cl
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; IMUL_9C r4, 788211341
- lea r12, [r12+r12*8+788211341]
- ; IMUL_9C r3, -67993446
- lea r11, [r11+r11*8-67993446]
- ; FPSWAP_R e3
- shufpd xmm7, xmm7, 1
- ; IMUL_M r2, L1[r6]
- mov eax, r14d
- and eax, 16376
- imul r10, qword ptr [rsi+rax]
- ; COND_M r2, ge(L1[r2], -1892157506)
- xor ecx, ecx
- mov eax, r10d
- and eax, 16376
- cmp dword ptr [rsi+rax], -1892157506
- setge cl
- add r10, rcx
- ; FPADD_M f1, L1[r3]
- mov eax, r11d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm1, xmm12
- ; IADD_M r7, L1[r0]
- mov eax, r8d
- and eax, 16376
- add r15, qword ptr [rsi+rax]
- ; ISDIV_C r1, 624867857
- mov rax, 7924491717200811467
- imul r9
- xor eax, eax
- sar rdx, 28
- sets al
- add rdx, rax
- add r9, rdx
- ; FPADD_R f0, a1
- addpd xmm0, xmm9
- ; ISUB_R r5, r7
- sub r13, r15
- ; FPNEG_R f0
- xorps xmm0, xmm15
- ; IMUL_R r6, r2
- imul r14, r10
- ; FPMUL_M e3, L1[r1]
- mov eax, r9d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm7, xmm12
- maxpd xmm7, xmm13
- ; IADD_R r0, r4
- add r8, r12
- ; FPSUB_M f3, L1[r1]
- mov eax, r9d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm3, xmm12
- ; FPMUL_R e2, a0
- mulpd xmm6, xmm8
- ; INEG_R r2
- neg r10
- ; FPMUL_R e2, a2
- mulpd xmm6, xmm10
- ; FPSUB_M f3, L1[r6]
- mov eax, r14d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm3, xmm12
- ; FPADD_R f1, a3
- addpd xmm1, xmm11
- ; IMULH_R r3, r2
- mov rax, r11
- mul r10
- mov r11, rdx
- ; FPSUB_R f0, a3
- subpd xmm0, xmm11
- ; IDIV_C r5, 2887845607
- mov rax, 13717520480010955377
- mul r13
- shr rdx, 31
- add r13, rdx
- ; ISMULH_M r6, L1[r2]
- mov ecx, r10d
- and ecx, 16376
- mov rax, r14
- imul qword ptr [rsi+rcx]
- mov r14, rdx
- ; FPSUB_R f3, a3
- subpd xmm3, xmm11
- ; IMUL_M r6, L1[r7]
- mov eax, r15d
- and eax, 16376
- imul r14, qword ptr [rsi+rax]
- ; FPNEG_R f0
- xorps xmm0, xmm15
- ; FPMUL_R e2, a0
- mulpd xmm6, xmm8
- ; IMUL_9C r6, 295130073
- lea r14, [r14+r14*8+295130073]
- ; FPADD_R f1, a1
- addpd xmm1, xmm9
- ; IXOR_R r0, r5
- xor r8, r13
- ; FPADD_R f2, a1
- addpd xmm2, xmm9
- ; FPSWAP_R e3
- shufpd xmm7, xmm7, 1
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; IADD_RC r3, r6, -1317630728
- lea r11, [r11+r14-1317630728]
- ; IMUL_M r2, L1[r3]
- mov eax, r11d
- and eax, 16376
- imul r10, qword ptr [rsi+rax]
- ; IADD_RC r1, r4, 894105694
- lea r9, [r9+r12+894105694]
- ; IMUL_R r7, r0
- imul r15, r8
- ; FPSUB_R f1, a0
- subpd xmm1, xmm8
- ; IMUL_M r7, L1[r1]
- mov eax, r9d
- and eax, 16376
- imul r15, qword ptr [rsi+rax]
- ; IXOR_R r2, r4
- xor r10, r12
- ; ISUB_M r0, L1[r1]
- mov eax, r9d
- and eax, 16376
- sub r8, qword ptr [rsi+rax]
- ; INEG_R r4
- neg r12
- ; IMUL_9C r4, -285272388
- lea r12, [r12+r12*8-285272388]
- ; IMUL_R r7, r4
- imul r15, r12
- ; IMULH_M r5, L1[r7]
- mov ecx, r15d
- and ecx, 16376
- mov rax, r13
- mul qword ptr [rsi+rcx]
- mov r13, rdx
- ; IROL_R r1, r7
- mov ecx, r15d
- rol r9, cl
- ; IXOR_R r4, -757532727
- xor r12, -757532727
- ; IMUL_R r3, 1863959234
- imul r11, 1863959234
- ; IROL_R r4, 59
- rol r12, 59
- ; ISMULH_R r1, 2122681086
- mov rax, 2122681086
- imul r9
- add r9, rdx
- ; ISTORE L2[r6], r7
- mov eax, r14d
- and eax, 262136
- mov qword ptr [rsi+rax], r15
- ; ISTORE L1[r1], r5
- mov eax, r9d
- and eax, 16376
- mov qword ptr [rsi+rax], r13
- ; FPMUL_R e0, a1
- mulpd xmm4, xmm9
- ; COND_R r2, ns(r1, 486049737)
- xor ecx, ecx
- cmp r9d, 486049737
- setns cl
- add r10, rcx
- ; FPMUL_M e0, L2[r7]
- mov eax, r15d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm4, xmm12
- maxpd xmm4, xmm13
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; IROL_R r5, r2
- mov ecx, r10d
- rol r13, cl
- ; IADD_M r0, L1[r4]
- mov eax, r12d
- and eax, 16376
- add r8, qword ptr [rsi+rax]