NOP instruction

register load/store from L3
This commit is contained in:
tevador 2019-01-27 18:19:49 +01:00
parent 005c67f64c
commit 8f2abd6c05
15 changed files with 233 additions and 624 deletions

View File

@ -491,6 +491,10 @@ namespace RandomX {
asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl; asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl;
} }
void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) {
asmCode << "\tnop" << std::endl;
}
#include "instructionWeights.hpp" #include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))
@ -540,5 +544,7 @@ namespace RandomX {
INST_HANDLE(ISTORE) INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE) INST_HANDLE(FSTORE)
INST_HANDLE(NOP)
}; };
} }

View File

@ -79,5 +79,6 @@ namespace RandomX {
void h_CFROUND(Instruction&, int); void h_CFROUND(Instruction&, int);
void h_ISTORE(Instruction&, int); void h_ISTORE(Instruction&, int);
void h_FSTORE(Instruction&, int); void h_FSTORE(Instruction&, int);
void h_NOP(Instruction&, int);
}; };
} }

View File

@ -327,6 +327,10 @@ namespace RandomX {
os << ", " << reg << srcIndex << std::endl; os << ", " << reg << srcIndex << std::endl;
} }
void Instruction::h_NOP(std::ostream& os) const {
os << std::endl;
}
#include "instructionWeights.hpp" #include "instructionWeights.hpp"
#define INST_NAME(x) REPN(#x, WT(x)) #define INST_NAME(x) REPN(#x, WT(x))
#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
@ -377,6 +381,8 @@ namespace RandomX {
INST_NAME(ISTORE) INST_NAME(ISTORE)
INST_NAME(FSTORE) INST_NAME(FSTORE)
INST_NAME(NOP)
}; };
InstructionVisualizer Instruction::engine[256] = { InstructionVisualizer Instruction::engine[256] = {
@ -425,6 +431,8 @@ namespace RandomX {
INST_HANDLE(ISTORE) INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE) INST_HANDLE(FSTORE)
INST_HANDLE(NOP)
}; };
} }

View File

@ -86,6 +86,7 @@ namespace RandomX {
void h_CFROUND(std::ostream&) const; void h_CFROUND(std::ostream&) const;
void h_ISTORE(std::ostream&) const; void h_ISTORE(std::ostream&) const;
void h_FSTORE(std::ostream&) const; void h_FSTORE(std::ostream&) const;
void h_NOP(std::ostream&) const;
}; };
static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction");

View File

@ -181,7 +181,7 @@ namespace RandomX {
static const uint8_t JMP = 0xe9; static const uint8_t JMP = 0xe9;
size_t JitCompilerX86::getCodeSize() { size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize + readDatasetSize; return codePos - prologueSize;
} }
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
@ -761,6 +761,10 @@ namespace RandomX {
emitByte(0x06); emitByte(0x06);
} }
void JitCompilerX86::h_NOP(Instruction& instr) {
emitByte(0x90);
}
#include "instructionWeights.hpp" #include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x))
@ -800,6 +804,7 @@ namespace RandomX {
INST_HANDLE(CFROUND) INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE) INST_HANDLE(ISTORE)
INST_HANDLE(FSTORE) INST_HANDLE(FSTORE)
INST_HANDLE(NOP)
}; };

View File

@ -125,6 +125,7 @@ namespace RandomX {
void h_CFROUND(Instruction&); void h_CFROUND(Instruction&);
void h_ISTORE(Instruction&); void h_ISTORE(Instruction&);
void h_FSTORE(Instruction&); void h_FSTORE(Instruction&);
void h_NOP(Instruction&);
}; };
} }

View File

@ -1,4 +1,4 @@
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm1, qword ptr [rcx+8]

View File

@ -1,4 +1,4 @@
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0] xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8] xor r9, qword ptr [rcx+8]

View File

@ -1,4 +1,4 @@
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mulpd xmm0, xmm4 mulpd xmm0, xmm4
mulpd xmm1, xmm5 mulpd xmm1, xmm5

View File

@ -1,4 +1,4 @@
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8 mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9 mov qword ptr [rcx+8], r9

View File

@ -72,7 +72,7 @@ namespace RandomX {
convertible_t hi; convertible_t hi;
}; };
constexpr int ProgramLength = 256; constexpr int ProgramLength = 128;
constexpr uint32_t InstructionCount = 1024; constexpr uint32_t InstructionCount = 1024;
constexpr uint32_t ScratchpadSize = 1024 * 1024; constexpr uint32_t ScratchpadSize = 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);

View File

@ -119,7 +119,7 @@ signMask:
ALIGN 64 ALIGN 64
program_begin: program_begin:
xor eax, r8d ;# read address register 1 xor eax, r8d ;# read address register 1
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0] xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8] xor r9, qword ptr [rcx+8]
@ -130,7 +130,7 @@ program_begin:
xor r14, qword ptr [rcx+48] xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56] xor r15, qword ptr [rcx+56]
xor eax, r9d ;# read address register 2 xor eax, r9d ;# read address register 2
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm1, qword ptr [rcx+8]
@ -166,7 +166,7 @@ program_begin:
xor r14, qword ptr [rcx+48] xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56] xor r15, qword ptr [rcx+56]
mov eax, r12d ;# write address register 1 mov eax, r12d ;# write address register 1
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8 mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9 mov qword ptr [rcx+8], r9
@ -177,7 +177,7 @@ program_begin:
mov qword ptr [rcx+48], r14 mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15 mov qword ptr [rcx+56], r15
xor eax, r13d ;# write address register 2 xor eax, r13d ;# write address register 2
and eax, 262080 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mulpd xmm0, xmm4 mulpd xmm0, xmm4
mulpd xmm1, xmm5 mulpd xmm1, xmm5

View File

@ -20,51 +20,51 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once #pragma once
//Integer //Integer
#define WT_IADD_R 10 #define WT_IADD_R 12
#define WT_IADD_M 3 #define WT_IADD_M 3
#define WT_IADD_RC 10 #define WT_IADD_RC 12
#define WT_ISUB_R 10 #define WT_ISUB_R 12
#define WT_ISUB_M 3 #define WT_ISUB_M 3
#define WT_IMUL_9C 10 #define WT_IMUL_9C 10
#define WT_IMUL_R 20 #define WT_IMUL_R 16
#define WT_IMUL_M 6 #define WT_IMUL_M 4
#define WT_IMULH_R 6 #define WT_IMULH_R 4
#define WT_IMULH_M 2 #define WT_IMULH_M 1
#define WT_ISMULH_R 6 #define WT_ISMULH_R 4
#define WT_ISMULH_M 2 #define WT_ISMULH_M 1
#define WT_IDIV_C 4 #define WT_IDIV_C 4
#define WT_ISDIV_C 4 #define WT_ISDIV_C 4
#define WT_INEG_R 2 #define WT_INEG_R 2
#define WT_IXOR_R 12 #define WT_IXOR_R 12
#define WT_IXOR_M 4 #define WT_IXOR_M 3
#define WT_IROR_R 10 #define WT_IROR_R 12
#define WT_IROL_R 10 #define WT_IROL_R 12
//Common floating point //Common floating point
#define WT_FPSWAP_R 6 #define WT_FPSWAP_R 8
//Floating point group F //Floating point group F
#define WT_FPADD_R 18 #define WT_FPADD_R 20
#define WT_FPADD_M 3 #define WT_FPADD_M 5
#define WT_FPSUB_R 18 #define WT_FPSUB_R 20
#define WT_FPSUB_M 3 #define WT_FPSUB_M 5
#define WT_FPNEG_R 5 #define WT_FPNEG_R 6
//Floating point group E //Floating point group E
#define WT_FPMUL_R 18 #define WT_FPMUL_R 16
#define WT_FPMUL_M 3 #define WT_FPMUL_M 4
#define WT_FPDIV_R 6 #define WT_FPDIV_R 7
#define WT_FPDIV_M 1 #define WT_FPDIV_M 1
#define WT_FPSQRT_R 6 #define WT_FPSQRT_R 6
//Control //Control
#define WT_COND_R 12 #define WT_COND_R 7
#define WT_COND_M 4 #define WT_COND_M 1
#define WT_CFROUND 1 #define WT_CFROUND 1
//Store //Store
#define WT_ISTORE 12 #define WT_ISTORE 18
#define WT_FSTORE 6 #define WT_FSTORE 0
#define WT_NOP 0 #define WT_NOP 0
@ -115,6 +115,7 @@ static_assert(wtSum == 256,
#define REP33(x) REP32(x) x, #define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x) #define REP40(x) REP32(x) REP8(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x) #define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x)
#define REP256(x) REP128(x) REP128(x) #define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x) #define REPNX(x,N) REP##N(x)
#define REPN(x,N) REPNX(x,N) #define REPN(x,N) REPNX(x,N)

View File

@ -169,12 +169,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
vm->initializeScratchpad(scratchpad, spIndex); vm->initializeScratchpad(scratchpad, spIndex);
//vm->initializeProgram(hash); vm->setScratchpad(scratchpad);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
for (int chain = 0; chain < 16; ++chain) { for (int chain = 0; chain < 16; ++chain) {
vm->initializeProgram(hash); vm->initializeProgram(hash);
int segment = hash[3] & 3;
vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4);
vm->execute(); vm->execute();
vm->getResult(nullptr, 0, hash); vm->getResult(nullptr, 0, hash);
} }

View File

@ -10,54 +10,54 @@
mulpd xmm6, xmm10 mulpd xmm6, xmm10
; IMUL_R r6, r3 ; IMUL_R r6, r3
imul r14, r11 imul r14, r11
; FPMUL_R e1, a0 ; FPSUB_M f1, L1[r4]
mulpd xmm5, xmm8 mov eax, r12d
; IROR_R r5, r3 and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm1, xmm12
; IROL_R r5, r3
mov ecx, r11d mov ecx, r11d
ror r13, cl rol r13, cl
; FPMUL_R e2, a0 ; FPMUL_R e2, a0
mulpd xmm6, xmm8 mulpd xmm6, xmm8
; FPNEG_R f3 ; FPSUB_R f3, a0
xorps xmm3, xmm15 subpd xmm3, xmm8
; IXOR_R r0, r4 ; IXOR_R r0, r4
xor r8, r12 xor r8, r12
; ISMULH_R r3, r7 ; ISMULH_M r3, L1[r7]
mov ecx, r15d
and ecx, 16376
mov rax, r11 mov rax, r11
imul r15 imul qword ptr [rsi+rcx]
mov r11, rdx mov r11, rdx
; FPSWAP_R f2 ; FPSWAP_R f2
shufpd xmm2, xmm2, 1 shufpd xmm2, xmm2, 1
; ISMULH_R r6, r0 ; IDIV_C r6, 1248528248
mov rax, r14 mov rax, 15864311168205210203
imul r8 mul r14
mov r14, rdx shr rdx, 30
add r14, rdx
; FPMUL_R e0, a2 ; FPMUL_R e0, a2
mulpd xmm4, xmm10 mulpd xmm4, xmm10
; ISUB_R r3, r4 ; IADD_RC r3, r4, -52260428
sub r11, r12 lea r11, [r11+r12-52260428]
; IADD_R r7, -1138617760 ; IADD_R r7, -1138617760
add r15, -1138617760 add r15, -1138617760
; IROR_R r2, r6 ; IROL_R r2, r6
mov ecx, r14d mov ecx, r14d
ror r10, cl rol r10, cl
; FPMUL_R e2, a1 ; FPNEG_R f2
mulpd xmm6, xmm9 xorps xmm2, xmm15
; IROR_R r7, r1 ; IROR_R r7, r1
mov ecx, r9d mov ecx, r9d
ror r15, cl ror r15, cl
; COND_M r2, lt(L1[r7], -41618808) ; COND_R r2, lt(r7, -41618808)
xor ecx, ecx xor ecx, ecx
mov eax, r15d cmp r15d, -41618808
and eax, 16376
cmp dword ptr [rsi+rax], -41618808
setl cl setl cl
add r10, rcx add r10, rcx
; FPMUL_M e3, L1[r0] ; FPMUL_R e3, a0
mov eax, r8d mulpd xmm7, xmm8
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm7, xmm12
maxpd xmm7, xmm13
; CFROUND r1, 43 ; CFROUND r1, 43
mov rax, r9 mov rax, r9
rol rax, 34 rol rax, 34
@ -67,14 +67,17 @@
ldmxcsr dword ptr [rsp-8] ldmxcsr dword ptr [rsp-8]
; FPADD_R f2, a1 ; FPADD_R f2, a1
addpd xmm2, xmm9 addpd xmm2, xmm9
; FPNEG_R f0 ; FPSUB_M f0, L1[r7]
xorps xmm0, xmm15 mov eax, r15d
; FSTORE L1[r6], f2 and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm0, xmm12
; ISTORE L1[r6], r2
mov eax, r14d mov eax, r14d
and eax, 16368 and eax, 16376
movapd xmmword ptr [rsi+rax], xmm2 mov qword ptr [rsi+rax], r10
; IMUL_9C r6, -45112665 ; ISUB_R r6, r5
lea r14, [r14+r14*8-45112665] sub r14, r13
; IADD_M r0, L1[r4] ; IADD_M r0, L1[r4]
mov eax, r12d mov eax, r12d
and eax, 16376 and eax, 16376
@ -87,41 +90,30 @@
mov eax, r14d mov eax, r14d
and eax, 16376 and eax, 16376
mov qword ptr [rsi+rax], r14 mov qword ptr [rsi+rax], r14
; COND_R r4, sg(r1, -1189096105) ; FPSQRT_R e0
xor ecx, ecx sqrtpd xmm4, xmm4
cmp r9d, -1189096105
sets cl
add r12, rcx
; IXOR_R r2, r5 ; IXOR_R r2, r5
xor r10, r13 xor r10, r13
; COND_R r1, be(r5, -965180434) ; FPSQRT_R e1
xor ecx, ecx sqrtpd xmm5, xmm5
cmp r13d, -965180434 ; FPMUL_R e1, a3
setbe cl mulpd xmm5, xmm11
add r9, rcx
; FPMUL_M e1, L2[r3]
mov eax, r11d
and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm5, xmm12
maxpd xmm5, xmm13
; IMULH_R r7, r6 ; IMULH_R r7, r6
mov rax, r15 mov rax, r15
mul r14 mul r14
mov r15, rdx mov r15, rdx
; ISMULH_M r0, L1[r4] ; ISDIV_C r0, -1706892622
mov ecx, r12d mov rax, -5802075764249827661
and ecx, 16376 imul r8
mov rax, r8 xor eax, eax
imul qword ptr [rsi+rcx] sar rdx, 29
mov r8, rdx sets al
add rdx, rax
add r8, rdx
; IMUL_R r5, r3 ; IMUL_R r5, r3
imul r13, r11 imul r13, r11
; COND_R r2, of(r0, -1045938770) ; FPSQRT_R e2
xor ecx, ecx sqrtpd xmm6, xmm6
cmp r8d, -1045938770
seto cl
add r10, rcx
; FPADD_M f3, L1[r4] ; FPADD_M f3, L1[r4]
mov eax, r12d mov eax, r12d
and eax, 16376 and eax, 16376
@ -131,18 +123,19 @@
add r11, r10 add r11, r10
; FPADD_R f1, a0 ; FPADD_R f1, a0
addpd xmm1, xmm8 addpd xmm1, xmm8
; FPSQRT_R e3 ; FPDIV_R e3, a2
sqrtpd xmm7, xmm7 divpd xmm7, xmm10
maxpd xmm7, xmm13
; FPSUB_R f0, a1 ; FPSUB_R f0, a1
subpd xmm0, xmm9 subpd xmm0, xmm9
; IMUL_M r5, L1[r6] ; IMUL_M r5, L1[r6]
mov eax, r14d mov eax, r14d
and eax, 16376 and eax, 16376
imul r13, qword ptr [rsi+rax] imul r13, qword ptr [rsi+rax]
; ISUB_R r1, r2 ; IADD_RC r1, r2, -1263285243
sub r9, r10 lea r9, [r9+r10-1263285243]
; IMUL_R r4, r6 ; IMUL_9C r4, 1994773931
imul r12, r14 lea r12, [r12+r12*8+1994773931]
; FPSWAP_R e3 ; FPSWAP_R e3
shufpd xmm7, xmm7, 1 shufpd xmm7, xmm7, 1
; IMUL_M r0, L1[r7] ; IMUL_M r0, L1[r7]
@ -152,69 +145,72 @@
; IROR_R r1, r6 ; IROR_R r1, r6
mov ecx, r14d mov ecx, r14d
ror r9, cl ror r9, cl
; IROR_R r2, r4 ; IROL_R r2, r4
mov ecx, r12d mov ecx, r12d
ror r10, cl rol r10, cl
; FPSUB_R f3, a1 ; FPSUB_R f3, a1
subpd xmm3, xmm9 subpd xmm3, xmm9
; FSTORE L1[r0], e1 ; ISTORE L1[r0], r5
mov eax, r8d mov eax, r8d
and eax, 16368 and eax, 16376
movapd xmmword ptr [rsi+rax], xmm5 mov qword ptr [rsi+rax], r13
; COND_R r2, sg(r3, 1269153133) ; FPDIV_M e2, L2[r3]
xor ecx, ecx mov eax, r11d
cmp r11d, 1269153133 and eax, 262136
sets cl cvtdq2pd xmm12, qword ptr [rsi+rax]
add r10, rcx divpd xmm6, xmm12
maxpd xmm6, xmm13
; FPSWAP_R f2 ; FPSWAP_R f2
shufpd xmm2, xmm2, 1 shufpd xmm2, xmm2, 1
; IADD_R r7, r5 ; IADD_R r7, r5
add r15, r13 add r15, r13
; COND_R r0, be(r4, -1486502150) ; FPDIV_M e0, L1[r4]
xor ecx, ecx mov eax, r12d
cmp r12d, -1486502150 and eax, 16376
setbe cl cvtdq2pd xmm12, qword ptr [rsi+rax]
add r8, rcx divpd xmm4, xmm12
; FPSUB_R f3, a1 maxpd xmm4, xmm13
subpd xmm3, xmm9 ; FPADD_M f3, L1[r5]
mov eax, r13d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm3, xmm12
; FPADD_R f0, a3 ; FPADD_R f0, a3
addpd xmm0, xmm11 addpd xmm0, xmm11
; IADD_R r2, r0 ; IADD_R r2, r0
add r10, r8 add r10, r8
; FSTORE L1[r3], e2 ; ISTORE L1[r3], r6
mov eax, r11d mov eax, r11d
and eax, 16368 and eax, 16376
movapd xmmword ptr [rsi+rax], xmm6 mov qword ptr [rsi+rax], r14
; IXOR_R r1, r7 ; IROR_R r1, r7
xor r9, r15 mov ecx, r15d
; IMUL_R r5, r7 ror r9, cl
imul r13, r15 ; IMUL_9C r5, 301671287
lea r13, [r13+r13*8+301671287]
; IXOR_R r7, 266992378 ; IXOR_R r7, 266992378
xor r15, 266992378 xor r15, 266992378
; COND_R r7, no(r4, 1983804692) ; FPSQRT_R e3
xor ecx, ecx sqrtpd xmm7, xmm7
cmp r12d, 1983804692
setno cl
add r15, rcx
; IMUL_M r2, L2[r0] ; IMUL_M r2, L2[r0]
mov eax, r8d mov eax, r8d
and eax, 262136 and eax, 262136
imul r10, qword ptr [rsi+rax] imul r10, qword ptr [rsi+rax]
; FPDIV_R e3, a2 ; FPMUL_R e3, a2
divpd xmm7, xmm10 mulpd xmm7, xmm10
maxpd xmm7, xmm13 ; IMUL_R r0, r6
; IMUL_M r0, L2[r6] imul r8, r14
mov eax, r14d
and eax, 262136
imul r8, qword ptr [rsi+rax]
; ISTORE L1[r0], r7 ; ISTORE L1[r0], r7
mov eax, r8d mov eax, r8d
and eax, 16376 and eax, 16376
mov qword ptr [rsi+rax], r15 mov qword ptr [rsi+rax], r15
; FPMUL_R e0, a1 ; FPNEG_R f0
mulpd xmm4, xmm9 xorps xmm0, xmm15
; FPSUB_R f3, a1 ; FPADD_M f3, L1[r5]
subpd xmm3, xmm9 mov eax, r13d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm3, xmm12
; IROR_R r5, r4 ; IROR_R r5, r4
mov ecx, r12d mov ecx, r12d
ror r13, cl ror r13, cl
@ -222,17 +218,20 @@
mov eax, r15d mov eax, r15d
and eax, 262136 and eax, 262136
mov qword ptr [rsi+rax], r10 mov qword ptr [rsi+rax], r10
; FPSWAP_R e2 ; FPADD_R f2, a3
shufpd xmm6, xmm6, 1 addpd xmm2, xmm11
; FPADD_M f3, L1[r2] ; FPADD_M f3, L1[r2]
mov eax, r10d mov eax, r10d
and eax, 16376 and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax] cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm3, xmm12 addpd xmm3, xmm12
; IDIV_C r5, 2218798981 ; ISDIV_C r5, -2076168315
mov rax, 17853839665672790751 mov rax, -4770095103914078469
mul r13 imul r13
shr rdx, 31 xor eax, eax
sar rdx, 29
sets al
add rdx, rax
add r13, rdx add r13, rdx
; IADD_RC r0, r4, -1321374359 ; IADD_RC r0, r4, -1321374359
lea r8, [r8+r12-1321374359] lea r8, [r8+r12-1321374359]
@ -250,28 +249,26 @@
rol r15, cl rol r15, cl
; ISUB_R r2, r4 ; ISUB_R r2, r4
sub r10, r12 sub r10, r12
; IMULH_M r0, L1[12400] ; ISMULH_R r0, -1500893068
mov rax, r8 mov rax, -1500893068
mul qword ptr [rsi+12400] imul r8
mov r8, rdx add r8, rdx
; IADD_R r2, r3 ; IADD_R r2, r3
add r10, r11 add r10, r11
; COND_R r6, lt(r1, -1124202227) ; FPSQRT_R e2
xor ecx, ecx sqrtpd xmm6, xmm6
cmp r9d, -1124202227 ; IROL_R r7, r4
setl cl
add r14, rcx
; IROR_R r7, r4
mov ecx, r12d mov ecx, r12d
ror r15, cl rol r15, cl
; IMUL_R r4, r2 ; IMUL_R r4, r2
imul r12, r10 imul r12, r10
; ISUB_R r3, r7 ; ISUB_R r3, r7
sub r11, r15 sub r11, r15
; IADD_R r2, r7 ; IADD_R r2, r7
add r10, r15 add r10, r15
; FPSQRT_R e3 ; FPDIV_R e3, a0
sqrtpd xmm7, xmm7 divpd xmm7, xmm8
maxpd xmm7, xmm13
; ISUB_R r6, 540663146 ; ISUB_R r6, 540663146
sub r14, 540663146 sub r14, 540663146
; IROL_R r5, 58 ; IROL_R r5, 58
@ -280,67 +277,65 @@
addpd xmm2, xmm9 addpd xmm2, xmm9
; FPADD_R f2, a2 ; FPADD_R f2, a2
addpd xmm2, xmm10 addpd xmm2, xmm10
; FPSQRT_R e1 ; FPDIV_R e1, a2
sqrtpd xmm5, xmm5 divpd xmm5, xmm10
maxpd xmm5, xmm13
; FPADD_R f1, a2 ; FPADD_R f1, a2
addpd xmm1, xmm10 addpd xmm1, xmm10
; IADD_R r5, r3 ; IADD_R r5, r3
add r13, r11 add r13, r11
; IADD_M r7, L1[880] ; IADD_R r7, -1780268176
add r15, qword ptr [rsi+880] add r15, -1780268176
; ISUB_R r7, r0 ; ISUB_R r7, r0
sub r15, r8 sub r15, r8
; ISTORE L2[r0], r7 ; ISTORE L2[r0], r7
mov eax, r8d mov eax, r8d
and eax, 262136 and eax, 262136
mov qword ptr [rsi+rax], r15 mov qword ptr [rsi+rax], r15
; IDIV_C r2, 1014940364 ; INEG_R r2
mov rax, r10 neg r10
shr rax, 2 ; FPNEG_R f0
mov rcx, 1219717022984988185 xorps xmm0, xmm15
mul rcx ; INEG_R r2
shr rdx, 24 neg r10
add r10, rdx
; FPMUL_R e0, a2
mulpd xmm4, xmm10
; IDIV_C r2, 3059159304
mov rax, 12949335853590502915
mul r10
shr rdx, 31
add r10, rdx
; IADD_R r0, r3 ; IADD_R r0, r3
add r8, r11 add r8, r11
; IMUL_9C r7, -2124093035 ; IMUL_9C r7, -2124093035
lea r15, [r15+r15*8-2124093035] lea r15, [r15+r15*8-2124093035]
; FPSUB_R f2, a0 ; FPADD_M f2, L1[r0]
subpd xmm2, xmm8 mov eax, r8d
; FPDIV_R e0, a2 and eax, 16376
divpd xmm4, xmm10 cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm2, xmm12
; FPMUL_M e0, L1[r6]
mov eax, r14d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm4, xmm12
maxpd xmm4, xmm13 maxpd xmm4, xmm13
; FPSUB_R f2, a3 ; FPSUB_R f2, a3
subpd xmm2, xmm11 subpd xmm2, xmm11
; IMUL_R r1, r2 ; IMUL_R r1, r2
imul r9, r10 imul r9, r10
; ISMULH_R r7, r5 ; IDIV_C r7, 3214009572
mov rax, r15 mov rax, 12325439725582798855
imul r13 mul r15
mov r15, rdx shr rdx, 31
add r15, rdx
; IMULH_R r3, r2 ; IMULH_R r3, r2
mov rax, r11 mov rax, r11
mul r10 mul r10
mov r11, rdx mov r11, rdx
; IXOR_M r1, L2[r0] ; IROR_R r1, r0
mov eax, r8d mov ecx, r8d
and eax, 262136 ror r9, cl
xor r9, qword ptr [rsi+rax]
; FPMUL_R e0, a1 ; FPMUL_R e0, a1
mulpd xmm4, xmm9 mulpd xmm4, xmm9
; ISUB_R r4, 1456841848 ; IADD_RC r4, r4, 1456841848
sub r12, 1456841848 lea r12, [r12+r12+1456841848]
; IXOR_M r3, L2[r2] ; IROR_R r3, r2
mov eax, r10d mov ecx, r10d
and eax, 262136 ror r11, cl
xor r11, qword ptr [rsi+rax]
; COND_M r0, of(L1[r4], 1678513610) ; COND_M r0, of(L1[r4], 1678513610)
xor ecx, ecx xor ecx, ecx
mov eax, r12d mov eax, r12d
@ -348,446 +343,39 @@
cmp dword ptr [rsi+rax], 1678513610 cmp dword ptr [rsi+rax], 1678513610
seto cl seto cl
add r8, rcx add r8, rcx
; IDIV_C r4, 2674394209 ; INEG_R r4
mov rax, 925772300223658071 neg r12
mul r12
shr rdx, 27
add r12, rdx
; IMUL_R r4, r1 ; IMUL_R r4, r1
imul r12, r9 imul r12, r9
; FPADD_R f1, a2 ; FPADD_R f1, a2
addpd xmm1, xmm10 addpd xmm1, xmm10
; FPSUB_R f2, a0 ; FPSUB_R f2, a0
subpd xmm2, xmm8 subpd xmm2, xmm8
; FPMUL_M e1, L2[r6] ; FPMUL_R e1, a2
mov eax, r14d mulpd xmm5, xmm10
and eax, 262136 ; FPSUB_R f0, a3
cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm0, xmm11
mulpd xmm5, xmm12
maxpd xmm5, xmm13
; FPSUB_M f0, L2[r3]
mov eax, r11d
and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm0, xmm12
; IROR_R r0, r7 ; IROR_R r0, r7
mov ecx, r15d mov ecx, r15d
ror r8, cl ror r8, cl
; FSTORE L2[r1], e0 ; ISTORE L2[r1], r4
mov eax, r9d mov eax, r9d
and eax, 262128 and eax, 262136
movapd xmmword ptr [rsi+rax], xmm4 mov qword ptr [rsi+rax], r12
; IROR_R r7, r6 ; IROL_R r7, r6
mov ecx, r14d mov ecx, r14d
ror r15, cl rol r15, cl
; IMUL_9C r2, 266593902 ; IMUL_9C r2, 266593902
lea r10, [r10+r10*8+266593902] lea r10, [r10+r10*8+266593902]
; IMUL_R r4, r6 ; IMUL_R r4, r6
imul r12, r14 imul r12, r14
; FPSUB_R f2, a2 ; FPSUB_R f2, a2
subpd xmm2, xmm10 subpd xmm2, xmm10
; FPMUL_R e3, a0 ; FPNEG_R f3
mulpd xmm7, xmm8 xorps xmm3, xmm15
; IXOR_M r7, L1[r2] ; IROR_R r7, r2
mov eax, r10d mov ecx, r10d
and eax, 16376 ror r15, cl
xor r15, qword ptr [rsi+rax]
; IROR_R r0, r5 ; IROR_R r0, r5
mov ecx, r13d mov ecx, r13d
ror r8, cl ror r8, cl
; FPADD_R f1, a2
addpd xmm1, xmm10
; FPSQRT_R e3
sqrtpd xmm7, xmm7
; FPADD_R f3, a1
addpd xmm3, xmm9
; FPADD_R f1, a0
addpd xmm1, xmm8
; COND_M r2, ge(L2[r2], -226330940)
xor ecx, ecx
mov eax, r10d
and eax, 262136
cmp dword ptr [rsi+rax], -226330940
setge cl
add r10, rcx
; FPDIV_R e2, a3
divpd xmm6, xmm11
maxpd xmm6, xmm13
; FPMUL_R e2, a1
mulpd xmm6, xmm9
; FPSUB_R f1, a0
subpd xmm1, xmm8
; IMUL_R r7, r5
imul r15, r13
; IMUL_R r0, r1
imul r8, r9
; FPSUB_R f3, a1
subpd xmm3, xmm9
; IROL_R r3, r5
mov ecx, r13d
rol r11, cl
; IADD_RC r5, r2, 795784298
lea r13, [r13+r10+795784298]
; ISUB_R r0, r4
sub r8, r12
; IMUL_R r5, r4
imul r13, r12
; FPSUB_R f0, a2
subpd xmm0, xmm10
; FPMUL_R e3, a1
mulpd xmm7, xmm9
; ISDIV_C r3, 1662492575
mov rax, 2978515652703905219
imul r11
xor eax, eax
sar rdx, 28
sets al
add rdx, rax
add r11, rdx
; ISMULH_R r5, r0
mov rax, r13
imul r8
mov r13, rdx
; ISDIV_C r4, 1963597892
mov rax, -8359627607928540073
imul r12
xor eax, eax
add rdx, r12
sar rdx, 30
sets al
add rdx, rax
add r12, rdx
; IMUL_R r7, r0
imul r15, r8
; IMULH_M r0, L1[r3]
mov ecx, r11d
and ecx, 16376
mov rax, r8
mul qword ptr [rsi+rcx]
mov r8, rdx
; IXOR_R r3, r7
xor r11, r15
; IDIV_C r4, 1146125335
mov rax, 8640870253760721727
mul r12
shr rdx, 29
add r12, rdx
; FPSWAP_R f3
shufpd xmm3, xmm3, 1
; IXOR_M r2, L1[r0]
mov eax, r8d
and eax, 16376
xor r10, qword ptr [rsi+rax]
; IROR_R r0, r1
mov ecx, r9d
ror r8, cl
; IXOR_R r7, r4
xor r15, r12
; ISMULH_R r6, r2
mov rax, r14
imul r10
mov r14, rdx
; FPMUL_R e3, a2
mulpd xmm7, xmm10
; IADD_RC r4, r2, 1704868083
lea r12, [r12+r10+1704868083]
; FPSUB_R f2, a0
subpd xmm2, xmm8
; ISTORE L1[r0], r0
mov eax, r8d
and eax, 16376
mov qword ptr [rsi+rax], r8
; FPSUB_R f0, a3
subpd xmm0, xmm11
; FPDIV_R e0, a3
divpd xmm4, xmm11
maxpd xmm4, xmm13
; FPMUL_R e3, a2
mulpd xmm7, xmm10
; ISUB_R r7, 1302457878
sub r15, 1302457878
; IMUL_9C r1, 1330165941
lea r9, [r9+r9*8+1330165941]
; FPMUL_R e1, a3
mulpd xmm5, xmm11
; IROL_R r0, r4
mov ecx, r12d
rol r8, cl
; FPSUB_M f1, L1[r0]
mov eax, r8d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm1, xmm12
; IROL_R r5, r6
mov ecx, r14d
rol r13, cl
; COND_M r0, ab(L1[r1], -310933871)
xor ecx, ecx
mov eax, r9d
and eax, 16376
cmp dword ptr [rsi+rax], -310933871
seta cl
add r8, rcx
; CFROUND r7, 39
mov rax, r15
rol rax, 38
and eax, 24576
or eax, 40896
mov dword ptr [rsp-8], eax
ldmxcsr dword ptr [rsp-8]
; FPDIV_R e0, a1
divpd xmm4, xmm9
maxpd xmm4, xmm13
; IMUL_M r1, L1[r3]
mov eax, r11d
and eax, 16376
imul r9, qword ptr [rsi+rax]
; IMUL_9C r3, 1573236728
lea r11, [r11+r11*8+1573236728]
; FPNEG_R f3
xorps xmm3, xmm15
; COND_R r1, lt(r4, -1805702334)
xor ecx, ecx
cmp r12d, -1805702334
setl cl
add r9, rcx
; FPSWAP_R f1
shufpd xmm1, xmm1, 1
; IADD_R r7, -1421188024
add r15, -1421188024
; FPMUL_R e3, a2
mulpd xmm7, xmm10
; FPSUB_M f2, L2[r7]
mov eax, r15d
and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm2, xmm12
; FPSUB_R f3, a1
subpd xmm3, xmm9
; FPSQRT_R e1
sqrtpd xmm5, xmm5
; ISUB_R r2, r4
sub r10, r12
; ISMULH_R r4, r5
mov rax, r12
imul r13
mov r12, rdx
; COND_R r1, of(r7, 1294727006)
xor ecx, ecx
cmp r15d, 1294727006
seto cl
add r9, rcx
; IADD_M r5, L2[r2]
mov eax, r10d
and eax, 262136
add r13, qword ptr [rsi+rax]
; IMUL_9C r4, 401020510
lea r12, [r12+r12*8+401020510]
; IROL_R r3, r0
mov ecx, r8d
rol r11, cl
; ISTORE L1[r7], r0
mov eax, r15d
and eax, 16376
mov qword ptr [rsi+rax], r8
; FPSUB_R f2, a1
subpd xmm2, xmm9
; FPSQRT_R e3
sqrtpd xmm7, xmm7
; IMUL_R r3, 720965215
imul r11, 720965215
; IMUL_R r6, r2
imul r14, r10
; ISTORE L1[r7], r3
mov eax, r15d
and eax, 16376
mov qword ptr [rsi+rax], r11
; IROR_R r2, r6
mov ecx, r14d
ror r10, cl
; FPSQRT_R e3
sqrtpd xmm7, xmm7
; IMUL_9C r4, 788211341
lea r12, [r12+r12*8+788211341]
; IMUL_9C r3, -67993446
lea r11, [r11+r11*8-67993446]
; FPSWAP_R e3
shufpd xmm7, xmm7, 1
; IMUL_M r2, L1[r6]
mov eax, r14d
and eax, 16376
imul r10, qword ptr [rsi+rax]
; COND_M r2, ge(L1[r2], -1892157506)
xor ecx, ecx
mov eax, r10d
and eax, 16376
cmp dword ptr [rsi+rax], -1892157506
setge cl
add r10, rcx
; FPADD_M f1, L1[r3]
mov eax, r11d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
addpd xmm1, xmm12
; IADD_M r7, L1[r0]
mov eax, r8d
and eax, 16376
add r15, qword ptr [rsi+rax]
; ISDIV_C r1, 624867857
mov rax, 7924491717200811467
imul r9
xor eax, eax
sar rdx, 28
sets al
add rdx, rax
add r9, rdx
; FPADD_R f0, a1
addpd xmm0, xmm9
; ISUB_R r5, r7
sub r13, r15
; FPNEG_R f0
xorps xmm0, xmm15
; IMUL_R r6, r2
imul r14, r10
; FPMUL_M e3, L1[r1]
mov eax, r9d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm7, xmm12
maxpd xmm7, xmm13
; IADD_R r0, r4
add r8, r12
; FPSUB_M f3, L1[r1]
mov eax, r9d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm3, xmm12
; FPMUL_R e2, a0
mulpd xmm6, xmm8
; INEG_R r2
neg r10
; FPMUL_R e2, a2
mulpd xmm6, xmm10
; FPSUB_M f3, L1[r6]
mov eax, r14d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm3, xmm12
; FPADD_R f1, a3
addpd xmm1, xmm11
; IMULH_R r3, r2
mov rax, r11
mul r10
mov r11, rdx
; FPSUB_R f0, a3
subpd xmm0, xmm11
; IDIV_C r5, 2887845607
mov rax, 13717520480010955377
mul r13
shr rdx, 31
add r13, rdx
; ISMULH_M r6, L1[r2]
mov ecx, r10d
and ecx, 16376
mov rax, r14
imul qword ptr [rsi+rcx]
mov r14, rdx
; FPSUB_R f3, a3
subpd xmm3, xmm11
; IMUL_M r6, L1[r7]
mov eax, r15d
and eax, 16376
imul r14, qword ptr [rsi+rax]
; FPNEG_R f0
xorps xmm0, xmm15
; FPMUL_R e2, a0
mulpd xmm6, xmm8
; IMUL_9C r6, 295130073
lea r14, [r14+r14*8+295130073]
; FPADD_R f1, a1
addpd xmm1, xmm9
; IXOR_R r0, r5
xor r8, r13
; FPADD_R f2, a1
addpd xmm2, xmm9
; FPSWAP_R e3
shufpd xmm7, xmm7, 1
; FPSQRT_R e3
sqrtpd xmm7, xmm7
; IADD_RC r3, r6, -1317630728
lea r11, [r11+r14-1317630728]
; IMUL_M r2, L1[r3]
mov eax, r11d
and eax, 16376
imul r10, qword ptr [rsi+rax]
; IADD_RC r1, r4, 894105694
lea r9, [r9+r12+894105694]
; IMUL_R r7, r0
imul r15, r8
; FPSUB_R f1, a0
subpd xmm1, xmm8
; IMUL_M r7, L1[r1]
mov eax, r9d
and eax, 16376
imul r15, qword ptr [rsi+rax]
; IXOR_R r2, r4
xor r10, r12
; ISUB_M r0, L1[r1]
mov eax, r9d
and eax, 16376
sub r8, qword ptr [rsi+rax]
; INEG_R r4
neg r12
; IMUL_9C r4, -285272388
lea r12, [r12+r12*8-285272388]
; IMUL_R r7, r4
imul r15, r12
; IMULH_M r5, L1[r7]
mov ecx, r15d
and ecx, 16376
mov rax, r13
mul qword ptr [rsi+rcx]
mov r13, rdx
; IROL_R r1, r7
mov ecx, r15d
rol r9, cl
; IXOR_R r4, -757532727
xor r12, -757532727
; IMUL_R r3, 1863959234
imul r11, 1863959234
; IROL_R r4, 59
rol r12, 59
; ISMULH_R r1, 2122681086
mov rax, 2122681086
imul r9
add r9, rdx
; ISTORE L2[r6], r7
mov eax, r14d
and eax, 262136
mov qword ptr [rsi+rax], r15
; ISTORE L1[r1], r5
mov eax, r9d
and eax, 16376
mov qword ptr [rsi+rax], r13
; FPMUL_R e0, a1
mulpd xmm4, xmm9
; COND_R r2, ns(r1, 486049737)
xor ecx, ecx
cmp r9d, 486049737
setns cl
add r10, rcx
; FPMUL_M e0, L2[r7]
mov eax, r15d
and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm4, xmm12
maxpd xmm4, xmm13
; FPMUL_R e3, a2
mulpd xmm7, xmm10
; IROL_R r5, r2
mov ecx, r10d
rol r13, cl
; IADD_M r0, L1[r4]
mov eax, r12d
and eax, 16376
add r8, qword ptr [rsi+rax]