From 1426fcbab5a8f1aa6213203f5713ddacbd70abc6 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 12 Jan 2019 16:05:09 +0100 Subject: [PATCH] Print average program code size Fixed assembly for MUL_64 and IMUL_32 Division weight 4 -> 8 --- src/AssemblyGeneratorX86.cpp | 4 +- src/CompiledVirtualMachine.cpp | 3 +- src/CompiledVirtualMachine.hpp | 4 + src/JitCompilerX86.cpp | 18 +- src/JitCompilerX86.hpp | 2 + src/instructionWeights.hpp | 11 +- src/main.cpp | 2 + src/program.inc | 518 +++++++++++++++++++-------------- 8 files changed, 337 insertions(+), 225 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index efa0818..4cb009e 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -222,7 +222,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\timul rax, "; - if ((instr.locb & 7) >= 6) { + if ((instr.locb & 3) == 0) { asmCode << "rax, "; } genbia(instr); @@ -250,7 +250,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tmovsxd rcx, eax" << std::endl; - if ((instr.locb & 7) >= 6) { + if ((instr.locb & 3) == 0) { asmCode << "\tmov rax, " << instr.imm32 << std::endl; } else { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 7803003..ef78d2f 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -26,7 +26,7 @@ along with RandomX. If not, see. namespace RandomX { CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { - + totalSize = 0; } void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { @@ -48,6 +48,7 @@ namespace RandomX { void CompiledVirtualMachine::execute() { //executeProgram(reg, mem, scratchpad, readDataset); + totalSize += compiler.getCodeSize(); compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index cf131d1..a77bdb8 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -44,10 +44,14 @@ namespace RandomX { void* getProgram() { return compiler.getCode(); } + uint64_t getTotalSize() { + return totalSize; + } private: #ifdef TRACEVM convertible_t tracepad[InstructionCount]; #endif JitCompilerX86 compiler; + uint64_t totalSize; }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 32bad3a..2a101f0 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -116,6 +116,10 @@ namespace RandomX { const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize; + size_t JitCompilerX86::getCodeSize() { + return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size; + } + JitCompilerX86::JitCompilerX86() { #ifdef _WIN32 code = (uint8_t*)VirtualAlloc(nullptr, CodeSize, MEM_COMMIT, PAGE_EXECUTE_READWRITE); @@ -196,6 +200,7 @@ namespace RandomX { void JitCompilerX86::genar(Instruction& instr) { gena(instr); emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] + emit(0xdc580f66); } void JitCompilerX86::genaf(Instruction& instr) { @@ -437,7 +442,7 @@ namespace RandomX { void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { genar(instr); - if (instr.locb & 3) { + if (instr.locb & 7) { #ifdef MAGIC_DIVISION if (instr.imm32 != 0) { uint32_t divisor = instr.imm32; @@ -496,7 +501,7 @@ namespace RandomX { void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { genar(instr); - if (instr.locb & 3) { + if (instr.locb & 7) { #ifdef MAGIC_DIVISION int64_t divisor = instr.imm32; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { @@ -566,8 +571,8 @@ namespace RandomX { #ifndef MAGIC_DIVISION } #endif - emit(0xc88b480b75fffa83); - emit(0x1274c9ff48c1d148); + emit(0xd8f7480575fffa83); //cmp edx,-1 + emit(uint16_t(0x12eb)); //jmp result emit(0x0fd28500000001b9); emit(0x489948c96348ca45); emit(uint16_t(0xf9f7)); //idiv rcx @@ -766,6 +771,10 @@ namespace RandomX { emitByte(0xc3); //ret } + void JitCompilerX86::h_NOP(Instruction& instr, int i) { + genar(instr); + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) @@ -801,6 +810,7 @@ namespace RandomX { INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) + INST_HANDLE(NOP) }; #endif diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index d95cbad..0c0c48c 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -51,6 +51,7 @@ namespace RandomX { uint8_t* getCode() { return code; } + size_t getCodeSize(); private: static InstructionGeneratorX86 engine[256]; uint8_t* code; @@ -114,6 +115,7 @@ namespace RandomX { void h_JUMP(Instruction&, int); void h_CALL(Instruction&, int); void h_RET(Instruction&, int); + void h_NOP(Instruction&, int); }; } \ No newline at end of file diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 7771a35..de027b7 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -24,12 +24,12 @@ along with RandomX. If not, see. #define WT_SUB_64 12 #define WT_SUB_32 2 #define WT_MUL_64 23 -#define WT_MULH_64 10 +#define WT_MULH_64 5 #define WT_MUL_32 15 #define WT_IMUL_32 15 -#define WT_IMULH_64 6 -#define WT_DIV_64 4 -#define WT_IDIV_64 4 +#define WT_IMULH_64 3 +#define WT_DIV_64 8 +#define WT_IDIV_64 8 #define WT_AND_64 4 #define WT_AND_32 2 #define WT_OR_64 4 @@ -50,6 +50,7 @@ along with RandomX. If not, see. #define WT_JUMP 11 #define WT_CALL 11 #define WT_RET 12 +#define WT_NOP 0 constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \ @@ -57,7 +58,7 @@ WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \ WT_DIV_64 + WT_IDIV_64 + WT_AND_64 + WT_AND_32 + WT_OR_64 + \ WT_OR_32 + WT_XOR_64 + WT_XOR_32 + WT_SHL_64 + WT_SHR_64 + \ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \ -+ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET; ++ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); diff --git a/src/main.cpp b/src/main.cpp index a0ffc0a..6366821 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -270,6 +270,8 @@ int main(int argc, char** argv) { } else { mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0); + if (compiled) + std::cout << "Average program size: " << ((RandomX::CompiledVirtualMachine*)vms[0])->getTotalSize() / programCount << std::endl; } double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; diff --git a/src/program.inc b/src/program.inc index 79a7dda..538f664 100644 --- a/src/program.inc +++ b/src/program.inc @@ -19,7 +19,7 @@ rx_body_0: ja short rx_i_1 call rx_i_30 -rx_i_1: ;DIV_64 +rx_i_1: ;IDIV_64 dec ebx jz rx_finish xor r15, 06afc2fa4h @@ -30,12 +30,19 @@ rx_i_1: ;DIV_64 rx_body_1: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 mov edx, r10d + cmp edx, -1 + jne short body_idiv_1 + neg rax + jmp short result_idiv_1 +body_idiv_1: + mov ecx, 1 test edx, edx cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_1: mov r12, rax rx_i_2: ;JUMP @@ -80,7 +87,7 @@ rx_body_3: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm8 -rx_i_4: ;MULH_64 +rx_i_4: ;MUL_32 dec ebx jz rx_finish xor r14, 077daefb4h @@ -91,16 +98,16 @@ rx_i_4: ;MULH_64 rx_body_4: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r14d + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 06ce10c20h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_5: ;MUL_32 +rx_i_5: ;IMUL_32 dec ebx jz rx_finish xor r15, 0379f9ee0h @@ -112,8 +119,8 @@ rx_body_5: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 1037420699 + movsxd rcx, eax + movsxd rax, r12d imul rax, rcx mov r12, rax @@ -171,7 +178,7 @@ rx_body_8: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_9: ;DIV_64 +rx_i_9: ;IDIV_64 dec ebx jz rx_finish xor r14, 085121c54h @@ -184,10 +191,13 @@ rx_body_9: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 565870810 - mov rcx, 8750690209911200579 - mul rcx + mov rdx, 8750690209911200579 + imul rdx mov rax, rdx - shr rax, 28 + xor edx, edx + sar rax, 28 + sets dl + add rax, rdx mov r10, rax rx_i_10: ;AND_64 @@ -434,10 +444,10 @@ rx_i_23: ;MUL_64 rx_body_23: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 1283724485 + imul rax, rax, 1283724485 mov r8, rax -rx_i_24: ;IMUL_32 +rx_i_24: ;DIV_64 dec ebx jz rx_finish xor r8, 070d3b8c7h @@ -449,9 +459,12 @@ rx_body_24: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r15d - imul rax, rcx + mov ecx, 1 + mov edx, r15d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r15d xor eax, 099b77a68h @@ -480,7 +493,7 @@ rx_body_25: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_26: ;IMUL_32 +rx_i_26: ;IMULH_64 dec ebx jz rx_finish xor r11, 0e311468ch @@ -491,9 +504,9 @@ rx_i_26: ;IMUL_32 rx_body_26: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx + mov rcx, 812644844 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0306ff9ech @@ -933,7 +946,7 @@ rx_body_53: je short rx_i_54 ret -rx_i_54: ;IMULH_64 +rx_i_54: ;DIV_64 dec ebx jz rx_finish xor r11, 060638de0h @@ -944,9 +957,11 @@ rx_i_54: ;IMULH_64 rx_body_54: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 282209221 + mov rcx, 1096650948274100047 + mul rcx mov rax, rdx + shr rax, 24 mov rcx, rax mov eax, r12d xor eax, 010d22bc5h @@ -974,7 +989,7 @@ rx_body_55: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_56: ;DIV_64 +rx_i_56: ;IDIV_64 dec ebx jz rx_finish xor r14, 0f1456b8eh @@ -985,13 +1000,16 @@ rx_i_56: ;DIV_64 rx_body_56: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 4244198545 - add rax, 1 - sbb rax, 0 - mov rcx, 9333701248213440683 - mul rcx + ; magic divide by -50768751 + mov rcx, rax + mov rdx, 6254795139557318139 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sub rax, rcx + sar rax, 25 + sets dl + add rax, rdx mov rcx, rax mov eax, r8d xor eax, 0fcf95491h @@ -1009,14 +1027,14 @@ rx_i_57: ;MUL_64 rx_body_57: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 172123015 + imul rax, rax, 172123015 mov rcx, rax mov eax, r15d xor eax, 0a426387h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_58: ;IMULH_64 +rx_i_58: ;DIV_64 dec ebx jz rx_finish xor r14, 0bcec0ebah @@ -1027,9 +1045,11 @@ rx_i_58: ;IMULH_64 rx_body_58: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 1506547423 + mov rcx, 6573653217342526495 + mul rcx mov rax, rdx + shr rax, 29 mov r8, rax rx_i_59: ;FPSUB @@ -1294,7 +1314,7 @@ rx_body_74: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r13 + imul rax, r13 mov rcx, rax mov eax, r9d xor eax, 0aaaacb32h @@ -1355,7 +1375,7 @@ rx_body_77: je short rx_i_78 ret -rx_i_78: ;MULH_64 +rx_i_78: ;MUL_32 dec ebx jz rx_finish xor r9, 0edeca680h @@ -1366,9 +1386,9 @@ rx_i_78: ;MULH_64 rx_body_78: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r8d + imul rax, rcx mov r15, rax rx_i_79: ;CALL @@ -1443,7 +1463,7 @@ rx_body_82: cmp r12d, -68969733 jo rx_i_145 -rx_i_83: ;DIV_64 +rx_i_83: ;IDIV_64 dec ebx jz rx_finish xor r10, 0d9b6a533h @@ -1455,10 +1475,13 @@ rx_body_83: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 91850728 - mov rcx, 13477737914993774191 - mul rcx + mov rdx, 842358619687110887 + imul rdx mov rax, rdx - shr rax, 26 + xor edx, edx + sar rax, 22 + sets dl + add rax, rdx mov r12, rax rx_i_84: ;SAR_64 @@ -1490,7 +1513,7 @@ rx_i_85: ;MUL_64 rx_body_85: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 20014507 + imul rax, rax, 20014507 mov r10, rax rx_i_86: ;AND_64 @@ -1661,7 +1684,7 @@ rx_body_95: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_96: ;MUL_32 +rx_i_96: ;IMUL_32 dec ebx jz rx_finish xor r11, 04f912ef8h @@ -1673,8 +1696,8 @@ rx_body_96: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r11d + movsxd rcx, eax + mov rax, -1354397081 imul rax, rcx mov r11, rax @@ -1797,7 +1820,7 @@ rx_body_103: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_104: ;IMUL_32 +rx_i_104: ;DIV_64 dec ebx jz rx_finish xor r11, 075deaf71h @@ -1808,9 +1831,11 @@ rx_i_104: ;IMUL_32 rx_body_104: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1913070089 - imul rax, rcx + ; magic divide by 2381897207 + mov rcx, 16631314374404138087 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r15d xor eax, 08df8ddf7h @@ -1992,7 +2017,7 @@ rx_body_113: mov rax, rdx mov r13, rax -rx_i_114: ;IMULH_64 +rx_i_114: ;DIV_64 dec ebx jz rx_finish xor r13, 06e83e2cdh @@ -2003,9 +2028,11 @@ rx_i_114: ;IMULH_64 rx_body_114: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx + ; magic divide by 770835683 + mov rcx, 12847770974664443757 + mul rcx mov rax, rdx + shr rax, 29 mov r14, rax rx_i_115: ;IDIV_64 @@ -2029,7 +2056,7 @@ rx_body_115: add rax, rdx mov r14, rax -rx_i_116: ;IMUL_32 +rx_i_116: ;DIV_64 dec ebx jz rx_finish xor r10, 0d122702eh @@ -2040,16 +2067,18 @@ rx_i_116: ;IMUL_32 rx_body_116: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1850776691 - imul rax, rcx + ; magic divide by 2444190605 + mov rcx, 16207443550472271289 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r8d xor eax, 091af638dh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_117: ;DIV_64 +rx_i_117: ;IDIV_64 dec ebx jz rx_finish xor r11, 015f2012bh @@ -2060,11 +2089,14 @@ rx_i_117: ;DIV_64 rx_body_117: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 3089140324 - mov rcx, 12823658721283834045 - mul rcx + ; magic divide by -1205826972 + mov rdx, -8213052572424165513 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sar rax, 29 + sets dl + add rax, rdx mov rcx, rax mov eax, r15d xor eax, 0b8208a64h @@ -2181,7 +2213,7 @@ rx_body_124: cmp r11d, 1719505436 jns rx_i_237 -rx_i_125: ;MUL_32 +rx_i_125: ;IMUL_32 dec ebx jz rx_finish xor r8, 0ebec27cdh @@ -2193,8 +2225,8 @@ rx_body_125: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 1774711622 + movsxd rcx, eax + movsxd rax, r14d imul rax, rcx mov r14, rax @@ -2511,7 +2543,7 @@ rx_body_143: imul rax, rcx mov r9, rax -rx_i_144: ;IMULH_64 +rx_i_144: ;DIV_64 dec ebx jz rx_finish xor r10, 02e59e00ah @@ -2522,12 +2554,15 @@ rx_i_144: ;IMULH_64 rx_body_144: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1304483355 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r11d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov r15, rax -rx_i_145: ;IMULH_64 +rx_i_145: ;DIV_64 dec ebx jz rx_finish xor r13, 08d5c798h @@ -2538,16 +2573,18 @@ rx_i_145: ;IMULH_64 rx_body_145: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx + ; magic divide by 3712555397 + mov rcx, 10670300378317066981 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r10d xor eax, 0dd491985h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_146: ;IMUL_32 +rx_i_146: ;IMULH_64 dec ebx jz rx_finish xor r13, 02327e6e2h @@ -2559,9 +2596,9 @@ rx_body_146: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, r12 + imul rcx + mov rax, rdx mov r10, rax rx_i_147: ;MUL_64 @@ -2576,7 +2613,7 @@ rx_body_147: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r11 + imul rax, r11 mov rcx, rax mov eax, r12d xor eax, 06a5bda88h @@ -2621,7 +2658,7 @@ rx_body_149: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_150: ;IMULH_64 +rx_i_150: ;DIV_64 dec ebx jz rx_finish xor r9, 01504ca7ah @@ -2632,9 +2669,12 @@ rx_i_150: ;IMULH_64 rx_body_150: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -933976796 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r8d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r9d xor eax, 0c854a524h @@ -2872,7 +2912,7 @@ rx_body_163: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_164: ;MULH_64 +rx_i_164: ;MUL_32 dec ebx jz rx_finish xor r12, 01f0c2737h @@ -2884,9 +2924,9 @@ rx_body_164: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r9d + imul rax, rcx mov rcx, rax mov eax, r13d xor eax, 09aa6da19h @@ -3007,7 +3047,7 @@ rx_body_170: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_171: ;IMULH_64 +rx_i_171: ;DIV_64 dec ebx jz rx_finish xor r15, 09901e05bh @@ -3018,9 +3058,13 @@ rx_i_171: ;IMULH_64 rx_body_171: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx + ; magic divide by 2064150457 + add rax, 1 + sbb rax, 0 + mov rcx, 4797867461985617359 + mul rcx mov rax, rdx + shr rax, 29 mov r12, rax rx_i_172: ;SUB_64 @@ -3049,7 +3093,7 @@ rx_body_173: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, -1386172772 + imul rax, rax, -1386172772 mov rcx, rax mov eax, r12d xor eax, 0ad60ae9ch @@ -3371,7 +3415,7 @@ rx_body_192: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_193: ;MULH_64 +rx_i_193: ;MUL_32 dec ebx jz rx_finish xor r12, 0e9939ach @@ -3382,9 +3426,9 @@ rx_i_193: ;MULH_64 rx_body_193: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r12d + imul rax, rcx mov rcx, rax mov eax, r15d xor eax, 074e097dch @@ -3656,7 +3700,7 @@ rx_i_208: ;MUL_64 rx_body_208: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, -486588965 + imul rax, rax, -486588965 mov r10, rax rx_i_209: ;XOR_64 @@ -3878,7 +3922,7 @@ rx_body_220: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_221: ;IMULH_64 +rx_i_221: ;DIV_64 dec ebx jz rx_finish xor r9, 0a3deb512h @@ -3889,9 +3933,12 @@ rx_i_221: ;IMULH_64 rx_body_221: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 2146087761 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r15d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r11d xor eax, 07feab351h @@ -3956,7 +4003,7 @@ rx_body_224: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_225: ;IMULH_64 +rx_i_225: ;DIV_64 dec ebx jz rx_finish xor r13, 0c558367eh @@ -3967,9 +4014,12 @@ rx_i_225: ;IMULH_64 rx_body_225: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - imul rcx + ; magic divide by 4264577610 + shr rax, 1 + mov rcx, 9289098447696480965 + mul rcx mov rax, rdx + shr rax, 30 mov rcx, rax mov eax, r12d xor eax, 0fe304a4ah @@ -4030,7 +4080,7 @@ rx_body_228: andps xmm0, xmm10 sqrtpd xmm7, xmm0 -rx_i_229: ;IMUL_32 +rx_i_229: ;IMULH_64 dec ebx jz rx_finish xor r11, 05c535836h @@ -4041,9 +4091,9 @@ rx_i_229: ;IMUL_32 rx_body_229: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, 334017248 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 013e8b2e0h @@ -4142,7 +4192,7 @@ rx_body_234: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_235: ;MUL_32 +rx_i_235: ;IMUL_32 dec ebx jz rx_finish xor r13, 0b6cb9ff2h @@ -4153,8 +4203,8 @@ rx_i_235: ;MUL_32 rx_body_235: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 212286089 + movsxd rcx, eax + movsxd rax, r12d imul rax, rcx mov rcx, rax mov eax, r15d @@ -4224,7 +4274,7 @@ rx_body_239: add rax, r10 mov r10, rax -rx_i_240: ;IMUL_32 +rx_i_240: ;IMULH_64 dec ebx jz rx_finish xor r9, 0d65d29f9h @@ -4236,9 +4286,9 @@ rx_body_240: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -423830277 - imul rax, rcx + mov rcx, r14 + imul rcx + mov rax, rdx mov r8, rax rx_i_241: ;FPADD @@ -4259,7 +4309,7 @@ rx_body_241: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_242: ;MULH_64 +rx_i_242: ;MUL_32 dec ebx jz rx_finish xor r12, 01119b0f9h @@ -4270,9 +4320,9 @@ rx_i_242: ;MULH_64 rx_body_242: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r12d + imul rax, rcx mov rcx, rax mov eax, r10d xor eax, 0130882f2h @@ -4331,7 +4381,7 @@ rx_body_245: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_246: ;DIV_64 +rx_i_246: ;IDIV_64 dec ebx jz rx_finish xor r15, 027eeaa2eh @@ -4343,14 +4393,17 @@ rx_body_246: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 4138158808 - mov rcx, 9572876028959826425 - mul rcx + ; magic divide by -156808488 + mov rdx, -3947299202596036367 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sar rax, 25 + sets dl + add rax, rdx mov r12, rax -rx_i_247: ;MUL_32 +rx_i_247: ;IMUL_32 dec ebx jz rx_finish xor r10, 0c4de0296h @@ -4361,8 +4414,8 @@ rx_i_247: ;MUL_32 rx_body_247: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r14d + movsxd rcx, eax + movsxd rax, r14d imul rax, rcx mov rcx, rax mov eax, r9d @@ -4391,7 +4444,7 @@ rx_body_248: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_249: ;IMUL_32 +rx_i_249: ;IMULH_64 dec ebx jz rx_finish xor r15, 0499552cch @@ -4403,9 +4456,9 @@ rx_body_249: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx + mov rcx, -508571655 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 0e1afcff9h @@ -4957,7 +5010,7 @@ rx_body_279: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_280: ;DIV_64 +rx_i_280: ;IDIV_64 dec ebx jz rx_finish xor r12, 066246b43h @@ -4969,10 +5022,13 @@ rx_body_280: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 555412224 - mov rcx, 2228867111296024113 - mul rcx + mov rdx, 2228867111296024113 + imul rdx mov rax, rdx - shr rax, 26 + xor edx, edx + sar rax, 26 + sets dl + add rax, rdx mov rcx, rax mov eax, r13d xor eax, 0211aeb00h @@ -5384,7 +5440,7 @@ rx_i_304: ;MUL_64 rx_body_304: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 2007686513 + imul rax, rax, 2007686513 mov r13, rax rx_i_305: ;MUL_64 @@ -5398,7 +5454,7 @@ rx_i_305: ;MUL_64 rx_body_305: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r15 + imul rax, r15 mov r10, rax rx_i_306: ;ADD_64 @@ -5443,7 +5499,7 @@ rx_body_308: imul rax, r13 mov r15, rax -rx_i_309: ;IMUL_32 +rx_i_309: ;DIV_64 dec ebx jz rx_finish xor r9, 090c42304h @@ -5454,9 +5510,11 @@ rx_i_309: ;IMUL_32 rx_body_309: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1652850028 - imul rax, rcx + ; magic divide by 2642117268 + mov rcx, 14993309243657753043 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r9d xor eax, 09d7b8294h @@ -5776,7 +5834,7 @@ rx_body_326: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_327: ;DIV_64 +rx_i_327: ;IDIV_64 dec ebx jz rx_finish xor r9, 09665f98dh @@ -5789,10 +5847,15 @@ rx_body_327: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 1572662125 - mov rcx, 12594593786994192665 - mul rcx + mov rcx, rax + mov rdx, -5852150286715358951 + imul rdx mov rax, rdx - shr rax, 30 + xor edx, edx + add rax, rcx + sar rax, 30 + sets dl + add rax, rdx mov r12, rax rx_i_328: ;SHR_64 @@ -5825,7 +5888,7 @@ rx_body_329: je short rx_i_330 ret -rx_i_330: ;MUL_32 +rx_i_330: ;IMUL_32 dec ebx jz rx_finish xor r9, 0f6a93f19h @@ -5837,8 +5900,8 @@ rx_body_330: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, -1349816041 + movsxd rcx, eax + movsxd rax, r13d imul rax, rcx mov rcx, rax mov eax, r11d @@ -6008,7 +6071,7 @@ rx_body_340: addpd xmm0, xmm5 movaps xmm5, xmm0 -rx_i_341: ;MULH_64 +rx_i_341: ;MUL_32 dec ebx jz rx_finish xor r12, 019eb9ea5h @@ -6019,9 +6082,9 @@ rx_i_341: ;MULH_64 rx_body_341: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r8d xor eax, 024736405h @@ -6230,7 +6293,7 @@ rx_body_353: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_354: ;MULH_64 +rx_i_354: ;MUL_32 dec ebx jz rx_finish xor r13, 02412fc10h @@ -6241,9 +6304,9 @@ rx_i_354: ;MULH_64 rx_body_354: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r13d + imul rax, rcx mov r13, rax rx_i_355: ;MUL_64 @@ -6293,7 +6356,7 @@ rx_body_357: add rax, r11 mov r11, rax -rx_i_358: ;IMULH_64 +rx_i_358: ;DIV_64 dec ebx jz rx_finish xor r13, 088fa6e5ah @@ -6304,9 +6367,12 @@ rx_i_358: ;IMULH_64 rx_body_358: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx + ; magic divide by 3667831238 + shr rax, 1 + mov rcx, 2700102505175032865 + mul rcx mov rax, rdx + shr rax, 28 mov r9, rax rx_i_359: ;FPSUB @@ -6401,7 +6467,7 @@ rx_body_363: andps xmm0, xmm1 movaps xmm3, xmm0 -rx_i_364: ;MULH_64 +rx_i_364: ;MUL_32 dec ebx jz rx_finish xor r11, 0badaf867h @@ -6412,9 +6478,9 @@ rx_i_364: ;MULH_64 rx_body_364: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r8d + imul rax, rcx mov r8, rax rx_i_365: ;IMUL_32 @@ -6486,7 +6552,7 @@ rx_body_368: sub eax, r10d mov r8, rax -rx_i_369: ;DIV_64 +rx_i_369: ;IDIV_64 dec ebx jz rx_finish xor r9, 053fe22e2h @@ -6498,10 +6564,13 @@ rx_body_369: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 470792991 - mov rcx, 1314739240972876203 - mul rcx + mov rdx, 1314739240972876203 + imul rdx mov rax, rdx - shr rax, 25 + xor edx, edx + sar rax, 25 + sets dl + add rax, rdx mov r9, rax rx_i_370: ;FPSUB @@ -6682,7 +6751,7 @@ rx_i_380: ;MUL_64 rx_body_380: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r10 + imul rax, r10 mov rcx, rax mov eax, r13d xor eax, 0a9fd85e0h @@ -6915,7 +6984,7 @@ rx_body_394: addpd xmm0, xmm9 movaps xmm6, xmm0 -rx_i_395: ;IMULH_64 +rx_i_395: ;DIV_64 dec ebx jz rx_finish xor r8, 04ae4fe8ch @@ -6927,9 +6996,11 @@ rx_body_395: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 939698704 + mov rcx, 5269518980991934091 + mul rcx mov rax, rdx + shr rax, 28 mov r8, rax rx_i_396: ;ROR_64 @@ -7058,7 +7129,7 @@ rx_body_402: je short rx_i_403 ret -rx_i_403: ;IMULH_64 +rx_i_403: ;DIV_64 dec ebx jz rx_finish xor r9, 0e59500f7h @@ -7069,9 +7140,11 @@ rx_i_403: ;IMULH_64 rx_body_403: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx + ; magic divide by 536056992 + mov rcx, 4618688153536407095 + mul rcx mov rax, rdx + shr rax, 27 mov rcx, rax mov eax, r11d xor eax, 01ff394a0h @@ -7161,7 +7234,7 @@ rx_i_408: ;MUL_64 rx_body_408: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, 693109961 + imul rax, rax, 693109961 mov rcx, rax mov eax, r10d xor eax, 0295004c9h @@ -7272,7 +7345,7 @@ rx_body_414: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_415: ;IMULH_64 +rx_i_415: ;DIV_64 dec ebx jz rx_finish xor r8, 08c3e59a1h @@ -7284,9 +7357,13 @@ rx_body_415: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 3756873911 + add rax, 1 + sbb rax, 0 + mov rcx, 10544426615208851175 + mul rcx mov rax, rdx + shr rax, 31 mov r9, rax rx_i_416: ;FPADD @@ -7456,7 +7533,7 @@ rx_body_425: imul rax, rcx mov r14, rax -rx_i_426: ;DIV_64 +rx_i_426: ;IDIV_64 dec ebx jz rx_finish xor r12, 09dd55ba0h @@ -7467,18 +7544,21 @@ rx_i_426: ;DIV_64 rx_body_426: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 3704238575 - mov rcx, 1336782190693946083 - mul rcx + ; magic divide by -590728721 + mov rdx, -4191230239118101979 + imul rdx mov rax, rdx - shr rax, 28 + xor edx, edx + sar rax, 27 + sets dl + add rax, rdx mov rcx, rax mov eax, r14d xor eax, 0dcca31efh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_427: ;MULH_64 +rx_i_427: ;MUL_32 dec ebx jz rx_finish xor r11, 0d6cae9aeh @@ -7490,9 +7570,9 @@ rx_body_427: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -2146332428 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, -2146332428 + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 0801190f4h @@ -7530,7 +7610,7 @@ rx_i_429: ;MUL_64 rx_body_429: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r9 + imul rax, r9 mov r15, rax rx_i_430: ;FPADD @@ -7632,7 +7712,7 @@ rx_body_435: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 1971717631 + imul rax, rax, 1971717631 mov rcx, rax mov eax, r9d xor eax, 0758605ffh @@ -7816,7 +7896,7 @@ rx_body_445: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_446: ;MULH_64 +rx_i_446: ;MUL_32 dec ebx jz rx_finish xor r12, 01734708eh @@ -7828,9 +7908,9 @@ rx_body_446: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r13d xor eax, 03166163h @@ -7938,7 +8018,7 @@ rx_body_452: je short rx_i_453 ret -rx_i_453: ;IMULH_64 +rx_i_453: ;DIV_64 dec ebx jz rx_finish xor r11, 0a2096aa4h @@ -7949,9 +8029,12 @@ rx_i_453: ;IMULH_64 rx_body_453: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - imul rcx + ; magic divide by 380157076 + shr rax, 2 + mov rcx, 3256390890604862173 + mul rcx mov rax, rdx + shr rax, 24 mov r8, rax rx_i_454: ;FPADD @@ -8050,7 +8133,7 @@ rx_i_459: ;MUL_64 rx_body_459: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r9 + imul rax, r9 mov rcx, rax mov eax, r13d xor eax, 016bb0164h @@ -8185,7 +8268,7 @@ rx_body_467: addpd xmm0, xmm9 movaps xmm8, xmm0 -rx_i_468: ;IMULH_64 +rx_i_468: ;DIV_64 dec ebx jz rx_finish xor r8, 091044dc3h @@ -8197,16 +8280,20 @@ rx_body_468: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 4281572471 + add rax, 1 + sbb rax, 0 + mov rcx, 9252227195836753313 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r8d xor eax, 0ff339c77h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_469: ;MUL_32 +rx_i_469: ;IMUL_32 dec ebx jz rx_finish xor r9, 0c0186beh @@ -8217,8 +8304,8 @@ rx_i_469: ;MUL_32 rx_body_469: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d + movsxd rcx, eax + mov rax, 294019485 imul rax, rcx mov rcx, rax mov eax, r9d @@ -8287,7 +8374,7 @@ rx_i_473: ;MUL_64 rx_body_473: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r11 + imul rax, r11 mov r12, rax rx_i_474: ;JUMP @@ -8398,7 +8485,7 @@ rx_body_480: addpd xmm0, xmm4 movaps xmm6, xmm0 -rx_i_481: ;IMULH_64 +rx_i_481: ;DIV_64 dec ebx jz rx_finish xor r14, 0225ba1f9h @@ -8409,9 +8496,12 @@ rx_i_481: ;IMULH_64 rx_body_481: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 2101516912 + shr rax, 4 + mov rcx, 147267437180322377 + mul rcx mov rax, rdx + shr rax, 20 mov r12, rax rx_i_482: ;AND_32 @@ -8509,7 +8599,7 @@ rx_body_487: sub rax, r9 mov r11, rax -rx_i_488: ;IMUL_32 +rx_i_488: ;DIV_64 dec ebx jz rx_finish xor r12, 0d8b1788eh @@ -8520,9 +8610,11 @@ rx_i_488: ;IMUL_32 rx_body_488: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, 297357073 - imul rax, rcx + ; magic divide by 297357073 + mov rcx, 16652572300311555393 + mul rcx + mov rax, rdx + shr rax, 28 mov r12, rax rx_i_489: ;JUMP