diff --git a/makefile b/makefile index 21584cb..55e1abd 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -60,7 +60,7 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ -$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR) +$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) @@ -87,6 +87,9 @@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR) $(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@ +$(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@ + $(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 21b39c8..c2394c9 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -169,11 +169,12 @@ namespace RandomX { asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl; } - void AssemblyGeneratorX86::gencr(Instruction& instr) { + void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) { switch (instr.locc & 7) { case 0: - asmCode << "\tmov rcx, rax" << std::endl; + if(rax) + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; @@ -186,7 +187,8 @@ namespace RandomX { case 1: case 2: case 3: - asmCode << "\tmov rcx, rax" << std::endl; + if (rax) + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; @@ -197,9 +199,9 @@ namespace RandomX { return; default: - asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; + asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl; } return; } @@ -208,7 +210,7 @@ namespace RandomX { void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) { if(move) asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; - const char* store = (instr.locc & 8) ? "movhpd" : "movlpd"; + const char* store = (instr.locc & 128) ? "movhpd" : "movlpd"; switch (instr.locc & 7) { case 4: @@ -463,14 +465,13 @@ namespace RandomX { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { genar(instr, i); - //asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tshl eax, 13" << std::endl; - //asmCode << "\tand rcx, -2048" << std::endl; asmCode << "\tand eax, 24576" << std::endl; - //asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; + gencr(instr, false); } static inline const char* jumpCondition(Instruction& instr, bool invert = false) { diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 6ffa2f9..bf5238a 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -44,7 +44,7 @@ namespace RandomX { void genbr1(Instruction&); void genbr132(Instruction&); void genbf(Instruction&, const char*); - void gencr(Instruction&); + void gencr(Instruction&, bool); void gencf(Instruction&, bool); void generateCode(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 8ae2f83..7803003 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -47,8 +47,8 @@ namespace RandomX { } void CompiledVirtualMachine::execute() { - executeProgram(reg, mem, scratchpad, readDataset); - //compiler.getProgramFunc()(reg, mem, scratchpad); + //executeProgram(reg, mem, scratchpad, readDataset); + compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index c436ef7..a6a3a0c 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -197,6 +197,17 @@ namespace RandomX { #define ALU_RETIRE(x) x(a, b, c); \ if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; +#define CHECK_NOP_FPDIV(b, c) +#ifndef STATS +#define CHECK_NOP_FPADD(b, c) +#define CHECK_NOP_FPSUB(b, c) +#define CHECK_NOP_FPMUL(b, c) +#else +#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++; +#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++; +#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++; +#endif + #define FPU_RETIRE(x) x(a, b, c); \ writecf(inst, c); \ if(trace) { \ @@ -248,8 +259,10 @@ namespace RandomX { INC_COUNT(x) \ convertible_t a = loada(inst); \ fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \ + fpu_reg_t btemp = b; \ fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \ FPU_RETIRE(x) \ + CHECK_NOP_##x(btemp, c) \ } #define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index b8fd98f..8c34936 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -83,6 +83,12 @@ namespace RandomX { int count_retdepth_max = 0; int count_endstack = 0; int count_instructions[ProgramLength] = { 0 }; + int count_FPADD_nop = 0; + int count_FPADD_nop2 = 0; + int count_FPSUB_nop = 0; + int count_FPSUB_nop2 = 0; + int count_FPMUL_nop = 0; + int count_FPMUL_nop2 = 0; #endif convertible_t loada(Instruction&); diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index be156ef..fdc32b1 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -29,9 +29,12 @@ .global DECL(randomx_program_prologue) .global DECL(randomx_program_begin) .global DECL(randomx_program_epilogue) -.global DECL(randomx_program_read_r) -.global DECL(randomx_program_read_f) +.global DECL(randomx_program_read_l1) +.global DECL(randomx_program_read_l2) .global DECL(randomx_program_end) +.global DECL(randomx_program_transform) + +#define db .byte .align 64 DECL(randomx_program_prologue): @@ -45,14 +48,26 @@ DECL(randomx_program_begin): DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -.align 64 -DECL(randomx_program_read_r): - #include "asm/program_read_r.inc" +#define scratchpad_mask and ecx, 2040 .align 64 -DECL(randomx_program_read_f): - #include "asm/program_read_f.inc" +DECL(randomx_program_read_l1): + #include "asm/program_read.inc" + +#undef scratchpad_mask + +#define scratchpad_mask and ecx, 32760 + +.align 64 +DECL(randomx_program_read_l2): + #include "asm/program_read.inc" + +#undef scratchpad_mask .align 64 DECL(randomx_program_end): - nop \ No newline at end of file + nop + +.align 8 +DECL(randomx_program_transform): + #include "asm/program_transform_address.inc" diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index d7d3d4b..7a2b3c4 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,9 +20,11 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue PUBLIC randomx_program_begin PUBLIC randomx_program_epilogue -PUBLIC randomx_program_read_r -PUBLIC randomx_program_read_f +PUBLIC randomx_program_read_l1 +PUBLIC randomx_program_read_l2 PUBLIC randomx_program_end +PUBLIC randomx_program_transform + ALIGN 64 randomx_program_prologue PROC @@ -39,21 +41,34 @@ randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP -ALIGN 64 -randomx_program_read_r PROC - include asm/program_read_r.inc -randomx_program_read_r ENDP +scratchpad_mask MACRO + and ecx, 2040 +ENDM ALIGN 64 -randomx_program_read_f PROC - include asm/program_read_f.inc -randomx_program_read_f ENDP +randomx_program_read_l1 PROC + include asm/program_read.inc +randomx_program_read_l1 ENDP + +scratchpad_mask MACRO + and ecx, 32760 +ENDM + +ALIGN 64 +randomx_program_read_l2 PROC + include asm/program_read.inc +randomx_program_read_l2 ENDP ALIGN 64 randomx_program_end PROC nop randomx_program_end ENDP +ALIGN 8 +randomx_program_transform PROC + include asm/program_transform_address.inc +randomx_program_transform ENDP + _RANDOMX_JITX86_STATIC ENDS END \ No newline at end of file diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index 6052283..f5904ad 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -21,7 +21,8 @@ extern "C" { void randomx_program_prologue(); void randomx_program_begin(); void randomx_program_epilogue(); - void randomx_program_read_r(); - void randomx_program_read_f(); + void randomx_program_transform(); + void randomx_program_read_l1(); + void randomx_program_read_l2(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index b03a330..fda3746 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -48,12 +48,12 @@ namespace RandomX { REGISTER ALLOCATION: rax -> temporary - rbx -> MemoryRegisters& memory + rbx -> "ic" rcx -> temporary rdx -> temporary rsi -> convertible_t* scratchpad - rdi -> "ic" (instruction counter) - rbp -> beginning of VM stack + rdi -> beginning of VM stack + rbp -> "ma", "mx" rsp -> end of VM stack r8 -> "r0" r9 -> "r1" @@ -82,7 +82,8 @@ namespace RandomX { | saved registers | v - [rbp] RegisterFile& registerFile + [rdi+8] RegisterFile& registerFile + [rdi] uint8_t* dataset | | | VM stack @@ -97,18 +98,19 @@ namespace RandomX { const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; - const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r; - const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f; + const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1; + const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; const int32_t prologueSize = codeProgramBegin - codePrologue; - const int32_t epilogueSize = codeReadDatasetR - codeEpilogue; - const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR; - const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF; + const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue; + const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1; + const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2; - const int32_t readDatasetFOffset = CodeSize - readDatasetFSize; - const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize; - const int32_t epilogueOffset = readDatasetROffset - epilogueSize; + const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size; + const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; + const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize; JitCompilerX86::JitCompilerX86() { #ifdef _WIN32 @@ -121,9 +123,9 @@ namespace RandomX { throw std::runtime_error("mmap failed"); #endif memcpy(code, codePrologue, prologueSize); - memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize); - memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize); - memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize); + memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize); + memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size); + memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size); } void JitCompilerX86::generateProgram(Pcg32& gen) { @@ -140,12 +142,33 @@ namespace RandomX { emitByte(0xe9); emit(instructionOffsets[0] - (codePos + 4)); fixCallOffsets(); + uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; + uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; + *reinterpret_cast(code + readDatasetL1Offset + 1) = transformL1; + *reinterpret_cast(code + readDatasetL2Offset + 1) = transformL2; } void JitCompilerX86::generateCode(Instruction& instr, int i) { instructionOffsets.push_back(codePos); - emit(0x840fcfff); //dec edx; jz + emit(0x840fcbff); //dec ebx; jz emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + emit(uint16_t(0x8149)); //xor + emitByte(0xf0 + (instr.rega % RegistersCount)); + emit(instr.addra); + emit(uint16_t(0x8b41)); //mov + emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega + emit(0x753fc3f6); //test bl,0x3f; jne + emit(uint16_t(0xe805)); + if (instr.loca & 3) { //A.LOC.W + emit(readDatasetL1Offset - (codePos + 4)); + } + else { + emit(readDatasetL2Offset - (codePos + 4)); + } + if ((instr.loca & 192) == 0) { //A.LOC.X + emit(uint16_t(0x3348)); + emitByte(0xe9); //xor rbp, rcx + } auto generator = engine[instr.opcode]; (this->*generator)(instr, i); } @@ -157,73 +180,26 @@ namespace RandomX { } void JitCompilerX86::genar(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - switch (instr.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega - emitByte(0xe8); //call - emit(readDatasetROffset - (codePos + 4)); - return; - - case 4: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL2 - 1); //whole scratchpad - emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] - return; - - default: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] - return; + emit(uint16_t(0xe181)); //and ecx, + if (instr.loca & 3) { + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad } + else { + emit(ScratchpadL2 - 1); //whole scratchpad + } + emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] } void JitCompilerX86::genaf(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - switch (instr.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega - emitByte(0xe8); //call - emit(readDatasetFOffset - (codePos + 4)); - return; - - case 4: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL2 - 1); //whole scratchpad - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - return; - - default: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and + emit(uint16_t(0xe181)); //and ecx, + if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - return; } + else { + emit(ScratchpadL2 - 1); //whole scratchpad + } + emitByte(0xf3); + emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8] } void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { @@ -274,8 +250,13 @@ namespace RandomX { } - void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) { - emit(0x41c88b48); //mov rcx, rax; REX + void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) { + if (rax) { + emit(0x41c88b48); //mov rcx, rax; REX + } + else { + emitByte(0x41); + } emitByte(0x8b); // mov emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc emitByte(0x35); // xor eax @@ -285,22 +266,27 @@ namespace RandomX { emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx } - void JitCompilerX86::gencr(Instruction& instr) { + void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { switch (instr.locc & 7) { case 0: - scratchpadStoreR(instr, ScratchpadL2); + scratchpadStoreR(instr, ScratchpadL2, rax); break; case 1: case 2: case 3: - scratchpadStoreR(instr, ScratchpadL1); + scratchpadStoreR(instr, ScratchpadL1, rax); break; default: emit(uint16_t(0x8b4c)); //mov - emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + if (rax) { + emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + } + else { + emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx + } break; } } @@ -322,29 +308,21 @@ namespace RandomX { emitByte(0xc6); } - void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) { + void JitCompilerX86::gencf(Instruction& instr) { int regc = (instr.regc % RegistersCount); - if (!alwaysLow) { - if (regc <= 1) { - emitByte(0x44); //REX - } - emit(uint16_t(0x280f)); //movaps - emitByte(0xc0 + 8 * regc); // regc, xmm0 + if (regc <= 1) { + emitByte(0x44); //REX } - switch (instr.locc & 7) + emit(uint16_t(0x280f)); //movaps + emitByte(0xc0 + 8 * regc); // regc, xmm0 + if (instr.locc & 4) //C.LOC.R { - case 4: - scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8)); - break; - - case 5: - case 6: - case 7: - scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8)); - break; - - default: - break; + if (instr.locc & 3) { //C.LOC.W + scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad + } + else { + scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad + } } } @@ -596,24 +574,11 @@ namespace RandomX { void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { genar(instr); - emit(0x81480de0c1c88b48); - emit(0x600025fffff800e1); - emit(uint16_t(0x0000)); - emitByte(0xf2); - int regc = (instr.regc % RegistersCount); - if (regc <= 1) { - emitByte(0x4c); //REX - } - else { - emitByte(0x48); //REX - } - emit(uint16_t(0x2a0f)); - emitByte(0xc1 + 8 * regc); - emitByte(0x0d); - emit(0xf824448900009fc0); - emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8] + emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd + emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0 + emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8] emitByte(0xf8); - gencf(instr, true); + gencr(instr, false); //result in rcx } static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { @@ -670,7 +635,7 @@ namespace RandomX { if ((instr.locc & 7) <= 3) { crlen = 17; } - emit(0x74e53b48); //cmp rsp, rbp; je + emit(0x74e73b48); //cmp rsp, rdi; je emitByte(11 + crlen); emitByte(0x48); emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8] diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e2c432c..cea067c 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -64,10 +64,10 @@ namespace RandomX { void genbr1(Instruction&, uint16_t, uint16_t); void genbr132(Instruction&, uint16_t, uint8_t); void genbf(Instruction&, uint8_t); - void scratchpadStoreR(Instruction&, uint32_t); + void scratchpadStoreR(Instruction&, uint32_t, bool); void scratchpadStoreF(Instruction&, int, uint32_t, bool); - void gencr(Instruction&); - void gencf(Instruction&, bool); + void gencr(Instruction&, bool); + void gencf(Instruction&); void generateCode(Instruction&, int); void fixCallOffsets(); diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index b7b779b..90b26ce 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -1,8 +1,9 @@ ;# unroll VM stack - mov rsp, rbp + mov rsp, rdi ;# save VM register values pop rcx + pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 8d09d88..6bc3bd2 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -7,9 +7,11 @@ push r15 ;# function arguments - push rdi ;# RegisterFile& registerFile - mov rbx, rsi ;# MemoryRegisters& memory - mov rsi, rdx ;# convertible_t* scratchpad + push rdi ;# RegisterFile& registerFile + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov rax, qword ptr [rsi+8] ;# uint8_t* dataset + push rax + mov rsi, rdx ;# convertible_t* scratchpad mov rcx, rdi #include "program_prologue_load.inc" diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index df44c08..ef4f96e 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,5 +1,5 @@ - mov rbp, rsp ;# beginning of VM stack - mov rdi, 1048577 ;# number of VM instructions to execute + 1 + mov rdi, rsp ;# beginning of VM stack + mov ebx, 1048577 ;# number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 6059904..bbf7851 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -15,9 +15,11 @@ movdqu xmmword ptr [rsp+0], xmm10 ;# function arguments - push rcx ;# RegisterFile& registerFile - mov rbx, rdx ;# MemoryRegisters& memory - mov rsi, r8 ;# convertible_t* scratchpad + push rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rax, qword ptr [rdx+8] ;# uint8_t* dataset + push rax + mov rsi, r8 ;# convertible_t* scratchpad include program_prologue_load.inc diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc new file mode 100644 index 0000000..adf8e92 --- /dev/null +++ b/src/asm/program_read.inc @@ -0,0 +1,32 @@ + push rcx ;# preserve ecx + db 0, 0, 0, 0 ;# TransformAddress placeholder + mov rax, qword ptr [rdi] ;# load the dataset address + xor rbp, rcx ;# modify "mx" + ;# prefetch cacheline "mx" + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx + prefetchnta byte ptr [rax+rdx] + ;# read cacheline "ma" + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8 + lea rcx, [rsi+rcx*8] ;# scratchpad cache line + lea rax, [rax+rdx] ;# dataset cache line + mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) + xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline + mov rdx, qword ptr [rax+8] + xor qword ptr [rcx+8], rdx + mov rdx, qword ptr [rax+16] + xor qword ptr [rcx+16], rdx + mov rdx, qword ptr [rax+24] + xor qword ptr [rcx+24], rdx + mov rdx, qword ptr [rax+32] + xor qword ptr [rcx+32], rdx + mov rdx, qword ptr [rax+40] + xor qword ptr [rcx+40], rdx + mov rdx, qword ptr [rax+48] + xor qword ptr [rcx+48], rdx + mov rdx, qword ptr [rax+56] + xor qword ptr [rcx+56], rdx + pop rcx ;# restore ecx + ret \ No newline at end of file diff --git a/src/asm/program_read_f.inc b/src/asm/program_read_f.inc deleted file mode 100644 index 1d70dab..0000000 --- a/src/asm/program_read_f.inc +++ /dev/null @@ -1,13 +0,0 @@ - mov edx, dword ptr [rbx] ;# ma - mov rax, qword ptr [rbx+8] ;# dataset - cvtdq2pd xmm0, qword ptr [rax+rdx] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ;# mx - mov dword ptr [rbx+4], ecx - test ecx, 65528 - jne short rx_read_dataset_f_ret - and ecx, -8 - mov dword ptr [rbx], ecx - prefetcht0 byte ptr [rax+rcx] -rx_read_dataset_f_ret: - ret 0 \ No newline at end of file diff --git a/src/asm/program_read_r.inc b/src/asm/program_read_r.inc deleted file mode 100644 index b3102dc..0000000 --- a/src/asm/program_read_r.inc +++ /dev/null @@ -1,13 +0,0 @@ - mov eax, dword ptr [rbx] ;# ma - mov rdx, qword ptr [rbx+8] ;# dataset - mov rax, qword ptr [rdx+rax] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ;# mx - mov dword ptr [rbx+4], ecx - test ecx, 65528 - jne short rx_read_dataset_r_ret - and ecx, -8 - mov dword ptr [rbx], ecx - prefetcht0 byte ptr [rdx+rcx] -rx_read_dataset_r_ret: - ret 0 \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 12b74c1..acda52a 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -77,6 +77,7 @@ namespace RandomX { constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t); constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); + constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; class Cache; diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ec39c60..841bb16 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -158,10 +158,14 @@ executeProgram PROC pslldq xmm7, 8 cvtsi2sd xmm7, qword ptr [rcx+112] - ; program body + jmp program_begin + ; program body +ALIGN 64 +program_begin: include program.inc +ALIGN 64 rx_finish: ; unroll the stack mov rsp, rdi diff --git a/src/main.cpp b/src/main.cpp index 81d49ec..a0ffc0a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -277,10 +277,6 @@ int main(int argc, char** argv) { if(programCount == 1000) std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; - /*if (threadCount == 1 && !compiled) { - auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0]; - std::cout << ivm->getProgam(); - }*/ } catch (std::exception& e) { std::cout << "ERROR: " << e.what() << std::endl; diff --git a/src/program.inc b/src/program.inc index 5dd1b4e..e078cc3 100644 --- a/src/program.inc +++ b/src/program.inc @@ -76,11 +76,13 @@ rx_body_3: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r8, rcx rx_i_4: ;MULH_64 dec ebx @@ -153,7 +155,7 @@ rx_body_7: mov eax, r14d xor eax, 057c8c41bh and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_8: ;SHL_64 dec ebx @@ -218,7 +220,7 @@ rx_body_11: mov eax, r12d xor eax, 0852d40d8h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_12: ;CALL dec ebx @@ -355,7 +357,7 @@ rx_body_18: mov eax, r11d xor eax, 0869baa81h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_19: ;FPSUB dec ebx @@ -372,7 +374,7 @@ rx_body_19: subpd xmm0, xmm8 movaps xmm7, xmm0 -rx_i_20: ;FPMUL +rx_i_20: ;FPSUB dec ebx jz rx_finish xor r13, 0ecca967dh @@ -383,15 +385,12 @@ rx_i_20: ;FPMUL rx_body_20: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d xor eax, 0aad81365h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_21: ;FPADD dec ebx @@ -482,7 +481,7 @@ rx_body_25: mov eax, r14d xor eax, 0baf5c2d4h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_26: ;IMUL_32 dec ebx @@ -580,7 +579,7 @@ rx_body_31: mov eax, r14d xor eax, 01e2da792h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_32: ;XOR_64 dec ebx @@ -668,7 +667,7 @@ rx_body_36: andps xmm0, xmm1 movaps xmm7, xmm0 -rx_i_37: ;FPMUL +rx_i_37: ;FPSUB dec ebx jz rx_finish xor r12, 0d0706601h @@ -679,10 +678,7 @@ rx_i_37: ;FPMUL rx_body_37: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d xor eax, 0bca81c78h @@ -764,7 +760,7 @@ taken_call_41: push rax call rx_i_127 -rx_i_42: ;FPSUB +rx_i_42: ;FPADD dec ebx jz rx_finish xor r15, 0bc1de9f6h @@ -776,7 +772,7 @@ rx_body_42: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm6 + addpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_43: ;SUB_64 @@ -887,7 +883,7 @@ rx_body_48: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm9 -rx_i_49: ;FPMUL +rx_i_49: ;FPSUB dec ebx jz rx_finish xor r8, 0f96c6a45h @@ -898,10 +894,7 @@ rx_i_49: ;FPMUL rx_body_49: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm3 movaps xmm5, xmm0 rx_i_50: ;OR_32 @@ -1018,7 +1011,7 @@ rx_body_55: mov eax, r11d xor eax, 07c79cddh and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_56: ;AND_64 dec ebx @@ -1144,7 +1137,7 @@ taken_call_61: push rax call rx_i_120 -rx_i_62: ;FPMUL +rx_i_62: ;FPSUB dec ebx jz rx_finish xor r15, 0c3089414h @@ -1155,17 +1148,14 @@ rx_i_62: ;FPMUL rx_body_62: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d xor eax, 05c4789e3h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 -rx_i_63: ;FPMUL +rx_i_63: ;FPSUB dec ebx jz rx_finish xor r9, 065cf272eh @@ -1176,10 +1166,7 @@ rx_i_63: ;FPMUL rx_body_63: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm8, xmm0 rx_i_64: ;SUB_64 @@ -1253,7 +1240,7 @@ taken_call_67: push rax call rx_i_79 -rx_i_68: ;FPSUB +rx_i_68: ;FPADD dec ebx jz rx_finish xor r13, 03aa5c3a4h @@ -1264,7 +1251,7 @@ rx_i_68: ;FPSUB rx_body_68: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm2 + addpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d xor eax, 03c51ef39h @@ -1354,11 +1341,16 @@ rx_i_73: ;FPROUND rx_body_73: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r10d + xor eax, 040624270h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_74: ;MUL_64 dec ebx @@ -1722,7 +1714,7 @@ rx_body_93: mov eax, r10d xor eax, 07e48a0d8h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_94: ;RET dec ebx @@ -1830,7 +1822,7 @@ rx_body_99: mov eax, r12d xor eax, 04c21df83h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 + movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_100: ;ADD_64 dec ebx @@ -1955,7 +1947,7 @@ rx_body_106: mov eax, r12d xor eax, 03cb2505h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_107: ;CALL dec ebx @@ -1999,7 +1991,7 @@ rx_body_108: mov eax, r9d xor eax, 0678b65beh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_109: ;FPADD dec ebx @@ -2207,7 +2199,7 @@ rx_body_120: addpd xmm0, xmm4 movaps xmm8, xmm0 -rx_i_121: ;FPMUL +rx_i_121: ;FPSUB dec ebx jz rx_finish xor r9, 03ab8f73h @@ -2218,10 +2210,7 @@ rx_i_121: ;FPMUL rx_body_121: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_122: ;RET @@ -2813,7 +2802,7 @@ rx_body_153: mov eax, r8d xor eax, 09111c981h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 + movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_154: ;MUL_32 dec ebx @@ -3196,11 +3185,13 @@ rx_i_174: ;FPROUND rx_body_174: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r14, rcx rx_i_175: ;SAR_64 dec ebx @@ -3431,7 +3422,7 @@ rx_body_187: andps xmm0, xmm1 movaps xmm5, xmm0 -rx_i_188: ;FPMUL +rx_i_188: ;FPSUB dec ebx jz rx_finish xor r9, 04659becbh @@ -3443,10 +3434,7 @@ rx_body_188: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm3 movaps xmm4, xmm0 rx_i_189: ;FPROUND @@ -3460,11 +3448,16 @@ rx_i_189: ;FPROUND rx_body_189: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r13d + xor eax, 0e6f1a3b7h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_190: ;RET dec ebx @@ -3761,7 +3754,7 @@ rx_body_205: andps xmm0, xmm1 movaps xmm5, xmm0 -rx_i_206: ;FPMUL +rx_i_206: ;FPSUB dec ebx jz rx_finish xor r11, 0e836a177h @@ -3773,10 +3766,7 @@ rx_body_206: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm4, xmm0 rx_i_207: ;AND_32 @@ -4085,7 +4075,7 @@ rx_body_223: mov eax, r10d xor eax, 07fca59eeh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_224: ;SAR_64 dec ebx @@ -4171,7 +4161,7 @@ rx_body_227: mov eax, r11d xor eax, 0aabe2a0ah and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_228: ;CALL dec ebx @@ -4313,11 +4303,16 @@ rx_i_234: ;FPROUND rx_body_234: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r12d + xor eax, 04d2e9e7dh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_235: ;IMUL_32 dec ebx @@ -4438,7 +4433,7 @@ rx_body_241: mov eax, r15d xor eax, 0bc2423ebh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_242: ;MULH_64 dec ebx @@ -4734,7 +4729,7 @@ rx_body_257: mov eax, r11d xor eax, 0373b1b6fh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_258: ;MUL_32 dec ebx @@ -4771,7 +4766,7 @@ rx_body_259: addpd xmm0, xmm9 movaps xmm3, xmm0 -rx_i_260: ;FPMUL +rx_i_260: ;FPSUB dec ebx jz rx_finish xor r13, 0f94e9fa9h @@ -4783,10 +4778,7 @@ rx_body_260: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm5 movaps xmm9, xmm0 rx_i_261: ;FPSQRT @@ -4806,7 +4798,7 @@ rx_body_261: mov eax, r11d xor eax, 0745a48e9h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_262: ;OR_32 dec ebx @@ -5044,7 +5036,7 @@ rx_body_274: mov eax, r14d xor eax, 06a2b2b5bh and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_275: ;OR_64 dec ebx @@ -5121,7 +5113,7 @@ rx_body_278: mov eax, r12d xor eax, 02d00ad10h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 + movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_279: ;FPSUB dec ebx @@ -5139,7 +5131,7 @@ rx_body_279: mov eax, r9d xor eax, 0475ade01h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_280: ;AND_64 dec ebx @@ -5210,7 +5202,7 @@ rx_body_283: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_284: ;FPSUB +rx_i_284: ;FPADD dec ebx jz rx_finish xor r15, 0e68f36ach @@ -5222,7 +5214,7 @@ rx_body_284: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm6 + addpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d xor eax, 0936f2960h @@ -5313,7 +5305,7 @@ rx_body_289: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_290: ;FPMUL +rx_i_290: ;FPSUB dec ebx jz rx_finish xor r15, 060665748h @@ -5324,10 +5316,7 @@ rx_i_290: ;FPMUL rx_body_290: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm8 movaps xmm9, xmm0 rx_i_291: ;RET @@ -5531,7 +5520,7 @@ rx_body_301: mov eax, r15d xor eax, 0433cf2d6h and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm7 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_302: ;ADD_64 dec ebx @@ -5937,7 +5926,7 @@ rx_body_324: mov eax, r9d xor eax, 0944856d4h and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_325: ;SHL_64 dec ebx @@ -6076,7 +6065,7 @@ rx_body_332: mov eax, r11d xor eax, 0116c919eh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_333: ;XOR_64 dec ebx @@ -6222,7 +6211,7 @@ rx_body_341: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_342: ;FPMUL +rx_i_342: ;FPSUB dec ebx jz rx_finish xor r9, 09ccc7abah @@ -6233,10 +6222,7 @@ rx_i_342: ;FPMUL rx_body_342: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_343: ;SHR_64 @@ -6258,7 +6244,7 @@ rx_body_343: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_344: ;FPMUL +rx_i_344: ;FPSUB dec ebx jz rx_finish xor r10, 03ef9bcc4h @@ -6269,10 +6255,7 @@ rx_i_344: ;FPMUL rx_body_344: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm6 movaps xmm5, xmm0 rx_i_345: ;MULH_64 @@ -6343,7 +6326,7 @@ rx_body_348: mov eax, r9d xor eax, 039c35461h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_349: ;XOR_32 dec ebx @@ -6413,9 +6396,9 @@ rx_body_352: mov eax, r10d xor eax, 03bf686f2h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm2 + movlpd qword ptr [rsi + rax * 8], xmm2 -rx_i_353: ;FPMUL +rx_i_353: ;FPSUB dec ebx jz rx_finish xor r13, 02e65278bh @@ -6426,15 +6409,12 @@ rx_i_353: ;FPMUL rx_body_353: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d xor eax, 0b3c9f7aeh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_354: ;MULH_64 dec ebx @@ -6535,7 +6515,7 @@ rx_body_359: mov eax, r12d xor eax, 0f16b9be3h and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_360: ;FPMUL dec ebx @@ -6570,7 +6550,7 @@ rx_body_361: mov eax, r14d xor eax, 0ad0b81f5h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_362: ;SUB_64 dec ebx @@ -6726,7 +6706,7 @@ rx_body_370: mov eax, r14d xor eax, 0a120e0edh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_371: ;FPADD dec ebx @@ -6948,7 +6928,7 @@ rx_body_383: mov eax, r13d xor eax, 0c9f5cc22h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_384: ;SHR_64 dec ebx @@ -7256,7 +7236,7 @@ rx_body_400: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_401: ;FPMUL +rx_i_401: ;FPSUB dec ebx jz rx_finish xor r13, 032e81f25h @@ -7267,15 +7247,12 @@ rx_i_401: ;FPMUL rx_body_401: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d xor eax, 03ea60344h and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_402: ;RET dec ebx @@ -7382,13 +7359,15 @@ rx_i_406: ;FPROUND rx_body_406: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r9, rcx -rx_i_407: ;FPMUL +rx_i_407: ;FPSUB dec ebx jz rx_finish xor r14, 09699566fh @@ -7400,10 +7379,7 @@ rx_body_407: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm9 movaps xmm8, xmm0 rx_i_408: ;MUL_64 @@ -7493,7 +7469,7 @@ rx_body_412: mov eax, r11d xor eax, 0bbd2640ah and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_413: ;FPDIV dec ebx @@ -7704,7 +7680,7 @@ rx_body_424: mov eax, r9d xor eax, 0565ae8aah and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_425: ;IMUL_32 dec ebx @@ -7887,7 +7863,7 @@ rx_body_434: mov eax, r9d xor eax, 08c1cfc74h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_435: ;MUL_64 dec ebx @@ -8068,7 +8044,7 @@ not_taken_ret_443: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_444: ;FPMUL +rx_i_444: ;FPSUB dec ebx jz rx_finish xor r8, 042455dd8h @@ -8079,15 +8055,12 @@ rx_i_444: ;FPMUL rx_body_444: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d xor eax, 0ce416070h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 + movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_445: ;ADD_64 dec ebx @@ -8128,7 +8101,7 @@ rx_body_446: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_447: ;FPSUB +rx_i_447: ;FPADD dec ebx jz rx_finish xor r8, 01596d0e8h @@ -8139,12 +8112,12 @@ rx_i_447: ;FPSUB rx_body_447: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm7 + addpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d xor eax, 0b384d4afh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_448: ;FPSUB dec ebx @@ -8668,7 +8641,7 @@ rx_body_477: mov eax, r14d xor eax, 0e81fc7a6h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_478: ;MUL_64 dec ebx @@ -9143,7 +9116,7 @@ rx_body_504: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm4 -rx_i_505: ;FPMUL +rx_i_505: ;FPSUB dec ebx jz rx_finish xor r12, 032c0a28ah @@ -9154,17 +9127,14 @@ rx_i_505: ;FPMUL rx_body_505: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d xor eax, 021b54eaeh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm8 + movhpd qword ptr [rsi + rax * 8], xmm8 -rx_i_506: ;FPMUL +rx_i_506: ;FPSUB dec ebx jz rx_finish xor r9, 0a973d58ch @@ -9175,10 +9145,7 @@ rx_i_506: ;FPMUL rx_body_506: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_507: ;RET @@ -9238,7 +9205,7 @@ taken_call_509: push rax call rx_i_42 -rx_i_510: ;FPSUB +rx_i_510: ;FPADD dec ebx jz rx_finish xor r8, 0db65513ch @@ -9249,7 +9216,7 @@ rx_i_510: ;FPSUB rx_body_510: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm2 + addpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_511: ;ROL_64 diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp index 766fda3..e6e44fc 100644 --- a/src/virtualMemory.cpp +++ b/src/virtualMemory.cpp @@ -74,21 +74,21 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) { } #endif -void* allocExecutableMemory(size_t bytes) { +void* allocExecutableMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE); if (mem == nullptr) throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc")); #else - mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (mem == MAP_FAILED) throw std::runtime_error("allocExecutableMemory - mmap failed"); #endif return mem; } -void* allocLargePagesMemory(size_t bytes) { +void* allocLargePagesMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 setPrivilege("SeLockMemoryPrivilege", 1); diff --git a/src/virtualMemory.hpp b/src/virtualMemory.hpp index dd150d3..c80d33e 100644 --- a/src/virtualMemory.hpp +++ b/src/virtualMemory.hpp @@ -19,5 +19,7 @@ along with RandomX. If not, see. #pragma once -void* allocExecutableMemory(size_t); -void* allocLargePagesMemory(size_t); \ No newline at end of file +#include + +void* allocExecutableMemory(std::size_t); +void* allocLargePagesMemory(std::size_t); \ No newline at end of file