Vector FPU instructions

JitCompilerX86 - static code written in asm
Updated ALU/FPU tests
Updated instruction weights
This commit is contained in:
tevador 2018-12-31 19:06:45 +01:00
parent a09bee8d60
commit 3caecc7646
30 changed files with 3757 additions and 3618 deletions

View File

@ -12,6 +12,9 @@ OBJDIR=obj
LDFLAGS=-lpthread LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o) ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
ifeq ($(PLATFORM),x86_64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o
endif
all: release test all: release test
@ -57,6 +60,9 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

View File

@ -54,7 +54,7 @@ namespace RandomX {
(this->*generator)(instr, i); (this->*generator)(instr, i);
} }
void AssemblyGeneratorX86::gena(Instruction& instr) { void AssemblyGeneratorX86::genar(Instruction& instr) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
switch (instr.loca & 7) switch (instr.loca & 7)
{ {
@ -63,7 +63,7 @@ namespace RandomX {
case 2: case 2:
case 3: case 3:
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tcall rx_read_dataset" << std::endl; asmCode << "\tcall rx_read_dataset_r" << std::endl;
return; return;
case 4: case 4:
@ -80,6 +80,33 @@ namespace RandomX {
} }
} }
void AssemblyGeneratorX86::genaf(Instruction& instr) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tcall rx_read_dataset_f" << std::endl;
return;
case 4:
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
return;
default:
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
return;
}
}
void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) { void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) {
switch (instr.locb & 7) switch (instr.locb & 7)
{ {
@ -87,8 +114,6 @@ namespace RandomX {
case 1: case 1:
case 2: case 2:
case 3: case 3:
case 4:
case 5:
asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl; asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
asmCode << "\t" << instrx86 << " rax, cl" << std::endl; asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
return; return;
@ -133,26 +158,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) { void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) {
asmCode << "\tand rax, -2048" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
switch (instr.locb & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl; asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
return;
default:
convertible_t bimm;
bimm.f64 = (double)instr.imm32;
asmCode << "\tmov rax, " << bimm.i64 << std::endl;
asmCode << "\tmovd xmm1, rax" << std::endl;
asmCode << "\t" << instrx86 << " xmm0, xmm1" << std::endl;
return;
}
} }
void AssemblyGeneratorX86::gencr(Instruction& instr) { void AssemblyGeneratorX86::gencr(Instruction& instr) {
@ -165,7 +171,7 @@ namespace RandomX {
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) { if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl; asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
} }
return; return;
@ -178,76 +184,75 @@ namespace RandomX {
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) { if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl; asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
} }
return; return;
default: default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
if (trace) { if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl; asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
} }
} }
} }
void AssemblyGeneratorX86::gencf(Instruction& instr) { void AssemblyGeneratorX86::gencf(Instruction& instr, bool alwaysLow = false) {
if(!alwaysLow)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (!alwaysLow && (instr.locc & 8)) ? "movhpd" : "movlpd";
switch (instr.locc & 7) switch (instr.locc & 7)
{ {
case 0: case 4:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl; asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
break; break;
case 1: case 5:
case 2: case 6:
case 3: case 7:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl; asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
break;
default:
asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
break; break;
} }
if (trace) { if (trace) {
asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl; asmCode << "\t" << store << " qword ptr [rsi + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl;
} }
} }
void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tadd rax, "; asmCode << "\tadd rax, ";
genbr1(instr); genbr1(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tadd eax, "; asmCode << "\tadd eax, ";
genbr132(instr); genbr132(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tsub rax, "; asmCode << "\tsub rax, ";
genbr1(instr); genbr1(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tsub eax, "; asmCode << "\tsub eax, ";
genbr132(instr); genbr132(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\timul rax, "; asmCode << "\timul rax, ";
if ((instr.locb & 7) >= 6) { if ((instr.locb & 7) >= 6) {
asmCode << "rax, "; asmCode << "rax, ";
@ -257,7 +262,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmov rcx, "; asmCode << "\tmov rcx, ";
genbr1(instr); genbr1(instr);
asmCode << "\tmul rcx" << std::endl; asmCode << "\tmul rcx" << std::endl;
@ -266,7 +271,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmov ecx, eax" << std::endl; asmCode << "\tmov ecx, eax" << std::endl;
asmCode << "\tmov eax, "; asmCode << "\tmov eax, ";
genbr132(instr); genbr132(instr);
@ -275,7 +280,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmovsxd rcx, eax" << std::endl; asmCode << "\tmovsxd rcx, eax" << std::endl;
if ((instr.locb & 7) >= 6) { if ((instr.locb & 7) >= 6) {
asmCode << "\tmov rax, " << instr.imm32 << std::endl; asmCode << "\tmov rax, " << instr.imm32 << std::endl;
@ -288,7 +293,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmov rcx, "; asmCode << "\tmov rcx, ";
genbr1(instr); genbr1(instr);
asmCode << "\timul rcx" << std::endl; asmCode << "\timul rcx" << std::endl;
@ -297,7 +302,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
gena(instr); genar(instr);
if ((instr.locb & 7) >= 6) { if ((instr.locb & 7) >= 6) {
if (instr.imm32 == 0) { if (instr.imm32 == 0) {
asmCode << "\tmov ecx, 1" << std::endl; asmCode << "\tmov ecx, 1" << std::endl;
@ -318,7 +323,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmov edx, "; asmCode << "\tmov edx, ";
genbr132(instr); genbr132(instr);
asmCode << "\tcmp edx, -1" << std::endl; asmCode << "\tcmp edx, -1" << std::endl;
@ -339,123 +344,125 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tand rax, "; asmCode << "\tand rax, ";
genbr1(instr); genbr1(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tand eax, "; asmCode << "\tand eax, ";
genbr132(instr); genbr132(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tor rax, "; asmCode << "\tor rax, ";
genbr1(instr); genbr1(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tor eax, "; asmCode << "\tor eax, ";
genbr132(instr); genbr132(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\txor rax, "; asmCode << "\txor rax, ";
genbr1(instr); genbr1(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) { void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\txor eax, "; asmCode << "\txor eax, ";
genbr132(instr); genbr132(instr);
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
gena(instr); genar(instr);
genbr0(instr, "shl"); genbr0(instr, "shl");
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
gena(instr); genar(instr);
genbr0(instr, "shr"); genbr0(instr, "shr");
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
gena(instr); genar(instr);
genbr0(instr, "sar"); genbr0(instr, "sar");
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
gena(instr); genar(instr);
genbr0(instr, "rol"); genbr0(instr, "rol");
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) { void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
gena(instr); genar(instr);
genbr0(instr, "ror"); genbr0(instr, "ror");
gencr(instr); gencr(instr);
} }
void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) {
gena(instr); genaf(instr);
genbf(instr, "addsd"); genbf(instr, "addpd");
gencf(instr); gencf(instr);
} }
void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) {
gena(instr); genaf(instr);
genbf(instr, "subsd"); genbf(instr, "subpd");
gencf(instr); gencf(instr);
} }
void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) {
gena(instr); genaf(instr);
asmCode << "\tor rax, 2048" << std::endl; genbf(instr, "mulpd");
genbf(instr, "mulsd"); asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr); gencf(instr);
} }
void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) {
gena(instr); genaf(instr);
asmCode << "\tor rax, 2048" << std::endl; genbf(instr, "divpd");
genbf(instr, "divsd"); asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr); gencf(instr);
} }
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
gena(instr); genaf(instr);
asmCode << "\tmov rcx, 9223372036854773760" << std::endl; asmCode << "\tandps xmm0, xmm10" << std::endl;
asmCode << "\tand rax, rcx" << std::endl; asmCode << "\tsqrtpd xmm0, xmm0" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
asmCode << "\tsqrtsd xmm0, xmm0" << std::endl;
gencf(instr); gencf(instr);
} }
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tshl eax, 13" << std::endl; asmCode << "\tshl eax, 13" << std::endl;
asmCode << "\tand rcx, -2048" << std::endl; asmCode << "\tand rcx, -2048" << std::endl;
asmCode << "\tand eax, 24576" << std::endl; asmCode << "\tand eax, 24576" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rcx" << std::endl; asmCode << "\tcvtsi2sd " << regF[instr.regc % RegistersCount] << ", rcx" << std::endl;
asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencf(instr); gencf(instr, true);
} }
static inline const char* jumpCondition(Instruction& instr, bool invert = false) { static inline const char* jumpCondition(Instruction& instr, bool invert = false) {
@ -481,7 +488,7 @@ namespace RandomX {
} }
void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) { void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
asmCode << "\t" << jumpCondition(instr); asmCode << "\t" << jumpCondition(instr);
asmCode << " short taken_call_" << i << std::endl; asmCode << " short taken_call_" << i << std::endl;
@ -489,14 +496,14 @@ namespace RandomX {
asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl; asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl;
asmCode << "taken_call_" << i << ":" << std::endl; asmCode << "taken_call_" << i << ":" << std::endl;
if (trace) { if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl; asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
} }
asmCode << "\tpush rax" << std::endl; asmCode << "\tpush rax" << std::endl;
asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
} }
void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {
gena(instr); genar(instr);
asmCode << "\tcmp rsp, rbp" << std::endl; asmCode << "\tcmp rsp, rbp" << std::endl;
asmCode << "\tje short not_taken_ret_" << i << std::endl; asmCode << "\tje short not_taken_ret_" << i << std::endl;
asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl; asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl;

View File

@ -38,13 +38,14 @@ namespace RandomX {
static InstructionGenerator engine[256]; static InstructionGenerator engine[256];
std::stringstream asmCode; std::stringstream asmCode;
void gena(Instruction&); void genar(Instruction&);
void genaf(Instruction&);
void genbr0(Instruction&, const char*); void genbr0(Instruction&, const char*);
void genbr1(Instruction&); void genbr1(Instruction&);
void genbr132(Instruction&); void genbr132(Instruction&);
void genbf(Instruction&, const char*); void genbf(Instruction&, const char*);
void gencr(Instruction&); void gencr(Instruction&);
void gencf(Instruction&); void gencf(Instruction&, bool);
void generateCode(Instruction&, int); void generateCode(Instruction&, int);

View File

@ -26,9 +26,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX { namespace RandomX {
CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {
#if !defined(_M_X64) && !defined(__x86_64__)
throw std::runtime_error("Compiled VM only supports x86-64 CPUs");
#endif
} }
void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
@ -51,7 +49,7 @@ namespace RandomX {
void CompiledVirtualMachine::execute() { void CompiledVirtualMachine::execute() {
//executeProgram(reg, mem, scratchpad, readDataset); //executeProgram(reg, mem, scratchpad, readDataset);
compiler.getProgramFunc()(reg, mem, scratchpad); compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACE #ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) { for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl; std::cout << std::hex << tracepad[i].u64 << std::endl;
} }

View File

@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/ */
#pragma once #pragma once
//#define TRACE //#define TRACEVM
#include "VirtualMachine.hpp" #include "VirtualMachine.hpp"
#include "JitCompilerX86.hpp" #include "JitCompilerX86.hpp"
@ -34,7 +34,7 @@ namespace RandomX {
return compiler.getCode(); return compiler.getCode();
} }
private: private:
#ifdef TRACE #ifdef TRACEVM
convertible_t tracepad[InstructionCount]; convertible_t tracepad[InstructionCount];
#endif #endif
JitCompilerX86 compiler; JitCompilerX86 compiler;

View File

@ -44,9 +44,11 @@ namespace RandomX {
*(((uint32_t*)&reg) + i) = gen(); *(((uint32_t*)&reg) + i) = gen();
} }
FPINIT(); FPINIT();
for (int i = 0; i < 8; ++i) { for (int i = 0; i < RegistersCount; ++i) {
reg.f[i].f64 = (double)reg.f[i].i64; reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
} }
//std::cout << reg;
p.initialize(gen); p.initialize(gen);
mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
mem.mx = *(((uint32_t*)seed) + 5); mem.mx = *(((uint32_t*)seed) + 5);
@ -119,9 +121,9 @@ namespace RandomX {
case 1: case 1:
case 2: case 2:
case 3: case 3:
return reg.r[inst.regb % RegistersCount];
case 4: case 4:
case 5: case 5:
return reg.r[inst.regb % RegistersCount];
case 6: case 6:
case 7: case 7:
convertible_t temp; convertible_t temp;
@ -130,22 +132,6 @@ namespace RandomX {
} }
} }
double InterpretedVirtualMachine::loadbf(Instruction& inst) {
switch (inst.locb & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
return reg.f[inst.regb % RegistersCount].f64;
case 6:
case 7:
return (double)inst.imm32;
}
}
convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) { convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
addr_t addr; addr_t addr;
switch (inst.locc & 7) switch (inst.locc & 7)
@ -168,25 +154,43 @@ namespace RandomX {
} }
} }
convertible_t& InterpretedVirtualMachine::getcf(Instruction& inst) { void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) {
addr_t addr; addr_t addr;
switch (inst.locc & 7) switch (inst.locc & 7)
{ {
case 0:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL2];
case 1:
case 2:
case 3:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL1];
case 4: case 4:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo;
break;
case 5: case 5:
case 6: case 6:
case 7: case 7:
return reg.f[inst.regc % RegistersCount]; addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo;
default:
break;
}
}
void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) {
addr_t addr;
switch (inst.locc & 7)
{
case 4:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL2] = regc.lo;
break;
case 5:
case 6:
case 7:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL1] = regc.lo;
default:
break;
} }
} }
@ -194,22 +198,18 @@ namespace RandomX {
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define FPU_RETIRE(x) x(a, b, c); \ #define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \
if(trace) { \ if(trace) { \
convertible_t bc; \ std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \
bc.f64 = b; \
std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \
} \ } \
if(fpuCheck) { \ if(fpuCheck) { \
convertible_t bc; \ if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64) { \
if(c.f64 != c.f64) { \
std::stringstream ss; \ std::stringstream ss; \
bc.f64 = b; \ ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
throw std::runtime_error(ss.str()); \ throw std::runtime_error(ss.str()); \
} else if (std::fpclassify(c.f64) == FP_SUBNORMAL) {\ } else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\
std::stringstream ss; \ std::stringstream ss; \
bc.f64 = b; \ ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
throw std::runtime_error(ss.str()); \ throw std::runtime_error(ss.str()); \
} \ } \
} }
@ -220,8 +220,13 @@ namespace RandomX {
#define INC_COUNT(x) #define INC_COUNT(x)
#endif #endif
#define FPU_RETIRE_NB(x) x(a, b, c); \ #define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; writecf(inst, c); \
if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl;
#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \
writecflo(inst, c); \
if(trace) std::cout << std::hex << c.lo.u64 << std::endl;
#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \ INC_COUNT(x) \
@ -242,17 +247,17 @@ namespace RandomX {
#define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ #define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \ INC_COUNT(x) \
convertible_t a = loada(inst); \ convertible_t a = loada(inst); \
double b = loadbf(inst); \ fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
convertible_t& c = getcf(inst); \ fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \ FPU_RETIRE(x) \
} }
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ #define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \ INC_COUNT(x) \
convertible_t a = loada(inst); \ convertible_t a = loada(inst); \
convertible_t b; \ fpu_reg_t b; \
convertible_t& c = getcf(inst); \ fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE_NB(x) \ FPU_RETIRE_##x(x) \
} }
ALU_INST(ADD_64) ALU_INST(ADD_64)

View File

@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/ */
#pragma once #pragma once
#define STATS //#define STATS
#include "VirtualMachine.hpp" #include "VirtualMachine.hpp"
#include "Program.hpp" #include "Program.hpp"
#include <vector> #include <vector>
@ -88,9 +88,9 @@ namespace RandomX {
convertible_t loada(Instruction&); convertible_t loada(Instruction&);
convertible_t loadbr0(Instruction&); convertible_t loadbr0(Instruction&);
convertible_t loadbr1(Instruction&); convertible_t loadbr1(Instruction&);
double loadbf(Instruction&);
convertible_t& getcr(Instruction&); convertible_t& getcr(Instruction&);
convertible_t& getcf(Instruction&); void writecf(Instruction&, fpu_reg_t&);
void writecflo(Instruction&, fpu_reg_t&);
void stackPush(convertible_t& c) { void stackPush(convertible_t& c) {
stack.push_back(c); stack.push_back(c);

View File

@ -0,0 +1,58 @@
;# Copyright (c) 2018 tevador
;#
;# This file is part of RandomX.
;#
;# RandomX is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# RandomX is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#else
.section .text
#endif
#if defined(__WIN32__) || defined(__APPLE__)
#define DECL(x) _##x
#else
#define DECL(x) x
#endif
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_r)
.global DECL(randomx_program_read_f)
.global DECL(randomx_program_end)
.align 64
DECL(randomx_program_prologue):
#include "asm/program_prologue_linux.inc"
.align 64
DECL(randomx_program_begin):
nop
.align 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
.align 64
DECL(randomx_program_read_r):
#include "asm/program_read_r.inc"
.align 64
DECL(randomx_program_read_f):
#include "asm/program_read_f.inc"
.align 64
DECL(randomx_program_end):
nop

View File

@ -0,0 +1,59 @@
;# Copyright (c) 2018 tevador
;#
;# This file is part of RandomX.
;#
;# RandomX is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# RandomX is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_r
PUBLIC randomx_program_read_f
PUBLIC randomx_program_end
ALIGN 64
randomx_program_prologue PROC
include asm/program_prologue_win64.inc
randomx_program_prologue ENDP
ALIGN 64
randomx_program_begin PROC
nop
randomx_program_begin ENDP
ALIGN 64
randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_program_read_r PROC
include asm/program_read_r.inc
randomx_program_read_r ENDP
ALIGN 64
randomx_program_read_f PROC
include asm/program_read_f.inc
randomx_program_read_f ENDP
ALIGN 64
randomx_program_end PROC
nop
randomx_program_end ENDP
_RANDOMX_JITX86_STATIC ENDS
END

View File

@ -0,0 +1,27 @@
/*
Copyright (c) 2018 tevador
This file is part of RandomX.
RandomX is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RandomX is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
extern "C" {
void randomx_program_prologue();
void randomx_program_begin();
void randomx_program_epilogue();
void randomx_program_read_r();
void randomx_program_read_f();
void randomx_program_end();
}

View File

@ -34,6 +34,16 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX { namespace RandomX {
#if !defined(_M_X64) && !defined(__x86_64__)
JitCompilerX86::JitCompilerX86() {
throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
}
#else
/* /*
REGISTER ALLOCATION: REGISTER ALLOCATION:
@ -41,7 +51,7 @@ namespace RandomX {
rbx -> MemoryRegisters& memory rbx -> MemoryRegisters& memory
rcx -> temporary rcx -> temporary
rdx -> temporary rdx -> temporary
rsi -> convertible_t& scratchpad rsi -> convertible_t* scratchpad
rdi -> "ic" (instruction counter) rdi -> "ic" (instruction counter)
rbp -> beginning of VM stack rbp -> beginning of VM stack
rsp -> end of VM stack rsp -> end of VM stack
@ -63,6 +73,7 @@ namespace RandomX {
xmm7 -> "f7" xmm7 -> "f7"
xmm8 -> "f0" xmm8 -> "f0"
xmm9 -> "f1" xmm9 -> "f1"
xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
STACK STRUCTURE: STACK STRUCTURE:
@ -81,127 +92,23 @@ namespace RandomX {
*/ */
constexpr uint8_t ic3 = ((InstructionCount + 1) >> 24); #include "JitCompilerX86-static.hpp"
constexpr uint8_t ic2 = ((InstructionCount + 1) >> 16);
constexpr uint8_t ic1 = ((InstructionCount + 1) >> 8);
constexpr uint8_t ic0 = ((InstructionCount + 1) >> 0);
const uint8_t prologue[] = { const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
0x53, //push rbx const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
0x55, //push rbp const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
#ifdef _WIN32 const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
0x57, //push rdi const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
0x56, //push rsi const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
#endif
0x41, 0x54, //push r12
0x41, 0x55, //push r13
0x41, 0x56, //push r14
0x41, 0x57, //push r15
#ifdef _WIN32
0x48, 0x83, 0xec, 0x48, //sub rsp,0x48
0xf3, 0x0f, 0x7f, 0x74, 0x24, 0x30, //movdqu XMMWORD PTR[rsp + 0x30],xmm6
0xf3, 0x0f, 0x7f, 0x7c, 0x24, 0x20, //movdqu XMMWORD PTR[rsp + 0x20],xmm7
0xf3, 0x44, 0x0f, 0x7f, 0x44, 0x24, 0x10, //movdqu XMMWORD PTR[rsp + 0x10],xmm8
0xf3, 0x44, 0x0f, 0x7f, 0x0c, 0x24, //movdqu XMMWORD PTR[rsp],xmm9
0x51, //push rcx
0x48, 0x8b, 0xda, //mov rbx,rdx
0x49, 0x8b, 0xf0, //mov rsi,r8
#else
0x57, //push rdi
0x48, 0x8b, 0xde, //mov rbx, rsi
0x48, 0x8b, 0xf2, //mov rsi, rdx
0x48, 0x8b, 0xcf, //mov rcx, rdi
#endif
0x48, 0x8b, 0xec, //mov rbp,rsp
0x48, 0xc7, 0xc7, ic0, ic1, ic2, ic3, //mov rdi, "InstructionCount"
0x4c, 0x8b, 0x01, //mov r8,QWORD PTR[rcx]
0x4c, 0x8b, 0x49, 0x08, //mov r9,QWORD PTR[rcx+0x8]
0x4c, 0x8b, 0x51, 0x10, //mov r10,QWORD PTR[rcx+0x10]
0x4c, 0x8b, 0x59, 0x18, //mov r11,QWORD PTR[rcx+0x18]
0x4c, 0x8b, 0x61, 0x20, //mov r12,QWORD PTR[rcx+0x20]
0x4c, 0x8b, 0x69, 0x28, //mov r13,QWORD PTR[rcx+0x28]
0x4c, 0x8b, 0x71, 0x30, //mov r14,QWORD PTR[rcx+0x30]
0x4c, 0x8b, 0x79, 0x38, //mov r15,QWORD PTR[rcx+0x38]
0xc7, 0x44, 0x24, 0xf8, 0xc0, 0x9f, 0x00, //mov DWORD PTR[rsp-0x8],0x9fc0
0x00,
0x0f, 0xae, 0x54, 0x24, 0xf8, //ldmxcsr DWORD PTR[rsp-0x8]
0xf2, 0x4c, 0x0f, 0x2a, 0x41, 0x40, //cvtsi2sd xmm8,QWORD PTR[rcx+0x40]
0xf2, 0x4c, 0x0f, 0x2a, 0x49, 0x48, //cvtsi2sd xmm9,QWORD PTR[rcx+0x48]
0xf2, 0x48, 0x0f, 0x2a, 0x51, 0x50, //cvtsi2sd xmm2,QWORD PTR[rcx+0x50]
0xf2, 0x48, 0x0f, 0x2a, 0x59, 0x58, //cvtsi2sd xmm3,QWORD PTR[rcx+0x58]
0xf2, 0x48, 0x0f, 0x2a, 0x61, 0x60, //cvtsi2sd xmm4,QWORD PTR[rcx+0x60]
0xf2, 0x48, 0x0f, 0x2a, 0x69, 0x68, //cvtsi2sd xmm5,QWORD PTR[rcx+0x68]
0xf2, 0x48, 0x0f, 0x2a, 0x71, 0x70, //cvtsi2sd xmm6,QWORD PTR[rcx+0x70]
0xf2, 0x48, 0x0f, 0x2a, 0x79, 0x78, //cvtsi2sd xmm7,QWORD PTR[rcx+0x78]
};
const uint8_t epilogue[] = { const int32_t prologueSize = codeProgramBegin - codePrologue;
0x48, 0x8b, 0xe5, //mov rsp,rbp const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
0x59, //pop rcx const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
0x4c, 0x89, 0x01, //mov QWORD PTR [rcx],r8 const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;
0x4c, 0x89, 0x49, 0x08, //mov QWORD PTR [rcx+0x8],r9
0x4c, 0x89, 0x51, 0x10, //mov QWORD PTR [rcx+0x10],r10
0x4c, 0x89, 0x59, 0x18, //mov QWORD PTR [rcx+0x18],r11
0x4c, 0x89, 0x61, 0x20, //mov QWORD PTR [rcx+0x20],r12
0x4c, 0x89, 0x69, 0x28, //mov QWORD PTR [rcx+0x28],r13
0x4c, 0x89, 0x71, 0x30, //mov QWORD PTR [rcx+0x30],r14
0x4c, 0x89, 0x79, 0x38, //mov QWORD PTR [rcx+0x38],r15
0x66, 0x4c, 0x0f, 0x7e, 0x41, 0x40, //movq QWORD PTR [rcx+0x40],xmm8
0x66, 0x4c, 0x0f, 0x7e, 0x49, 0x48, //movq QWORD PTR [rcx+0x48],xmm9
0x66, 0x48, 0x0f, 0x7e, 0x51, 0x50, //movq QWORD PTR [rcx+0x50],xmm2
0x66, 0x48, 0x0f, 0x7e, 0x59, 0x58, //movq QWORD PTR [rcx+0x58],xmm3
0x66, 0x48, 0x0f, 0x7e, 0x61, 0x60, //movq QWORD PTR [rcx+0x60],xmm4
0x66, 0x48, 0x0f, 0x7e, 0x69, 0x68, //movq QWORD PTR [rcx+0x68],xmm5
0x66, 0x48, 0x0f, 0x7e, 0x71, 0x70, //movq QWORD PTR [rcx+0x70],xmm6
0x66, 0x48, 0x0f, 0x7e, 0x79, 0x78, //movq QWORD PTR [rcx+0x78],xmm7
#ifdef _WIN32
0xf3, 0x44, 0x0f, 0x6f, 0x0c, 0x24, //movdqu xmm9,XMMWORD PTR [rsp]
0xf3, 0x44, 0x0f, 0x6f, 0x44, 0x24, 0x10, //movdqu xmm8,XMMWORD PTR [rsp+0x10]
0xf3, 0x0f, 0x6f, 0x7c, 0x24, 0x20, //movdqu xmm7,XMMWORD PTR [rsp+0x20]
0xf3, 0x0f, 0x6f, 0x74, 0x24, 0x30, //movdqu xmm6,XMMWORD PTR [rsp+0x30]
0x48, 0x83, 0xc4, 0x48, //add rsp,0x48
#endif
0x41, 0x5f, //pop r15
0x41, 0x5e, //pop r14
0x41, 0x5d, //pop r13
0x41, 0x5c, //pop r12
#ifdef _WIN32
0x5e, //pop rsi
0x5f, //pop rdi
#endif
0x5d, //pop rbp
0x5b, //pop rbx
0xc3, //ret
};
//41 bytes -> 1 cache line const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
const uint8_t readDatasetSub[] = { const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
0x8b, 0x13, //mov edx,DWORD PTR [rbx] const int32_t epilogueOffset = readDatasetROffset - epilogueSize;
0x48, 0x8b, 0x43, 0x08, //mov rax,QWORD PTR [rbx+0x8]
0x48, 0x8b, 0x04, 0x10, //mov rax,QWORD PTR [rax+rdx*1]
0x83, 0x03, 0x08, //add DWORD PTR [rbx],0x8
0x33, 0x4b, 0x04, //xor ecx,DWORD PTR [rbx+0x4]
0x89, 0x4b, 0x04, //mov DWORD PTR [rbx+0x4],ecx
0xf7, 0xc1, 0xf8, 0xff, 0x00, 0x00, //test ecx,0xfff8
0x75, 0x0d, //jne
0x83, 0xe1, 0xf8, //and ecx,0xfffffff8
0x89, 0x0b, //mov DWORD PTR [rbx],ecx
0x48, 0x8b, 0x53, 0x08, //mov rdx,QWORD PTR [rbx+0x8]
0x0f, 0x18, 0x0c, 0x0a, //prefetcht0 BYTE PTR [rdx+rcx*1]
0xc3, //ret
};
constexpr int getNumCacheLines(size_t size) {
return (size + (CacheLineSize - 1)) / CacheLineSize;
}
constexpr int32_t align(int32_t pos, int32_t align) {
return ((pos - 1) / align + 1) * align;
}
constexpr int32_t readDatasetSubOffset = CodeSize - CacheLineSize * getNumCacheLines(sizeof(readDatasetSub));
constexpr int32_t epilogueOffset = readDatasetSubOffset - CacheLineSize * getNumCacheLines(sizeof(epilogue));
constexpr int32_t startOffsetAligned = align(sizeof(prologue), CacheLineSize);
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
#ifdef _WIN32 #ifdef _WIN32
@ -213,24 +120,16 @@ namespace RandomX {
if (code == (uint8_t*)-1) if (code == (uint8_t*)-1)
throw std::runtime_error("mmap failed"); throw std::runtime_error("mmap failed");
#endif #endif
memcpy(code, prologue, sizeof(prologue)); memcpy(code, codePrologue, prologueSize);
codePos = sizeof(prologue); memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
if (startOffsetAligned - codePos > 4) { memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
emitByte(0xeb); memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
emitByte(startOffsetAligned - (codePos + 1));
}
else {
while (codePos < startOffsetAligned)
emitByte(0x90); //nop
}
memcpy(code + readDatasetSubOffset, readDatasetSub, sizeof(readDatasetSub));
memcpy(code + epilogueOffset, epilogue, sizeof(epilogue));
} }
void JitCompilerX86::generateProgram(Pcg32& gen) { void JitCompilerX86::generateProgram(Pcg32& gen) {
instructionOffsets.clear(); instructionOffsets.clear();
callOffsets.clear(); callOffsets.clear();
codePos = startOffsetAligned; codePos = prologueSize;
Instruction instr; Instruction instr;
for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned i = 0; i < ProgramLength; ++i) {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
@ -247,7 +146,6 @@ namespace RandomX {
instructionOffsets.push_back(codePos); instructionOffsets.push_back(codePos);
emit(0x840fcfff); //dec edx; jz <epilogue> emit(0x840fcfff); //dec edx; jz <epilogue>
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
gena(instr);
auto generator = engine[instr.opcode]; auto generator = engine[instr.opcode];
(this->*generator)(instr, i); (this->*generator)(instr, i);
} }
@ -258,11 +156,10 @@ namespace RandomX {
} }
} }
void JitCompilerX86::gena(Instruction& instr) { void JitCompilerX86::genar(Instruction& instr) {
emit(uint16_t(0x8149)); //xor emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount)); emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra); emit(instr.addra);
int32_t pc;
switch (instr.loca & 7) switch (instr.loca & 7)
{ {
case 0: case 0:
@ -272,7 +169,7 @@ namespace RandomX {
emit(uint16_t(0x8b41)); //mov emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call emitByte(0xe8); //call
emit(readDatasetSubOffset - (codePos + 4)); emit(readDatasetROffset - (codePos + 4));
return; return;
case 4: case 4:
@ -293,8 +190,44 @@ namespace RandomX {
} }
} }
void JitCompilerX86::genaf(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetFOffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
}
}
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 3) {
emit(uint16_t(0x8b49)); //mov emit(uint16_t(0x8b49)); //mov
emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
emitByte(0x48); //REX.W emitByte(0x48); //REX.W
@ -330,12 +263,8 @@ namespace RandomX {
} }
void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) { void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) {
emit(0x48f2fffff8002548); //and rax,0xfffffffffffff800; cvtsi2sd xmm0,rax
emit(uint16_t(0x2a0f));
emitByte(0xc0);
if ((instr.locb & 7) <= 5) {
int regb = (instr.regb % RegistersCount); int regb = (instr.regb % RegistersCount);
emitByte(0xf2); //xxxsd xmm0,regb emitByte(0x66); //xxxpd xmm0,regb
if (regb <= 1) { if (regb <= 1) {
emitByte(0x41); //REX emitByte(0x41); //REX
} }
@ -343,44 +272,30 @@ namespace RandomX {
emitByte(opcode); emitByte(opcode);
emitByte(0xc0 + regb); emitByte(0xc0 + regb);
} }
else {
convertible_t bimm;
bimm.f64 = (double)instr.imm32; void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
emit(uint16_t(0xb848)); //movabs rax,imm64 emit(0x41c88b48); //mov rcx, rax; REX
emit(bimm.i64); emitByte(0x8b); // mov
emitByte(0x66); //movq xmm1,rax emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emit(0xc86e0f48); emitByte(0x35); // xor eax
emit(uint16_t(0x0ff2)); //xxxsd xmm0,xmm1 emit(instr.addrc);
emitByte(opcode); emitByte(0x25); //and
emitByte(0xc1); emit(scratchpadSize - 1);
} emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
} }
void JitCompilerX86::gencr(Instruction& instr) { void JitCompilerX86::gencr(Instruction& instr) {
switch (instr.locc & 7) switch (instr.locc & 7)
{ {
case 0: case 0:
emit(0x41c88b48); //mov rcx, rax; REX scratchpadStoreR(instr, ScratchpadL2);
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
break; break;
case 1: case 1:
case 2: case 2:
case 3: case 3:
emit(0x41c88b48); //mov rcx, rax; REX scratchpadStoreR(instr, ScratchpadL1);
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
break; break;
default: default:
@ -390,66 +305,75 @@ namespace RandomX {
} }
} }
void JitCompilerX86::gencf(Instruction& instr) { void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) {
int regc = (instr.regc % RegistersCount);
switch (instr.locc & 7)
{
case 0:
emit(uint16_t(0x8b41)); //mov emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + regc); //eax, regc emitByte(0xc0 + regc); //eax, regc
emitByte(0x35); // xor eax emitByte(0x35); // xor eax
emit(instr.addrc); emit(instr.addrc);
emitByte(0x25); //and emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad emit(scratchpadSize - 1);
emit(uint16_t(0x4866)); //prefix emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
break;
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + regc); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(uint16_t(0x4866)); //prefix
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
break;
default:
emitByte(0xf2);
if (regc <= 1) { if (regc <= 1) {
emitByte(0x44); //REX emitByte(0x44); //REX
} }
emit(uint16_t(0x100f)); //movsd emitByte(0x0f);
emitByte(storeHigh ? 0x17 : 0x13);
emitByte(4 + 8 * regc);
emitByte(0xc6);
}
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
int regc = (instr.regc % RegistersCount);
if (!alwaysLow) {
if (regc <= 1) {
emitByte(0x44); //REX
}
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0 emitByte(0xc0 + 8 * regc); // regc, xmm0
}
switch (instr.locc & 7)
{
case 4:
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
break;
case 5:
case 6:
case 7:
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
break;
default:
break; break;
} }
} }
void JitCompilerX86::h_ADD_64(Instruction& instr, int i) { void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x0349, 0x0548); genbr1(instr, 0x0349, 0x0548);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_ADD_32(Instruction& instr, int i) { void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x0341, 0x05); genbr132(instr, 0x0341, 0x05);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_SUB_64(Instruction& instr, int i) { void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x2b49, 0x2d48); genbr1(instr, 0x2b49, 0x2d48);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_SUB_32(Instruction& instr, int i) { void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x2b41, 0x2d); genbr132(instr, 0x2b41, 0x2d);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_MUL_64(Instruction& instr, int i) { void JitCompilerX86::h_MUL_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emitByte(0x49); //REX emitByte(0x49); //REX
emit(uint16_t(0xaf0f)); // imul rax, r64 emit(uint16_t(0xaf0f)); // imul rax, r64
@ -464,6 +388,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_MULH_64(Instruction& instr, int i) { void JitCompilerX86::h_MULH_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b49)); //mov rcx, r64 emit(uint16_t(0x8b49)); //mov rcx, r64
emitByte(0xc8 + (instr.regb % RegistersCount)); emitByte(0xc8 + (instr.regb % RegistersCount));
@ -481,6 +406,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_MUL_32(Instruction& instr, int i) { void JitCompilerX86::h_MUL_32(Instruction& instr, int i) {
genar(instr);
emit(uint16_t(0xc88b)); //mov ecx, eax emit(uint16_t(0xc88b)); //mov ecx, eax
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b41)); // mov eax, r32 emit(uint16_t(0x8b41)); // mov eax, r32
@ -495,6 +421,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) { void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) {
genar(instr);
emitByte(0x48); emitByte(0x48);
emit(uint16_t(0xc863)); //movsxd rcx,eax emit(uint16_t(0xc863)); //movsxd rcx,eax
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
@ -511,6 +438,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) { void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b49)); //mov rcx, r64 emit(uint16_t(0x8b49)); //mov rcx, r64
emitByte(0xc8 + (instr.regb % RegistersCount)); emitByte(0xc8 + (instr.regb % RegistersCount));
@ -528,6 +456,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emitByte(0xb9); //mov ecx, 1 emitByte(0xb9); //mov ecx, 1
emit(1); emit(1);
@ -546,6 +475,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) { if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b41)); //mov edx, r32 emit(uint16_t(0x8b41)); //mov edx, r32
emitByte(0xd0 + (instr.regb % RegistersCount)); emitByte(0xd0 + (instr.regb % RegistersCount));
@ -563,100 +493,127 @@ namespace RandomX {
} }
void JitCompilerX86::h_AND_64(Instruction& instr, int i) { void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x2349, 0x2548); genbr1(instr, 0x2349, 0x2548);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_AND_32(Instruction& instr, int i) { void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x2341, 0x25); genbr132(instr, 0x2341, 0x25);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_OR_64(Instruction& instr, int i) { void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x0b49, 0x0d48); genbr1(instr, 0x0b49, 0x0d48);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_OR_32(Instruction& instr, int i) { void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x0b41, 0x0d); genbr132(instr, 0x0b41, 0x0d);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_XOR_64(Instruction& instr, int i) { void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x3349, 0x3548); genbr1(instr, 0x3349, 0x3548);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_XOR_32(Instruction& instr, int i) { void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x3341, 0x35); genbr132(instr, 0x3341, 0x35);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_SHL_64(Instruction& instr, int i) { void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xe0d3, 0xe0c1); genbr0(instr, 0xe0d3, 0xe0c1);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_SHR_64(Instruction& instr, int i) { void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xe8d3, 0xe8c1); genbr0(instr, 0xe8d3, 0xe8c1);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_SAR_64(Instruction& instr, int i) { void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xf8d3, 0xf8c1); genbr0(instr, 0xf8d3, 0xf8c1);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_ROL_64(Instruction& instr, int i) { void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xc0d3, 0xc0c1); genbr0(instr, 0xc0d3, 0xc0c1);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_ROR_64(Instruction& instr, int i) { void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xc8d3, 0xc8c1); genbr0(instr, 0xc8d3, 0xc8c1);
gencr(instr); gencr(instr);
} }
void JitCompilerX86::h_FPADD(Instruction& instr, int i) { void JitCompilerX86::h_FPADD(Instruction& instr, int i) {
genaf(instr);
genbf(instr, 0x58); genbf(instr, 0x58);
gencf(instr); gencf(instr);
} }
void JitCompilerX86::h_FPSUB(Instruction& instr, int i) { void JitCompilerX86::h_FPSUB(Instruction& instr, int i) {
genaf(instr);
genbf(instr, 0x5c); genbf(instr, 0x5c);
gencf(instr); gencf(instr);
} }
void JitCompilerX86::h_FPMUL(Instruction& instr, int i) { void JitCompilerX86::h_FPMUL(Instruction& instr, int i) {
emit(uint16_t(0x0d48)); //or rax,0x800 genaf(instr);
emit(0x00000800);
genbf(instr, 0x59); genbf(instr, 0x59);
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
emit(uint16_t(0x540f)); //andps xmm0,xmm1
emitByte(0xc1);
gencf(instr); gencf(instr);
} }
void JitCompilerX86::h_FPDIV(Instruction& instr, int i) { void JitCompilerX86::h_FPDIV(Instruction& instr, int i) {
emit(uint16_t(0x0d48)); //or rax,0x800 genaf(instr);
emit(0x00000800);
genbf(instr, 0x5e); genbf(instr, 0x5e);
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
emit(uint16_t(0x540f)); //andps xmm0,xmm1
emitByte(0xc1);
gencf(instr); gencf(instr);
} }
void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) { void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) {
emit(uint16_t(0xb948)); //or movabs rcx, imm64 genaf(instr);
emit(0x7ffffffffffff800); emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0
emit(0xc02a0f48f2c12348); //and rax,rcx; cvtsi2sd xmm0,rax
emit(0xc0510ff2); //sqrtsd xmm0,xmm0
gencf(instr); gencf(instr);
} }
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
genar(instr);
emit(0x81480de0c1c88b48); emit(0x81480de0c1c88b48);
emit(0x600025fffff800e1); emit(0x600025fffff800e1);
emit(0x0dc12a0f48f20000); emit(uint16_t(0x0000));
emitByte(0xf2);
int regc = (instr.regc % RegistersCount);
if (regc <= 1) {
emitByte(0x4c); //REX
}
else {
emitByte(0x48); //REX
}
emit(uint16_t(0x2a0f));
emitByte(0xc1 + 8 * regc);
emitByte(0x0d);
emit(0xf824448900009fc0); emit(0xf824448900009fc0);
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8] emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf8); emitByte(0xf8);
gencf(instr); gencf(instr, true);
} }
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@ -682,6 +639,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_CALL(Instruction& instr, int i) { void JitCompilerX86::h_CALL(Instruction& instr, int i) {
genar(instr);
emit(uint16_t(0x8141)); //cmp regb, imm32 emit(uint16_t(0x8141)); //cmp regb, imm32
emitByte(0xf8 + (instr.regb % RegistersCount)); emitByte(0xf8 + (instr.regb % RegistersCount));
emit(instr.imm32); emit(instr.imm32);
@ -707,6 +665,7 @@ namespace RandomX {
} }
void JitCompilerX86::h_RET(Instruction& instr, int i) { void JitCompilerX86::h_RET(Instruction& instr, int i) {
genar(instr);
int crlen = 0; int crlen = 0;
if ((instr.locc & 7) <= 3) { if ((instr.locc & 7) <= 3) {
crlen = 17; crlen = 17;
@ -756,4 +715,6 @@ namespace RandomX {
INST_HANDLE(CALL) INST_HANDLE(CALL)
INST_HANDLE(RET) INST_HANDLE(RET)
}; };
#endif
} }

View File

@ -58,13 +58,16 @@ namespace RandomX {
std::vector<int32_t> instructionOffsets; std::vector<int32_t> instructionOffsets;
std::vector<CallOffset> callOffsets; std::vector<CallOffset> callOffsets;
void gena(Instruction&); void genar(Instruction&);
void genaf(Instruction&);
void genbr0(Instruction&, uint16_t, uint16_t); void genbr0(Instruction&, uint16_t, uint16_t);
void genbr1(Instruction&, uint16_t, uint16_t); void genbr1(Instruction&, uint16_t, uint16_t);
void genbr132(Instruction&, uint16_t, uint8_t); void genbr132(Instruction&, uint16_t, uint8_t);
void genbf(Instruction&, uint8_t); void genbf(Instruction&, uint8_t);
void scratchpadStoreR(Instruction&, uint32_t);
void scratchpadStoreF(Instruction&, int, uint32_t, bool);
void gencr(Instruction&); void gencr(Instruction&);
void gencf(Instruction&); void gencf(Instruction&, bool);
void generateCode(Instruction&, int); void generateCode(Instruction&, int);
void fixCallOffsets(); void fixCallOffsets();

View File

@ -21,33 +21,36 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
#include "instructions.hpp" #include "instructions.hpp"
#include "Pcg32.hpp"
//#define DEBUG //#define DEBUG
using namespace RandomX; using namespace RandomX;
typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&); typedef void(*FpuOperation)(convertible_t&, fpu_reg_t&, fpu_reg_t&);
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
convertible_t a, b, c;
a.u64 = mode;
FPROUND(a, b, c);
#ifdef DEBUG
a.f64 = convertToDouble(x);
b.f64 = convertToDouble(y);
std::cout << std::hex << (uint64_t)x << " -> " << a.u64 << std::endl;
std::cout << std::hex << (uint64_t)y << " -> " << b.u64 << std::endl;
std::cout << std::dec;
#endif
a.i64 = x;
b.i64 = y;
op(a, b, c);
return c.u64;
}
#define CATCH_CONFIG_MAIN #define CATCH_CONFIG_MAIN
#include "catch.hpp" #include "catch.hpp"
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, FpuOperation op, bool hiEqualsLo = true) {
convertible_t a;
fpu_reg_t b, c;
a.u64 = mode;
FPROUND(a, b, c);
if (hiEqualsLo) {
a.i32lo = x;
a.i32hi = x;
}
else {
a.i64 = x;
}
b.lo.i64 = y;
b.hi.i64 = y;
op(a, b, c);
if (hiEqualsLo) {
CHECK(c.lo.u64 == c.hi.u64);
}
return c.lo.u64;
}
#define RX_EXECUTE_U64(va, vb, INST) do { \ #define RX_EXECUTE_U64(va, vb, INST) do { \
a.u64 = va; \ a.u64 = va; \
b.u64 = vb; \ b.u64 = vb; \
@ -273,118 +276,126 @@ TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
TEST_CASE("Denormal results are not produced", "[FTZ]") { TEST_CASE("Denormal results are not produced", "[FTZ]") {
FPINIT(); FPINIT();
convertible_t a, b, c; convertible_t a;
a.i64 = 2048; fpu_reg_t b;
FPDIV(a, DBL_MAX, c); a.i64 = 1;
b.lo.f64 = DBL_MAX;
FPDIV(a, b, b);
#ifdef DEBUG #ifdef DEBUG
std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << c.u64 << std::endl; std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << b.lo.u64 << std::endl;
#endif #endif
REQUIRE(std::fpclassify(c.f64) != FP_SUBNORMAL); CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
b.f64 = c.f64;
a.i64 = 0; a.i64 = 0;
FPSUB_64(a, b, c); FPSUB(a, b, b);
#ifdef DEBUG #ifdef DEBUG
std::cout << a.i64 << " - " << b.f64 << " = " << std::hex << c.u64 << std::endl; std::cout << a.i64 << " - " << b.lo.f64 << " = " << std::hex << b.lo.u64 << std::endl;
#endif #endif
CHECK(std::fpclassify(c.f64) != FP_SUBNORMAL); CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
} }
TEST_CASE("NaN results are not produced", "[NAN]") { TEST_CASE("NaN results are not produced", "[NAN]") {
FPINIT(); FPINIT();
convertible_t a, c; convertible_t a;
fpu_reg_t b;
a.i64 = 0; a.i64 = 0;
FPDIV(a, 0, c); b.lo.f64 = 0;
CHECK(std::fpclassify(c.f64) != FP_NAN); FPDIV(a, b, b);
FPMUL(a, std::numeric_limits<double>::infinity(), c); CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
CHECK(std::fpclassify(c.f64) != FP_NAN); b.lo.f64 = std::numeric_limits<double>::infinity();
FPMUL(a, b, b);
CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
} }
volatile int64_t fpAdda = 7379480244170225589; volatile int64_t fpRounda = 7379480244170225589;
volatile int64_t fpAddb = -438072579179686797; volatile int32_t fpAdda = -2110701072;
volatile int64_t fpSuba = 2939258788088626026; volatile int64_t fpAddb = 5822431907862180274;
volatile int64_t fpSubb = 4786131045320678734; volatile int32_t fpSuba = -1651770302;
volatile int64_t fpMula1 = 8399833736388895639; volatile int64_t fpSubb = 4982086006202596504;
volatile int64_t fpMulb1 = 5671608020317594922; volatile int32_t fpMula1 = 122885310;
volatile int64_t fpMula2 = -7094299423744805450; volatile int64_t fpMulb1 = 6036690890763685020;
volatile int64_t fpMulb2 = 4982086006202596504; volatile int32_t fpMula2 = -1952486466;
volatile int64_t fpDiva1 = 8399833736388895639; volatile int64_t fpMulb2 = 5693689137909219638;
volatile int64_t fpDivb1 = 5671608020317594922; volatile int32_t fpDiva1 = -1675630642;
volatile int64_t fpDiva2 = -7434878587645025912; volatile int64_t fpDivb1 = -3959960229647489051;
volatile int64_t fpDivb2 = 5266243837734830806; volatile int32_t fpDiva2 = -1651770302;
volatile int64_t fpSqrta = -7594301562963134542; volatile int64_t fpDivb2 = 4982086006202596504;
volatile int32_t fpSqrta1 = 440505508;
volatile int32_t fpSqrta2 = -2147483648;
TEST_CASE("IEEE-754 compliance", "[FPU]") { TEST_CASE("IEEE-754 compliance", "[FPU]") {
FPINIT(); FPINIT();
convertible_t a, b, c; convertible_t a;
fpu_reg_t b, c;
b.lo.f64 = 0.0;
a.i64 = 2048; a.i64 = 1;
FPDIV(a, 0, c); FPDIV(a, b, c);
CHECK(c.f64 == std::numeric_limits<double>::infinity()); CHECK(c.lo.f64 == std::numeric_limits<double>::infinity());
a.i64 = -2048; a.i64 = -1;
FPDIV(a, 0, c); FPDIV(a, b, c);
CHECK(c.f64 == -std::numeric_limits<double>::infinity()); CHECK(c.lo.f64 == -std::numeric_limits<double>::infinity());
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPROUND" << std::endl; std::cout << "FPROUND" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU); CHECK(rxRound(RoundToNearest, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundDown, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU); CHECK(rxRound(RoundDown, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundUp, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU); CHECK(rxRound(RoundUp, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundToZero, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU); CHECK(rxRound(RoundToZero, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundToNearest, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundDown, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundUp, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundToZero, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPADD" << std::endl; std::cout << "FPADD" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U); CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U); CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U); CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U); CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPSUB" << std::endl; std::cout << "FPSUB" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U); CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U); CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U); CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U); CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPMUL" << std::endl; std::cout << "FPMUL" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U); CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U); CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U); CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U); CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U); CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U); CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U); CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U); CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPDIV" << std::endl; std::cout << "FPDIV" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU); CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU); CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU); CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU); CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU); CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU); CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU); CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU); CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
#ifdef DEBUG #ifdef DEBUG
std::cout << "FPSQRT" << std::endl; std::cout << "FPSQRT" << std::endl;
#endif #endif
CHECK(rxRound(RoundToNearest, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU); CHECK(rxRound(RoundToNearest, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
CHECK(rxRound(RoundDown, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU); CHECK(rxRound(RoundDown, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
CHECK(rxRound(RoundUp, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU); CHECK(rxRound(RoundUp, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
CHECK(rxRound(RoundToZero, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU); CHECK(rxRound(RoundToZero, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
CHECK(rxRound(RoundToNearest, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
CHECK(rxRound(RoundDown, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
CHECK(rxRound(RoundUp, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
CHECK(rxRound(RoundToZero, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
} }

View File

@ -24,8 +24,19 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "t1ha/t1ha.h" #include "t1ha/t1ha.h"
#include "blake2/blake2.h" #include "blake2/blake2.h"
#include <cstring> #include <cstring>
#include <iomanip>
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl
<< " = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec;
return os;
}
namespace RandomX { namespace RandomX {
VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) { VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
mem.ds.dataset = nullptr; mem.ds.dataset = nullptr;
} }
@ -83,9 +94,10 @@ namespace RandomX {
} }
void VirtualMachine::getResult(void* out) { void VirtualMachine::getResult(void* out) {
uint64_t smallState[sizeof(RegisterFile) / sizeof(uint64_t) + 2]; constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2;
uint64_t smallState[smallStateLength];
memcpy(smallState, &reg, sizeof(RegisterFile)); memcpy(smallState, &reg, sizeof(RegisterFile));
smallState[17] = t1ha2_atonce128(&smallState[16], scratchpad, ScratchpadSize, reg.r[0].u64); smallState[smallStateLength - 1] = t1ha2_atonce128(&smallState[smallStateLength - 2], scratchpad, ScratchpadSize, reg.r[0].u64);
blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0);
} }
} }

View File

@ -32,11 +32,14 @@ namespace RandomX {
virtual void initializeProgram(const void* seed) = 0; virtual void initializeProgram(const void* seed) = 0;
virtual void execute() = 0; virtual void execute() = 0;
void getResult(void*); void getResult(void*);
const RegisterFile& getRegisterFile() {
return reg;
}
protected: protected:
bool softAes, lightClient; bool softAes, lightClient;
RegisterFile reg;
MemoryRegisters mem;
DatasetReadFunc readDataset; DatasetReadFunc readDataset;
alignas(16) RegisterFile reg;
MemoryRegisters mem;
alignas(16) convertible_t scratchpad[ScratchpadLength]; alignas(16) convertible_t scratchpad[ScratchpadLength];
}; };
} }

View File

@ -0,0 +1,12 @@
#include "program_epilogue_store.inc"
;# restore callee-saved registers - System V AMD64 ABI
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
;# program finished
ret 0

View File

@ -0,0 +1,22 @@
;# unroll VM stack
mov rsp, rbp
;# save VM register values
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
movdqa xmmword ptr [rcx+64], xmm8
movdqa xmmword ptr [rcx+80], xmm9
movdqa xmmword ptr [rcx+96], xmm2
movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
movdqa xmmword ptr [rcx+64], xmm4
movdqa xmmword ptr [rcx+80], xmm5
movdqa xmmword ptr [rcx+96], xmm6
movdqa xmmword ptr [rcx+112], xmm7

View File

@ -0,0 +1,20 @@
include program_epilogue_store.inc
;# restore callee-saved registers - Microsoft x64 calling convention
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
;# program finished
ret 0

View File

@ -0,0 +1,17 @@
;# callee-saved registers - System V AMD64 ABI
push rbx
push rbp
push r12
push r13
push r14
push r15
;# function arguments
push rdi ;# RegisterFile& registerFile
mov rbx, rsi ;# MemoryRegisters& memory
mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi
#include "program_prologue_load.inc"
jmp randomx_program_begin

View File

@ -0,0 +1,63 @@
mov rbp, rsp ;# beginning of VM stack
mov rdi, 1048577 ;# number of VM instructions to execute + 1
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10
psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
;# reset rounding mode
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
;# load integer registers
mov r8, qword ptr [rcx+0]
mov r9, qword ptr [rcx+8]
mov r10, qword ptr [rcx+16]
mov r11, qword ptr [rcx+24]
mov r12, qword ptr [rcx+32]
mov r13, qword ptr [rcx+40]
mov r14, qword ptr [rcx+48]
mov r15, qword ptr [rcx+56]
;# initialize floating point registers
xorps xmm8, xmm8
cvtsi2sd xmm8, qword ptr [rcx+72]
pslldq xmm8, 8
cvtsi2sd xmm8, qword ptr [rcx+64]
xorps xmm9, xmm9
cvtsi2sd xmm9, qword ptr [rcx+88]
pslldq xmm9, 8
cvtsi2sd xmm9, qword ptr [rcx+80]
xorps xmm2, xmm2
cvtsi2sd xmm2, qword ptr [rcx+104]
pslldq xmm2, 8
cvtsi2sd xmm2, qword ptr [rcx+96]
xorps xmm3, xmm3
cvtsi2sd xmm3, qword ptr [rcx+120]
pslldq xmm3, 8
cvtsi2sd xmm3, qword ptr [rcx+112]
lea rcx, [rcx+64]
xorps xmm4, xmm4
cvtsi2sd xmm4, qword ptr [rcx+72]
pslldq xmm4, 8
cvtsi2sd xmm4, qword ptr [rcx+64]
xorps xmm5, xmm5
cvtsi2sd xmm5, qword ptr [rcx+88]
pslldq xmm5, 8
cvtsi2sd xmm5, qword ptr [rcx+80]
xorps xmm6, xmm6
cvtsi2sd xmm6, qword ptr [rcx+104]
pslldq xmm6, 8
cvtsi2sd xmm6, qword ptr [rcx+96]
xorps xmm7, xmm7
cvtsi2sd xmm7, qword ptr [rcx+120]
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]

View File

@ -0,0 +1,24 @@
;# callee-saved registers - Microsoft x64 calling convention
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbx, rdx ;# MemoryRegisters& memory
mov rsi, r8 ;# convertible_t* scratchpad
include program_prologue_load.inc
jmp randomx_program_begin

View File

@ -0,0 +1,13 @@
mov edx, dword ptr [rbx] ;# ma
mov rax, qword ptr [rbx+8] ;# dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0

View File

@ -0,0 +1,13 @@
mov eax, dword ptr [rbx] ;# ma
mov rdx, qword ptr [rbx+8] ;# dataset
mov rax, qword ptr [rdx+rax]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_r_ret:
ret 0

View File

@ -20,6 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include <iostream>
namespace RandomX { namespace RandomX {
@ -59,6 +60,15 @@ namespace RandomX {
uint64_t u64; uint64_t u64;
int32_t i32; int32_t i32;
uint32_t u32; uint32_t u32;
struct {
int32_t i32lo;
int32_t i32hi;
};
};
struct fpu_reg_t {
convertible_t lo;
convertible_t hi;
}; };
constexpr int ProgramLength = 512; constexpr int ProgramLength = 512;
@ -96,10 +106,10 @@ namespace RandomX {
struct RegisterFile { struct RegisterFile {
convertible_t r[RegistersCount]; convertible_t r[RegistersCount];
convertible_t f[RegistersCount]; fpu_reg_t f[RegistersCount];
}; };
static_assert(sizeof(RegisterFile) == 2 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile"); static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&); typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&);
@ -109,3 +119,5 @@ namespace RandomX {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc); void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc);
} }
} }
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);

View File

@ -1,19 +1,19 @@
; Copyright (c) 2018 tevador ;# Copyright (c) 2018 tevador
; ;#
; This file is part of RandomX. ;# This file is part of RandomX.
; ;#
; RandomX is free software: you can redistribute it and/or modify ;# RandomX is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by ;# it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or ;# the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version. ;# (at your option) any later version.
; ;#
; RandomX is distributed in the hope that it will be useful, ;# RandomX is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of ;# but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details. ;# GNU General Public License for more details.
; ;#
; You should have received a copy of the GNU General Public License ;# You should have received a copy of the GNU General Public License
; along with RandomX. If not, see<http://www.gnu.org/licenses/>. ;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
PUBLIC executeProgram PUBLIC executeProgram
@ -47,6 +47,7 @@ executeProgram PROC
; xmm7 -> "f7" ; xmm7 -> "f7"
; xmm8 -> "f0" ; xmm8 -> "f0"
; xmm9 -> "f1" ; xmm9 -> "f1"
; xmm10 -> absolute value mask
; STACK STRUCTURE: ; STACK STRUCTURE:
; | ; |
@ -71,11 +72,12 @@ executeProgram PROC
push r13 push r13
push r14 push r14
push r15 push r15
sub rsp, 64 sub rsp, 80
movdqu xmmword ptr [rsp+48], xmm6 movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+32], xmm7 movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+16], xmm8 movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+0], xmm9 movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
; function arguments ; function arguments
push rcx ; RegisterFile& registerFile push rcx ; RegisterFile& registerFile
@ -86,7 +88,15 @@ executeProgram PROC
mov rbp, rsp ; beginning of VM stack mov rbp, rsp ; beginning of VM stack
mov rdi, 1048577 ; number of VM instructions to execute + 1 mov rdi, 1048577 ; number of VM instructions to execute + 1
; load VM register values xorps xmm10, xmm10
cmpeqpd xmm10, xmm10
psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
; reset rounding mode
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
; load integer registers
mov r8, qword ptr [rcx+0] mov r8, qword ptr [rcx+0]
mov r9, qword ptr [rcx+8] mov r9, qword ptr [rcx+8]
mov r10, qword ptr [rcx+16] mov r10, qword ptr [rcx+16]
@ -95,16 +105,56 @@ executeProgram PROC
mov r13, qword ptr [rcx+40] mov r13, qword ptr [rcx+40]
mov r14, qword ptr [rcx+48] mov r14, qword ptr [rcx+48]
mov r15, qword ptr [rcx+56] mov r15, qword ptr [rcx+56]
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8] ; load register f0 hi, lo
xorps xmm8, xmm8
cvtsi2sd xmm8, qword ptr [rcx+72]
pslldq xmm8, 8
cvtsi2sd xmm8, qword ptr [rcx+64] cvtsi2sd xmm8, qword ptr [rcx+64]
cvtsi2sd xmm9, qword ptr [rcx+72]
cvtsi2sd xmm2, qword ptr [rcx+80] ; load register f1 hi, lo
cvtsi2sd xmm3, qword ptr [rcx+88] xorps xmm9, xmm9
cvtsi2sd xmm4, qword ptr [rcx+96] cvtsi2sd xmm9, qword ptr [rcx+88]
cvtsi2sd xmm5, qword ptr [rcx+104] pslldq xmm9, 8
cvtsi2sd xmm6, qword ptr [rcx+112] cvtsi2sd xmm9, qword ptr [rcx+80]
; load register f2 hi, lo
xorps xmm2, xmm2
cvtsi2sd xmm2, qword ptr [rcx+104]
pslldq xmm2, 8
cvtsi2sd xmm2, qword ptr [rcx+96]
; load register f3 hi, lo
xorps xmm3, xmm3
cvtsi2sd xmm3, qword ptr [rcx+120]
pslldq xmm3, 8
cvtsi2sd xmm3, qword ptr [rcx+112]
lea rcx, [rcx+64]
; load register f4 hi, lo
xorps xmm4, xmm4
cvtsi2sd xmm4, qword ptr [rcx+72]
pslldq xmm4, 8
cvtsi2sd xmm4, qword ptr [rcx+64]
; load register f5 hi, lo
xorps xmm5, xmm5
cvtsi2sd xmm5, qword ptr [rcx+88]
pslldq xmm5, 8
cvtsi2sd xmm5, qword ptr [rcx+80]
; load register f6 hi, lo
xorps xmm6, xmm6
cvtsi2sd xmm6, qword ptr [rcx+104]
pslldq xmm6, 8
cvtsi2sd xmm6, qword ptr [rcx+96]
; load register f7 hi, lo
xorps xmm7, xmm7
cvtsi2sd xmm7, qword ptr [rcx+120] cvtsi2sd xmm7, qword ptr [rcx+120]
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]
; program body ; program body
@ -125,21 +175,23 @@ rx_finish:
mov qword ptr [rcx+40], r13 mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14 mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15 mov qword ptr [rcx+56], r15
movd qword ptr [rcx+64], xmm8 movdqa xmmword ptr [rcx+64], xmm8
movd qword ptr [rcx+72], xmm9 movdqa xmmword ptr [rcx+80], xmm9
movd qword ptr [rcx+80], xmm2 movdqa xmmword ptr [rcx+96], xmm2
movd qword ptr [rcx+88], xmm3 movdqa xmmword ptr [rcx+112], xmm3
movd qword ptr [rcx+96], xmm4 lea rcx, [rcx+64]
movd qword ptr [rcx+104], xmm5 movdqa xmmword ptr [rcx+64], xmm4
movd qword ptr [rcx+112], xmm6 movdqa xmmword ptr [rcx+80], xmm5
movd qword ptr [rcx+120], xmm7 movdqa xmmword ptr [rcx+96], xmm6
movdqa xmmword ptr [rcx+112], xmm7
; load callee-saved registers ; load callee-saved registers
movdqu xmm9, xmmword ptr [rsp] movdqu xmm10, xmmword ptr [rsp]
movdqu xmm8, xmmword ptr [rsp+16] movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm7, xmmword ptr [rsp+32] movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm6, xmmword ptr [rsp+48] movdqu xmm7, xmmword ptr [rsp+48]
add rsp, 64 movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15 pop r15
pop r14 pop r14
pop r13 pop r13
@ -171,7 +223,7 @@ rx_read_dataset:
pop r8 pop r8
ret 0 ret 0
rx_read_dataset_full: rx_read_dataset_r:
mov edx, dword ptr [rbx] ; ma mov edx, dword ptr [rbx] ; ma
mov rax, qword ptr [rbx+8] ; dataset mov rax, qword ptr [rbx+8] ; dataset
mov rax, qword ptr [rax+rdx] mov rax, qword ptr [rax+rdx]
@ -179,12 +231,27 @@ rx_read_dataset_full:
xor ecx, dword ptr [rbx+4] ; mx xor ecx, dword ptr [rbx+4] ; mx
mov dword ptr [rbx+4], ecx mov dword ptr [rbx+4], ecx
test ecx, 0FFF8h test ecx, 0FFF8h
jne short rx_read_dataset_full_ret jne short rx_read_dataset_r_ret
and ecx, -8 and ecx, -8
mov dword ptr [rbx], ecx mov dword ptr [rbx], ecx
mov rdx, qword ptr [rbx+8] mov rdx, qword ptr [rbx+8]
prefetcht0 byte ptr [rdx+rcx] prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_full_ret: rx_read_dataset_r_ret:
ret 0
rx_read_dataset_f:
mov edx, dword ptr [rbx] ; ma
mov rax, qword ptr [rbx+8] ; dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ; mx
mov dword ptr [rbx+4], ecx
test ecx, 0FFF8h
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0 ret 0
executeProgram ENDP executeProgram ENDP

View File

@ -19,15 +19,15 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once #pragma once
#define WT_ADD_64 10 #define WT_ADD_64 11
#define WT_ADD_32 2 #define WT_ADD_32 2
#define WT_SUB_64 10 #define WT_SUB_64 11
#define WT_SUB_32 2 #define WT_SUB_32 2
#define WT_MUL_64 21 #define WT_MUL_64 23
#define WT_MULH_64 10 #define WT_MULH_64 10
#define WT_MUL_32 15 #define WT_MUL_32 15
#define WT_IMUL_32 15 #define WT_IMUL_32 15
#define WT_IMULH_64 10 #define WT_IMULH_64 6
#define WT_DIV_64 1 #define WT_DIV_64 1
#define WT_IDIV_64 1 #define WT_IDIV_64 1
#define WT_AND_64 4 #define WT_AND_64 4
@ -47,8 +47,9 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#define WT_FPDIV 8 #define WT_FPDIV 8
#define WT_FPSQRT 6 #define WT_FPSQRT 6
#define WT_FPROUND 2 #define WT_FPROUND 2
#define WT_CALL 24 #define WT_CALL 20
#define WT_RET 18 #define WT_RET 22
constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \ constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \
WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \ WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \
@ -60,6 +61,7 @@ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \
static_assert(wtSum == 256, static_assert(wtSum == 256,
"Sum of instruction weights must be 256"); "Sum of instruction weights must be 256");
#define REP0(x)
#define REP1(x) x, #define REP1(x) x,
#define REP2(x) REP1(x) x, #define REP2(x) REP1(x) x,
#define REP3(x) REP2(x) x, #define REP3(x) REP2(x) x,
@ -86,6 +88,16 @@ static_assert(wtSum == 256,
#define REP24(x) REP23(x) x, #define REP24(x) REP23(x) x,
#define REP25(x) REP24(x) x, #define REP25(x) REP24(x) x,
#define REP26(x) REP25(x) x, #define REP26(x) REP25(x) x,
#define REP27(x) REP26(x) x,
#define REP28(x) REP27(x) x,
#define REP29(x) REP28(x) x,
#define REP30(x) REP29(x) x,
#define REP31(x) REP30(x) x,
#define REP32(x) REP31(x) x,
#define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
#define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x) #define REPNX(x,N) REP##N(x)
#define REPN(x,N) REPNX(x,N) #define REPN(x,N) REPNX(x,N)
#define NUM(x) x #define NUM(x) x

View File

@ -22,16 +22,10 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX { namespace RandomX {
inline double convertToDouble(int64_t x) { //Clears the 11 least-significant bits before conversion. This is done so the number
return (double)(x &-2048L); //fits exactly into the 52-bit mantissa without rounding.
} inline double convertSigned52(int64_t x) {
return (double)(x & -2048L);
inline double convertToDoubleNonZero(int64_t x) {
return (double)((x & -2048L) | 2048);
}
inline double convertToDoubleNonNegative(int64_t x) {
return (double)(x & 9223372036854773760L);
} }
extern "C" { extern "C" {
@ -59,27 +53,11 @@ namespace RandomX {
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c); void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
bool JMP_COND(uint8_t, convertible_t&, int32_t); bool JMP_COND(uint8_t, convertible_t&, int32_t);
void FPINIT(); void FPINIT();
void FPADD(convertible_t& a, double b, convertible_t& c); void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPSUB(convertible_t& a, double b, convertible_t& c); void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPMUL(convertible_t& a, double b, convertible_t& c); void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPDIV(convertible_t& a, double b, convertible_t& c); void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c); void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c); void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
inline void FPADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPADD(a, b.f64, c);
}
inline void FPSUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPSUB(a, b.f64, c);
}
inline void FPMUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPMUL(a, b.f64, c);
}
inline void FPDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPDIV(a, b.f64, c);
}
} }
} }

View File

@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>. along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/ */
//#define DEBUG //#define DEBUG
//#define FTZ
#include "instructions.hpp" #include "instructions.hpp"
#include "intrinPortable.h" #include "intrinPortable.h"
#pragma STDC FENV_ACCESS on #pragma STDC FENV_ACCESS on
@ -154,19 +153,17 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
#define subOverflow __subOverflow #define subOverflow __subOverflow
#endif #endif
static double FlushDenormal(double x) { static inline double FlushDenormalNaN(double x) {
if (std::fpclassify(x) == FP_SUBNORMAL) { int fpc = std::fpclassify(x);
return 0; if (fpc == FP_SUBNORMAL || fpc == FP_NAN) {
return 0.0;
} }
return x; return x;
} }
#ifdef FTZ static inline double FlushNaN(double x) {
#undef FTZ return x != x ? 0.0 : x;
#define FTZ(x) FlushDenormal(x) }
#else
#define FTZ(x) x
#endif
namespace RandomX { namespace RandomX {
@ -286,37 +283,95 @@ namespace RandomX {
} }
void FPINIT() { void FPINIT() {
setRoundMode(FE_TONEAREST);
}
void FPADD(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDouble(a.i64) + b);
}
void FPSUB(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDouble(a.i64) - b);
}
void FPMUL(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDoubleNonZero(a.i64) * b);
}
void FPDIV(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDoubleNonZero(a.i64) / b);
}
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
#ifdef __SSE2__ #ifdef __SSE2__
double d = convertToDoubleNonNegative(a.i64); _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
#else #else
c.f64 = FTZ(sqrt(convertToDoubleNonNegative(a.i64))); setRoundMode(FE_TONEAREST);
#endif #endif
} }
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c) { void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
c.f64 = convertToDouble(a.i64); #ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_add_pd(ad, bd);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = alo + b.lo.f64;
c.hi.f64 = ahi + b.hi.f64;
#endif
}
void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_sub_pd(ad, bd);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = alo - b.lo.f64;
c.hi.f64 = ahi - b.hi.f64;
#endif
}
void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_mul_pd(ad, bd);
__m128d mask = _mm_cmpeq_pd(cd, cd);
cd = _mm_and_pd(cd, mask);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = FlushNaN(alo * b.lo.f64);
c.hi.f64 = FlushNaN(ahi * b.hi.f64);
#endif
}
void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_div_pd(ad, bd);
__m128d mask = _mm_cmpeq_pd(cd, cd);
cd = _mm_and_pd(cd, mask);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64);
c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64);
#endif
}
void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
ad = _mm_and_pd(ad, absmask);
__m128d cd = _mm_sqrt_pd(ad);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = sqrt(std::abs(alo));
c.hi.f64 = sqrt(std::abs(ahi));
#endif
}
void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
c.lo.f64 = convertSigned52(a.i64);
switch (a.u64 & 3) { switch (a.u64 & 3) {
case RoundDown: case RoundDown:
#ifdef DEBUG #ifdef DEBUG

View File

@ -79,14 +79,6 @@ void readInt(int argc, char** argv, int& out, int defaultValue) {
out = defaultValue; out = defaultValue;
} }
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "f" << i << " = " << rf.f[i].u64 << " (" << rf.f[i].f64 << ")" << std::endl << std::dec;
return os;
}
class AtomicHash { class AtomicHash {
public: public:
AtomicHash() { AtomicHash() {
@ -282,7 +274,7 @@ int main(int argc, char** argv) {
std::cout << "Calculated result: "; std::cout << "Calculated result: ";
result.print(std::cout); result.print(std::cout);
if(programCount == 1000) if(programCount == 1000)
std::cout << "Reference result: f6bf06465d5fa1b1dc919140b9e9f9e210b07ae6d662988458a172e9a267eb3f" << std::endl; std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
/*if (threadCount == 1 && !compiled) { /*if (threadCount == 1 && !compiled) {
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0]; auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];

File diff suppressed because it is too large Load Diff