mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-12-22 15:58:53 +00:00
Vector FPU instructions
JitCompilerX86 - static code written in asm Updated ALU/FPU tests Updated instruction weights
This commit is contained in:
parent
a09bee8d60
commit
3caecc7646
6
makefile
6
makefile
@ -12,6 +12,9 @@ OBJDIR=obj
|
||||
LDFLAGS=-lpthread
|
||||
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
|
||||
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
|
||||
ifeq ($(PLATFORM),x86_64)
|
||||
ROBJS += $(OBJDIR)/JitCompilerX86-static.o
|
||||
endif
|
||||
|
||||
all: release test
|
||||
|
||||
@ -57,6 +60,9 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
|
||||
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
|
||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
|
||||
|
||||
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR)
|
||||
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
|
||||
|
||||
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
|
||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@
|
||||
|
||||
|
@ -54,7 +54,7 @@ namespace RandomX {
|
||||
(this->*generator)(instr, i);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::gena(Instruction& instr) {
|
||||
void AssemblyGeneratorX86::genar(Instruction& instr) {
|
||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||
switch (instr.loca & 7)
|
||||
{
|
||||
@ -63,7 +63,7 @@ namespace RandomX {
|
||||
case 2:
|
||||
case 3:
|
||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\tcall rx_read_dataset" << std::endl;
|
||||
asmCode << "\tcall rx_read_dataset_r" << std::endl;
|
||||
return;
|
||||
|
||||
case 4:
|
||||
@ -80,6 +80,33 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void AssemblyGeneratorX86::genaf(Instruction& instr) {
|
||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||
switch (instr.loca & 7)
|
||||
{
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\tcall rx_read_dataset_f" << std::endl;
|
||||
return;
|
||||
|
||||
case 4:
|
||||
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
|
||||
return;
|
||||
|
||||
default:
|
||||
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) {
|
||||
switch (instr.locb & 7)
|
||||
{
|
||||
@ -87,8 +114,6 @@ namespace RandomX {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
|
||||
asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
|
||||
return;
|
||||
@ -133,26 +158,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) {
|
||||
asmCode << "\tand rax, -2048" << std::endl;
|
||||
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
|
||||
switch (instr.locb & 7)
|
||||
{
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
|
||||
return;
|
||||
default:
|
||||
convertible_t bimm;
|
||||
bimm.f64 = (double)instr.imm32;
|
||||
asmCode << "\tmov rax, " << bimm.i64 << std::endl;
|
||||
asmCode << "\tmovd xmm1, rax" << std::endl;
|
||||
asmCode << "\t" << instrx86 << " xmm0, xmm1" << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::gencr(Instruction& instr) {
|
||||
@ -165,7 +171,7 @@ namespace RandomX {
|
||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
|
||||
if (trace) {
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
|
||||
}
|
||||
return;
|
||||
|
||||
@ -178,76 +184,75 @@ namespace RandomX {
|
||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
|
||||
if (trace) {
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
|
||||
}
|
||||
return;
|
||||
|
||||
default:
|
||||
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
|
||||
if (trace) {
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::gencf(Instruction& instr) {
|
||||
void AssemblyGeneratorX86::gencf(Instruction& instr, bool alwaysLow = false) {
|
||||
if(!alwaysLow)
|
||||
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
|
||||
const char* store = (!alwaysLow && (instr.locc & 8)) ? "movhpd" : "movlpd";
|
||||
switch (instr.locc & 7)
|
||||
{
|
||||
case 0:
|
||||
case 4:
|
||||
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
|
||||
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
|
||||
asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
|
||||
break;
|
||||
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
|
||||
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
|
||||
break;
|
||||
|
||||
default:
|
||||
asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
|
||||
asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
|
||||
break;
|
||||
}
|
||||
if (trace) {
|
||||
asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl;
|
||||
asmCode << "\t" << store << " qword ptr [rsi + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tadd rax, ";
|
||||
genbr1(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tadd eax, ";
|
||||
genbr132(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tsub rax, ";
|
||||
genbr1(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tsub eax, ";
|
||||
genbr132(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\timul rax, ";
|
||||
if ((instr.locb & 7) >= 6) {
|
||||
asmCode << "rax, ";
|
||||
@ -257,7 +262,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmov rcx, ";
|
||||
genbr1(instr);
|
||||
asmCode << "\tmul rcx" << std::endl;
|
||||
@ -266,7 +271,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmov ecx, eax" << std::endl;
|
||||
asmCode << "\tmov eax, ";
|
||||
genbr132(instr);
|
||||
@ -275,7 +280,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmovsxd rcx, eax" << std::endl;
|
||||
if ((instr.locb & 7) >= 6) {
|
||||
asmCode << "\tmov rax, " << instr.imm32 << std::endl;
|
||||
@ -288,7 +293,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmov rcx, ";
|
||||
genbr1(instr);
|
||||
asmCode << "\timul rcx" << std::endl;
|
||||
@ -297,7 +302,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) >= 6) {
|
||||
if (instr.imm32 == 0) {
|
||||
asmCode << "\tmov ecx, 1" << std::endl;
|
||||
@ -318,7 +323,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmov edx, ";
|
||||
genbr132(instr);
|
||||
asmCode << "\tcmp edx, -1" << std::endl;
|
||||
@ -339,123 +344,125 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tand rax, ";
|
||||
genbr1(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tand eax, ";
|
||||
genbr132(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tor rax, ";
|
||||
genbr1(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tor eax, ";
|
||||
genbr132(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\txor rax, ";
|
||||
genbr1(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\txor eax, ";
|
||||
genbr132(instr);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
genbr0(instr, "shl");
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
genbr0(instr, "shr");
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
genbr0(instr, "sar");
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
genbr0(instr, "rol");
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
genbr0(instr, "ror");
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genbf(instr, "addsd");
|
||||
genaf(instr);
|
||||
genbf(instr, "addpd");
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genbf(instr, "subsd");
|
||||
genaf(instr);
|
||||
genbf(instr, "subpd");
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
asmCode << "\tor rax, 2048" << std::endl;
|
||||
genbf(instr, "mulsd");
|
||||
genaf(instr);
|
||||
genbf(instr, "mulpd");
|
||||
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
|
||||
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
|
||||
asmCode << "\tandps xmm0, xmm1" << std::endl;
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
asmCode << "\tor rax, 2048" << std::endl;
|
||||
genbf(instr, "divsd");
|
||||
genaf(instr);
|
||||
genbf(instr, "divpd");
|
||||
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
|
||||
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
|
||||
asmCode << "\tandps xmm0, xmm1" << std::endl;
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
asmCode << "\tmov rcx, 9223372036854773760" << std::endl;
|
||||
asmCode << "\tand rax, rcx" << std::endl;
|
||||
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
|
||||
asmCode << "\tsqrtsd xmm0, xmm0" << std::endl;
|
||||
genaf(instr);
|
||||
asmCode << "\tandps xmm0, xmm10" << std::endl;
|
||||
asmCode << "\tsqrtpd xmm0, xmm0" << std::endl;
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tmov rcx, rax" << std::endl;
|
||||
asmCode << "\tshl eax, 13" << std::endl;
|
||||
asmCode << "\tand rcx, -2048" << std::endl;
|
||||
asmCode << "\tand eax, 24576" << std::endl;
|
||||
asmCode << "\tcvtsi2sd xmm0, rcx" << std::endl;
|
||||
asmCode << "\tcvtsi2sd " << regF[instr.regc % RegistersCount] << ", rcx" << std::endl;
|
||||
asmCode << "\tor eax, 40896" << std::endl;
|
||||
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
|
||||
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
|
||||
gencf(instr);
|
||||
gencf(instr, true);
|
||||
}
|
||||
|
||||
static inline const char* jumpCondition(Instruction& instr, bool invert = false) {
|
||||
@ -481,7 +488,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
|
||||
asmCode << "\t" << jumpCondition(instr);
|
||||
asmCode << " short taken_call_" << i << std::endl;
|
||||
@ -489,14 +496,14 @@ namespace RandomX {
|
||||
asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl;
|
||||
asmCode << "taken_call_" << i << ":" << std::endl;
|
||||
if (trace) {
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
|
||||
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
|
||||
}
|
||||
asmCode << "\tpush rax" << std::endl;
|
||||
asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {
|
||||
gena(instr);
|
||||
genar(instr);
|
||||
asmCode << "\tcmp rsp, rbp" << std::endl;
|
||||
asmCode << "\tje short not_taken_ret_" << i << std::endl;
|
||||
asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl;
|
||||
|
@ -38,13 +38,14 @@ namespace RandomX {
|
||||
static InstructionGenerator engine[256];
|
||||
std::stringstream asmCode;
|
||||
|
||||
void gena(Instruction&);
|
||||
void genar(Instruction&);
|
||||
void genaf(Instruction&);
|
||||
void genbr0(Instruction&, const char*);
|
||||
void genbr1(Instruction&);
|
||||
void genbr132(Instruction&);
|
||||
void genbf(Instruction&, const char*);
|
||||
void gencr(Instruction&);
|
||||
void gencf(Instruction&);
|
||||
void gencf(Instruction&, bool);
|
||||
|
||||
void generateCode(Instruction&, int);
|
||||
|
||||
|
@ -26,9 +26,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
namespace RandomX {
|
||||
|
||||
CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {
|
||||
#if !defined(_M_X64) && !defined(__x86_64__)
|
||||
throw std::runtime_error("Compiled VM only supports x86-64 CPUs");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
|
||||
@ -51,7 +49,7 @@ namespace RandomX {
|
||||
void CompiledVirtualMachine::execute() {
|
||||
//executeProgram(reg, mem, scratchpad, readDataset);
|
||||
compiler.getProgramFunc()(reg, mem, scratchpad);
|
||||
#ifdef TRACE
|
||||
#ifdef TRACEVM
|
||||
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
|
||||
std::cout << std::hex << tracepad[i].u64 << std::endl;
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
//#define TRACE
|
||||
//#define TRACEVM
|
||||
#include "VirtualMachine.hpp"
|
||||
#include "JitCompilerX86.hpp"
|
||||
|
||||
@ -34,7 +34,7 @@ namespace RandomX {
|
||||
return compiler.getCode();
|
||||
}
|
||||
private:
|
||||
#ifdef TRACE
|
||||
#ifdef TRACEVM
|
||||
convertible_t tracepad[InstructionCount];
|
||||
#endif
|
||||
JitCompilerX86 compiler;
|
||||
|
@ -44,9 +44,11 @@ namespace RandomX {
|
||||
*(((uint32_t*)®) + i) = gen();
|
||||
}
|
||||
FPINIT();
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg.f[i].f64 = (double)reg.f[i].i64;
|
||||
for (int i = 0; i < RegistersCount; ++i) {
|
||||
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
|
||||
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
|
||||
}
|
||||
//std::cout << reg;
|
||||
p.initialize(gen);
|
||||
mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
|
||||
mem.mx = *(((uint32_t*)seed) + 5);
|
||||
@ -119,9 +121,9 @@ namespace RandomX {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
return reg.r[inst.regb % RegistersCount];
|
||||
case 4:
|
||||
case 5:
|
||||
return reg.r[inst.regb % RegistersCount];
|
||||
case 6:
|
||||
case 7:
|
||||
convertible_t temp;
|
||||
@ -130,22 +132,6 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
double InterpretedVirtualMachine::loadbf(Instruction& inst) {
|
||||
switch (inst.locb & 7)
|
||||
{
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
return reg.f[inst.regb % RegistersCount].f64;
|
||||
case 6:
|
||||
case 7:
|
||||
return (double)inst.imm32;
|
||||
}
|
||||
}
|
||||
|
||||
convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
|
||||
addr_t addr;
|
||||
switch (inst.locc & 7)
|
||||
@ -168,25 +154,43 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
convertible_t& InterpretedVirtualMachine::getcf(Instruction& inst) {
|
||||
void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) {
|
||||
addr_t addr;
|
||||
switch (inst.locc & 7)
|
||||
{
|
||||
case 0:
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
return scratchpad[addr % ScratchpadL2];
|
||||
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
return scratchpad[addr % ScratchpadL1];
|
||||
|
||||
case 4:
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo;
|
||||
break;
|
||||
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
return reg.f[inst.regc % RegistersCount];
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) {
|
||||
addr_t addr;
|
||||
switch (inst.locc & 7)
|
||||
{
|
||||
case 4:
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
scratchpad[addr % ScratchpadL2] = regc.lo;
|
||||
break;
|
||||
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
|
||||
scratchpad[addr % ScratchpadL1] = regc.lo;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -194,22 +198,18 @@ namespace RandomX {
|
||||
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
|
||||
|
||||
#define FPU_RETIRE(x) x(a, b, c); \
|
||||
writecf(inst, c); \
|
||||
if(trace) { \
|
||||
convertible_t bc; \
|
||||
bc.f64 = b; \
|
||||
std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \
|
||||
std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \
|
||||
} \
|
||||
if(fpuCheck) { \
|
||||
convertible_t bc; \
|
||||
if(c.f64 != c.f64) { \
|
||||
if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64) { \
|
||||
std::stringstream ss; \
|
||||
bc.f64 = b; \
|
||||
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
|
||||
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
|
||||
throw std::runtime_error(ss.str()); \
|
||||
} else if (std::fpclassify(c.f64) == FP_SUBNORMAL) {\
|
||||
} else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\
|
||||
std::stringstream ss; \
|
||||
bc.f64 = b; \
|
||||
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
|
||||
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
|
||||
throw std::runtime_error(ss.str()); \
|
||||
} \
|
||||
}
|
||||
@ -220,8 +220,13 @@ namespace RandomX {
|
||||
#define INC_COUNT(x)
|
||||
#endif
|
||||
|
||||
#define FPU_RETIRE_NB(x) x(a, b, c); \
|
||||
if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
|
||||
#define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \
|
||||
writecf(inst, c); \
|
||||
if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl;
|
||||
|
||||
#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \
|
||||
writecflo(inst, c); \
|
||||
if(trace) std::cout << std::hex << c.lo.u64 << std::endl;
|
||||
|
||||
#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
|
||||
INC_COUNT(x) \
|
||||
@ -242,17 +247,17 @@ namespace RandomX {
|
||||
#define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
|
||||
INC_COUNT(x) \
|
||||
convertible_t a = loada(inst); \
|
||||
double b = loadbf(inst); \
|
||||
convertible_t& c = getcf(inst); \
|
||||
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
|
||||
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
|
||||
FPU_RETIRE(x) \
|
||||
}
|
||||
|
||||
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
|
||||
INC_COUNT(x) \
|
||||
convertible_t a = loada(inst); \
|
||||
convertible_t b; \
|
||||
convertible_t& c = getcf(inst); \
|
||||
FPU_RETIRE_NB(x) \
|
||||
fpu_reg_t b; \
|
||||
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
|
||||
FPU_RETIRE_##x(x) \
|
||||
}
|
||||
|
||||
ALU_INST(ADD_64)
|
||||
|
@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#define STATS
|
||||
//#define STATS
|
||||
#include "VirtualMachine.hpp"
|
||||
#include "Program.hpp"
|
||||
#include <vector>
|
||||
@ -88,9 +88,9 @@ namespace RandomX {
|
||||
convertible_t loada(Instruction&);
|
||||
convertible_t loadbr0(Instruction&);
|
||||
convertible_t loadbr1(Instruction&);
|
||||
double loadbf(Instruction&);
|
||||
convertible_t& getcr(Instruction&);
|
||||
convertible_t& getcf(Instruction&);
|
||||
void writecf(Instruction&, fpu_reg_t&);
|
||||
void writecflo(Instruction&, fpu_reg_t&);
|
||||
|
||||
void stackPush(convertible_t& c) {
|
||||
stack.push_back(c);
|
||||
|
58
src/JitCompilerX86-static.S
Normal file
58
src/JitCompilerX86-static.S
Normal file
@ -0,0 +1,58 @@
|
||||
;# Copyright (c) 2018 tevador
|
||||
;#
|
||||
;# This file is part of RandomX.
|
||||
;#
|
||||
;# RandomX is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# RandomX is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
.intel_syntax noprefix
|
||||
#if defined(__APPLE__)
|
||||
.text
|
||||
#else
|
||||
.section .text
|
||||
#endif
|
||||
#if defined(__WIN32__) || defined(__APPLE__)
|
||||
#define DECL(x) _##x
|
||||
#else
|
||||
#define DECL(x) x
|
||||
#endif
|
||||
.global DECL(randomx_program_prologue)
|
||||
.global DECL(randomx_program_begin)
|
||||
.global DECL(randomx_program_epilogue)
|
||||
.global DECL(randomx_program_read_r)
|
||||
.global DECL(randomx_program_read_f)
|
||||
.global DECL(randomx_program_end)
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_prologue):
|
||||
#include "asm/program_prologue_linux.inc"
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_begin):
|
||||
nop
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_epilogue):
|
||||
#include "asm/program_epilogue_linux.inc"
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_r):
|
||||
#include "asm/program_read_r.inc"
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_f):
|
||||
#include "asm/program_read_f.inc"
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_end):
|
||||
nop
|
59
src/JitCompilerX86-static.asm
Normal file
59
src/JitCompilerX86-static.asm
Normal file
@ -0,0 +1,59 @@
|
||||
;# Copyright (c) 2018 tevador
|
||||
;#
|
||||
;# This file is part of RandomX.
|
||||
;#
|
||||
;# RandomX is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# RandomX is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
|
||||
|
||||
PUBLIC randomx_program_prologue
|
||||
PUBLIC randomx_program_begin
|
||||
PUBLIC randomx_program_epilogue
|
||||
PUBLIC randomx_program_read_r
|
||||
PUBLIC randomx_program_read_f
|
||||
PUBLIC randomx_program_end
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_prologue PROC
|
||||
include asm/program_prologue_win64.inc
|
||||
randomx_program_prologue ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_begin PROC
|
||||
nop
|
||||
randomx_program_begin ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_epilogue PROC
|
||||
include asm/program_epilogue_win64.inc
|
||||
randomx_program_epilogue ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_read_r PROC
|
||||
include asm/program_read_r.inc
|
||||
randomx_program_read_r ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_read_f PROC
|
||||
include asm/program_read_f.inc
|
||||
randomx_program_read_f ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_end PROC
|
||||
nop
|
||||
randomx_program_end ENDP
|
||||
|
||||
_RANDOMX_JITX86_STATIC ENDS
|
||||
|
||||
END
|
27
src/JitCompilerX86-static.hpp
Normal file
27
src/JitCompilerX86-static.hpp
Normal file
@ -0,0 +1,27 @@
|
||||
/*
|
||||
Copyright (c) 2018 tevador
|
||||
|
||||
This file is part of RandomX.
|
||||
|
||||
RandomX is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
RandomX is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
extern "C" {
|
||||
void randomx_program_prologue();
|
||||
void randomx_program_begin();
|
||||
void randomx_program_epilogue();
|
||||
void randomx_program_read_r();
|
||||
void randomx_program_read_f();
|
||||
void randomx_program_end();
|
||||
}
|
@ -34,6 +34,16 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
namespace RandomX {
|
||||
|
||||
#if !defined(_M_X64) && !defined(__x86_64__)
|
||||
JitCompilerX86::JitCompilerX86() {
|
||||
throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateProgram(Pcg32& gen) {
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
REGISTER ALLOCATION:
|
||||
|
||||
@ -41,7 +51,7 @@ namespace RandomX {
|
||||
rbx -> MemoryRegisters& memory
|
||||
rcx -> temporary
|
||||
rdx -> temporary
|
||||
rsi -> convertible_t& scratchpad
|
||||
rsi -> convertible_t* scratchpad
|
||||
rdi -> "ic" (instruction counter)
|
||||
rbp -> beginning of VM stack
|
||||
rsp -> end of VM stack
|
||||
@ -63,6 +73,7 @@ namespace RandomX {
|
||||
xmm7 -> "f7"
|
||||
xmm8 -> "f0"
|
||||
xmm9 -> "f1"
|
||||
xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
|
||||
|
||||
STACK STRUCTURE:
|
||||
|
||||
@ -81,127 +92,23 @@ namespace RandomX {
|
||||
|
||||
*/
|
||||
|
||||
constexpr uint8_t ic3 = ((InstructionCount + 1) >> 24);
|
||||
constexpr uint8_t ic2 = ((InstructionCount + 1) >> 16);
|
||||
constexpr uint8_t ic1 = ((InstructionCount + 1) >> 8);
|
||||
constexpr uint8_t ic0 = ((InstructionCount + 1) >> 0);
|
||||
#include "JitCompilerX86-static.hpp"
|
||||
|
||||
const uint8_t prologue[] = {
|
||||
0x53, //push rbx
|
||||
0x55, //push rbp
|
||||
#ifdef _WIN32
|
||||
0x57, //push rdi
|
||||
0x56, //push rsi
|
||||
#endif
|
||||
0x41, 0x54, //push r12
|
||||
0x41, 0x55, //push r13
|
||||
0x41, 0x56, //push r14
|
||||
0x41, 0x57, //push r15
|
||||
#ifdef _WIN32
|
||||
0x48, 0x83, 0xec, 0x48, //sub rsp,0x48
|
||||
0xf3, 0x0f, 0x7f, 0x74, 0x24, 0x30, //movdqu XMMWORD PTR[rsp + 0x30],xmm6
|
||||
0xf3, 0x0f, 0x7f, 0x7c, 0x24, 0x20, //movdqu XMMWORD PTR[rsp + 0x20],xmm7
|
||||
0xf3, 0x44, 0x0f, 0x7f, 0x44, 0x24, 0x10, //movdqu XMMWORD PTR[rsp + 0x10],xmm8
|
||||
0xf3, 0x44, 0x0f, 0x7f, 0x0c, 0x24, //movdqu XMMWORD PTR[rsp],xmm9
|
||||
0x51, //push rcx
|
||||
0x48, 0x8b, 0xda, //mov rbx,rdx
|
||||
0x49, 0x8b, 0xf0, //mov rsi,r8
|
||||
#else
|
||||
0x57, //push rdi
|
||||
0x48, 0x8b, 0xde, //mov rbx, rsi
|
||||
0x48, 0x8b, 0xf2, //mov rsi, rdx
|
||||
0x48, 0x8b, 0xcf, //mov rcx, rdi
|
||||
#endif
|
||||
0x48, 0x8b, 0xec, //mov rbp,rsp
|
||||
0x48, 0xc7, 0xc7, ic0, ic1, ic2, ic3, //mov rdi, "InstructionCount"
|
||||
0x4c, 0x8b, 0x01, //mov r8,QWORD PTR[rcx]
|
||||
0x4c, 0x8b, 0x49, 0x08, //mov r9,QWORD PTR[rcx+0x8]
|
||||
0x4c, 0x8b, 0x51, 0x10, //mov r10,QWORD PTR[rcx+0x10]
|
||||
0x4c, 0x8b, 0x59, 0x18, //mov r11,QWORD PTR[rcx+0x18]
|
||||
0x4c, 0x8b, 0x61, 0x20, //mov r12,QWORD PTR[rcx+0x20]
|
||||
0x4c, 0x8b, 0x69, 0x28, //mov r13,QWORD PTR[rcx+0x28]
|
||||
0x4c, 0x8b, 0x71, 0x30, //mov r14,QWORD PTR[rcx+0x30]
|
||||
0x4c, 0x8b, 0x79, 0x38, //mov r15,QWORD PTR[rcx+0x38]
|
||||
0xc7, 0x44, 0x24, 0xf8, 0xc0, 0x9f, 0x00, //mov DWORD PTR[rsp-0x8],0x9fc0
|
||||
0x00,
|
||||
0x0f, 0xae, 0x54, 0x24, 0xf8, //ldmxcsr DWORD PTR[rsp-0x8]
|
||||
0xf2, 0x4c, 0x0f, 0x2a, 0x41, 0x40, //cvtsi2sd xmm8,QWORD PTR[rcx+0x40]
|
||||
0xf2, 0x4c, 0x0f, 0x2a, 0x49, 0x48, //cvtsi2sd xmm9,QWORD PTR[rcx+0x48]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x51, 0x50, //cvtsi2sd xmm2,QWORD PTR[rcx+0x50]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x59, 0x58, //cvtsi2sd xmm3,QWORD PTR[rcx+0x58]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x61, 0x60, //cvtsi2sd xmm4,QWORD PTR[rcx+0x60]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x69, 0x68, //cvtsi2sd xmm5,QWORD PTR[rcx+0x68]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x71, 0x70, //cvtsi2sd xmm6,QWORD PTR[rcx+0x70]
|
||||
0xf2, 0x48, 0x0f, 0x2a, 0x79, 0x78, //cvtsi2sd xmm7,QWORD PTR[rcx+0x78]
|
||||
};
|
||||
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
||||
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
|
||||
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
|
||||
const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
|
||||
const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
|
||||
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
|
||||
|
||||
const uint8_t epilogue[] = {
|
||||
0x48, 0x8b, 0xe5, //mov rsp,rbp
|
||||
0x59, //pop rcx
|
||||
0x4c, 0x89, 0x01, //mov QWORD PTR [rcx],r8
|
||||
0x4c, 0x89, 0x49, 0x08, //mov QWORD PTR [rcx+0x8],r9
|
||||
0x4c, 0x89, 0x51, 0x10, //mov QWORD PTR [rcx+0x10],r10
|
||||
0x4c, 0x89, 0x59, 0x18, //mov QWORD PTR [rcx+0x18],r11
|
||||
0x4c, 0x89, 0x61, 0x20, //mov QWORD PTR [rcx+0x20],r12
|
||||
0x4c, 0x89, 0x69, 0x28, //mov QWORD PTR [rcx+0x28],r13
|
||||
0x4c, 0x89, 0x71, 0x30, //mov QWORD PTR [rcx+0x30],r14
|
||||
0x4c, 0x89, 0x79, 0x38, //mov QWORD PTR [rcx+0x38],r15
|
||||
0x66, 0x4c, 0x0f, 0x7e, 0x41, 0x40, //movq QWORD PTR [rcx+0x40],xmm8
|
||||
0x66, 0x4c, 0x0f, 0x7e, 0x49, 0x48, //movq QWORD PTR [rcx+0x48],xmm9
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x51, 0x50, //movq QWORD PTR [rcx+0x50],xmm2
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x59, 0x58, //movq QWORD PTR [rcx+0x58],xmm3
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x61, 0x60, //movq QWORD PTR [rcx+0x60],xmm4
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x69, 0x68, //movq QWORD PTR [rcx+0x68],xmm5
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x71, 0x70, //movq QWORD PTR [rcx+0x70],xmm6
|
||||
0x66, 0x48, 0x0f, 0x7e, 0x79, 0x78, //movq QWORD PTR [rcx+0x78],xmm7
|
||||
#ifdef _WIN32
|
||||
0xf3, 0x44, 0x0f, 0x6f, 0x0c, 0x24, //movdqu xmm9,XMMWORD PTR [rsp]
|
||||
0xf3, 0x44, 0x0f, 0x6f, 0x44, 0x24, 0x10, //movdqu xmm8,XMMWORD PTR [rsp+0x10]
|
||||
0xf3, 0x0f, 0x6f, 0x7c, 0x24, 0x20, //movdqu xmm7,XMMWORD PTR [rsp+0x20]
|
||||
0xf3, 0x0f, 0x6f, 0x74, 0x24, 0x30, //movdqu xmm6,XMMWORD PTR [rsp+0x30]
|
||||
0x48, 0x83, 0xc4, 0x48, //add rsp,0x48
|
||||
#endif
|
||||
0x41, 0x5f, //pop r15
|
||||
0x41, 0x5e, //pop r14
|
||||
0x41, 0x5d, //pop r13
|
||||
0x41, 0x5c, //pop r12
|
||||
#ifdef _WIN32
|
||||
0x5e, //pop rsi
|
||||
0x5f, //pop rdi
|
||||
#endif
|
||||
0x5d, //pop rbp
|
||||
0x5b, //pop rbx
|
||||
0xc3, //ret
|
||||
};
|
||||
const int32_t prologueSize = codeProgramBegin - codePrologue;
|
||||
const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
|
||||
const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
|
||||
const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;
|
||||
|
||||
//41 bytes -> 1 cache line
|
||||
const uint8_t readDatasetSub[] = {
|
||||
0x8b, 0x13, //mov edx,DWORD PTR [rbx]
|
||||
0x48, 0x8b, 0x43, 0x08, //mov rax,QWORD PTR [rbx+0x8]
|
||||
0x48, 0x8b, 0x04, 0x10, //mov rax,QWORD PTR [rax+rdx*1]
|
||||
0x83, 0x03, 0x08, //add DWORD PTR [rbx],0x8
|
||||
0x33, 0x4b, 0x04, //xor ecx,DWORD PTR [rbx+0x4]
|
||||
0x89, 0x4b, 0x04, //mov DWORD PTR [rbx+0x4],ecx
|
||||
0xf7, 0xc1, 0xf8, 0xff, 0x00, 0x00, //test ecx,0xfff8
|
||||
0x75, 0x0d, //jne
|
||||
0x83, 0xe1, 0xf8, //and ecx,0xfffffff8
|
||||
0x89, 0x0b, //mov DWORD PTR [rbx],ecx
|
||||
0x48, 0x8b, 0x53, 0x08, //mov rdx,QWORD PTR [rbx+0x8]
|
||||
0x0f, 0x18, 0x0c, 0x0a, //prefetcht0 BYTE PTR [rdx+rcx*1]
|
||||
0xc3, //ret
|
||||
};
|
||||
|
||||
constexpr int getNumCacheLines(size_t size) {
|
||||
return (size + (CacheLineSize - 1)) / CacheLineSize;
|
||||
}
|
||||
|
||||
constexpr int32_t align(int32_t pos, int32_t align) {
|
||||
return ((pos - 1) / align + 1) * align;
|
||||
}
|
||||
|
||||
constexpr int32_t readDatasetSubOffset = CodeSize - CacheLineSize * getNumCacheLines(sizeof(readDatasetSub));
|
||||
constexpr int32_t epilogueOffset = readDatasetSubOffset - CacheLineSize * getNumCacheLines(sizeof(epilogue));
|
||||
constexpr int32_t startOffsetAligned = align(sizeof(prologue), CacheLineSize);
|
||||
const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
|
||||
const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
|
||||
const int32_t epilogueOffset = readDatasetROffset - epilogueSize;
|
||||
|
||||
JitCompilerX86::JitCompilerX86() {
|
||||
#ifdef _WIN32
|
||||
@ -213,24 +120,16 @@ namespace RandomX {
|
||||
if (code == (uint8_t*)-1)
|
||||
throw std::runtime_error("mmap failed");
|
||||
#endif
|
||||
memcpy(code, prologue, sizeof(prologue));
|
||||
codePos = sizeof(prologue);
|
||||
if (startOffsetAligned - codePos > 4) {
|
||||
emitByte(0xeb);
|
||||
emitByte(startOffsetAligned - (codePos + 1));
|
||||
}
|
||||
else {
|
||||
while (codePos < startOffsetAligned)
|
||||
emitByte(0x90); //nop
|
||||
}
|
||||
memcpy(code + readDatasetSubOffset, readDatasetSub, sizeof(readDatasetSub));
|
||||
memcpy(code + epilogueOffset, epilogue, sizeof(epilogue));
|
||||
memcpy(code, codePrologue, prologueSize);
|
||||
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
|
||||
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
|
||||
memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateProgram(Pcg32& gen) {
|
||||
instructionOffsets.clear();
|
||||
callOffsets.clear();
|
||||
codePos = startOffsetAligned;
|
||||
codePos = prologueSize;
|
||||
Instruction instr;
|
||||
for (unsigned i = 0; i < ProgramLength; ++i) {
|
||||
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
|
||||
@ -247,7 +146,6 @@ namespace RandomX {
|
||||
instructionOffsets.push_back(codePos);
|
||||
emit(0x840fcfff); //dec edx; jz <epilogue>
|
||||
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
|
||||
gena(instr);
|
||||
auto generator = engine[instr.opcode];
|
||||
(this->*generator)(instr, i);
|
||||
}
|
||||
@ -258,11 +156,10 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::gena(Instruction& instr) {
|
||||
void JitCompilerX86::genar(Instruction& instr) {
|
||||
emit(uint16_t(0x8149)); //xor
|
||||
emitByte(0xf0 + (instr.rega % RegistersCount));
|
||||
emit(instr.addra);
|
||||
int32_t pc;
|
||||
switch (instr.loca & 7)
|
||||
{
|
||||
case 0:
|
||||
@ -272,7 +169,7 @@ namespace RandomX {
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
|
||||
emitByte(0xe8); //call
|
||||
emit(readDatasetSubOffset - (codePos + 4));
|
||||
emit(readDatasetROffset - (codePos + 4));
|
||||
return;
|
||||
|
||||
case 4:
|
||||
@ -293,8 +190,44 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::genaf(Instruction& instr) {
|
||||
emit(uint16_t(0x8149)); //xor
|
||||
emitByte(0xf0 + (instr.rega % RegistersCount));
|
||||
emit(instr.addra);
|
||||
switch (instr.loca & 7)
|
||||
{
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
|
||||
emitByte(0xe8); //call
|
||||
emit(readDatasetFOffset - (codePos + 4));
|
||||
return;
|
||||
|
||||
case 4:
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL2 - 1); //whole scratchpad
|
||||
emitByte(0xf3);
|
||||
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
|
||||
return;
|
||||
|
||||
default:
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||
emitByte(0xf3);
|
||||
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
if ((instr.locb & 7) <= 3) {
|
||||
emit(uint16_t(0x8b49)); //mov
|
||||
emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
|
||||
emitByte(0x48); //REX.W
|
||||
@ -330,12 +263,8 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) {
|
||||
emit(0x48f2fffff8002548); //and rax,0xfffffffffffff800; cvtsi2sd xmm0,rax
|
||||
emit(uint16_t(0x2a0f));
|
||||
emitByte(0xc0);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
int regb = (instr.regb % RegistersCount);
|
||||
emitByte(0xf2); //xxxsd xmm0,regb
|
||||
emitByte(0x66); //xxxpd xmm0,regb
|
||||
if (regb <= 1) {
|
||||
emitByte(0x41); //REX
|
||||
}
|
||||
@ -343,44 +272,30 @@ namespace RandomX {
|
||||
emitByte(opcode);
|
||||
emitByte(0xc0 + regb);
|
||||
}
|
||||
else {
|
||||
convertible_t bimm;
|
||||
bimm.f64 = (double)instr.imm32;
|
||||
emit(uint16_t(0xb848)); //movabs rax,imm64
|
||||
emit(bimm.i64);
|
||||
emitByte(0x66); //movq xmm1,rax
|
||||
emit(0xc86e0f48);
|
||||
emit(uint16_t(0x0ff2)); //xxxsd xmm0,xmm1
|
||||
emitByte(opcode);
|
||||
emitByte(0xc1);
|
||||
}
|
||||
|
||||
|
||||
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
|
||||
emit(0x41c88b48); //mov rcx, rax; REX
|
||||
emitByte(0x8b); // mov
|
||||
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
|
||||
emitByte(0x35); // xor eax
|
||||
emit(instr.addrc);
|
||||
emitByte(0x25); //and
|
||||
emit(scratchpadSize - 1);
|
||||
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
|
||||
}
|
||||
|
||||
void JitCompilerX86::gencr(Instruction& instr) {
|
||||
switch (instr.locc & 7)
|
||||
{
|
||||
case 0:
|
||||
emit(0x41c88b48); //mov rcx, rax; REX
|
||||
emitByte(0x8b); // mov
|
||||
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
|
||||
emitByte(0x35); // xor eax
|
||||
emit(instr.addrc);
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL2 - 1); //whole scratchpad
|
||||
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
|
||||
scratchpadStoreR(instr, ScratchpadL2);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
emit(0x41c88b48); //mov rcx, rax; REX
|
||||
emitByte(0x8b); // mov
|
||||
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
|
||||
emitByte(0x35); // xor eax
|
||||
emit(instr.addrc);
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
|
||||
scratchpadStoreR(instr, ScratchpadL1);
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -390,66 +305,75 @@ namespace RandomX {
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::gencf(Instruction& instr) {
|
||||
int regc = (instr.regc % RegistersCount);
|
||||
switch (instr.locc & 7)
|
||||
{
|
||||
case 0:
|
||||
void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) {
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc0 + regc); //eax, regc
|
||||
emitByte(0x35); // xor eax
|
||||
emit(instr.addrc);
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL2 - 1); //whole scratchpad
|
||||
emit(uint16_t(0x4866)); //prefix
|
||||
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
|
||||
break;
|
||||
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc0 + regc); //eax, regc
|
||||
emitByte(0x35); // xor eax
|
||||
emit(instr.addrc);
|
||||
emitByte(0x25); //and
|
||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||
emit(uint16_t(0x4866)); //prefix
|
||||
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
|
||||
break;
|
||||
|
||||
default:
|
||||
emitByte(0xf2);
|
||||
emit(scratchpadSize - 1);
|
||||
emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc
|
||||
if (regc <= 1) {
|
||||
emitByte(0x44); //REX
|
||||
}
|
||||
emit(uint16_t(0x100f)); //movsd
|
||||
emitByte(0x0f);
|
||||
emitByte(storeHigh ? 0x17 : 0x13);
|
||||
emitByte(4 + 8 * regc);
|
||||
emitByte(0xc6);
|
||||
}
|
||||
|
||||
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
|
||||
int regc = (instr.regc % RegistersCount);
|
||||
if (!alwaysLow) {
|
||||
if (regc <= 1) {
|
||||
emitByte(0x44); //REX
|
||||
}
|
||||
emit(uint16_t(0x280f)); //movaps
|
||||
emitByte(0xc0 + 8 * regc); // regc, xmm0
|
||||
}
|
||||
switch (instr.locc & 7)
|
||||
{
|
||||
case 4:
|
||||
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
|
||||
break;
|
||||
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr1(instr, 0x0349, 0x0548);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr132(instr, 0x0341, 0x05);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr1(instr, 0x2b49, 0x2d48);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr132(instr, 0x2b41, 0x2d);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_MUL_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emitByte(0x49); //REX
|
||||
emit(uint16_t(0xaf0f)); // imul rax, r64
|
||||
@ -464,6 +388,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_MULH_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emit(uint16_t(0x8b49)); //mov rcx, r64
|
||||
emitByte(0xc8 + (instr.regb % RegistersCount));
|
||||
@ -481,6 +406,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_MUL_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
emit(uint16_t(0xc88b)); //mov ecx, eax
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emit(uint16_t(0x8b41)); // mov eax, r32
|
||||
@ -495,6 +421,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
emitByte(0x48);
|
||||
emit(uint16_t(0xc863)); //movsxd rcx,eax
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
@ -511,6 +438,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emit(uint16_t(0x8b49)); //mov rcx, r64
|
||||
emitByte(0xc8 + (instr.regb % RegistersCount));
|
||||
@ -528,6 +456,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emitByte(0xb9); //mov ecx, 1
|
||||
emit(1);
|
||||
@ -546,6 +475,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
if ((instr.locb & 7) <= 5) {
|
||||
emit(uint16_t(0x8b41)); //mov edx, r32
|
||||
emitByte(0xd0 + (instr.regb % RegistersCount));
|
||||
@ -563,100 +493,127 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr1(instr, 0x2349, 0x2548);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr132(instr, 0x2341, 0x25);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr1(instr, 0x0b49, 0x0d48);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr132(instr, 0x0b41, 0x0d);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr1(instr, 0x3349, 0x3548);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr132(instr, 0x3341, 0x35);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr0(instr, 0xe0d3, 0xe0c1);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr0(instr, 0xe8d3, 0xe8c1);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr0(instr, 0xf8d3, 0xf8c1);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr0(instr, 0xc0d3, 0xc0c1);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
genbr0(instr, 0xc8d3, 0xc8c1);
|
||||
gencr(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPADD(Instruction& instr, int i) {
|
||||
genaf(instr);
|
||||
genbf(instr, 0x58);
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPSUB(Instruction& instr, int i) {
|
||||
genaf(instr);
|
||||
genbf(instr, 0x5c);
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPMUL(Instruction& instr, int i) {
|
||||
emit(uint16_t(0x0d48)); //or rax,0x800
|
||||
emit(0x00000800);
|
||||
genaf(instr);
|
||||
genbf(instr, 0x59);
|
||||
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
|
||||
emit(uint16_t(0x540f)); //andps xmm0,xmm1
|
||||
emitByte(0xc1);
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPDIV(Instruction& instr, int i) {
|
||||
emit(uint16_t(0x0d48)); //or rax,0x800
|
||||
emit(0x00000800);
|
||||
genaf(instr);
|
||||
genbf(instr, 0x5e);
|
||||
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
|
||||
emit(uint16_t(0x540f)); //andps xmm0,xmm1
|
||||
emitByte(0xc1);
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) {
|
||||
emit(uint16_t(0xb948)); //or movabs rcx, imm64
|
||||
emit(0x7ffffffffffff800);
|
||||
emit(0xc02a0f48f2c12348); //and rax,rcx; cvtsi2sd xmm0,rax
|
||||
emit(0xc0510ff2); //sqrtsd xmm0,xmm0
|
||||
genaf(instr);
|
||||
emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0
|
||||
gencf(instr);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
emit(0x81480de0c1c88b48);
|
||||
emit(0x600025fffff800e1);
|
||||
emit(0x0dc12a0f48f20000);
|
||||
emit(uint16_t(0x0000));
|
||||
emitByte(0xf2);
|
||||
int regc = (instr.regc % RegistersCount);
|
||||
if (regc <= 1) {
|
||||
emitByte(0x4c); //REX
|
||||
}
|
||||
else {
|
||||
emitByte(0x48); //REX
|
||||
}
|
||||
emit(uint16_t(0x2a0f));
|
||||
emitByte(0xc1 + 8 * regc);
|
||||
emitByte(0x0d);
|
||||
emit(0xf824448900009fc0);
|
||||
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
|
||||
emitByte(0xf8);
|
||||
gencf(instr);
|
||||
gencf(instr, true);
|
||||
}
|
||||
|
||||
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
|
||||
@ -682,6 +639,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_CALL(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
emit(uint16_t(0x8141)); //cmp regb, imm32
|
||||
emitByte(0xf8 + (instr.regb % RegistersCount));
|
||||
emit(instr.imm32);
|
||||
@ -707,6 +665,7 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_RET(Instruction& instr, int i) {
|
||||
genar(instr);
|
||||
int crlen = 0;
|
||||
if ((instr.locc & 7) <= 3) {
|
||||
crlen = 17;
|
||||
@ -756,4 +715,6 @@ namespace RandomX {
|
||||
INST_HANDLE(CALL)
|
||||
INST_HANDLE(RET)
|
||||
};
|
||||
|
||||
#endif
|
||||
}
|
@ -58,13 +58,16 @@ namespace RandomX {
|
||||
std::vector<int32_t> instructionOffsets;
|
||||
std::vector<CallOffset> callOffsets;
|
||||
|
||||
void gena(Instruction&);
|
||||
void genar(Instruction&);
|
||||
void genaf(Instruction&);
|
||||
void genbr0(Instruction&, uint16_t, uint16_t);
|
||||
void genbr1(Instruction&, uint16_t, uint16_t);
|
||||
void genbr132(Instruction&, uint16_t, uint8_t);
|
||||
void genbf(Instruction&, uint8_t);
|
||||
void scratchpadStoreR(Instruction&, uint32_t);
|
||||
void scratchpadStoreF(Instruction&, int, uint32_t, bool);
|
||||
void gencr(Instruction&);
|
||||
void gencf(Instruction&);
|
||||
void gencf(Instruction&, bool);
|
||||
void generateCode(Instruction&, int);
|
||||
void fixCallOffsets();
|
||||
|
||||
|
@ -21,33 +21,36 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
#include <iomanip>
|
||||
#include <limits>
|
||||
#include "instructions.hpp"
|
||||
#include "Pcg32.hpp"
|
||||
//#define DEBUG
|
||||
|
||||
using namespace RandomX;
|
||||
|
||||
typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
|
||||
|
||||
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
|
||||
convertible_t a, b, c;
|
||||
a.u64 = mode;
|
||||
FPROUND(a, b, c);
|
||||
#ifdef DEBUG
|
||||
a.f64 = convertToDouble(x);
|
||||
b.f64 = convertToDouble(y);
|
||||
std::cout << std::hex << (uint64_t)x << " -> " << a.u64 << std::endl;
|
||||
std::cout << std::hex << (uint64_t)y << " -> " << b.u64 << std::endl;
|
||||
std::cout << std::dec;
|
||||
#endif
|
||||
a.i64 = x;
|
||||
b.i64 = y;
|
||||
op(a, b, c);
|
||||
return c.u64;
|
||||
}
|
||||
typedef void(*FpuOperation)(convertible_t&, fpu_reg_t&, fpu_reg_t&);
|
||||
|
||||
#define CATCH_CONFIG_MAIN
|
||||
#include "catch.hpp"
|
||||
|
||||
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, FpuOperation op, bool hiEqualsLo = true) {
|
||||
convertible_t a;
|
||||
fpu_reg_t b, c;
|
||||
a.u64 = mode;
|
||||
FPROUND(a, b, c);
|
||||
if (hiEqualsLo) {
|
||||
a.i32lo = x;
|
||||
a.i32hi = x;
|
||||
}
|
||||
else {
|
||||
a.i64 = x;
|
||||
}
|
||||
b.lo.i64 = y;
|
||||
b.hi.i64 = y;
|
||||
op(a, b, c);
|
||||
if (hiEqualsLo) {
|
||||
CHECK(c.lo.u64 == c.hi.u64);
|
||||
}
|
||||
return c.lo.u64;
|
||||
}
|
||||
|
||||
#define RX_EXECUTE_U64(va, vb, INST) do { \
|
||||
a.u64 = va; \
|
||||
b.u64 = vb; \
|
||||
@ -273,118 +276,126 @@ TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
|
||||
|
||||
TEST_CASE("Denormal results are not produced", "[FTZ]") {
|
||||
FPINIT();
|
||||
convertible_t a, b, c;
|
||||
a.i64 = 2048;
|
||||
FPDIV(a, DBL_MAX, c);
|
||||
convertible_t a;
|
||||
fpu_reg_t b;
|
||||
a.i64 = 1;
|
||||
b.lo.f64 = DBL_MAX;
|
||||
FPDIV(a, b, b);
|
||||
#ifdef DEBUG
|
||||
std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << c.u64 << std::endl;
|
||||
std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << b.lo.u64 << std::endl;
|
||||
#endif
|
||||
REQUIRE(std::fpclassify(c.f64) != FP_SUBNORMAL);
|
||||
b.f64 = c.f64;
|
||||
CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
|
||||
a.i64 = 0;
|
||||
FPSUB_64(a, b, c);
|
||||
FPSUB(a, b, b);
|
||||
#ifdef DEBUG
|
||||
std::cout << a.i64 << " - " << b.f64 << " = " << std::hex << c.u64 << std::endl;
|
||||
std::cout << a.i64 << " - " << b.lo.f64 << " = " << std::hex << b.lo.u64 << std::endl;
|
||||
#endif
|
||||
CHECK(std::fpclassify(c.f64) != FP_SUBNORMAL);
|
||||
CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
|
||||
}
|
||||
|
||||
TEST_CASE("NaN results are not produced", "[NAN]") {
|
||||
FPINIT();
|
||||
convertible_t a, c;
|
||||
convertible_t a;
|
||||
fpu_reg_t b;
|
||||
a.i64 = 0;
|
||||
FPDIV(a, 0, c);
|
||||
CHECK(std::fpclassify(c.f64) != FP_NAN);
|
||||
FPMUL(a, std::numeric_limits<double>::infinity(), c);
|
||||
CHECK(std::fpclassify(c.f64) != FP_NAN);
|
||||
b.lo.f64 = 0;
|
||||
FPDIV(a, b, b);
|
||||
CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
|
||||
b.lo.f64 = std::numeric_limits<double>::infinity();
|
||||
FPMUL(a, b, b);
|
||||
CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
|
||||
}
|
||||
|
||||
volatile int64_t fpAdda = 7379480244170225589;
|
||||
volatile int64_t fpAddb = -438072579179686797;
|
||||
volatile int64_t fpSuba = 2939258788088626026;
|
||||
volatile int64_t fpSubb = 4786131045320678734;
|
||||
volatile int64_t fpMula1 = 8399833736388895639;
|
||||
volatile int64_t fpMulb1 = 5671608020317594922;
|
||||
volatile int64_t fpMula2 = -7094299423744805450;
|
||||
volatile int64_t fpMulb2 = 4982086006202596504;
|
||||
volatile int64_t fpDiva1 = 8399833736388895639;
|
||||
volatile int64_t fpDivb1 = 5671608020317594922;
|
||||
volatile int64_t fpDiva2 = -7434878587645025912;
|
||||
volatile int64_t fpDivb2 = 5266243837734830806;
|
||||
volatile int64_t fpSqrta = -7594301562963134542;
|
||||
volatile int64_t fpRounda = 7379480244170225589;
|
||||
volatile int32_t fpAdda = -2110701072;
|
||||
volatile int64_t fpAddb = 5822431907862180274;
|
||||
volatile int32_t fpSuba = -1651770302;
|
||||
volatile int64_t fpSubb = 4982086006202596504;
|
||||
volatile int32_t fpMula1 = 122885310;
|
||||
volatile int64_t fpMulb1 = 6036690890763685020;
|
||||
volatile int32_t fpMula2 = -1952486466;
|
||||
volatile int64_t fpMulb2 = 5693689137909219638;
|
||||
volatile int32_t fpDiva1 = -1675630642;
|
||||
volatile int64_t fpDivb1 = -3959960229647489051;
|
||||
volatile int32_t fpDiva2 = -1651770302;
|
||||
volatile int64_t fpDivb2 = 4982086006202596504;
|
||||
volatile int32_t fpSqrta1 = 440505508;
|
||||
volatile int32_t fpSqrta2 = -2147483648;
|
||||
|
||||
TEST_CASE("IEEE-754 compliance", "[FPU]") {
|
||||
FPINIT();
|
||||
convertible_t a, b, c;
|
||||
convertible_t a;
|
||||
fpu_reg_t b, c;
|
||||
b.lo.f64 = 0.0;
|
||||
|
||||
a.i64 = 2048;
|
||||
FPDIV(a, 0, c);
|
||||
CHECK(c.f64 == std::numeric_limits<double>::infinity());
|
||||
a.i64 = 1;
|
||||
FPDIV(a, b, c);
|
||||
CHECK(c.lo.f64 == std::numeric_limits<double>::infinity());
|
||||
|
||||
a.i64 = -2048;
|
||||
FPDIV(a, 0, c);
|
||||
CHECK(c.f64 == -std::numeric_limits<double>::infinity());
|
||||
a.i64 = -1;
|
||||
FPDIV(a, b, c);
|
||||
CHECK(c.lo.f64 == -std::numeric_limits<double>::infinity());
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPROUND" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundDown, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundUp, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundToZero, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
|
||||
|
||||
CHECK(rxRound(RoundToNearest, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
|
||||
CHECK(rxRound(RoundDown, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
|
||||
CHECK(rxRound(RoundUp, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
|
||||
CHECK(rxRound(RoundToZero, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
|
||||
CHECK(rxRound(RoundToNearest, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundDown, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundUp, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
|
||||
CHECK(rxRound(RoundToZero, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPADD" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
|
||||
CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
|
||||
CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
|
||||
CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
|
||||
CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
|
||||
CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
|
||||
CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
|
||||
CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPSUB" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
|
||||
CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
|
||||
CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
|
||||
CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
|
||||
CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
|
||||
CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
|
||||
CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
|
||||
CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPMUL" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
|
||||
CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
|
||||
CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
|
||||
CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
|
||||
CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
|
||||
CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
|
||||
CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
|
||||
CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
|
||||
|
||||
CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
|
||||
CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
|
||||
CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
|
||||
CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
|
||||
CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
|
||||
CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
|
||||
CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
|
||||
CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPDIV" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
|
||||
CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
|
||||
CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
|
||||
CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
|
||||
CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
|
||||
CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
|
||||
CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
|
||||
CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
|
||||
|
||||
CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
|
||||
CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
|
||||
CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
|
||||
CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
|
||||
CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
|
||||
CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
|
||||
CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
|
||||
CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "FPSQRT" << std::endl;
|
||||
#endif
|
||||
CHECK(rxRound(RoundToNearest, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
|
||||
CHECK(rxRound(RoundDown, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
|
||||
CHECK(rxRound(RoundUp, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
|
||||
CHECK(rxRound(RoundToZero, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
|
||||
CHECK(rxRound(RoundToNearest, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
|
||||
CHECK(rxRound(RoundDown, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
|
||||
CHECK(rxRound(RoundUp, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
|
||||
CHECK(rxRound(RoundToZero, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
|
||||
|
||||
CHECK(rxRound(RoundToNearest, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
|
||||
CHECK(rxRound(RoundDown, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
|
||||
CHECK(rxRound(RoundUp, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
|
||||
CHECK(rxRound(RoundToZero, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
|
||||
}
|
||||
|
@ -24,8 +24,19 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
#include "t1ha/t1ha.h"
|
||||
#include "blake2/blake2.h"
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
|
||||
for (int i = 0; i < RandomX::RegistersCount; ++i)
|
||||
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
|
||||
for (int i = 0; i < RandomX::RegistersCount; ++i)
|
||||
os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl
|
||||
<< " = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec;
|
||||
return os;
|
||||
}
|
||||
|
||||
namespace RandomX {
|
||||
|
||||
VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
|
||||
mem.ds.dataset = nullptr;
|
||||
}
|
||||
@ -83,9 +94,10 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void VirtualMachine::getResult(void* out) {
|
||||
uint64_t smallState[sizeof(RegisterFile) / sizeof(uint64_t) + 2];
|
||||
constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2;
|
||||
uint64_t smallState[smallStateLength];
|
||||
memcpy(smallState, ®, sizeof(RegisterFile));
|
||||
smallState[17] = t1ha2_atonce128(&smallState[16], scratchpad, ScratchpadSize, reg.r[0].u64);
|
||||
smallState[smallStateLength - 1] = t1ha2_atonce128(&smallState[smallStateLength - 2], scratchpad, ScratchpadSize, reg.r[0].u64);
|
||||
blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0);
|
||||
}
|
||||
}
|
@ -32,11 +32,14 @@ namespace RandomX {
|
||||
virtual void initializeProgram(const void* seed) = 0;
|
||||
virtual void execute() = 0;
|
||||
void getResult(void*);
|
||||
const RegisterFile& getRegisterFile() {
|
||||
return reg;
|
||||
}
|
||||
protected:
|
||||
bool softAes, lightClient;
|
||||
RegisterFile reg;
|
||||
MemoryRegisters mem;
|
||||
DatasetReadFunc readDataset;
|
||||
alignas(16) RegisterFile reg;
|
||||
MemoryRegisters mem;
|
||||
alignas(16) convertible_t scratchpad[ScratchpadLength];
|
||||
};
|
||||
}
|
12
src/asm/program_epilogue_linux.inc
Normal file
12
src/asm/program_epilogue_linux.inc
Normal file
@ -0,0 +1,12 @@
|
||||
#include "program_epilogue_store.inc"
|
||||
|
||||
;# restore callee-saved registers - System V AMD64 ABI
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
|
||||
;# program finished
|
||||
ret 0
|
22
src/asm/program_epilogue_store.inc
Normal file
22
src/asm/program_epilogue_store.inc
Normal file
@ -0,0 +1,22 @@
|
||||
;# unroll VM stack
|
||||
mov rsp, rbp
|
||||
|
||||
;# save VM register values
|
||||
pop rcx
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
mov qword ptr [rcx+16], r10
|
||||
mov qword ptr [rcx+24], r11
|
||||
mov qword ptr [rcx+32], r12
|
||||
mov qword ptr [rcx+40], r13
|
||||
mov qword ptr [rcx+48], r14
|
||||
mov qword ptr [rcx+56], r15
|
||||
movdqa xmmword ptr [rcx+64], xmm8
|
||||
movdqa xmmword ptr [rcx+80], xmm9
|
||||
movdqa xmmword ptr [rcx+96], xmm2
|
||||
movdqa xmmword ptr [rcx+112], xmm3
|
||||
lea rcx, [rcx+64]
|
||||
movdqa xmmword ptr [rcx+64], xmm4
|
||||
movdqa xmmword ptr [rcx+80], xmm5
|
||||
movdqa xmmword ptr [rcx+96], xmm6
|
||||
movdqa xmmword ptr [rcx+112], xmm7
|
20
src/asm/program_epilogue_win64.inc
Normal file
20
src/asm/program_epilogue_win64.inc
Normal file
@ -0,0 +1,20 @@
|
||||
include program_epilogue_store.inc
|
||||
|
||||
;# restore callee-saved registers - Microsoft x64 calling convention
|
||||
movdqu xmm10, xmmword ptr [rsp]
|
||||
movdqu xmm9, xmmword ptr [rsp+16]
|
||||
movdqu xmm8, xmmword ptr [rsp+32]
|
||||
movdqu xmm7, xmmword ptr [rsp+48]
|
||||
movdqu xmm6, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbp
|
||||
pop rbx
|
||||
|
||||
;# program finished
|
||||
ret 0
|
17
src/asm/program_prologue_linux.inc
Normal file
17
src/asm/program_prologue_linux.inc
Normal file
@ -0,0 +1,17 @@
|
||||
;# callee-saved registers - System V AMD64 ABI
|
||||
push rbx
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
;# function arguments
|
||||
push rdi ;# RegisterFile& registerFile
|
||||
mov rbx, rsi ;# MemoryRegisters& memory
|
||||
mov rsi, rdx ;# convertible_t* scratchpad
|
||||
mov rcx, rdi
|
||||
|
||||
#include "program_prologue_load.inc"
|
||||
|
||||
jmp randomx_program_begin
|
63
src/asm/program_prologue_load.inc
Normal file
63
src/asm/program_prologue_load.inc
Normal file
@ -0,0 +1,63 @@
|
||||
mov rbp, rsp ;# beginning of VM stack
|
||||
mov rdi, 1048577 ;# number of VM instructions to execute + 1
|
||||
|
||||
xorps xmm10, xmm10
|
||||
cmpeqpd xmm10, xmm10
|
||||
psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
|
||||
|
||||
;# reset rounding mode
|
||||
mov dword ptr [rsp-8], 40896
|
||||
ldmxcsr dword ptr [rsp-8]
|
||||
|
||||
;# load integer registers
|
||||
mov r8, qword ptr [rcx+0]
|
||||
mov r9, qword ptr [rcx+8]
|
||||
mov r10, qword ptr [rcx+16]
|
||||
mov r11, qword ptr [rcx+24]
|
||||
mov r12, qword ptr [rcx+32]
|
||||
mov r13, qword ptr [rcx+40]
|
||||
mov r14, qword ptr [rcx+48]
|
||||
mov r15, qword ptr [rcx+56]
|
||||
|
||||
;# initialize floating point registers
|
||||
xorps xmm8, xmm8
|
||||
cvtsi2sd xmm8, qword ptr [rcx+72]
|
||||
pslldq xmm8, 8
|
||||
cvtsi2sd xmm8, qword ptr [rcx+64]
|
||||
|
||||
xorps xmm9, xmm9
|
||||
cvtsi2sd xmm9, qword ptr [rcx+88]
|
||||
pslldq xmm9, 8
|
||||
cvtsi2sd xmm9, qword ptr [rcx+80]
|
||||
|
||||
xorps xmm2, xmm2
|
||||
cvtsi2sd xmm2, qword ptr [rcx+104]
|
||||
pslldq xmm2, 8
|
||||
cvtsi2sd xmm2, qword ptr [rcx+96]
|
||||
|
||||
xorps xmm3, xmm3
|
||||
cvtsi2sd xmm3, qword ptr [rcx+120]
|
||||
pslldq xmm3, 8
|
||||
cvtsi2sd xmm3, qword ptr [rcx+112]
|
||||
|
||||
lea rcx, [rcx+64]
|
||||
|
||||
xorps xmm4, xmm4
|
||||
cvtsi2sd xmm4, qword ptr [rcx+72]
|
||||
pslldq xmm4, 8
|
||||
cvtsi2sd xmm4, qword ptr [rcx+64]
|
||||
|
||||
xorps xmm5, xmm5
|
||||
cvtsi2sd xmm5, qword ptr [rcx+88]
|
||||
pslldq xmm5, 8
|
||||
cvtsi2sd xmm5, qword ptr [rcx+80]
|
||||
|
||||
xorps xmm6, xmm6
|
||||
cvtsi2sd xmm6, qword ptr [rcx+104]
|
||||
pslldq xmm6, 8
|
||||
cvtsi2sd xmm6, qword ptr [rcx+96]
|
||||
|
||||
xorps xmm7, xmm7
|
||||
cvtsi2sd xmm7, qword ptr [rcx+120]
|
||||
pslldq xmm7, 8
|
||||
cvtsi2sd xmm7, qword ptr [rcx+112]
|
24
src/asm/program_prologue_win64.inc
Normal file
24
src/asm/program_prologue_win64.inc
Normal file
@ -0,0 +1,24 @@
|
||||
;# callee-saved registers - Microsoft x64 calling convention
|
||||
push rbx
|
||||
push rbp
|
||||
push rdi
|
||||
push rsi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm6
|
||||
movdqu xmmword ptr [rsp+48], xmm7
|
||||
movdqu xmmword ptr [rsp+32], xmm8
|
||||
movdqu xmmword ptr [rsp+16], xmm9
|
||||
movdqu xmmword ptr [rsp+0], xmm10
|
||||
|
||||
;# function arguments
|
||||
push rcx ;# RegisterFile& registerFile
|
||||
mov rbx, rdx ;# MemoryRegisters& memory
|
||||
mov rsi, r8 ;# convertible_t* scratchpad
|
||||
|
||||
include program_prologue_load.inc
|
||||
|
||||
jmp randomx_program_begin
|
13
src/asm/program_read_f.inc
Normal file
13
src/asm/program_read_f.inc
Normal file
@ -0,0 +1,13 @@
|
||||
mov edx, dword ptr [rbx] ;# ma
|
||||
mov rax, qword ptr [rbx+8] ;# dataset
|
||||
cvtdq2pd xmm0, qword ptr [rax+rdx]
|
||||
add dword ptr [rbx], 8
|
||||
xor ecx, dword ptr [rbx+4] ;# mx
|
||||
mov dword ptr [rbx+4], ecx
|
||||
test ecx, 65528
|
||||
jne short rx_read_dataset_f_ret
|
||||
and ecx, -8
|
||||
mov dword ptr [rbx], ecx
|
||||
prefetcht0 byte ptr [rax+rcx]
|
||||
rx_read_dataset_f_ret:
|
||||
ret 0
|
13
src/asm/program_read_r.inc
Normal file
13
src/asm/program_read_r.inc
Normal file
@ -0,0 +1,13 @@
|
||||
mov eax, dword ptr [rbx] ;# ma
|
||||
mov rdx, qword ptr [rbx+8] ;# dataset
|
||||
mov rax, qword ptr [rdx+rax]
|
||||
add dword ptr [rbx], 8
|
||||
xor ecx, dword ptr [rbx+4] ;# mx
|
||||
mov dword ptr [rbx+4], ecx
|
||||
test ecx, 65528
|
||||
jne short rx_read_dataset_r_ret
|
||||
and ecx, -8
|
||||
mov dword ptr [rbx], ecx
|
||||
prefetcht0 byte ptr [rdx+rcx]
|
||||
rx_read_dataset_r_ret:
|
||||
ret 0
|
@ -20,6 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
|
||||
namespace RandomX {
|
||||
|
||||
@ -59,6 +60,15 @@ namespace RandomX {
|
||||
uint64_t u64;
|
||||
int32_t i32;
|
||||
uint32_t u32;
|
||||
struct {
|
||||
int32_t i32lo;
|
||||
int32_t i32hi;
|
||||
};
|
||||
};
|
||||
|
||||
struct fpu_reg_t {
|
||||
convertible_t lo;
|
||||
convertible_t hi;
|
||||
};
|
||||
|
||||
constexpr int ProgramLength = 512;
|
||||
@ -96,10 +106,10 @@ namespace RandomX {
|
||||
|
||||
struct RegisterFile {
|
||||
convertible_t r[RegistersCount];
|
||||
convertible_t f[RegistersCount];
|
||||
fpu_reg_t f[RegistersCount];
|
||||
};
|
||||
|
||||
static_assert(sizeof(RegisterFile) == 2 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
|
||||
static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
|
||||
|
||||
typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&);
|
||||
|
||||
@ -109,3 +119,5 @@ namespace RandomX {
|
||||
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc);
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);
|
||||
|
@ -1,19 +1,19 @@
|
||||
; Copyright (c) 2018 tevador
|
||||
;
|
||||
; This file is part of RandomX.
|
||||
;
|
||||
; RandomX is free software: you can redistribute it and/or modify
|
||||
; it under the terms of the GNU General Public License as published by
|
||||
; the Free Software Foundation, either version 3 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; RandomX is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
;# Copyright (c) 2018 tevador
|
||||
;#
|
||||
;# This file is part of RandomX.
|
||||
;#
|
||||
;# RandomX is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# RandomX is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
PUBLIC executeProgram
|
||||
|
||||
@ -47,6 +47,7 @@ executeProgram PROC
|
||||
; xmm7 -> "f7"
|
||||
; xmm8 -> "f0"
|
||||
; xmm9 -> "f1"
|
||||
; xmm10 -> absolute value mask
|
||||
|
||||
; STACK STRUCTURE:
|
||||
; |
|
||||
@ -71,11 +72,12 @@ executeProgram PROC
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
movdqu xmmword ptr [rsp+48], xmm6
|
||||
movdqu xmmword ptr [rsp+32], xmm7
|
||||
movdqu xmmword ptr [rsp+16], xmm8
|
||||
movdqu xmmword ptr [rsp+0], xmm9
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm6
|
||||
movdqu xmmword ptr [rsp+48], xmm7
|
||||
movdqu xmmword ptr [rsp+32], xmm8
|
||||
movdqu xmmword ptr [rsp+16], xmm9
|
||||
movdqu xmmword ptr [rsp+0], xmm10
|
||||
|
||||
; function arguments
|
||||
push rcx ; RegisterFile& registerFile
|
||||
@ -86,7 +88,15 @@ executeProgram PROC
|
||||
mov rbp, rsp ; beginning of VM stack
|
||||
mov rdi, 1048577 ; number of VM instructions to execute + 1
|
||||
|
||||
; load VM register values
|
||||
xorps xmm10, xmm10
|
||||
cmpeqpd xmm10, xmm10
|
||||
psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
|
||||
|
||||
; reset rounding mode
|
||||
mov dword ptr [rsp-8], 40896
|
||||
ldmxcsr dword ptr [rsp-8]
|
||||
|
||||
; load integer registers
|
||||
mov r8, qword ptr [rcx+0]
|
||||
mov r9, qword ptr [rcx+8]
|
||||
mov r10, qword ptr [rcx+16]
|
||||
@ -95,16 +105,56 @@ executeProgram PROC
|
||||
mov r13, qword ptr [rcx+40]
|
||||
mov r14, qword ptr [rcx+48]
|
||||
mov r15, qword ptr [rcx+56]
|
||||
mov dword ptr [rsp-8], 40896
|
||||
ldmxcsr dword ptr [rsp-8]
|
||||
|
||||
; load register f0 hi, lo
|
||||
xorps xmm8, xmm8
|
||||
cvtsi2sd xmm8, qword ptr [rcx+72]
|
||||
pslldq xmm8, 8
|
||||
cvtsi2sd xmm8, qword ptr [rcx+64]
|
||||
cvtsi2sd xmm9, qword ptr [rcx+72]
|
||||
cvtsi2sd xmm2, qword ptr [rcx+80]
|
||||
cvtsi2sd xmm3, qword ptr [rcx+88]
|
||||
cvtsi2sd xmm4, qword ptr [rcx+96]
|
||||
cvtsi2sd xmm5, qword ptr [rcx+104]
|
||||
cvtsi2sd xmm6, qword ptr [rcx+112]
|
||||
|
||||
; load register f1 hi, lo
|
||||
xorps xmm9, xmm9
|
||||
cvtsi2sd xmm9, qword ptr [rcx+88]
|
||||
pslldq xmm9, 8
|
||||
cvtsi2sd xmm9, qword ptr [rcx+80]
|
||||
|
||||
; load register f2 hi, lo
|
||||
xorps xmm2, xmm2
|
||||
cvtsi2sd xmm2, qword ptr [rcx+104]
|
||||
pslldq xmm2, 8
|
||||
cvtsi2sd xmm2, qword ptr [rcx+96]
|
||||
|
||||
; load register f3 hi, lo
|
||||
xorps xmm3, xmm3
|
||||
cvtsi2sd xmm3, qword ptr [rcx+120]
|
||||
pslldq xmm3, 8
|
||||
cvtsi2sd xmm3, qword ptr [rcx+112]
|
||||
|
||||
lea rcx, [rcx+64]
|
||||
|
||||
; load register f4 hi, lo
|
||||
xorps xmm4, xmm4
|
||||
cvtsi2sd xmm4, qword ptr [rcx+72]
|
||||
pslldq xmm4, 8
|
||||
cvtsi2sd xmm4, qword ptr [rcx+64]
|
||||
|
||||
; load register f5 hi, lo
|
||||
xorps xmm5, xmm5
|
||||
cvtsi2sd xmm5, qword ptr [rcx+88]
|
||||
pslldq xmm5, 8
|
||||
cvtsi2sd xmm5, qword ptr [rcx+80]
|
||||
|
||||
; load register f6 hi, lo
|
||||
xorps xmm6, xmm6
|
||||
cvtsi2sd xmm6, qword ptr [rcx+104]
|
||||
pslldq xmm6, 8
|
||||
cvtsi2sd xmm6, qword ptr [rcx+96]
|
||||
|
||||
; load register f7 hi, lo
|
||||
xorps xmm7, xmm7
|
||||
cvtsi2sd xmm7, qword ptr [rcx+120]
|
||||
pslldq xmm7, 8
|
||||
cvtsi2sd xmm7, qword ptr [rcx+112]
|
||||
|
||||
; program body
|
||||
|
||||
@ -125,21 +175,23 @@ rx_finish:
|
||||
mov qword ptr [rcx+40], r13
|
||||
mov qword ptr [rcx+48], r14
|
||||
mov qword ptr [rcx+56], r15
|
||||
movd qword ptr [rcx+64], xmm8
|
||||
movd qword ptr [rcx+72], xmm9
|
||||
movd qword ptr [rcx+80], xmm2
|
||||
movd qword ptr [rcx+88], xmm3
|
||||
movd qword ptr [rcx+96], xmm4
|
||||
movd qword ptr [rcx+104], xmm5
|
||||
movd qword ptr [rcx+112], xmm6
|
||||
movd qword ptr [rcx+120], xmm7
|
||||
movdqa xmmword ptr [rcx+64], xmm8
|
||||
movdqa xmmword ptr [rcx+80], xmm9
|
||||
movdqa xmmword ptr [rcx+96], xmm2
|
||||
movdqa xmmword ptr [rcx+112], xmm3
|
||||
lea rcx, [rcx+64]
|
||||
movdqa xmmword ptr [rcx+64], xmm4
|
||||
movdqa xmmword ptr [rcx+80], xmm5
|
||||
movdqa xmmword ptr [rcx+96], xmm6
|
||||
movdqa xmmword ptr [rcx+112], xmm7
|
||||
|
||||
; load callee-saved registers
|
||||
movdqu xmm9, xmmword ptr [rsp]
|
||||
movdqu xmm8, xmmword ptr [rsp+16]
|
||||
movdqu xmm7, xmmword ptr [rsp+32]
|
||||
movdqu xmm6, xmmword ptr [rsp+48]
|
||||
add rsp, 64
|
||||
movdqu xmm10, xmmword ptr [rsp]
|
||||
movdqu xmm9, xmmword ptr [rsp+16]
|
||||
movdqu xmm8, xmmword ptr [rsp+32]
|
||||
movdqu xmm7, xmmword ptr [rsp+48]
|
||||
movdqu xmm6, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
@ -171,7 +223,7 @@ rx_read_dataset:
|
||||
pop r8
|
||||
ret 0
|
||||
|
||||
rx_read_dataset_full:
|
||||
rx_read_dataset_r:
|
||||
mov edx, dword ptr [rbx] ; ma
|
||||
mov rax, qword ptr [rbx+8] ; dataset
|
||||
mov rax, qword ptr [rax+rdx]
|
||||
@ -179,12 +231,27 @@ rx_read_dataset_full:
|
||||
xor ecx, dword ptr [rbx+4] ; mx
|
||||
mov dword ptr [rbx+4], ecx
|
||||
test ecx, 0FFF8h
|
||||
jne short rx_read_dataset_full_ret
|
||||
jne short rx_read_dataset_r_ret
|
||||
and ecx, -8
|
||||
mov dword ptr [rbx], ecx
|
||||
mov rdx, qword ptr [rbx+8]
|
||||
prefetcht0 byte ptr [rdx+rcx]
|
||||
rx_read_dataset_full_ret:
|
||||
rx_read_dataset_r_ret:
|
||||
ret 0
|
||||
|
||||
rx_read_dataset_f:
|
||||
mov edx, dword ptr [rbx] ; ma
|
||||
mov rax, qword ptr [rbx+8] ; dataset
|
||||
cvtdq2pd xmm0, qword ptr [rax+rdx]
|
||||
add dword ptr [rbx], 8
|
||||
xor ecx, dword ptr [rbx+4] ; mx
|
||||
mov dword ptr [rbx+4], ecx
|
||||
test ecx, 0FFF8h
|
||||
jne short rx_read_dataset_f_ret
|
||||
and ecx, -8
|
||||
mov dword ptr [rbx], ecx
|
||||
prefetcht0 byte ptr [rax+rcx]
|
||||
rx_read_dataset_f_ret:
|
||||
ret 0
|
||||
executeProgram ENDP
|
||||
|
||||
|
@ -19,15 +19,15 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
#pragma once
|
||||
|
||||
#define WT_ADD_64 10
|
||||
#define WT_ADD_64 11
|
||||
#define WT_ADD_32 2
|
||||
#define WT_SUB_64 10
|
||||
#define WT_SUB_64 11
|
||||
#define WT_SUB_32 2
|
||||
#define WT_MUL_64 21
|
||||
#define WT_MUL_64 23
|
||||
#define WT_MULH_64 10
|
||||
#define WT_MUL_32 15
|
||||
#define WT_IMUL_32 15
|
||||
#define WT_IMULH_64 10
|
||||
#define WT_IMULH_64 6
|
||||
#define WT_DIV_64 1
|
||||
#define WT_IDIV_64 1
|
||||
#define WT_AND_64 4
|
||||
@ -47,8 +47,9 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
#define WT_FPDIV 8
|
||||
#define WT_FPSQRT 6
|
||||
#define WT_FPROUND 2
|
||||
#define WT_CALL 24
|
||||
#define WT_RET 18
|
||||
#define WT_CALL 20
|
||||
#define WT_RET 22
|
||||
|
||||
|
||||
constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \
|
||||
WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \
|
||||
@ -60,6 +61,7 @@ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \
|
||||
static_assert(wtSum == 256,
|
||||
"Sum of instruction weights must be 256");
|
||||
|
||||
#define REP0(x)
|
||||
#define REP1(x) x,
|
||||
#define REP2(x) REP1(x) x,
|
||||
#define REP3(x) REP2(x) x,
|
||||
@ -86,6 +88,16 @@ static_assert(wtSum == 256,
|
||||
#define REP24(x) REP23(x) x,
|
||||
#define REP25(x) REP24(x) x,
|
||||
#define REP26(x) REP25(x) x,
|
||||
#define REP27(x) REP26(x) x,
|
||||
#define REP28(x) REP27(x) x,
|
||||
#define REP29(x) REP28(x) x,
|
||||
#define REP30(x) REP29(x) x,
|
||||
#define REP31(x) REP30(x) x,
|
||||
#define REP32(x) REP31(x) x,
|
||||
#define REP33(x) REP32(x) x,
|
||||
#define REP40(x) REP32(x) REP8(x)
|
||||
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
|
||||
#define REP256(x) REP128(x) REP128(x)
|
||||
#define REPNX(x,N) REP##N(x)
|
||||
#define REPN(x,N) REPNX(x,N)
|
||||
#define NUM(x) x
|
||||
|
@ -22,16 +22,10 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
|
||||
namespace RandomX {
|
||||
|
||||
inline double convertToDouble(int64_t x) {
|
||||
return (double)(x &-2048L);
|
||||
}
|
||||
|
||||
inline double convertToDoubleNonZero(int64_t x) {
|
||||
return (double)((x & -2048L) | 2048);
|
||||
}
|
||||
|
||||
inline double convertToDoubleNonNegative(int64_t x) {
|
||||
return (double)(x & 9223372036854773760L);
|
||||
//Clears the 11 least-significant bits before conversion. This is done so the number
|
||||
//fits exactly into the 52-bit mantissa without rounding.
|
||||
inline double convertSigned52(int64_t x) {
|
||||
return (double)(x & -2048L);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
@ -59,27 +53,11 @@ namespace RandomX {
|
||||
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
|
||||
bool JMP_COND(uint8_t, convertible_t&, int32_t);
|
||||
void FPINIT();
|
||||
void FPADD(convertible_t& a, double b, convertible_t& c);
|
||||
void FPSUB(convertible_t& a, double b, convertible_t& c);
|
||||
void FPMUL(convertible_t& a, double b, convertible_t& c);
|
||||
void FPDIV(convertible_t& a, double b, convertible_t& c);
|
||||
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
|
||||
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c);
|
||||
|
||||
inline void FPADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
FPADD(a, b.f64, c);
|
||||
}
|
||||
|
||||
inline void FPSUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
FPSUB(a, b.f64, c);
|
||||
}
|
||||
|
||||
inline void FPMUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
FPMUL(a, b.f64, c);
|
||||
}
|
||||
|
||||
inline void FPDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
FPDIV(a, b.f64, c);
|
||||
}
|
||||
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
|
||||
}
|
||||
}
|
@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
|
||||
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
//#define DEBUG
|
||||
//#define FTZ
|
||||
#include "instructions.hpp"
|
||||
#include "intrinPortable.h"
|
||||
#pragma STDC FENV_ACCESS on
|
||||
@ -154,19 +153,17 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
|
||||
#define subOverflow __subOverflow
|
||||
#endif
|
||||
|
||||
static double FlushDenormal(double x) {
|
||||
if (std::fpclassify(x) == FP_SUBNORMAL) {
|
||||
return 0;
|
||||
static inline double FlushDenormalNaN(double x) {
|
||||
int fpc = std::fpclassify(x);
|
||||
if (fpc == FP_SUBNORMAL || fpc == FP_NAN) {
|
||||
return 0.0;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
#ifdef FTZ
|
||||
#undef FTZ
|
||||
#define FTZ(x) FlushDenormal(x)
|
||||
#else
|
||||
#define FTZ(x) x
|
||||
#endif
|
||||
static inline double FlushNaN(double x) {
|
||||
return x != x ? 0.0 : x;
|
||||
}
|
||||
|
||||
namespace RandomX {
|
||||
|
||||
@ -286,37 +283,95 @@ namespace RandomX {
|
||||
}
|
||||
|
||||
void FPINIT() {
|
||||
setRoundMode(FE_TONEAREST);
|
||||
}
|
||||
|
||||
void FPADD(convertible_t& a, double b, convertible_t& c) {
|
||||
c.f64 = FTZ(convertToDouble(a.i64) + b);
|
||||
}
|
||||
|
||||
void FPSUB(convertible_t& a, double b, convertible_t& c) {
|
||||
c.f64 = FTZ(convertToDouble(a.i64) - b);
|
||||
}
|
||||
|
||||
void FPMUL(convertible_t& a, double b, convertible_t& c) {
|
||||
c.f64 = FTZ(convertToDoubleNonZero(a.i64) * b);
|
||||
}
|
||||
|
||||
void FPDIV(convertible_t& a, double b, convertible_t& c) {
|
||||
c.f64 = FTZ(convertToDoubleNonZero(a.i64) / b);
|
||||
}
|
||||
|
||||
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
#ifdef __SSE2__
|
||||
double d = convertToDoubleNonNegative(a.i64);
|
||||
c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
|
||||
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
|
||||
#else
|
||||
c.f64 = FTZ(sqrt(convertToDoubleNonNegative(a.i64)));
|
||||
setRoundMode(FE_TONEAREST);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
|
||||
c.f64 = convertToDouble(a.i64);
|
||||
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
#ifdef __SSE2__
|
||||
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
|
||||
__m128d ad = _mm_cvtepi32_pd(ai);
|
||||
__m128d bd = _mm_load_pd(&b.lo.f64);
|
||||
__m128d cd = _mm_add_pd(ad, bd);
|
||||
_mm_store_pd(&c.lo.f64, cd);
|
||||
#else
|
||||
double alo = (double)a.i32lo;
|
||||
double ahi = (double)a.i32hi;
|
||||
c.lo.f64 = alo + b.lo.f64;
|
||||
c.hi.f64 = ahi + b.hi.f64;
|
||||
#endif
|
||||
}
|
||||
|
||||
void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
#ifdef __SSE2__
|
||||
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
|
||||
__m128d ad = _mm_cvtepi32_pd(ai);
|
||||
__m128d bd = _mm_load_pd(&b.lo.f64);
|
||||
__m128d cd = _mm_sub_pd(ad, bd);
|
||||
_mm_store_pd(&c.lo.f64, cd);
|
||||
#else
|
||||
double alo = (double)a.i32lo;
|
||||
double ahi = (double)a.i32hi;
|
||||
c.lo.f64 = alo - b.lo.f64;
|
||||
c.hi.f64 = ahi - b.hi.f64;
|
||||
#endif
|
||||
}
|
||||
|
||||
void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
#ifdef __SSE2__
|
||||
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
|
||||
__m128d ad = _mm_cvtepi32_pd(ai);
|
||||
__m128d bd = _mm_load_pd(&b.lo.f64);
|
||||
__m128d cd = _mm_mul_pd(ad, bd);
|
||||
__m128d mask = _mm_cmpeq_pd(cd, cd);
|
||||
cd = _mm_and_pd(cd, mask);
|
||||
_mm_store_pd(&c.lo.f64, cd);
|
||||
#else
|
||||
double alo = (double)a.i32lo;
|
||||
double ahi = (double)a.i32hi;
|
||||
c.lo.f64 = FlushNaN(alo * b.lo.f64);
|
||||
c.hi.f64 = FlushNaN(ahi * b.hi.f64);
|
||||
#endif
|
||||
}
|
||||
|
||||
void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
#ifdef __SSE2__
|
||||
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
|
||||
__m128d ad = _mm_cvtepi32_pd(ai);
|
||||
__m128d bd = _mm_load_pd(&b.lo.f64);
|
||||
__m128d cd = _mm_div_pd(ad, bd);
|
||||
__m128d mask = _mm_cmpeq_pd(cd, cd);
|
||||
cd = _mm_and_pd(cd, mask);
|
||||
_mm_store_pd(&c.lo.f64, cd);
|
||||
#else
|
||||
double alo = (double)a.i32lo;
|
||||
double ahi = (double)a.i32hi;
|
||||
c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64);
|
||||
c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64);
|
||||
#endif
|
||||
}
|
||||
|
||||
void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
#ifdef __SSE2__
|
||||
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
|
||||
__m128d ad = _mm_cvtepi32_pd(ai);
|
||||
const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
|
||||
ad = _mm_and_pd(ad, absmask);
|
||||
__m128d cd = _mm_sqrt_pd(ad);
|
||||
_mm_store_pd(&c.lo.f64, cd);
|
||||
#else
|
||||
double alo = (double)a.i32lo;
|
||||
double ahi = (double)a.i32hi;
|
||||
c.lo.f64 = sqrt(std::abs(alo));
|
||||
c.hi.f64 = sqrt(std::abs(ahi));
|
||||
#endif
|
||||
}
|
||||
|
||||
void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
|
||||
c.lo.f64 = convertSigned52(a.i64);
|
||||
switch (a.u64 & 3) {
|
||||
case RoundDown:
|
||||
#ifdef DEBUG
|
||||
|
10
src/main.cpp
10
src/main.cpp
@ -79,14 +79,6 @@ void readInt(int argc, char** argv, int& out, int defaultValue) {
|
||||
out = defaultValue;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
|
||||
for (int i = 0; i < RandomX::RegistersCount; ++i)
|
||||
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
|
||||
for (int i = 0; i < RandomX::RegistersCount; ++i)
|
||||
os << std::hex << "f" << i << " = " << rf.f[i].u64 << " (" << rf.f[i].f64 << ")" << std::endl << std::dec;
|
||||
return os;
|
||||
}
|
||||
|
||||
class AtomicHash {
|
||||
public:
|
||||
AtomicHash() {
|
||||
@ -282,7 +274,7 @@ int main(int argc, char** argv) {
|
||||
std::cout << "Calculated result: ";
|
||||
result.print(std::cout);
|
||||
if(programCount == 1000)
|
||||
std::cout << "Reference result: f6bf06465d5fa1b1dc919140b9e9f9e210b07ae6d662988458a172e9a267eb3f" << std::endl;
|
||||
std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
|
||||
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
|
||||
/*if (threadCount == 1 && !compiled) {
|
||||
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];
|
||||
|
5640
src/program.inc
5640
src/program.inc
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user