ASM code generator for "small" programs that fit into the uOP cache

This commit is contained in:
tevador 2019-01-24 19:29:59 +01:00
parent bd0dba88a8
commit d2cb086221
13 changed files with 1796 additions and 9915 deletions

View File

@ -30,12 +30,20 @@ namespace RandomX {
static const char* regR[8] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" };
static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" };
static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regFE[8] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regF[4] = { "xmm0", "xmm1", "xmm2", "xmm3" };
static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" };
static const char* regA4 = "xmm12";
static const char* dblMin = "xmm13";
static const char* absMask = "xmm14";
static const char* signMask = "xmm15";
static const char* regMx = "rbp";
static const char* regIc = "ebx";
static const char* regIc = "rbx";
static const char* regIc32 = "ebx";
static const char* regIc8 = "bl";
static const char* regStackBeginAddr = "rdi";
static const char* regDatasetAddr = "rdi";
static const char* regScratchpadAddr = "rsi";
void AssemblyGeneratorX86::generateProgram(const void* seed) {
@ -49,226 +57,217 @@ namespace RandomX {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
*(((uint32_t*)&instr) + j) = gen();
}
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
generateCode(instr, i);
asmCode << std::endl;
//asmCode << std::endl;
}
if(ProgramLength > 0)
asmCode << "\tjmp rx_i_0" << std::endl;
}
void AssemblyGeneratorX86::generateCode(Instruction& instr, int i) {
asmCode << "rx_i_" << i << ": ;" << instr.getName() << std::endl;
asmCode << "\tdec " << regIc << std::endl;
asmCode << "\tjz rx_finish" << std::endl;
asmCode << "\t; " << instr;
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
asmCode << "\tjnz short rx_body_" << i << std::endl;
asmCode << "\tcall rx_read" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rax" << std::endl;
if (instr.loca & 15) {
if (instr.loca & 3) {
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") {
asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl;
asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
}
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
//1 uOP
void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", " << instr.imm32 << std::endl;
}
}
//2.75 uOP
void AssemblyGeneratorX86::h_IADD_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
}
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
gena(instr, i);
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
//1 uOP
void AssemblyGeneratorX86::h_IADD_RC(Instruction& instr, int i) {
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl;
}
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
gena(instr, i);
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
}
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {
if (instr.locb & 1) {
asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
} else {
asmCode << "\t" << instrx86 << " rax, " << (instr.imm8 & 63) << std::endl;;
}
}
void AssemblyGeneratorX86::genbia(Instruction& instr) {
if (instr.locb & 3) {
asmCode << regR[instr.regb % RegistersCount] << std::endl;
} else {
asmCode << instr.imm32 << std::endl;;
}
}
void AssemblyGeneratorX86::genbia32(Instruction& instr) {
if (instr.locb & 3) {
asmCode << regR32[instr.regb % RegistersCount] << std::endl;
//1 uOP
void AssemblyGeneratorX86::h_ISUB_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << instr.imm32 << std::endl;;
asmCode << "\tsub " << regR[instr.dst] << ", " << instr.imm32 << std::endl;
}
}
void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) {
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
}
void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) {
if (instr.locc & 16) { //write to register
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl;
}
}
else { //write to scratchpad
if (rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
if (instr.locc & 15) {
if (instr.locc & 3) {
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
//2.75 uOP
void AssemblyGeneratorX86::h_ISUB_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
}
//1 uOP
void AssemblyGeneratorX86::h_IMUL_9C(Instruction& instr, int i) {
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl;
}
//1 uOP
void AssemblyGeneratorX86::h_IMUL_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl;
}
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl;
}
asmCode << "\timul " << regR[instr.dst] << ", " << instr.imm32 << std::endl;
}
}
void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) {
if(move)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (instr.locc & 128) ? "movhpd" : "movlpd";
if (instr.locc & 16) { //write to scratchpad
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
if (instr.locc & 15) {
if (instr.locc & 3) {
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
//2.75 uOP
void AssemblyGeneratorX86::h_IMUL_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\timul " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\timul " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
}
//4 uOPs
void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul " << regR[instr.src] << std::endl;
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
}
else {
asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl;
}
asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
}
if (trace) {
asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl;
asmCode << "\tmov eax, " << instr.imm32 << std::endl;
asmCode << "\tmul " << regR[instr.dst] << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
}
}
void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tadd rax, ";
genbia(instr);
gencr(instr);
//5.75 uOPs
void AssemblyGeneratorX86::h_IMULH_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr, "ecx");
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul qword ptr [rsi+rcx]" << std::endl;
}
else {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
}
void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tadd eax, ";
genbia32(instr);
gencr(instr);
//4 uOPs
void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul " << regR[instr.src] << std::endl;
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
}
void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tsub rax, ";
genbia(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tsub eax, ";
genbia32(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\timul rax, ";
if ((instr.locb & 3) == 0) {
asmCode << "rax, ";
}
genbia(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tmov rcx, ";
genbia(instr);
asmCode << "\tmul rcx" << std::endl;
asmCode << "\tmov rax, rdx" << std::endl;
gencr(instr);
}
void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tmov ecx, eax" << std::endl;
asmCode << "\tmov eax, ";
genbia32(instr);
asmCode << "\timul rax, rcx" << std::endl;
gencr(instr);
}
void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tmovsxd rcx, eax" << std::endl;
if ((instr.locb & 3) == 0) {
else {
asmCode << "\tmov rax, " << instr.imm32 << std::endl;
asmCode << "\timul " << regR[instr.dst] << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
}
}
//5.75 uOPs
void AssemblyGeneratorX86::h_ISMULH_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr, "ecx");
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul qword ptr [rsi+rcx]" << std::endl;
}
else {
asmCode << "\tmovsxd rax, " << regR32[instr.regb % RegistersCount] << std::endl;
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
asmCode << "\timul rax, rcx" << std::endl;
gencr(instr);
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
}
void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tmov rcx, ";
genbia(instr);
asmCode << "\timul rcx" << std::endl;
asmCode << "\tmov rax, rdx" << std::endl;
gencr(instr);
//1 uOP
void AssemblyGeneratorX86::h_INEG_R(Instruction& instr, int i) {
asmCode << "\tneg " << regR[instr.dst] << std::endl;
}
void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
genar(instr, i);
if (instr.locb & 3) {
#ifdef MAGIC_DIVISION
//1 uOP
void AssemblyGeneratorX86::h_IXOR_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\txor " << regR[instr.dst] << ", " << instr.imm32 << std::endl;
}
}
//2.75 uOP
void AssemblyGeneratorX86::h_IXOR_M(Instruction& instr, int i) {
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\txor " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl;
}
else {
asmCode << "\txor " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl;
}
}
//1.75 uOPs
void AssemblyGeneratorX86::h_IROR_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl;
asmCode << "\tror " << regR[instr.dst] << ", cl" << std::endl;
}
else {
asmCode << "\tror " << regR[instr.dst] << ", " << (instr.imm32 & 63) << std::endl;
}
}
//1.75 uOPs
void AssemblyGeneratorX86::h_IROL_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl;
asmCode << "\trol " << regR[instr.dst] << ", cl" << std::endl;
}
else {
asmCode << "\trol " << regR[instr.dst] << ", " << (instr.imm32 & 63) << std::endl;
}
}
//~6 uOPs
void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) {
if (instr.imm32 != 0) {
uint32_t divisor = instr.imm32;
asmCode << "\t; magic divide by " << divisor << std::endl;
if (divisor & (divisor - 1)) {
magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
if (mi.pre_shift == 0 && !mi.increment) {
asmCode << "\tmov rax, " << mi.multiplier << std::endl;
asmCode << "\tmul " << regR[instr.dst] << std::endl;
}
else {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
if (mi.pre_shift > 0)
asmCode << "\tshr rax, " << mi.pre_shift << std::endl;
if (mi.increment) {
@ -277,51 +276,26 @@ namespace RandomX {
}
asmCode << "\tmov rcx, " << mi.multiplier << std::endl;
asmCode << "\tmul rcx" << std::endl;
asmCode << "\tmov rax, rdx" << std::endl;
}
if (mi.post_shift > 0)
asmCode << "\tshr rax, " << mi.post_shift << std::endl;
asmCode << "\tshr rdx, " << mi.post_shift << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
}
else { //divisor is a power of two
int shift = 0;
while (divisor >>= 1)
++shift;
if(shift > 0)
asmCode << "\tshr rax, " << shift << std::endl;
asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl;
}
}
#else
if (instr.imm32 == 0) {
asmCode << "\tmov ecx, 1" << std::endl;
}
else {
asmCode << "\tmov ecx, " << instr.imm32 << std::endl;
}
#endif
}
else {
asmCode << "\tmov ecx, 1" << std::endl;
asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl;
asmCode << "\ttest edx, edx" << std::endl;
asmCode << "\tcmovne ecx, edx" << std::endl;
#ifdef MAGIC_DIVISION
asmCode << "\txor edx, edx" << std::endl;
asmCode << "\tdiv rcx" << std::endl;
#endif
}
#ifndef MAGIC_DIVISION
asmCode << "\txor edx, edx" << std::endl;
asmCode << "\tdiv rcx" << std::endl;
#endif
gencr(instr);
}
void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
genar(instr, i);
if (instr.locb & 3) {
#ifdef MAGIC_DIVISION
//~8.5 uOPs
void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) {
int64_t divisor = instr.imm32;
asmCode << "\t; magic divide by " << divisor << std::endl;
if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
// +/- power of two
bool negative = divisor < 0;
if (negative)
@ -340,263 +314,211 @@ namespace RandomX {
}
if (negative)
asmCode << "\tneg rax" << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl;
}
else if (divisor != 0) {
magics_info mi = compute_signed_magic_info(divisor);
if ((divisor >= 0) != (mi.multiplier >= 0))
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov rdx, " << mi.multiplier << std::endl;
asmCode << "\timul rdx" << std::endl;
asmCode << "\tmov rax, rdx" << std::endl;
asmCode << "\txor edx, edx" << std::endl;
asmCode << "\tmov rax, " << mi.multiplier << std::endl;
asmCode << "\timul " << regR[instr.dst] << std::endl;
//asmCode << "\tmov rax, rdx" << std::endl;
asmCode << "\txor eax, eax" << std::endl;
bool haveSF = false;
if (divisor > 0 && mi.multiplier < 0) {
asmCode << "\tadd rax, rcx" << std::endl;
asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl;
haveSF = true;
}
if (divisor < 0 && mi.multiplier > 0) {
asmCode << "\tsub rax, rcx" << std::endl;
asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl;
haveSF = true;
}
if (mi.shift > 0) {
asmCode << "\tsar rax, " << mi.shift << std::endl;
asmCode << "\tsar rdx, " << mi.shift << std::endl;
haveSF = true;
}
if (!haveSF)
asmCode << "\ttest rax, rax" << std::endl;
asmCode << "\tsets dl" << std::endl;
asmCode << "\tadd rax, rdx" << std::endl;
asmCode << "\ttest rdx, rdx" << std::endl;
asmCode << "\tsets al" << std::endl;
asmCode << "\tadd rdx, rax" << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
}
#else
asmCode << "\tmov edx, " << instr.imm32 << std::endl;
#endif
}
else {
asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl;
#ifndef MAGIC_DIVISION
}
#endif
asmCode << "\tcmp edx, -1" << std::endl;
asmCode << "\tjne short body_idiv_" << i << std::endl;
asmCode << "\tneg rax" << std::endl;
asmCode << "\tjmp short result_idiv_" << i << std::endl;
asmCode << "body_idiv_" << i << ":" << std::endl;
asmCode << "\tmov ecx, 1" << std::endl;
asmCode << "\ttest edx, edx" << std::endl;
asmCode << "\tcmovne ecx, edx" << std::endl;
asmCode << "\tmovsxd rcx, ecx" << std::endl;
asmCode << "\tcqo" << std::endl;
asmCode << "\tidiv rcx" << std::endl;
asmCode << "result_idiv_" << i << ":" << std::endl;
#ifdef MAGIC_DIVISION
}
#endif
gencr(instr);
}
void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tand rax, ";
genbia(instr);
gencr(instr);
//1 uOPs
void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) {
asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl;
}
void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tand eax, ";
genbia32(instr);
gencr(instr);
//1 uOP
void AssemblyGeneratorX86::h_FPADD_R(Instruction& instr, int i) {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
}
void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tor rax, ";
genbia(instr);
gencr(instr);
//5 uOPs
void AssemblyGeneratorX86::h_FPADD_M(Instruction& instr, int i) {
instr.dst %= 4;
genAddressReg(instr);
asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl;
asmCode << "\taddpd " << regF[instr.dst] << ", xmm12" << std::endl;
}
void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tor eax, ";
genbia32(instr);
gencr(instr);
//1 uOP
void AssemblyGeneratorX86::h_FPSUB_R(Instruction& instr, int i) {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
}
void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\txor rax, ";
genbia(instr);
gencr(instr);
//5 uOPs
void AssemblyGeneratorX86::h_FPSUB_M(Instruction& instr, int i) {
instr.dst %= 4;
genAddressReg(instr);
asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl;
asmCode << "\tsubpd " << regF[instr.dst] << ", xmm12" << std::endl;
}
void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\txor eax, ";
genbia32(instr);
gencr(instr);
//1 uOP
void AssemblyGeneratorX86::h_FPNEG_R(Instruction& instr, int i) {
instr.dst %= 4;
asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl;
}
void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
genar(instr, i);
genbiashift(instr, "shl");
gencr(instr);
//1 uOPs
void AssemblyGeneratorX86::h_FPMUL_R(Instruction& instr, int i) {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl;
}
void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
genar(instr, i);
genbiashift(instr, "shr");
gencr(instr);
//6 uOPs
void AssemblyGeneratorX86::h_FPMUL_M(Instruction& instr, int i) {
instr.dst %= 4;
genAddressReg(instr);
asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl;
asmCode << "\tmulpd " << regE[instr.dst] << ", xmm12" << std::endl;
asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl;
}
void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
genar(instr, i);
genbiashift(instr, "sar");
gencr(instr);
//2 uOPs
void AssemblyGeneratorX86::h_FPDIV_R(Instruction& instr, int i) {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\tdivpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl;
asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl;
}
void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
genar(instr, i);
genbiashift(instr, "rol");
gencr(instr);
//6 uOPs
void AssemblyGeneratorX86::h_FPDIV_M(Instruction& instr, int i) {
instr.dst %= 4;
genAddressReg(instr);
asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl;
asmCode << "\tdivpd " << regE[instr.dst] << ", xmm12" << std::endl;
asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl;
}
void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
genar(instr, i);
genbiashift(instr, "ror");
gencr(instr);
//1 uOP
void AssemblyGeneratorX86::h_FPSQRT_R(Instruction& instr, int i) {
instr.dst %= 4;
asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl;
}
void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) {
genaf(instr, i);
genbf(instr, "addpd");
gencf(instr);
}
void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) {
genaf(instr, i);
genbf(instr, "subpd");
gencf(instr);
}
void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) {
genaf(instr, i);
genbf(instr, "mulpd");
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr);
}
void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) {
genaf(instr, i);
genbf(instr, "divpd");
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr);
}
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
genaf(instr, i);
asmCode << "\tandps xmm0, xmm10" << std::endl;
asmCode << "\tsqrtpd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
gencf(instr, false);
}
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
genar(instr, i);
asmCode << "\tmov rcx, rax" << std::endl;
int rotate = (13 - (instr.imm8 & 63)) & 63;
//6 uOPs
void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
int rotate = (13 - (instr.alt & 63)) & 63;
if (rotate != 0)
asmCode << "\trol rax, " << rotate << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencr(instr, false);
asmCode << "\tmov dword ptr [rsp-8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp-8]" << std::endl;
}
static inline const char* jumpCondition(Instruction& instr, bool invert = false) {
switch ((instr.locb & 7) ^ invert)
static inline const char* condition(Instruction& instr, bool invert = false) {
switch (((instr.alt >> 2) & 7) ^ invert)
{
case 0:
return "jbe";
return "be";
case 1:
return "ja";
return "a";
case 2:
return "js";
return "s";
case 3:
return "jns";
return "ns";
case 4:
return "jo";
return "o";
case 5:
return "jno";
return "no";
case 6:
return "jl";
return "l";
case 7:
return "jge";
return "ge";
}
}
void AssemblyGeneratorX86::h_JUMP(Instruction& instr, int i) {
genar(instr, i);
gencr(instr);
asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
asmCode << "\t" << jumpCondition(instr);
asmCode << " rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
//4 uOPs
void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) {
asmCode << "\txor ecx, ecx" << std::endl;
asmCode << "\tcmp " << regR32[instr.src] << ", " << instr.imm32 << std::endl;
asmCode << "\tset" << condition(instr) << " cl" << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl;
}
void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) {
genar(instr, i);
gencr(instr);
asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
asmCode << "\t" << jumpCondition(instr, true);
asmCode << " short rx_i_" << wrapInstr(i + 1) << std::endl;
asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
}
void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {
genar(instr, i);
gencr(instr);
asmCode << "\tcmp rsp, " << regStackBeginAddr << std::endl;
asmCode << "\tje short rx_i_" << wrapInstr(i + 1) << std::endl;
asmCode << "\tret" << std::endl;
//6 uOPs
void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) {
asmCode << "\txor ecx, ecx" << std::endl;
genAddressReg(instr);
asmCode << "\tcmp dword ptr [rsi+rax], " << instr.imm32 << std::endl;
asmCode << "\tset" << condition(instr) << " cl" << std::endl;
asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl;
}
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))
InstructionGenerator AssemblyGeneratorX86::engine[256] = {
INST_HANDLE(ADD_64)
INST_HANDLE(ADD_32)
INST_HANDLE(SUB_64)
INST_HANDLE(SUB_32)
INST_HANDLE(MUL_64)
INST_HANDLE(MULH_64)
INST_HANDLE(MUL_32)
INST_HANDLE(IMUL_32)
INST_HANDLE(IMULH_64)
INST_HANDLE(DIV_64)
INST_HANDLE(IDIV_64)
INST_HANDLE(AND_64)
INST_HANDLE(AND_32)
INST_HANDLE(OR_64)
INST_HANDLE(OR_32)
INST_HANDLE(XOR_64)
INST_HANDLE(XOR_32)
INST_HANDLE(SHL_64)
INST_HANDLE(SHR_64)
INST_HANDLE(SAR_64)
INST_HANDLE(ROL_64)
INST_HANDLE(ROR_64)
INST_HANDLE(FPADD)
INST_HANDLE(FPSUB)
INST_HANDLE(FPMUL)
INST_HANDLE(FPDIV)
INST_HANDLE(FPSQRT)
INST_HANDLE(FPROUND)
INST_HANDLE(JUMP)
INST_HANDLE(CALL)
INST_HANDLE(RET)
//Integer
INST_HANDLE(IADD_R)
INST_HANDLE(IADD_M)
INST_HANDLE(IADD_RC)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_9C)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IDIV_C)
INST_HANDLE(ISDIV_C)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
//Common floating point
INST_HANDLE(FPSWAP_R)
//Floating point group F
INST_HANDLE(FPADD_R)
INST_HANDLE(FPADD_M)
INST_HANDLE(FPSUB_R)
INST_HANDLE(FPSUB_M)
INST_HANDLE(FPNEG_R)
//Floating point group E
INST_HANDLE(FPMUL_R)
INST_HANDLE(FPMUL_M)
INST_HANDLE(FPDIV_R)
INST_HANDLE(FPDIV_M)
INST_HANDLE(FPSQRT_R)
//Control
INST_HANDLE(COND_R)
INST_HANDLE(COND_M)
INST_HANDLE(CFROUND)
};
}

View File

@ -47,39 +47,43 @@ namespace RandomX {
void genbf(Instruction&, const char*);
void gencr(Instruction&, bool);
void gencf(Instruction&, bool);
void genAddressReg(Instruction&, const char*);
int32_t genAddressImm(Instruction&);
void generateCode(Instruction&, int);
void h_ADD_64(Instruction&, int);
void h_ADD_32(Instruction&, int);
void h_SUB_64(Instruction&, int);
void h_SUB_32(Instruction&, int);
void h_MUL_64(Instruction&, int);
void h_MULH_64(Instruction&, int);
void h_MUL_32(Instruction&, int);
void h_IMUL_32(Instruction&, int);
void h_IMULH_64(Instruction&, int);
void h_DIV_64(Instruction&, int);
void h_IDIV_64(Instruction&, int);
void h_AND_64(Instruction&, int);
void h_AND_32(Instruction&, int);
void h_OR_64(Instruction&, int);
void h_OR_32(Instruction&, int);
void h_XOR_64(Instruction&, int);
void h_XOR_32(Instruction&, int);
void h_SHL_64(Instruction&, int);
void h_SHR_64(Instruction&, int);
void h_SAR_64(Instruction&, int);
void h_ROL_64(Instruction&, int);
void h_ROR_64(Instruction&, int);
void h_FPADD(Instruction&, int);
void h_FPSUB(Instruction&, int);
void h_FPMUL(Instruction&, int);
void h_FPDIV(Instruction&, int);
void h_FPSQRT(Instruction&, int);
void h_FPROUND(Instruction&, int);
void h_JUMP(Instruction&, int);
void h_CALL(Instruction&, int);
void h_RET(Instruction&, int);
void h_IADD_R(Instruction&, int);
void h_IADD_M(Instruction&, int);
void h_IADD_RC(Instruction&, int);
void h_ISUB_R(Instruction&, int);
void h_ISUB_M(Instruction&, int);
void h_IMUL_9C(Instruction&, int);
void h_IMUL_R(Instruction&, int);
void h_IMUL_M(Instruction&, int);
void h_IMULH_R(Instruction&, int);
void h_IMULH_M(Instruction&, int);
void h_ISMULH_R(Instruction&, int);
void h_ISMULH_M(Instruction&, int);
void h_IDIV_C(Instruction&, int);
void h_ISDIV_C(Instruction&, int);
void h_INEG_R(Instruction&, int);
void h_IXOR_R(Instruction&, int);
void h_IXOR_M(Instruction&, int);
void h_IROR_R(Instruction&, int);
void h_IROL_R(Instruction&, int);
void h_FPSWAP_R(Instruction&, int);
void h_FPADD_R(Instruction&, int);
void h_FPADD_M(Instruction&, int);
void h_FPSUB_R(Instruction&, int);
void h_FPSUB_M(Instruction&, int);
void h_FPNEG_R(Instruction&, int);
void h_FPMUL_R(Instruction&, int);
void h_FPMUL_M(Instruction&, int);
void h_FPDIV_R(Instruction&, int);
void h_FPDIV_M(Instruction&, int);
void h_FPSQRT_R(Instruction&, int);
void h_COND_R(Instruction&, int);
void h_COND_M(Instruction&, int);
void h_CFROUND(Instruction&, int);
};
}

View File

@ -25,6 +25,12 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
constexpr int mantissaSize = 52;
constexpr int exponentSize = 11;
constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1;
constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1;
constexpr int exponentBias = 1023;
CompiledVirtualMachine::CompiledVirtualMachine() {
totalSize = 0;
}
@ -37,25 +43,42 @@ namespace RandomX {
memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize);
}
static uint64_t getSmallPositiveFloatBits(uint64_t entropy) {
auto exponent = entropy >> 60; //0..15
auto mantissa = entropy & mantissaMask;
exponent += exponentBias;
exponent &= exponentMask;
exponent <<= mantissaSize;
return exponent | mantissa;
}
void CompiledVirtualMachine::initializeProgram(const void* seed) {
Pcg32 gen(seed);
for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
*(((uint32_t*)&reg) + i) = gen();
}
FPINIT();
for (int i = 0; i < RegistersCount; ++i) {
/*for (int i = 0; i < RegistersCount / 2; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
}
for (int i = 0; i < RegistersCount / 2; ++i) {
reg.g[i].lo.f64 = std::abs((double)reg.g[i].lo.i64);
reg.g[i].hi.f64 = std::abs((double)reg.g[i].hi.i64);
}*/
for (int i = 0; i < RegistersCount / 2; ++i) {
reg.a[i].lo.u64 = getSmallPositiveFloatBits(reg.f[i].lo.u64);
reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64);
}
compiler.generateProgram(gen);
mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
mem.mx = *(((uint32_t*)seed) + 5);
}
void CompiledVirtualMachine::execute() {
//executeProgram(reg, mem, scratchpad, readDataset);
executeProgram(reg, mem, scratchpad, InstructionCount);
totalSize += compiler.getCodeSize();
compiler.getProgramFunc()(reg, mem, scratchpad);
//compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;

View File

@ -18,54 +18,391 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
#include "Instruction.hpp"
#include "common.hpp"
namespace RandomX {
void Instruction::print(std::ostream& os) const {
os << " A: loc = " << std::dec << (loca & 7) << ", reg: " << (rega & 7) << std::endl;
os << " B: loc = " << (locb & 7) << ", reg: " << (regb & 7) << std::endl;
os << " C: loc = " << (locc & 7) << ", reg: " << (regc & 7) << std::endl;
os << " addra = " << std::hex << addra << std::endl;
os << " addrc = " << addrc << std::endl;
os << " imm8 = " << std::dec << (int)imm8 << std::endl;
os << " imm32 = " << imm32 << std::endl;
os << names[opcode] << " ";
auto handler = engine[opcode];
(this->*handler)(os);
}
void Instruction::genAddressReg(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
}
void Instruction::genAddressImm(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
}
void Instruction::h_IADD_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_IADD_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IADD_RC(std::ostream& os) const {
os << "r" << (int)dst << ", r" << (int)src << ", " << imm32 << std::endl;
}
//1 uOP
void Instruction::h_ISUB_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_ISUB_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IMUL_9C(std::ostream& os) const {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
void Instruction::h_IMUL_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_IMUL_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IMULH_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_IMULH_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_ISMULH_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_ISMULH_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_INEG_R(std::ostream& os) const {
os << "r" << (int)dst << std::endl;
}
void Instruction::h_IXOR_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
}
void Instruction::h_IXOR_M(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", ";
genAddressReg(os);
os << std::endl;
}
else {
os << "r" << (int)dst << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IROR_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << (imm32 & 63) << std::endl;
}
}
void Instruction::h_IROL_R(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
else {
os << "r" << (int)dst << ", " << (imm32 & 63) << std::endl;
}
}
void Instruction::h_IDIV_C(std::ostream& os) const {
os << "r" << (int)dst << ", " << (uint32_t)imm32 << std::endl;
}
void Instruction::h_ISDIV_C(std::ostream& os) const {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
void Instruction::h_FPSWAP_R(std::ostream& os) const {
const char reg = (dst >= 4) ? 'e' : 'f';
auto dstIndex = dst % 4;
os << reg << dstIndex << std::endl;
}
void Instruction::h_FPADD_R(std::ostream& os) const {
auto dstIndex = dst % 4;
auto srcIndex = src % 4;
os << "f" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FPADD_M(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "f" << dstIndex << ", ";
genAddressReg(os);
os << std::endl;
}
void Instruction::h_FPSUB_R(std::ostream& os) const {
auto dstIndex = dst % 4;
auto srcIndex = src % 4;
os << "f" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FPSUB_M(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "f" << dstIndex << ", ";
genAddressReg(os);
os << std::endl;
}
void Instruction::h_FPNEG_R(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "f" << dstIndex << std::endl;
}
void Instruction::h_FPMUL_R(std::ostream& os) const {
auto dstIndex = dst % 4;
auto srcIndex = src % 4;
os << "e" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FPMUL_M(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "e" << dstIndex << ", ";
genAddressReg(os);
os << std::endl;
}
void Instruction::h_FPDIV_R(std::ostream& os) const {
auto dstIndex = dst % 4;
auto srcIndex = src % 4;
os << "e" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FPDIV_M(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "e" << dstIndex << ", ";
genAddressReg(os);
os << std::endl;
}
void Instruction::h_FPSQRT_R(std::ostream& os) const {
auto dstIndex = dst % 4;
os << "e" << dstIndex << std::endl;
}
void Instruction::h_CFROUND(std::ostream& os) const {
os << "r" << (int)dst << ", " << (alt & 63) << std::endl;
}
static inline const char* condition(int index) {
switch (index)
{
case 0:
return "be";
case 1:
return "ab";
case 2:
return "sg";
case 3:
return "ns";
case 4:
return "of";
case 5:
return "no";
case 6:
return "lt";
case 7:
return "ge";
}
}
void Instruction::h_COND_R(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
}
void Instruction::h_COND_M(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(";
genAddressReg(os);
os << ", " << imm32 << ")" << std::endl;
}
#include "instructionWeights.hpp"
#define INST_NAME(x) REPN(#x, WT(x))
#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
const char* Instruction::names[256] = {
INST_NAME(ADD_64)
INST_NAME(ADD_32)
INST_NAME(SUB_64)
INST_NAME(SUB_32)
INST_NAME(MUL_64)
INST_NAME(MULH_64)
INST_NAME(MUL_32)
INST_NAME(IMUL_32)
INST_NAME(IMULH_64)
INST_NAME(DIV_64)
INST_NAME(IDIV_64)
INST_NAME(AND_64)
INST_NAME(AND_32)
INST_NAME(OR_64)
INST_NAME(OR_32)
INST_NAME(XOR_64)
INST_NAME(XOR_32)
INST_NAME(SHL_64)
INST_NAME(SHR_64)
INST_NAME(SAR_64)
INST_NAME(ROL_64)
INST_NAME(ROR_64)
INST_NAME(FPADD)
INST_NAME(FPSUB)
INST_NAME(FPMUL)
INST_NAME(FPDIV)
INST_NAME(FPSQRT)
INST_NAME(FPROUND)
INST_NAME(JUMP)
INST_NAME(CALL)
INST_NAME(RET)
//Integer
INST_NAME(IADD_R)
INST_NAME(IADD_M)
INST_NAME(IADD_RC)
INST_NAME(ISUB_R)
INST_NAME(ISUB_M)
INST_NAME(IMUL_9C)
INST_NAME(IMUL_R)
INST_NAME(IMUL_M)
INST_NAME(IMULH_R)
INST_NAME(IMULH_M)
INST_NAME(ISMULH_R)
INST_NAME(ISMULH_M)
INST_NAME(IDIV_C)
INST_NAME(ISDIV_C)
INST_NAME(INEG_R)
INST_NAME(IXOR_R)
INST_NAME(IXOR_M)
INST_NAME(IROR_R)
INST_NAME(IROL_R)
//Common floating point
INST_NAME(FPSWAP_R)
//Floating point group F
INST_NAME(FPADD_R)
INST_NAME(FPADD_M)
INST_NAME(FPSUB_R)
INST_NAME(FPSUB_M)
INST_NAME(FPNEG_R)
//Floating point group E
INST_NAME(FPMUL_R)
INST_NAME(FPMUL_M)
INST_NAME(FPDIV_R)
INST_NAME(FPDIV_M)
INST_NAME(FPSQRT_R)
//Control
INST_NAME(COND_R)
INST_NAME(COND_M)
INST_NAME(CFROUND)
};
InstructionVisualizer Instruction::engine[256] = {
//Integer
INST_HANDLE(IADD_R)
INST_HANDLE(IADD_M)
INST_HANDLE(IADD_RC)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_9C)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IDIV_C)
INST_HANDLE(ISDIV_C)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
//Common floating point
INST_HANDLE(FPSWAP_R)
//Floating point group F
INST_HANDLE(FPADD_R)
INST_HANDLE(FPADD_M)
INST_HANDLE(FPSUB_R)
INST_HANDLE(FPSUB_M)
INST_HANDLE(FPNEG_R)
//Floating point group E
INST_HANDLE(FPMUL_R)
INST_HANDLE(FPMUL_M)
INST_HANDLE(FPDIV_R)
INST_HANDLE(FPDIV_M)
INST_HANDLE(FPSQRT_R)
//Control
INST_HANDLE(COND_R)
INST_HANDLE(COND_M)
INST_HANDLE(CFROUND)
};
}

View File

@ -24,21 +24,17 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
class Instruction;
typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const;
class Instruction {
public:
uint8_t opcode;
uint8_t loca;
uint8_t rega;
uint8_t locb;
uint8_t regb;
uint8_t locc;
uint8_t regc;
uint8_t imm8;
int32_t addra;
union {
uint32_t addrc;
uint8_t dst;
uint8_t src;
uint8_t alt;
int32_t imm32;
};
const char* getName() const {
return names[opcode];
}
@ -49,8 +45,46 @@ namespace RandomX {
private:
void print(std::ostream&) const;
static const char* names[256];
static InstructionVisualizer engine[256];
void genAddressReg(std::ostream& os) const;
void genAddressImm(std::ostream& os) const;
void h_IADD_R(std::ostream&) const;
void h_IADD_M(std::ostream&) const;
void h_IADD_RC(std::ostream&) const;
void h_ISUB_R(std::ostream&) const;
void h_ISUB_M(std::ostream&) const;
void h_IMUL_9C(std::ostream&) const;
void h_IMUL_R(std::ostream&) const;
void h_IMUL_M(std::ostream&) const;
void h_IMULH_R(std::ostream&) const;
void h_IMULH_M(std::ostream&) const;
void h_ISMULH_R(std::ostream&) const;
void h_ISMULH_M(std::ostream&) const;
void h_IDIV_C(std::ostream&) const;
void h_ISDIV_C(std::ostream&) const;
void h_INEG_R(std::ostream&) const;
void h_IXOR_R(std::ostream&) const;
void h_IXOR_M(std::ostream&) const;
void h_IROR_R(std::ostream&) const;
void h_IROL_R(std::ostream&) const;
void h_FPSWAP_R(std::ostream&) const;
void h_FPADD_R(std::ostream&) const;
void h_FPADD_M(std::ostream&) const;
void h_FPSUB_R(std::ostream&) const;
void h_FPSUB_M(std::ostream&) const;
void h_FPNEG_R(std::ostream&) const;
void h_FPMUL_R(std::ostream&) const;
void h_FPMUL_M(std::ostream&) const;
void h_FPDIV_R(std::ostream&) const;
void h_FPDIV_M(std::ostream&) const;
void h_FPSQRT_R(std::ostream&) const;
void h_COND_R(std::ostream&) const;
void h_COND_M(std::ostream&) const;
void h_CFROUND(std::ostream&) const;
};
static_assert(sizeof(Instruction) == 16, "Invalid alignment of struct Instruction");
static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction");
}

View File

@ -130,333 +130,10 @@ namespace RandomX {
#endif
}
convertible_t InterpretedVirtualMachine::loada(Instruction& instr) {
convertible_t& rega = reg.r[instr.rega % RegistersCount];
rega.i64 ^= instr.addra; //sign-extend addra
addr_t addr = rega.u32;
if ((ic % 64) == 0) {
addr = currentTransform->apply(addr);
#ifdef STATS
datasetAccess[mem.ma / (DatasetBlockCount / 256) / CacheLineSize]++;
#endif
readDataset(addr, mem, reg);
}
if ((instr.loca & 192) == 0) {
mem.mx ^= addr;
}
if (instr.loca & 3) {
return scratchpad[addr % ScratchpadL1];
}
else {
return scratchpad[addr % ScratchpadL2];
}
}
convertible_t InterpretedVirtualMachine::loadbia(Instruction& instr) {
if (instr.locb & 3) {
return reg.r[instr.regb % RegistersCount];
}
else {
convertible_t temp;
temp.i64 = instr.imm32; //sign-extend imm32
return temp;
}
}
convertible_t InterpretedVirtualMachine::loadbiashift(Instruction& instr) {
if (instr.locb & 1) {
return reg.r[instr.regb % RegistersCount];
}
else {
convertible_t temp;
temp.u64 = instr.imm8;
return temp;
}
}
convertible_t InterpretedVirtualMachine::loadbiadiv(Instruction& instr) {
if (instr.locb & 3) {
convertible_t temp;
temp.u64 = instr.imm32;
return temp;
}
else {
return reg.r[instr.regb % RegistersCount];
}
}
convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
addr_t addr;
switch (inst.locc & 7)
{
case 0:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL2];
case 1:
case 2:
case 3:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL1];
case 4:
case 5:
case 6:
case 7:
return reg.r[inst.regc % RegistersCount];
}
}
void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) {
addr_t addr;
switch (inst.locc & 7)
{
case 4:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo;
break;
case 5:
case 6:
case 7:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo;
default:
break;
}
}
#define ALU_RETIRE(x) x(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define CHECK_NOP_FPDIV(b, c)
#ifndef STATS
#define CHECK_NOP_FPADD(b, c)
#define CHECK_NOP_FPSUB(b, c)
#define CHECK_NOP_FPMUL(b, c)
#else
#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++;
#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++;
#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++;
#endif
#define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \
if(trace) { \
std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \
} \
if(fpuCheck) { \
if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64) { \
std::stringstream ss; \
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
throw std::runtime_error(ss.str()); \
} else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\
std::stringstream ss; \
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
throw std::runtime_error(ss.str()); \
} \
}
#ifdef STATS
#define INC_COUNT(x) count_##x++;
#else
#define INC_COUNT(x)
#endif
#define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \
writecf(inst, c); \
if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl;
#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \
writecflo(inst, c); \
if(trace) std::cout << std::hex << c.lo.u64 << std::endl;
#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
convertible_t b = loadbia(inst); \
convertible_t& c = getcr(inst); \
ALU_RETIRE(x) \
}
#define ALU_INST_SR(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
convertible_t b = loadbiashift(inst); \
convertible_t& c = getcr(inst); \
ALU_RETIRE(x) \
}
#define ALU_INST_DIV(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
convertible_t b = loadbiadiv(inst); \
convertible_t& c = getcr(inst); \
ALU_RETIRE(x) \
}
#define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
fpu_reg_t btemp = b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \
CHECK_NOP_##x(btemp, c) \
}
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
fpu_reg_t b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE_##x(x) \
}
ALU_INST(ADD_64)
ALU_INST(ADD_32)
ALU_INST(SUB_64)
ALU_INST(SUB_32)
ALU_INST(MUL_64)
ALU_INST(MULH_64)
ALU_INST(MUL_32)
ALU_INST(IMUL_32)
ALU_INST(IMULH_64)
ALU_INST_DIV(DIV_64)
ALU_INST_DIV(IDIV_64)
ALU_INST(AND_64)
ALU_INST(AND_32)
ALU_INST(OR_64)
ALU_INST(OR_32)
ALU_INST(XOR_64)
ALU_INST(XOR_32)
ALU_INST_SR(SHL_64)
ALU_INST_SR(SHR_64)
ALU_INST_SR(SAR_64)
ALU_INST_SR(ROL_64)
ALU_INST_SR(ROR_64)
FPU_INST(FPADD)
FPU_INST(FPSUB)
FPU_INST(FPMUL)
FPU_INST(FPDIV)
FPU_INST_NB(FPSQRT)
void InterpretedVirtualMachine::h_FPROUND(Instruction& inst) {
convertible_t a = loada(inst);
convertible_t& c = getcr(inst);
c.u64 = a.u64;
if (trace) std::cout << std::hex << a.u64 << std::endl;
FPROUND(a, inst.imm8);
}
void InterpretedVirtualMachine::h_JUMP(Instruction& inst) {
convertible_t a = loada(inst);
convertible_t& c = getcr(inst);
c.u64 = a.u64;
if (trace) std::cout << std::hex << a.u64 << std::endl;
if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) {
#ifdef STATS
count_JUMP_taken++;
count_jump_taken[inst.locb & 7]++;
#endif
pc += (inst.imm8 & 127) + 1;
pc = pc % ProgramLength;
}
#ifdef STATS
else {
count_JUMP_not_taken++;
count_jump_not_taken[inst.locb & 7]++;
}
#endif
}
void InterpretedVirtualMachine::h_CALL(Instruction& inst) {
convertible_t a = loada(inst);
convertible_t& c = getcr(inst);
c.u64 = a.u64;
if (trace) std::cout << std::hex << a.u64 << std::endl;
if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) {
#ifdef STATS
count_CALL_taken++;
count_jump_taken[inst.locb & 7]++;
count_retdepth = std::max(0, count_retdepth - 1);
#endif
stackPush(pc);
#ifdef STATS
count_max_stack = std::max(count_max_stack, (int)stack.size());
#endif
pc += (inst.imm8 & 127) + 1;
pc = pc % ProgramLength;
}
#ifdef STATS
else {
count_CALL_not_taken++;
count_jump_not_taken[inst.locb & 7]++;
}
#endif
}
void InterpretedVirtualMachine::h_RET(Instruction& inst) {
convertible_t a = loada(inst);
convertible_t& c = getcr(inst);
c.u64 = a.u64;
if (trace) std::cout << std::hex << a.u64 << std::endl;
if (stack.size() > 0) {
#ifdef STATS
count_RET_taken++;
count_retdepth++;
count_retdepth_max = std::max(count_retdepth_max, count_retdepth);
#endif
auto raddr = stackPopAddress();
pc = raddr;
}
#ifdef STATS
else {
count_RET_stack_empty++;
}
#endif
}
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x))
InstructionHandler InterpretedVirtualMachine::engine[256] = {
INST_HANDLE(ADD_64)
INST_HANDLE(ADD_32)
INST_HANDLE(SUB_64)
INST_HANDLE(SUB_32)
INST_HANDLE(MUL_64)
INST_HANDLE(MULH_64)
INST_HANDLE(MUL_32)
INST_HANDLE(IMUL_32)
INST_HANDLE(IMULH_64)
INST_HANDLE(DIV_64)
INST_HANDLE(IDIV_64)
INST_HANDLE(AND_64)
INST_HANDLE(AND_32)
INST_HANDLE(OR_64)
INST_HANDLE(OR_32)
INST_HANDLE(XOR_64)
INST_HANDLE(XOR_32)
INST_HANDLE(SHL_64)
INST_HANDLE(SHR_64)
INST_HANDLE(SAR_64)
INST_HANDLE(ROL_64)
INST_HANDLE(ROR_64)
INST_HANDLE(FPADD)
INST_HANDLE(FPSUB)
INST_HANDLE(FPMUL)
INST_HANDLE(FPDIV)
INST_HANDLE(FPSQRT)
INST_HANDLE(FPROUND)
INST_HANDLE(JUMP)
INST_HANDLE(CALL)
INST_HANDLE(RET)
};
}

View File

@ -38,9 +38,9 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
#if !defined(_M_X64) && !defined(__x86_64__)
#if true || !defined(_M_X64) && !defined(__x86_64__)
JitCompilerX86::JitCompilerX86() {
throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
//throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
}
void JitCompilerX86::generateProgram(Pcg32& gen) {

View File

@ -30,7 +30,6 @@ namespace RandomX {
void Program::print(std::ostream& os) const {
for (int i = 0; i < RandomX::ProgramLength; ++i) {
auto instr = programBuffer[i];
os << std::dec << instr.getName() << " (" << i << "):" << std::endl;
os << instr;
}
}

View File

@ -46,7 +46,7 @@ namespace RandomX {
constexpr int CacheBlockCount = CacheSize / CacheLineSize;
constexpr int BlockExpansionRatio = DatasetSize / CacheSize;
constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount;
constexpr int DatasetIterations = 10;
constexpr int DatasetIterations = 3;
#ifdef TRACE
@ -72,13 +72,15 @@ namespace RandomX {
convertible_t hi;
};
constexpr int ProgramLength = 512;
constexpr uint32_t InstructionCount = 1024 * 1024;
constexpr int ProgramLength = 256;
constexpr uint32_t InstructionCount = 1024;
constexpr uint32_t ScratchpadSize = 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8;
@ -118,17 +120,19 @@ namespace RandomX {
struct RegisterFile {
convertible_t r[RegistersCount];
fpu_reg_t f[RegistersCount];
fpu_reg_t f[RegistersCount / 2];
fpu_reg_t g[RegistersCount / 2];
fpu_reg_t a[RegistersCount / 2];
};
static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct RandomX::RegisterFile");
typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&);
typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*);
extern "C" {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc);
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
}
}

View File

@ -21,16 +21,24 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE
PUBLIC executeProgram
ALIGN 16
minDbl:
db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
absMask:
db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
signMask:
db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
executeProgram PROC
; REGISTER ALLOCATION:
; rax -> temporary
; rbx -> "ic"
; rcx -> temporary
; rdx -> temporary
; rsi -> convertible_t& scratchpad
; rdi -> beginning of VM stack
; rsi -> scratchpad pointer
; rdi -> dataset pointer
; rbp -> "ma", "mx"
; rsp -> end of VM stack
; rsp -> stack pointer
; r8 -> "r0"
; r9 -> "r1"
; r10 -> "r2"
@ -39,32 +47,22 @@ executeProgram PROC
; r13 -> "r5"
; r14 -> "r6"
; r15 -> "r7"
; xmm0 -> temporary
; xmm1 -> temporary
; xmm0 -> "f0"
; xmm1 -> "f1"
; xmm2 -> "f2"
; xmm3 -> "f3"
; xmm4 -> "f4"
; xmm5 -> "f5"
; xmm6 -> "f6"
; xmm7 -> "f7"
; xmm8 -> "f0"
; xmm9 -> "f1"
; xmm10 -> absolute value mask
; STACK STRUCTURE:
; |
; |
; | saved registers
; |
; v
; [rbx+8] RegisterFile& registerFile
; [rbx+0] uint8_t* dataset
; |
; |
; | VM stack
; |
; v
; [rsp] last element of VM stack
; xmm4 -> "e0"
; xmm5 -> "e1"
; xmm6 -> "e2"
; xmm7 -> "e3"
; xmm8 -> "a0"
; xmm9 -> "a1"
; xmm10 -> "a2"
; xmm11 -> "a3"
; xmm12 -> temporary
; xmm13 -> DBL_MIN
; xmm14 -> absolute value mask
; xmm15 -> sign mask
; store callee-saved registers
push rbx
@ -81,100 +79,117 @@ executeProgram PROC
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm11
movdqu xmmword ptr [rsp+48], xmm12
movdqu xmmword ptr [rsp+32], xmm13
movdqu xmmword ptr [rsp+16], xmm14
movdqu xmmword ptr [rsp+0], xmm15
; function arguments
push rcx ; RegisterFile& registerFile
mov rbp, qword ptr [rdx] ; "mx", "ma"
mov rax, qword ptr [rdx+8] ; uint8_t* dataset
push rax
mov eax, ebp ; "mx"
mov rdi, qword ptr [rdx+8] ; uint8_t* dataset
mov rsi, r8 ; convertible_t* scratchpad
mov rbx, r9 ; loop counter
mov rdi, rsp ; beginning of VM stack
mov ebx, 1048577 ; number of VM instructions to execute + 1
;# zero integer registers
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10
psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
;# load constant registers
lea rcx, [rcx+120]
movapd xmm8, xmmword ptr [rcx+72]
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]
movapd xmm13, xmmword ptr [minDbl]
movapd xmm14, xmmword ptr [absMask]
movapd xmm15, xmmword ptr [signMask]
; reset rounding mode
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
; load integer registers
mov r8, qword ptr [rcx+0]
mov r9, qword ptr [rcx+8]
mov r10, qword ptr [rcx+16]
mov r11, qword ptr [rcx+24]
mov r12, qword ptr [rcx+32]
mov r13, qword ptr [rcx+40]
mov r14, qword ptr [rcx+48]
mov r15, qword ptr [rcx+56]
; load register f0 hi, lo
xorps xmm8, xmm8
cvtsi2sd xmm8, qword ptr [rcx+72]
pslldq xmm8, 8
cvtsi2sd xmm8, qword ptr [rcx+64]
; load register f1 hi, lo
xorps xmm9, xmm9
cvtsi2sd xmm9, qword ptr [rcx+88]
pslldq xmm9, 8
cvtsi2sd xmm9, qword ptr [rcx+80]
; load register f2 hi, lo
xorps xmm2, xmm2
cvtsi2sd xmm2, qword ptr [rcx+104]
pslldq xmm2, 8
cvtsi2sd xmm2, qword ptr [rcx+96]
; load register f3 hi, lo
xorps xmm3, xmm3
cvtsi2sd xmm3, qword ptr [rcx+120]
pslldq xmm3, 8
cvtsi2sd xmm3, qword ptr [rcx+112]
lea rcx, [rcx+64]
; load register f4 hi, lo
xorps xmm4, xmm4
cvtsi2sd xmm4, qword ptr [rcx+72]
pslldq xmm4, 8
cvtsi2sd xmm4, qword ptr [rcx+64]
; load register f5 hi, lo
xorps xmm5, xmm5
cvtsi2sd xmm5, qword ptr [rcx+88]
pslldq xmm5, 8
cvtsi2sd xmm5, qword ptr [rcx+80]
; load register f6 hi, lo
xorps xmm6, xmm6
cvtsi2sd xmm6, qword ptr [rcx+104]
pslldq xmm6, 8
cvtsi2sd xmm6, qword ptr [rcx+96]
; load register f7 hi, lo
xorps xmm7, xmm7
cvtsi2sd xmm7, qword ptr [rcx+120]
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]
jmp program_begin
; program body
ALIGN 64
program_begin:
xor eax, r8d ;# read address register 1
and eax, 262080
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor eax, r9d ;# read address register 2
and eax, 262080
lea rcx, [rsi+rax]
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
cvtdq2pd xmm3, qword ptr [rcx+24]
cvtdq2pd xmm4, qword ptr [rcx+32]
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
andps xmm4, xmm14
andps xmm5, xmm14
andps xmm6, xmm14
andps xmm7, xmm14
;# 256 instructions
include program.inc
ALIGN 64
rx_finish:
; unroll the stack
mov rsp, rdi
mov eax, r8d ;# read address register 1
xor eax, r9d ;# read address register 2
xor rbp, rax ;# modify "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
lea rcx, [rdi+rdx] ;# dataset cache line
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
mov eax, r12d ;# write address register 1
and eax, 262080
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
xor eax, r13d ;# write address register 2
and eax, 262080
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3
dec ebx
jnz program_begin
rx_finish:
; save VM register values
pop rcx
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
@ -183,8 +198,8 @@ rx_finish:
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
movdqa xmmword ptr [rcx+64], xmm8
movdqa xmmword ptr [rcx+80], xmm9
movdqa xmmword ptr [rcx+64], xmm0
movdqa xmmword ptr [rcx+80], xmm1
movdqa xmmword ptr [rcx+96], xmm2
movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
@ -194,6 +209,12 @@ rx_finish:
movdqa xmmword ptr [rcx+112], xmm7
; load callee-saved registers
movdqu xmm15, xmmword ptr [rsp]
movdqu xmm14, xmmword ptr [rsp+16]
movdqu xmm13, xmmword ptr [rsp+32]
movdqu xmm12, xmmword ptr [rsp+48]
movdqu xmm11, xmmword ptr [rsp+64]
add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]

View File

@ -19,46 +19,58 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
#define WT_ADD_64 12
#define WT_ADD_32 2
#define WT_SUB_64 12
#define WT_SUB_32 2
#define WT_MUL_64 23
#define WT_MULH_64 5
#define WT_MUL_32 15
#define WT_IMUL_32 15
#define WT_IMULH_64 3
#define WT_DIV_64 8
#define WT_IDIV_64 8
#define WT_AND_64 4
#define WT_AND_32 2
#define WT_OR_64 4
#define WT_OR_32 2
#define WT_XOR_64 4
#define WT_XOR_32 2
#define WT_SHL_64 3
#define WT_SHR_64 3
#define WT_SAR_64 3
#define WT_ROL_64 6
#define WT_ROR_64 6
#define WT_FPADD 20
#define WT_FPSUB 20
#define WT_FPMUL 22
#define WT_FPDIV 8
#define WT_FPSQRT 6
#define WT_FPROUND 2
#define WT_JUMP 11
#define WT_CALL 11
#define WT_RET 12
//Integer
#define WT_IADD_R 10
#define WT_IADD_M 3
#define WT_IADD_RC 12
#define WT_ISUB_R 10
#define WT_ISUB_M 3
#define WT_IMUL_9C 12
#define WT_IMUL_R 24
#define WT_IMUL_M 8
#define WT_IMULH_R 6
#define WT_IMULH_M 2
#define WT_ISMULH_R 6
#define WT_ISMULH_M 2
#define WT_IDIV_C 4
#define WT_ISDIV_C 2
#define WT_INEG_R 4
#define WT_IXOR_R 15
#define WT_IXOR_M 5
#define WT_IROR_R 10
#define WT_IROL_R 10
//Common floating point
#define WT_FPSWAP_R 6
//Floating point group F
#define WT_FPADD_R 18
#define WT_FPADD_M 3
#define WT_FPSUB_R 18
#define WT_FPSUB_M 3
#define WT_FPNEG_R 5
//Floating point group E
#define WT_FPMUL_R 18
#define WT_FPMUL_M 3
#define WT_FPDIV_R 6
#define WT_FPDIV_M 1
#define WT_FPSQRT_R 6
//Control
#define WT_COND_R 15
#define WT_COND_M 5
#define WT_CFROUND 1
#define WT_NOP 0
constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \
WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \
WT_DIV_64 + WT_IDIV_64 + WT_AND_64 + WT_AND_32 + WT_OR_64 + \
WT_OR_32 + WT_XOR_64 + WT_XOR_32 + WT_SHL_64 + WT_SHR_64 + \
WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \
+ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET + WT_NOP;
constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP;
static_assert(wtSum == 256,
"Sum of instruction weights must be 256");

View File

@ -34,6 +34,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <atomic>
#include "dataset.hpp"
#include "Cache.hpp"
#include "Pcg32.hpp"
const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
@ -130,6 +131,27 @@ void generateAsm(int nonce) {
asmX86.printCode(std::cout);
}
void generateNative(int nonce) {
uint64_t hash[4];
unsigned char blockTemplate[] = {
0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca,
0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09
};
int* noncePtr = (int*)(blockTemplate + 39);
*noncePtr = nonce;
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
RandomX::Program prog;
Pcg32 gen(hash);
prog.initialize(gen);
for (int i = 0; i < RandomX::ProgramLength; ++i) {
prog(i).dst %= 8;
prog(i).src %= 8;
}
std::cout << prog << std::endl;
}
void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) {
uint64_t hash[4];
unsigned char blockTemplate[] = {
@ -147,18 +169,16 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
vm->initializeScratchpad(scratchpad, spIndex);
vm->initializeProgram(hash);
//vm->initializeProgram(hash);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
vm->setScratchpad(scratchpad + 3 * RandomX::ScratchpadSize / 4);
vm->execute();
vm->setScratchpad(scratchpad + 2 * RandomX::ScratchpadSize / 4);
for (int chain = 0; chain < 16; ++chain) {
vm->initializeProgram(hash);
int segment = hash[3] & 3;
vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4);
vm->execute();
vm->getResult(nullptr, 0, hash);
vm->initializeProgram(hash);
vm->setScratchpad(scratchpad + 1 * RandomX::ScratchpadSize / 4);
vm->execute();
vm->setScratchpad(scratchpad + 0 * RandomX::ScratchpadSize / 4);
vm->execute();
}
//vm->initializeProgram(hash);
vm->getResult(scratchpad, RandomX::ScratchpadSize, hash);
result.xorWith(hash);
if (RandomX::trace) {
@ -171,7 +191,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
}
int main(int argc, char** argv) {
bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench;
bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench, genNative;
int programCount, threadCount;
readOption("--help", argc, argv, help);
@ -189,12 +209,18 @@ int main(int argc, char** argv) {
readOption("--largePages", argc, argv, largePages);
readOption("--async", argc, argv, async);
readOption("--aesBench", argc, argv, aesBench);
readOption("--genNative", argc, argv, genNative);
if (genAsm) {
generateAsm(programCount);
return 0;
}
if (genNative) {
generateNative(programCount);
return 0;
}
if (softAes)
std::cout << "Using software AES." << std::endl;

File diff suppressed because it is too large Load Diff