From 8b1102ee05200c0b22b8c89b6b204142725e1958 Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 15 Jan 2019 00:01:11 +0100 Subject: [PATCH] Interpreter + async mode --- src/AddressTransform.cpp | 292 ++++++++++++++++++++++++++++++ src/AssemblyGeneratorX86.cpp | 4 +- src/Cache.hpp | 2 +- src/CompiledVirtualMachine.cpp | 13 +- src/CompiledVirtualMachine.hpp | 5 +- src/InterpretedVirtualMachine.cpp | 246 +++++++++++++++---------- src/InterpretedVirtualMachine.hpp | 31 +++- src/JitCompilerX86.cpp | 6 +- src/LightClientAsyncWorker.cpp | 94 ++++++++++ src/LightClientAsyncWorker.hpp | 52 ++++++ src/VirtualMachine.cpp | 56 +----- src/VirtualMachine.hpp | 9 +- src/common.hpp | 23 ++- src/dataset.cpp | 48 +++-- src/dataset.hpp | 6 +- src/instructions.hpp | 8 +- src/instructionsPortable.cpp | 6 +- src/main.cpp | 16 +- 18 files changed, 702 insertions(+), 215 deletions(-) create mode 100644 src/AddressTransform.cpp create mode 100644 src/LightClientAsyncWorker.cpp create mode 100644 src/LightClientAsyncWorker.hpp diff --git a/src/AddressTransform.cpp b/src/AddressTransform.cpp new file mode 100644 index 0000000..b8070a0 --- /dev/null +++ b/src/AddressTransform.cpp @@ -0,0 +1,292 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "common.hpp" +#include "InterpretedVirtualMachine.hpp" + +#include +#include +#include + +namespace RandomX { + + class Mul9Transform : public ITransform { + public: + Mul9Transform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "mul9_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return 9 * x + c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "lea ecx, [rcx+rcx*8" << std::showpos << c << "]" << std::noshowpos << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const Mul9Transform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + class AddTransform : public ITransform { + public: + AddTransform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "add_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return x + c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "db 64" << std::endl; + os << "add ecx, " << c << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const AddTransform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + class XorTransform : public ITransform { + public: + XorTransform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "xor_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return x ^ c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "db 64" << std::endl; + os << "xor ecx, " << c << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const XorTransform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + static const Mul9Transform mul9_6d(109); + static const XorTransform xor_60(96); + static const Mul9Transform mul9_ed(-19); + static const AddTransform add_9e(-98); + static const AddTransform add_eb(-21); + static const XorTransform xor_b0(-80); + static const Mul9Transform mul9_a4(-92); + static const AddTransform add_71(113); + static const Mul9Transform mul9_64(100); + static const AddTransform add_d9(-39); + static const XorTransform xor_78(120); + static const Mul9Transform mul9_89(-119); + static const AddTransform add_8f(-113); + static const AddTransform add_6f(111); + static const XorTransform xor_68(104); + static const Mul9Transform mul9_ad(-83); + static const Mul9Transform mul9_7f(127); + static const XorTransform xor_90(-112); + static const AddTransform add_59(89); + static const AddTransform add_e0(-32); + static const AddTransform add_68(104); + static const XorTransform xor_88(-120); + static const XorTransform xor_18(24); + static const Mul9Transform mul9_9(9); + static const AddTransform add_e1(-31); + static const XorTransform xor_f0(-16); + static const AddTransform add_44(68); + static const Mul9Transform mul9_92(-110); + static const XorTransform xor_40(64); + static const XorTransform xor_d8(-40); + static const XorTransform xor_f8(-8); + static const AddTransform add_f6(-10); + static const XorTransform xor_e0(-32); + static const AddTransform add_e(14); + static const Mul9Transform mul9_d2(-46); + static const XorTransform xor_98(-104); + static const Mul9Transform mul9_24(36); + static const AddTransform add_64(100); + static const Mul9Transform mul9_bf(-65); + static const Mul9Transform mul9_1b(27); + static const Mul9Transform mul9_5b(91); + static const AddTransform add_9b(-101); + static const AddTransform add_a2(-94); + static const Mul9Transform mul9_f6(-10); + static const XorTransform xor_50(80); + static const AddTransform add_94(-108); + static const AddTransform add_c6(-58); + static const XorTransform xor_30(48); + static const Mul9Transform mul9_49(73); + static const XorTransform xor_d0(-48); + static const XorTransform xor_20(32); + static const XorTransform xor_a0(-96); + static const AddTransform add_76(118); + static const AddTransform add_5b(91); + static const Mul9Transform mul9_12(18); + static const AddTransform add_f5(-11); + static const Mul9Transform mul9_3f(63); + static const AddTransform add_72(114); + static const Mul9Transform mul9_2d(45); + static const AddTransform add_bd(-67); + static const AddTransform add_35(53); + static const Mul9Transform mul9_9b(-101); + static const Mul9Transform mul9_ff(-1); + static const XorTransform xor_10(16); + static const Mul9Transform mul9_db(-37); + static const Mul9Transform mul9_e4(-28); + static const Mul9Transform mul9_c9(-55); + static const XorTransform xor_a8(-88); + static const XorTransform xor_b8(-72); + static const AddTransform add_24(36); + static const XorTransform xor_c8(-56); + static const AddTransform add_74(116); + static const XorTransform xor_58(88); + static const XorTransform xor_80(-128); + static const AddTransform add_32(50); + static const AddTransform add_69(105); + static const AddTransform add_db(-37); + static const XorTransform xor_70(112); + static const XorTransform xor_8(8); + static const XorTransform xor_e8(-24); + static const Mul9Transform mul9_76(118); + static const XorTransform xor_48(72); + static const XorTransform xor_c0(-64); + static const AddTransform add_28(40); + static const Mul9Transform mul9_b6(-74); + static const Mul9Transform mul9_52(82); + static const Mul9Transform mul9_36(54); + static const XorTransform xor_38(56); + static const XorTransform xor_28(40); + static const AddTransform add_57(87); + + const ITransform* InterpretedVirtualMachine::addressTransformations[TransformationCount] = { + (ITransform*)&mul9_6d, + (ITransform*)&xor_60, + (ITransform*)&mul9_ed, + (ITransform*)&add_9e, + (ITransform*)&add_eb, + (ITransform*)&xor_b0, + (ITransform*)&mul9_a4, + (ITransform*)&add_71, + (ITransform*)&mul9_64, + (ITransform*)&add_d9, + (ITransform*)&xor_78, + (ITransform*)&mul9_89, + (ITransform*)&add_8f, + (ITransform*)&add_6f, + (ITransform*)&xor_68, + (ITransform*)&mul9_ad, + (ITransform*)&mul9_7f, + (ITransform*)&xor_90, + (ITransform*)&add_59, + (ITransform*)&add_e0, + (ITransform*)&add_68, + (ITransform*)&xor_88, + (ITransform*)&xor_18, + (ITransform*)&mul9_9, + (ITransform*)&add_e1, + (ITransform*)&xor_f0, + (ITransform*)&add_44, + (ITransform*)&mul9_92, + (ITransform*)&xor_40, + (ITransform*)&xor_d8, + (ITransform*)&xor_f8, + (ITransform*)&add_f6, + (ITransform*)&xor_e0, + (ITransform*)&add_e, + (ITransform*)&mul9_d2, + (ITransform*)&xor_98, + (ITransform*)&mul9_24, + (ITransform*)&add_64, + (ITransform*)&mul9_bf, + (ITransform*)&mul9_1b, + (ITransform*)&mul9_5b, + (ITransform*)&add_9b, + (ITransform*)&add_a2, + (ITransform*)&mul9_f6, + (ITransform*)&xor_50, + (ITransform*)&add_94, + (ITransform*)&add_c6, + (ITransform*)&xor_30, + (ITransform*)&mul9_49, + (ITransform*)&xor_d0, + (ITransform*)&xor_20, + (ITransform*)&xor_a0, + (ITransform*)&add_76, + (ITransform*)&add_5b, + (ITransform*)&mul9_12, + (ITransform*)&add_f5, + (ITransform*)&mul9_3f, + (ITransform*)&add_72, + (ITransform*)&mul9_2d, + (ITransform*)&add_bd, + (ITransform*)&add_35, + (ITransform*)&mul9_9b, + (ITransform*)&mul9_ff, + (ITransform*)&xor_10, + (ITransform*)&mul9_db, + (ITransform*)&mul9_e4, + (ITransform*)&mul9_c9, + (ITransform*)&xor_a8, + (ITransform*)&xor_b8, + (ITransform*)&add_24, + (ITransform*)&xor_c8, + (ITransform*)&add_74, + (ITransform*)&xor_58, + (ITransform*)&xor_80, + (ITransform*)&add_32, + (ITransform*)&add_69, + (ITransform*)&add_db, + (ITransform*)&xor_70, + (ITransform*)&xor_8, + (ITransform*)&xor_e8, + (ITransform*)&mul9_76, + (ITransform*)&xor_48, + (ITransform*)&xor_c0, + (ITransform*)&add_28, + (ITransform*)&mul9_b6, + (ITransform*)&mul9_52, + (ITransform*)&mul9_36, + (ITransform*)&xor_38, + (ITransform*)&xor_28, + (ITransform*)&add_57, + }; +} \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8a4a0a1..25ae7ef 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -67,12 +67,12 @@ namespace RandomX { void AssemblyGeneratorX86::gena(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rax" << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; asmCode << "\tcall rx_read" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rax" << std::endl; if (instr.loca & 3) { asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; } diff --git a/src/Cache.hpp b/src/Cache.hpp index 4137b97..8a2b93a 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -46,7 +46,7 @@ namespace RandomX { return keys; } - const uint8_t* getCache() { + const uint8_t* getCache() const { return memory; } private: diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index ef78d2f..28a3cca 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -25,15 +25,16 @@ along with RandomX. If not, see. namespace RandomX { - CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { + CompiledVirtualMachine::CompiledVirtualMachine() { totalSize = 0; } - void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { - if (lightClient) { - throw std::runtime_error("Compiled VM does not support light-client mode"); - } - VirtualMachine::setDataset(ds, lightClient); + void CompiledVirtualMachine::setDataset(dataset_t ds) { + mem.ds = ds; + } + + void CompiledVirtualMachine::initializeScratchpad(uint32_t index) { + memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } void CompiledVirtualMachine::initializeProgram(const void* seed) { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index a77bdb8..98b0b78 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -37,8 +37,9 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); } - CompiledVirtualMachine(bool softAes); - void setDataset(dataset_t ds, bool light = false) override; + CompiledVirtualMachine(); + void setDataset(dataset_t ds) override; + void initializeScratchpad(uint32_t index) override; void initializeProgram(const void* seed) override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index a6a3a0c..54d2279 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -21,11 +21,15 @@ along with RandomX. If not, see. #include "InterpretedVirtualMachine.hpp" #include "Pcg32.hpp" #include "instructions.hpp" +#include "dataset.hpp" +#include "Cache.hpp" +#include "LightClientAsyncWorker.hpp" #include #include #include #include #include +#include #ifdef STATS #include #endif @@ -38,6 +42,57 @@ constexpr bool fpuCheck = false; namespace RandomX { + InterpretedVirtualMachine::~InterpretedVirtualMachine() { + if (asyncWorker) { + delete mem.ds.asyncWorker; + } + } + + void InterpretedVirtualMachine::setDataset(dataset_t ds) { + if (asyncWorker) { + if (softAes) { + mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); + } + else { + mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); + } + readDataset = &datasetReadLightAsync; + } + else { + mem.ds = ds; + if (softAes) { + readDataset = &datasetReadLight; + } + else { + readDataset = &datasetReadLight; + } + } + } + + void InterpretedVirtualMachine::initializeScratchpad(uint32_t index) { + uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; + if (asyncWorker) { + ILightClientAsyncWorker* worker = mem.ds.asyncWorker; + const uint32_t blocksPerThread = (ScratchpadSize / CacheLineSize) / 2; + worker->prepareBlocks(scratchpad, startingBlock, blocksPerThread); //async first half + worker->getBlocks(scratchpad + ScratchpadLength / 2, startingBlock + blocksPerThread, blocksPerThread); //sync second half + worker->sync(); + } + else { + auto cache = mem.ds.cache; + if (softAes) { + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); + } + } + else { + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); + } + } + } + } + void InterpretedVirtualMachine::initializeProgram(const void* seed) { Pcg32 gen(seed); for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { @@ -50,6 +105,7 @@ namespace RandomX { } //std::cout << reg; p.initialize(gen); + currentTransform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); pc = 0; @@ -74,61 +130,61 @@ namespace RandomX { #endif } - convertible_t InterpretedVirtualMachine::loada(Instruction& inst) { - convertible_t& rega = reg.r[inst.rega % RegistersCount]; - rega.i64 ^= inst.addra; //sign-extend addra + convertible_t InterpretedVirtualMachine::loada(Instruction& instr) { + convertible_t& rega = reg.r[instr.rega % RegistersCount]; + rega.i64 ^= instr.addra; //sign-extend addra addr_t addr = rega.u32; - switch (inst.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - return readDataset(addr, mem); - case 4: - return scratchpad[addr % ScratchpadL2]; + if ((ic % 64) == 0) { + addr = currentTransform->apply(addr); +#ifdef STATS + datasetAccess[mem.ma / (DatasetBlockCount / 256) / CacheLineSize]++; +#endif + readDataset(addr, mem, reg); + } - case 5: - case 6: - case 7: - return scratchpad[addr % ScratchpadL1]; + if ((instr.loca & 192) == 0) { + mem.mx ^= addr; + } + + if (instr.loca & 3) { + return scratchpad[addr % ScratchpadL1]; + } + else { + return scratchpad[addr % ScratchpadL2]; } } - convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) { - switch (inst.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - return reg.r[inst.regb % RegistersCount]; - case 6: - case 7: - convertible_t temp; - temp.i64 = inst.imm32; //sign-extend imm32 - return temp; + convertible_t InterpretedVirtualMachine::loadbia(Instruction& instr) { + if (instr.locb & 3) { + return reg.r[instr.regb % RegistersCount]; + } + else { + convertible_t temp; + temp.i64 = instr.imm32; //sign-extend imm32 + return temp; } } - convertible_t InterpretedVirtualMachine::loadbr0(Instruction& inst) { - switch (inst.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - return reg.r[inst.regb % RegistersCount]; - case 4: - case 5: - case 6: - case 7: - convertible_t temp; - temp.u64 = inst.imm8; - return temp; + convertible_t InterpretedVirtualMachine::loadbiashift(Instruction& instr) { + if (instr.locb & 1) { + return reg.r[instr.regb % RegistersCount]; + } + else { + convertible_t temp; + temp.u64 = instr.imm8; + return temp; + } + } + + convertible_t InterpretedVirtualMachine::loadbiadiv(Instruction& instr) { + if (instr.locb & 3) { + convertible_t temp; + temp.u64 = instr.imm32; + return temp; + } + else { + return reg.r[instr.regb % RegistersCount]; } } @@ -174,26 +230,6 @@ namespace RandomX { } } - void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) { - addr_t addr; - switch (inst.locc & 7) - { - case 4: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL2] = regc.lo; - break; - - case 5: - case 6: - case 7: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL1] = regc.lo; - - default: - break; - } - } - #define ALU_RETIRE(x) x(a, b, c); \ if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; @@ -242,7 +278,7 @@ namespace RandomX { #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ INC_COUNT(x) \ convertible_t a = loada(inst); \ - convertible_t b = loadbr1(inst); \ + convertible_t b = loadbia(inst); \ convertible_t& c = getcr(inst); \ ALU_RETIRE(x) \ } @@ -250,7 +286,15 @@ namespace RandomX { #define ALU_INST_SR(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ INC_COUNT(x) \ convertible_t a = loada(inst); \ - convertible_t b = loadbr0(inst); \ + convertible_t b = loadbiashift(inst); \ + convertible_t& c = getcr(inst); \ + ALU_RETIRE(x) \ + } + +#define ALU_INST_DIV(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ + INC_COUNT(x) \ + convertible_t a = loada(inst); \ + convertible_t b = loadbiadiv(inst); \ convertible_t& c = getcr(inst); \ ALU_RETIRE(x) \ } @@ -282,8 +326,8 @@ namespace RandomX { ALU_INST(MUL_32) ALU_INST(IMUL_32) ALU_INST(IMULH_64) - ALU_INST(DIV_64) - ALU_INST(IDIV_64) + ALU_INST_DIV(DIV_64) + ALU_INST_DIV(IDIV_64) ALU_INST(AND_64) ALU_INST(AND_32) ALU_INST(OR_64) @@ -301,42 +345,68 @@ namespace RandomX { FPU_INST(FPSUB) FPU_INST(FPMUL) FPU_INST(FPDIV) - FPU_INST_NB(FPSQRT) - FPU_INST_NB(FPROUND) + + void InterpretedVirtualMachine::h_FPROUND(Instruction& inst) { + convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; + FPROUND(a, inst.imm8); + } + + void InterpretedVirtualMachine::h_JUMP(Instruction& inst) { + convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; + if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { +#ifdef STATS + count_JUMP_taken++; + count_jump_taken[inst.locb & 7]++; +#endif + pc += (inst.imm8 & 127) + 1; + pc = pc % ProgramLength; + } +#ifdef STATS + else { + count_JUMP_not_taken++; + count_jump_not_taken[inst.locb & 7]++; + } +#endif + } void InterpretedVirtualMachine::h_CALL(Instruction& inst) { convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { #ifdef STATS count_CALL_taken++; count_jump_taken[inst.locb & 7]++; count_retdepth = std::max(0, count_retdepth - 1); #endif - stackPush(a); stackPush(pc); #ifdef STATS count_max_stack = std::max(count_max_stack, (int)stack.size()); #endif pc += (inst.imm8 & 127) + 1; pc = pc % ProgramLength; - if (trace) std::cout << std::hex << a.u64 << std::endl; } - else { - convertible_t& c = getcr(inst); #ifdef STATS + else { count_CALL_not_taken++; count_jump_not_taken[inst.locb & 7]++; -#endif - c.u64 = a.u64; - if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; } +#endif } void InterpretedVirtualMachine::h_RET(Instruction& inst) { convertible_t a = loada(inst); - convertible_t b = loadbr1(inst); convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; if (stack.size() > 0) { #ifdef STATS count_RET_taken++; @@ -344,22 +414,13 @@ namespace RandomX { count_retdepth_max = std::max(count_retdepth_max, count_retdepth); #endif auto raddr = stackPopAddress(); - auto retval = stackPopValue(); - c.u64 = a.u64 ^ retval.u64; pc = raddr; } - else { #ifdef STATS - if (stack.size() == 0) - count_RET_stack_empty++; - else { - count_RET_not_taken++; - count_jump_not_taken[inst.locb & 7]++; - } -#endif - c.u64 = a.u64; + else { + count_RET_stack_empty++; } - if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; +#endif } #include "instructionWeights.hpp" @@ -394,6 +455,7 @@ namespace RandomX { INST_HANDLE(FPDIV) INST_HANDLE(FPSQRT) INST_HANDLE(FPROUND) + INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) }; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 8c34936..7745cad 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -25,23 +25,37 @@ along with RandomX. If not, see. namespace RandomX { + class ITransform { + public: + virtual int32_t apply(int32_t) const = 0; + virtual const char* getName() const = 0; + virtual std::ostream& printAsm(std::ostream&) const = 0; + virtual std::ostream& printCxx(std::ostream&) const = 0; + }; + class InterpretedVirtualMachine; typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); class InterpretedVirtualMachine : public VirtualMachine { public: - InterpretedVirtualMachine(bool softAes) : VirtualMachine(softAes) {} - virtual void initializeProgram(const void* seed) override; - virtual void execute() override; + InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} + ~InterpretedVirtualMachine(); + void setDataset(dataset_t ds) override; + void initializeScratchpad(uint32_t index) override; + void initializeProgram(const void* seed) override; + void execute() override; const Program& getProgam() { return p; } private: static InstructionHandler engine[256]; + static const ITransform* addressTransformations[TransformationCount]; + bool softAes, asyncWorker; Program p; std::vector stack; uint64_t pc, ic; + const ITransform* currentTransform; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -71,11 +85,12 @@ namespace RandomX { int count_FPDIV = 0; int count_FPSQRT = 0; int count_FPROUND = 0; + int count_JUMP_taken = 0; + int count_JUMP_not_taken = 0; int count_CALL_taken = 0; int count_CALL_not_taken = 0; int count_RET_stack_empty = 0; int count_RET_taken = 0; - int count_RET_not_taken = 0; int count_jump_taken[8] = { 0 }; int count_jump_not_taken[8] = { 0 }; int count_max_stack = 0; @@ -89,14 +104,15 @@ namespace RandomX { int count_FPSUB_nop2 = 0; int count_FPMUL_nop = 0; int count_FPMUL_nop2 = 0; + int datasetAccess[256] = { 0 }; #endif convertible_t loada(Instruction&); - convertible_t loadbr0(Instruction&); - convertible_t loadbr1(Instruction&); + convertible_t loadbiashift(Instruction&); + convertible_t loadbiadiv(Instruction&); + convertible_t loadbia(Instruction&); convertible_t& getcr(Instruction&); void writecf(Instruction&, fpu_reg_t&); - void writecflo(Instruction&, fpu_reg_t&); void stackPush(convertible_t& c) { stack.push_back(c); @@ -148,6 +164,7 @@ namespace RandomX { void h_FPDIV(Instruction&); void h_FPSQRT(Instruction&); void h_FPROUND(Instruction&); + void h_JUMP(Instruction&); void h_CALL(Instruction&); void h_RET(Instruction&); }; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index f76ab74..b41f7b5 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -170,13 +170,13 @@ namespace RandomX { emit(instr.addra); emit(uint16_t(0x8b41)); //mov emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega + emit(0x753fc3f6); //test bl,0x3f; jne + emit(uint16_t(0xe805)); + emit(readDatasetOffset - (codePos + 4)); if ((instr.loca & 192) == 0) { //A.LOC.X emit(uint16_t(0x3348)); emitByte(0xe8); //xor rbp, rax } - emit(0x753fc3f6); //test bl,0x3f; jne - emit(uint16_t(0xe805)); - emit(readDatasetOffset - (codePos + 4)); emitByte(0x25); //and eax, if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp new file mode 100644 index 0000000..c069f3f --- /dev/null +++ b/src/LightClientAsyncWorker.cpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "LightClientAsyncWorker.hpp" +#include "dataset.hpp" +#include "Cache.hpp" + +namespace RandomX { + + template + LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), workerThread(&LightClientAsyncWorker::runWorker, this) { + + } + + template + void LightClientAsyncWorker::prepareBlock(addr_t addr) { + { + std::lock_guard lk(mutex); + startBlock = addr / CacheLineSize; + blockCount = 1; + output = currentLine.data(); + hasWork = true; + } + notifier.notify_all(); + } + + template + const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { + uint32_t currentBlock = addr / CacheLineSize; + if (currentBlock != startBlock || output != currentLine.data()) { + initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); + } + else { + sync(); + } + return currentLine.data(); + } + + template + void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { + { + std::lock_guard lk(mutex); + startBlock = startBlock; + blockCount = blockCount; + output = out; + hasWork = true; + } + notifier.notify_all(); + } + + template + void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { + for (uint32_t i = 0; i < blockCount; ++i) { + initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); + } + } + + template + void LightClientAsyncWorker::sync() { + std::unique_lock lk(mutex); + notifier.wait(lk, [this] { return !hasWork; }); + } + + template + void LightClientAsyncWorker::runWorker() { + for (;;) { + std::unique_lock lk(mutex); + notifier.wait(lk, [this] { return hasWork; }); + getBlocks(output, startBlock, blockCount); + hasWork = false; + lk.unlock(); + notifier.notify_all(); + } + } + + template class LightClientAsyncWorker; + template class LightClientAsyncWorker; +} \ No newline at end of file diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp new file mode 100644 index 0000000..7596fd5 --- /dev/null +++ b/src/LightClientAsyncWorker.hpp @@ -0,0 +1,52 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "common.hpp" + +#include +#include +#include +#include + +namespace RandomX { + + class Cache; + + using DatasetLine = std::array; + + template + class LightClientAsyncWorker : public ILightClientAsyncWorker { + public: + LightClientAsyncWorker(const Cache*); + void prepareBlock(addr_t) final; + void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; + const uint64_t* getBlock(addr_t) final; + void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; + void sync() final; + private: + void runWorker(); + std::condition_variable notifier; + std::mutex mutex; + DatasetLine currentLine; + void* output; + uint32_t startBlock, blockCount; + bool hasWork; + std::thread workerThread; + }; +} \ No newline at end of file diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 6e8cfad..7a2be48 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -19,8 +19,6 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "common.hpp" -#include "dataset.hpp" -#include "Cache.hpp" #include "t1ha/t1ha.h" #include "blake2/blake2.h" #include @@ -37,62 +35,10 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { namespace RandomX { - VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) { + VirtualMachine::VirtualMachine() { mem.ds.dataset = nullptr; } - VirtualMachine::~VirtualMachine() { - if (lightClient) { - delete mem.ds.lightDataset->block; - delete mem.ds.lightDataset; - } - } - - void VirtualMachine::setDataset(dataset_t ds, bool light) { - if (mem.ds.dataset != nullptr) { - throw std::runtime_error("Dataset is already initialized"); - } - lightClient = light; - if (light) { - auto lds = mem.ds.lightDataset = new LightClientDataset(); - lds->cache = ds.cache; - //lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); - lds->blockNumber = -1; - if (lds->block == nullptr) { - throw std::bad_alloc(); - } - if (softAes) { - readDataset = &datasetReadLight; - } - else { - readDataset = &datasetReadLight; - } - } - else { - mem.ds = ds; - readDataset = &datasetRead; - } - } - - void VirtualMachine::initializeScratchpad(uint32_t index) { - if (lightClient) { - auto cache = mem.ds.lightDataset->cache; - if (softAes) { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - else { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - } - else { - memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); - } - } - void VirtualMachine::getResult(void* out) { constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2; uint64_t smallState[smallStateLength]; diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index bbcfec3..78f7cf6 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -25,10 +25,10 @@ namespace RandomX { class VirtualMachine { public: - VirtualMachine(bool softAes); - virtual ~VirtualMachine(); - virtual void setDataset(dataset_t ds, bool light = false); - void initializeScratchpad(uint32_t index); + VirtualMachine(); + virtual ~VirtualMachine() {} + virtual void setDataset(dataset_t ds) = 0; + virtual void initializeScratchpad(uint32_t index) = 0; virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; void getResult(void*); @@ -36,7 +36,6 @@ namespace RandomX { return reg; } protected: - bool softAes, lightClient; DatasetReadFunc readDataset; alignas(16) RegisterFile reg; MemoryRegisters mem; diff --git a/src/common.hpp b/src/common.hpp index 3831175..62fae70 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -38,7 +38,7 @@ namespace RandomX { constexpr int CacheLineSize = 64; constexpr int BlockExpansionRatio = 64; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 64; + constexpr int DatasetIterations = 16; constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; @@ -86,16 +86,25 @@ namespace RandomX { return i % RandomX::ProgramLength; } - struct LightClientDataset { - Cache* cache; - uint8_t* block; - uint32_t blockNumber; + class ILightClientAsyncWorker { + public: + virtual void prepareBlock(addr_t) = 0; + virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0; + virtual const uint64_t* getBlock(addr_t) = 0; + virtual void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0; + virtual void sync() = 0; + const Cache* getCache() { + return cache; + } + protected: + ILightClientAsyncWorker(const Cache* c) : cache(c) {} + const Cache* cache; }; union dataset_t { uint8_t* dataset; Cache* cache; - LightClientDataset* lightDataset; + ILightClientAsyncWorker* asyncWorker; }; struct MemoryRegisters { @@ -112,7 +121,7 @@ namespace RandomX { static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile"); - typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&); + typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); diff --git a/src/dataset.cpp b/src/dataset.cpp index d9c7b3f..ae31963 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -30,7 +30,7 @@ along with RandomX. If not, see. #if defined(__SSE2__) #include -#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_T0) +#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA) #else #define PREFETCH(memory) #endif @@ -106,32 +106,44 @@ namespace RandomX { template void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) { - convertible_t data; - data.u64 = *(uint64_t*)(memory.ds.dataset + memory.ma); - memory.ma += 8; + void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma); memory.mx ^= addr; - if ((memory.mx & 0xFFF8) == 0) { - memory.ma = memory.mx & ~7; - PREFETCH(memory); - } - return data; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); + PREFETCH(memory); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; } template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) { - convertible_t data; - LightClientDataset* lds = memory.ds.lightDataset; - auto blockNumber = memory.ma / CacheLineSize; - - return data; + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + Cache* cache = memory.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; + memory.mx ^= addr; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); } template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + ILightClientAsyncWorker* aw = memory.ds.asyncWorker; + const uint64_t* datasetLine = aw->getBlock(memory.ma); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; + memory.mx ^= addr; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); + aw->prepareBlock(memory.ma); + } void datasetAlloc(dataset_t& ds, bool largePages) { if (sizeof(size_t) <= 4) diff --git a/src/dataset.hpp b/src/dataset.hpp index 5f9836c..0103271 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -40,12 +40,14 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount); - convertible_t datasetRead(addr_t addr, MemoryRegisters& memory); + void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&); template void datasetInitCache(const void* seed, dataset_t& dataset); template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); + + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); } diff --git a/src/instructions.hpp b/src/instructions.hpp index 2321be6..dc5d4ee 100644 --- a/src/instructions.hpp +++ b/src/instructions.hpp @@ -22,12 +22,6 @@ along with RandomX. If not, see. namespace RandomX { - //Clears the 11 least-significant bits before conversion. This is done so the number - //fits exactly into the 52-bit mantissa without rounding. - inline double convertSigned52(int64_t x) { - return (double)(x & -2048L); - } - extern "C" { void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c); void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c); @@ -53,11 +47,11 @@ namespace RandomX { void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c); bool JMP_COND(uint8_t, convertible_t&, int32_t); void FPINIT(); + void FPROUND(convertible_t, uint8_t); void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); } } \ No newline at end of file diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index 790506b..78bdb6f 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -370,9 +370,9 @@ namespace RandomX { #endif } - void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { - c.lo.f64 = convertSigned52(a.i64); - switch (a.u64 & 3) { + void FPROUND(convertible_t a, uint8_t rot) { + uint64_t flag = ror64(a.u64, rot); + switch (flag & 3) { case RoundDown: #ifdef DEBUG std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " << diff --git a/src/main.cpp b/src/main.cpp index 6366821..3295500 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help, largePages; + bool softAes, lightClient, genAsm, compiled, help, largePages, async; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -178,6 +178,7 @@ int main(int argc, char** argv) { readIntOption("--threads", argc, argv, threadCount, 1); readIntOption("--nonces", argc, argv, programCount, 1000); readOption("--largePages", argc, argv, largePages); + readOption("--async", argc, argv, async); if (genAsm) { generateAsm(programCount); @@ -250,12 +251,12 @@ int main(int argc, char** argv) { for (int i = 0; i < threadCount; ++i) { RandomX::VirtualMachine* vm; if (compiled) { - vm = new RandomX::CompiledVirtualMachine(softAes); + vm = new RandomX::CompiledVirtualMachine(); } else { - vm = new RandomX::InterpretedVirtualMachine(softAes); + vm = new RandomX::InterpretedVirtualMachine(softAes, async); } - vm->setDataset(dataset, lightClient); + vm->setDataset(dataset); vms.push_back(vm); } std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; @@ -278,7 +279,12 @@ int main(int argc, char** argv) { result.print(std::cout); if(programCount == 1000) std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; - std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; + if (lightClient) { + std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per program" << std::endl; + } + else { + std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; + } } catch (std::exception& e) { std::cout << "ERROR: " << e.what() << std::endl;