diff --git a/src/CompiledLightVirtualMachine.cpp b/src/CompiledLightVirtualMachine.cpp index 11bedf8..0d0fa67 100644 --- a/src/CompiledLightVirtualMachine.cpp +++ b/src/CompiledLightVirtualMachine.cpp @@ -38,7 +38,7 @@ namespace RandomX { template void CompiledLightVirtualMachine::initialize() { VirtualMachine::initialize(); - compiler.generateProgramLight(program); + compiler.generateProgramLight(program, config); //mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize); } diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 3e44476..14de68e 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -37,7 +37,7 @@ namespace RandomX { void CompiledVirtualMachine::initialize() { VirtualMachine::initialize(); - compiler.generateProgram(program); + compiler.generateProgram(program, config); mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize); } diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 827f2e6..2fa1018 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -116,6 +116,16 @@ namespace RandomX { return scratchpad + addr; } + template + FORCE_INLINE __m128d InterpretedVirtualMachine::maskRegisterExponentMantissa(__m128d x) { + constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1; + const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64)); + const __m128d exponentMask = _mm_load_pd((const double*)&config.eMask); + x = _mm_and_pd(x, mantissaMask); + x = _mm_or_pd(x, exponentMask); + return x; + } + template FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[ic]; @@ -229,7 +239,7 @@ namespace RandomX { } break; case InstructionType::FDIV_M: { - __m128d fsrc = ieee_set_exponent<-240>(load_cvt_i32x2(getScratchpadAddress(ibc))); + __m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc))); *ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc); } break; @@ -326,7 +336,7 @@ namespace RandomX { uint32_t spAddr1 = mem.ma; if (trace) { - std::cout << "execute (reg: r" << readReg0 << ", r" << readReg1 << ", r" << readReg2 << ", r" << readReg3 << ")" << std::endl; + std::cout << "execute (reg: r" << config.readReg0 << ", r" << config.readReg1 << ", r" << config.readReg2 << ", r" << config.readReg3 << ")" << std::endl; std::cout << "spAddr " << std::hex << std::setw(8) << std::setfill('0') << spAddr1 << " / " << std::setw(8) << std::setfill('0') << spAddr0 << std::endl; std::cout << "ma/mx " << std::hex << std::setw(8) << std::setfill('0') << mem.ma << std::setw(8) << std::setfill('0') << mem.mx << std::endl; printState(r, f, e, a); @@ -334,7 +344,7 @@ namespace RandomX { for(unsigned ic = 0; ic < RANDOMX_PROGRAM_ITERATIONS; ++ic) { //std::cout << "Iteration " << iter << std::endl; - uint64_t spMix = r[readReg0] ^ r[readReg1]; + uint64_t spMix = r[config.readReg0] ^ r[config.readReg1]; spAddr0 ^= spMix; spAddr0 &= ScratchpadL3Mask64; spAddr1 ^= spMix >> 32; @@ -353,10 +363,10 @@ namespace RandomX { f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); - e[0] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 32)); - e[1] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 40)); - e[2] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 48)); - e[3] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 56)); + e[0] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 32)); + e[1] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 40)); + e[2] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 48)); + e[3] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 56)); if (trace) { std::cout << "iteration " << std::dec << ic << std::endl; @@ -368,7 +378,7 @@ namespace RandomX { executeBytecode(r, f, e, a); - mem.mx ^= r[readReg2] ^ r[readReg3]; + mem.mx ^= r[config.readReg2] ^ r[config.readReg3]; mem.mx &= CacheLineAlignMask; if (superscalar) { executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r); diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 1dcc441..d7cb340 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -133,5 +133,6 @@ namespace RandomX { void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]); void* getScratchpadAddress(InstructionByteCode& ibc); + __m128d maskRegisterExponentMantissa(__m128d); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 0ad7350..8c70041 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -24,6 +24,7 @@ along with RandomX. If not, see. #include "Program.hpp" #include "reciprocal.h" #include "virtualMemory.hpp" +#include "intrinPortable.h" #define RANDOMX_JUMP @@ -230,20 +231,20 @@ namespace RandomX { freePagedMemory(code, CodeSize); } - void JitCompilerX86::generateProgram(Program& prog) { - generateProgramPrologue(prog); + void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + generateProgramPrologue(prog, pcfg); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; generateProgramEpilogue(prog); } template - void JitCompilerX86::generateProgramLight(Program& prog) { + void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg) { if (RANDOMX_CACHE_ACCESSES != 8) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_CACHE_ACCESSES"); if (RANDOMX_ARGON_GROWTH != 0) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_ARGON_GROWTH"); - generateProgramPrologue(prog); + generateProgramPrologue(prog, pcfg); if (superscalar) { emit(codeReadDatasetLightSshInit, readDatasetLightInitSize); emitByte(CALL); @@ -259,8 +260,8 @@ namespace RandomX { generateProgramEpilogue(prog); } - template void JitCompilerX86::generateProgramLight(Program& prog); - template void JitCompilerX86::generateProgramLight(Program& prog); + template void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg); + template void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg); template void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[N]) { @@ -298,33 +299,26 @@ namespace RandomX { memcpy(code, codeDatasetInit, datasetInitSize); } - void JitCompilerX86::generateProgramPrologue(Program& prog) { + void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { #ifdef RANDOMX_JUMP instructionOffsets.clear(); for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; } #endif - auto addressRegisters = prog.getEntropy(12); - uint32_t readReg0 = 0 + (addressRegisters & 1); - addressRegisters >>= 1; - uint32_t readReg1 = 2 + (addressRegisters & 1); - addressRegisters >>= 1; - uint32_t readReg2 = 4 + (addressRegisters & 1); - addressRegisters >>= 1; - uint32_t readReg3 = 6 + (addressRegisters & 1); codePos = prologueSize; + memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg0); + emitByte(0xc0 + pcfg.readReg0); emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg1); + emitByte(0xc0 + pcfg.readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; generateCode(prog); emit(REX_MOV_RR); - emitByte(0xc0 + readReg2); + emitByte(0xc0 + pcfg.readReg2); emit(REX_XOR_EAX); - emitByte(0xc0 + readReg3); + emitByte(0xc0 + pcfg.readReg3); } void JitCompilerX86::generateProgramEpilogue(Program& prog) { diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 2908b04..9c15ac7 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; + class ProgramConfiguration; class SuperscalarProgram; class JitCompilerX86; @@ -38,9 +39,9 @@ namespace RandomX { public: JitCompilerX86(); ~JitCompilerX86(); - void generateProgram(Program&); + void generateProgram(Program&, ProgramConfiguration&); template - void generateProgramLight(Program&); + void generateProgramLight(Program&, ProgramConfiguration&); template void generateSuperScalarHash(SuperscalarProgram (&programs)[N]); ProgramFunc getProgramFunc() { @@ -73,7 +74,7 @@ namespace RandomX { void generateDatasetInitCode(); - void generateProgramPrologue(Program&); + void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramEpilogue(Program&); int getConditionRegister(); void genAddressReg(Instruction&, bool); diff --git a/src/Program.hpp b/src/Program.hpp index 2f2a402..854a557 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -27,6 +27,11 @@ along with RandomX. If not, see. namespace RandomX { + struct ProgramConfiguration { + uint64_t eMask[2]; + uint32_t readReg0, readReg1, readReg2, readReg3; + }; + class Program { public: Instruction& operator()(int pc) { diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index d15bb4d..4af0374 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -77,14 +77,18 @@ namespace RandomX { mem.ma = program.getEntropy(8) & CacheLineAlignMask; mem.mx = program.getEntropy(10); auto addressRegisters = program.getEntropy(12); - readReg0 = 0 + (addressRegisters & 1); + config.readReg0 = 0 + (addressRegisters & 1); addressRegisters >>= 1; - readReg1 = 2 + (addressRegisters & 1); + config.readReg1 = 2 + (addressRegisters & 1); addressRegisters >>= 1; - readReg2 = 4 + (addressRegisters & 1); + config.readReg2 = 4 + (addressRegisters & 1); addressRegisters >>= 1; - readReg3 = 6 + (addressRegisters & 1); - datasetBase = program.getEntropy(14) % datasetRange; + config.readReg3 = 6 + (addressRegisters & 1); + datasetBase = program.getEntropy(13) % datasetRange; + constexpr uint64_t mask22bit = (1ULL << 22) - 1; + constexpr uint64_t maskExp240 = ieee_get_exponent_mask<-240>(); + store64(&config.eMask[0], (program.getEntropy(14) & mask22bit) | maskExp240); + store64(&config.eMask[1], (program.getEntropy(15) & mask22bit) | maskExp240); } template diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 7352933..b8382f6 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -46,9 +46,9 @@ namespace RandomX { protected: alignas(64) Program program; alignas(64) RegisterFile reg; + alignas(16) ProgramConfiguration config; MemoryRegisters mem; uint8_t* scratchpad; - uint32_t readReg0, readReg1, readReg2, readReg3; uint32_t datasetRange; uint32_t datasetBase; }; diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 3bee98f..972716f 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -311,6 +311,12 @@ inline __m128d load_cvt_i32x2(const void* addr) { return _mm_cvtepi32_pd(ix); } +template +constexpr uint64_t ieee_get_exponent_mask() { + static_assert(E > -1023, "Invalid exponent value"); + return (uint64_t)(E + 1023U) << 52; +} + template __m128d ieee_set_exponent(__m128d x) { static_assert(E > -1023, "Invalid exponent value"); diff --git a/src/main.cpp b/src/main.cpp index b7dbc3f..2b653ae 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -396,7 +396,7 @@ int main(int argc, char** argv) { std::cout << "Calculated result: "; result.print(std::cout); if(!legacy && programCount == 1000) - std::cout << "Reference result: af72d8069bd95ef04b414d3a83772c7bd2df454940bad15ae0b48543aeef8ab2" << std::endl; + std::cout << "Reference result: 630ad3bc7f44fe8386462d7b671fa2a1167d3e062bfb9a2967f64832760cfedb" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; }