diff --git a/doc/isa.md b/doc/isa.md index 6e67ab8..dc4cbf9 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -33,7 +33,7 @@ The first operand is read from memory. The location is determined by the `loc(a) Flag `reg(a)` encodes an integer register `r0`-`r7`. The read address is calculated as: ``` -reg(a)[31:0] = reg(a)[31:0] XOR addr0 +reg(a) = reg(a) XOR signExtend(addr0) addr(a) = reg(a)[W-1:0] ``` `W` is the address width from the above table. For reading from the scratchpad, `addr(a)` is multiplied by 8 for 8-byte aligned access. @@ -54,7 +54,7 @@ The second operand is loaded either from a register or from an immediate value e `imm0` is an 8-bit immediate value, which is used for shift and rotate ALU operations. -`imm1` is a 32-bit immediate value which is used for most operations. For operands larger than 32 bits, the value is zero-extended for unsigned instructions and sign-extended for signed instructions. For FPU instructions, the value is considered a signed 32-bit integer and then converted to a double precision floating point format. +`imm1` is a 32-bit immediate value which is used for most operations. For operands larger than 32 bits, the value is sign-extended. For FPU instructions, the value is considered a signed 32-bit integer and then converted to a double precision floating point format. #### Operand C The third operand is the location where the result is stored. @@ -80,7 +80,7 @@ addr(c) = 8 * (addr1 XOR reg(c)[31:0])[W-1:0] An 8-bit immediate value that is used as the shift/rotate count by some ALU instructions and as the jump offset of the CALL instruction. #### addr0 -A 32-bit address mask that is used to calculate the read address for the A operand. +A 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits. #### addr1 A 32-bit address mask that is used to calculate the write address for the C operand. `addr1` is equal to `imm1`. diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 092cd9f..a854bdb 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -16,7 +16,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ - +//#define TRACE #include "AssemblyGeneratorX86.hpp" #include "Pcg32.hpp" #include "common.hpp" @@ -164,6 +164,9 @@ namespace RandomX { asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; + if (trace) { + asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl; + } return; case 1: @@ -174,10 +177,16 @@ namespace RandomX { asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; + if (trace) { + asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl; + } return; default: asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; + if (trace) { + asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl; + } } } @@ -189,7 +198,7 @@ namespace RandomX { asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl; - return; + break; case 1: case 2: @@ -198,10 +207,14 @@ namespace RandomX { asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl; - return; + break; default: asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; + break; + } + if (trace) { + asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl; } } @@ -466,8 +479,11 @@ namespace RandomX { asmCode << "\tjmp rx_i_" << wrapi(i + 1) << std::endl; asmCode << "taken_call_" << i << ":" << std::endl; } + if (trace) { + asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl; + } asmCode << "\tpush rax" << std::endl; - asmCode << "\tcall rx_i_" << wrapi(i + (instr.imm0 & 127) + 1) << std::endl; + asmCode << "\tcall rx_i_" << wrapi(i + (instr.imm0 & 127) + 2) << std::endl; } void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index af9af7f..d7477a3 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -43,5 +43,11 @@ namespace RandomX { void CompiledVirtualMachine::execute() { FPINIT(); executeProgram(reg, mem, readDataset, scratchpad); +#ifdef TRACE + for (int32_t i = InstructionCount - 1; i >= 0; --i) { + std::cout << std::hex << tracepad[i].u64 << std::endl; + } +#endif + } } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index e2acc50..b2b7a1c 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -18,7 +18,7 @@ along with RandomX. If not, see. */ #pragma once - +//#define TRACE #include "VirtualMachine.hpp" #include "Program.hpp" #include @@ -30,5 +30,9 @@ namespace RandomX { CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {} virtual void initializeProgram(const void* seed) override; virtual void execute() override; + private: +#ifdef TRACE + convertible_t tracepad[InstructionCount]; +#endif }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 36983cf..f2b75d0 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -34,7 +34,7 @@ namespace RandomX { uint8_t locc; uint8_t regc; uint8_t imm0; - uint32_t addr0; + int32_t addr0; union { uint32_t addr1; int32_t imm1; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 15a75a9..4ad0e3d 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -65,7 +65,7 @@ namespace RandomX { convertible_t InterpretedVirtualMachine::loada(Instruction& inst) { convertible_t& rega = reg.r[inst.rega % RegistersCount]; - rega.u64 ^= inst.addr0; + rega.i64 ^= inst.addr0; //sign-extend addr0 addr_t addr = rega.u32; switch (inst.loca & 7) { @@ -86,7 +86,7 @@ namespace RandomX { } convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) { - switch (inst.loca & 7) + switch (inst.locb & 7) { case 0: case 1: @@ -98,7 +98,7 @@ namespace RandomX { case 6: case 7: convertible_t temp; - temp.i64 = inst.imm1; + temp.i64 = inst.imm1; //sign-extend imm1 return temp; } } @@ -182,13 +182,13 @@ namespace RandomX { } #define ALU_RETIRE(x) x(a, b, c); \ - if(trace) std::cout << std::hex << a.u64 << " " << b.u64 << " " << c.u64 << std::endl; + if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; #define FPU_RETIRE(x) x(a, b, c); \ if(trace) { \ convertible_t bc; \ bc.f64 = b; \ - std::cout << std::hex << a.u64 << " " << bc.u64 << " " << c.u64 << std::endl; \ + std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \ } \ if(fpuCheck) { \ convertible_t bc; \ @@ -206,7 +206,7 @@ namespace RandomX { } #define FPU_RETIRE_NB(x) x(a, b, c); \ - if(trace) std::cout << std::hex << a.u64 << " " << c.u64 << std::endl; + if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ convertible_t a = loada(inst); \ @@ -277,9 +277,11 @@ namespace RandomX { stackPush(pc); pc += (inst.imm0 & 127) + 1; pc = pc % ProgramLength; + if (trace) std::cout << std::hex << a.u64 << std::endl; } else { c.u64 = a.u64; + if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; } } @@ -296,6 +298,7 @@ namespace RandomX { else { c.u64 = a.u64; } + if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; } #include "instructionWeights.hpp" diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 05b22ac..3951a86 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -58,16 +58,14 @@ namespace RandomX { void VirtualMachine::initializeScratchpad(uint32_t index) { if (lightClient) { if (softAes) { - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 0, 4 * index + 0, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 1, 4 * index + 1, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 2, 4 * index + 2, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 3, 4 * index + 3, mem.lcm->keys); + for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { + initBlock(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys); + } } else { - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 0, 4 * index + 0, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 1, 4 * index + 1, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 2, 4 * index + 2, mem.lcm->keys); - initBlock(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 3, 4 * index + 3, mem.lcm->keys); + for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { + initBlock(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys); + } } } else { diff --git a/src/common.hpp b/src/common.hpp index c980219..765a1fc 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -35,6 +35,7 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int CacheBlockSize = 1024; + constexpr int CacheShift = CacheBlockSize / 2; constexpr int BlockExpansionRatio = 64; constexpr uint32_t DatasetBlockSize = BlockExpansionRatio * CacheBlockSize; constexpr uint32_t DatasetBlockCount = 65536; diff --git a/src/dataset.cpp b/src/dataset.cpp index 52a997c..0738b4f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. #include #include #include +#include #if defined(_MSC_VER) #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) @@ -237,7 +238,7 @@ namespace RandomX { convertible_t data; auto blockNumber = memory.ma / DatasetBlockSize; if (memory.lcm->blockNumber != blockNumber) { - initBlock(memory.lcm->cache, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys); + initBlock(memory.lcm->cache + CacheShift, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys); memory.lcm->blockNumber = blockNumber; } data.u64 = *(uint64_t*)(memory.lcm->block + (memory.ma % DatasetBlockSize)); @@ -263,15 +264,16 @@ namespace RandomX { if (dataset == nullptr) { throw std::runtime_error("Dataset memory allocation failed. >4 GiB of virtual memory is needed."); } - uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize, sizeof(__m128i)); - if (dataset == nullptr) { + uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i)); + if (cache == nullptr) { throw std::bad_alloc(); } initializeCache(seed, SeedSize, cache); + memcpy(cache + CacheSize, cache, CacheShift); alignas(16) __m128i keys[10]; expandAesKeys((const __m128i*)seed, keys); for (uint32_t i = 0; i < DatasetBlockCount; ++i) { - initBlock(cache, dataset + i * DatasetBlockSize, i, keys); + initBlock(cache + CacheShift, dataset + i * DatasetBlockSize, i, keys); } _mm_free(cache); } @@ -285,11 +287,12 @@ namespace RandomX { template void datasetInitLight(const void* seed, LightClientMemory*& lcm) { lcm = new LightClientMemory(); - lcm->cache = (uint8_t*)_mm_malloc(CacheSize, sizeof(__m128i)); + lcm->cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i)); if (lcm->cache == nullptr) { throw std::bad_alloc(); } initializeCache(seed, SeedSize, lcm->cache); + memcpy(lcm->cache + CacheSize, lcm->cache, CacheShift); expandAesKeys((__m128i*)seed, lcm->keys); lcm->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); if (lcm->block == nullptr) { @@ -303,4 +306,4 @@ namespace RandomX { template void datasetInitLight(const void*, LightClientMemory*&); -} \ No newline at end of file +} diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 9828f5c..cee6fb7 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -151,19 +151,40 @@ rx_finish: ; return ret 0 -rx_read_dataset: +rx_read_dataset_light: push rdx push r9 push r10 push r11 - sub rsp, 32 + movd qword ptr [rsp - 8], xmm1 + movd qword ptr [rsp - 16], xmm2 + sub rsp, 48 call qword ptr [rbp] - add rsp, 32 + add rsp, 48 + movd xmm2, qword ptr [rsp - 16] + movd xmm1, qword ptr [rsp - 8] pop r11 pop r10 pop r9 pop rdx ret 0 + +rx_read_dataset: + mov r8d, dword ptr [rdx] ; ma + mov rax, qword ptr [rdx+8] ; dataset + mov rax, qword ptr [rax+r8] + add dword ptr [rdx], 8 + mov r8d, dword ptr [rdx+4] ; mx + xor ecx, r8d + mov dword ptr [rdx+4], ecx + test ecx, 0FFF8h + jne short rx_read_dataset_full_ret + and ecx, -8 + mov dword ptr [rdx], ecx + mov r8, qword ptr [rdx+8] + prefetcht0 byte ptr [r8+rcx] +rx_read_dataset_full_ret: + ret 0 executeProgram ENDP END diff --git a/src/program.inc b/src/program.inc index 63c99df..0b5856b 100644 --- a/src/program.inc +++ b/src/program.inc @@ -123,7 +123,7 @@ rx_i_6: ;CALL jmp rx_i_7 taken_call_6: push rax - call rx_i_50 + call rx_i_51 rx_i_7: ;FPDIV dec edi @@ -538,7 +538,7 @@ rx_i_38: ;CALL jmp rx_i_39 taken_call_38: push rax - call rx_i_111 + call rx_i_112 rx_i_39: ;CALL dec edi @@ -553,7 +553,7 @@ rx_i_39: ;CALL jmp rx_i_40 taken_call_39: push rax - call rx_i_61 + call rx_i_62 rx_i_40: ;FPMUL dec edi @@ -621,7 +621,7 @@ rx_i_44: ;CALL jmp rx_i_45 taken_call_44: push rax - call rx_i_93 + call rx_i_94 rx_i_45: ;FPROUND dec edi @@ -726,7 +726,7 @@ rx_i_51: ;CALL jmp rx_i_52 taken_call_51: push rax - call rx_i_134 + call rx_i_135 rx_i_52: ;FPDIV dec edi @@ -943,7 +943,7 @@ rx_i_65: ;CALL jmp rx_i_66 taken_call_65: push rax - call rx_i_123 + call rx_i_124 rx_i_66: ;FPSUB dec edi @@ -996,7 +996,7 @@ rx_i_69: ;CALL jmp rx_i_70 taken_call_69: push rax - call rx_i_132 + call rx_i_133 rx_i_70: ;FPDIV dec edi @@ -1022,7 +1022,7 @@ rx_i_71: ;CALL jmp rx_i_72 taken_call_71: push rax - call rx_i_82 + call rx_i_83 rx_i_72: ;FPADD dec edi @@ -1093,7 +1093,7 @@ rx_i_76: ;CALL jmp rx_i_77 taken_call_76: push rax - call rx_i_194 + call rx_i_195 rx_i_77: ;FPDIV dec edi @@ -1138,7 +1138,7 @@ rx_i_79: ;CALL jmp rx_i_80 taken_call_79: push rax - call rx_i_205 + call rx_i_206 rx_i_80: ;FPADD dec edi @@ -1208,7 +1208,7 @@ rx_i_83: ;CALL jmp rx_i_84 taken_call_83: push rax - call rx_i_96 + call rx_i_97 rx_i_84: ;ROR_64 dec edi @@ -1249,7 +1249,7 @@ rx_i_86: ;CALL jmp rx_i_87 taken_call_86: push rax - call rx_i_148 + call rx_i_149 rx_i_87: ;DIV_64 dec edi @@ -1376,7 +1376,7 @@ rx_i_96: ;CALL mov ecx, ebx call rx_read_dataset push rax - call rx_i_173 + call rx_i_174 rx_i_97: ;ROR_64 dec edi @@ -1402,7 +1402,7 @@ rx_i_98: ;CALL jmp rx_i_99 taken_call_98: push rax - call rx_i_160 + call rx_i_161 rx_i_99: ;MUL_64 dec edi @@ -1567,7 +1567,7 @@ rx_i_111: ;CALL and eax, 2047 mov rax, qword ptr [rsi + rax * 8] push rax - call rx_i_146 + call rx_i_147 rx_i_112: ;FPMUL dec edi @@ -1617,7 +1617,7 @@ rx_i_115: ;CALL mov ecx, ebx call rx_read_dataset push rax - call rx_i_215 + call rx_i_216 rx_i_116: ;ADD_32 dec edi @@ -1778,7 +1778,7 @@ rx_i_126: ;CALL jmp rx_i_127 taken_call_126: push rax - call rx_i_195 + call rx_i_196 rx_i_127: ;ADD_64 dec edi @@ -1806,7 +1806,7 @@ rx_i_128: ;CALL jmp rx_i_129 taken_call_128: push rax - call rx_i_240 + call rx_i_241 rx_i_129: ;MUL_32 dec edi @@ -1863,7 +1863,7 @@ rx_i_133: ;CALL and eax, 2047 mov rax, qword ptr [rsi + rax * 8] push rax - call rx_i_157 + call rx_i_158 rx_i_134: ;AND_64 dec edi @@ -2049,7 +2049,7 @@ rx_i_146: ;CALL jmp rx_i_147 taken_call_146: push rax - call rx_i_260 + call rx_i_261 rx_i_147: ;IMUL_32 dec edi @@ -2277,7 +2277,7 @@ rx_i_163: ;CALL jmp rx_i_164 taken_call_163: push rax - call rx_i_184 + call rx_i_185 rx_i_164: ;ADD_32 dec edi @@ -2430,7 +2430,7 @@ rx_i_173: ;CALL jmp rx_i_174 taken_call_173: push rax - call rx_i_200 + call rx_i_201 rx_i_174: ;FPSQRT dec edi @@ -2593,7 +2593,7 @@ rx_i_185: ;CALL jmp rx_i_186 taken_call_185: push rax - call rx_i_214 + call rx_i_215 rx_i_186: ;FPADD dec edi @@ -2647,7 +2647,7 @@ rx_i_189: ;CALL jmp rx_i_190 taken_call_189: push rax - call rx_i_249 + call rx_i_250 rx_i_190: ;XOR_64 dec edi @@ -3209,7 +3209,7 @@ rx_i_230: ;CALL jmp rx_i_231 taken_call_230: push rax - call rx_i_331 + call rx_i_332 rx_i_231: ;FPMUL dec edi @@ -3323,7 +3323,7 @@ rx_i_237: ;CALL jmp rx_i_238 taken_call_237: push rax - call rx_i_271 + call rx_i_272 rx_i_238: ;FPDIV dec edi @@ -3379,7 +3379,7 @@ rx_i_241: ;CALL mov ecx, r15d call rx_read_dataset push rax - call rx_i_298 + call rx_i_299 rx_i_242: ;ROR_64 dec edi @@ -3597,7 +3597,7 @@ rx_i_257: ;CALL jmp rx_i_258 taken_call_257: push rax - call rx_i_370 + call rx_i_371 rx_i_258: ;FPADD dec edi @@ -3776,7 +3776,7 @@ rx_i_270: ;CALL jmp rx_i_271 taken_call_270: push rax - call rx_i_298 + call rx_i_299 rx_i_271: ;ROL_64 dec edi @@ -3868,7 +3868,7 @@ rx_i_277: ;CALL and eax, 2047 mov rax, qword ptr [rsi + rax * 8] push rax - call rx_i_375 + call rx_i_376 rx_i_278: ;FPADD dec edi @@ -4548,7 +4548,7 @@ rx_i_326: ;CALL jmp rx_i_327 taken_call_326: push rax - call rx_i_346 + call rx_i_347 rx_i_327: ;MUL_64 dec edi @@ -4922,7 +4922,7 @@ rx_i_354: ;CALL jmp rx_i_355 taken_call_354: push rax - call rx_i_355 + call rx_i_356 rx_i_355: ;MUL_64 dec edi @@ -5659,7 +5659,7 @@ rx_i_409: ;CALL jmp rx_i_410 taken_call_409: push rax - call rx_i_497 + call rx_i_498 rx_i_410: ;FPDIV dec edi @@ -5866,7 +5866,7 @@ rx_i_425: ;CALL jmp rx_i_426 taken_call_425: push rax - call rx_i_34 + call rx_i_35 rx_i_426: ;IMUL_32 dec edi @@ -6556,7 +6556,7 @@ rx_i_476: ;CALL and eax, 2047 mov rax, qword ptr [rsi + rax * 8] push rax - call rx_i_11 + call rx_i_12 rx_i_477: ;MUL_64 dec edi @@ -6580,7 +6580,7 @@ rx_i_478: ;CALL jmp rx_i_479 taken_call_478: push rax - call rx_i_72 + call rx_i_73 rx_i_479: ;FPSUB dec edi @@ -6721,7 +6721,7 @@ rx_i_489: ;CALL jmp rx_i_490 taken_call_489: push rax - call rx_i_61 + call rx_i_62 rx_i_490: ;ADD_64 dec edi