Implemented cache shift

Fixed assembly code generator
Fixed an error in the interpreter
Updated specification: sign-extended immediates
This commit is contained in:
tevador 2018-12-15 23:13:17 +01:00
parent 4fc4b840f5
commit 6332831ec1
11 changed files with 121 additions and 69 deletions

View File

@ -33,7 +33,7 @@ The first operand is read from memory. The location is determined by the `loc(a)
Flag `reg(a)` encodes an integer register `r0`-`r7`. The read address is calculated as:
```
reg(a)[31:0] = reg(a)[31:0] XOR addr0
reg(a) = reg(a) XOR signExtend(addr0)
addr(a) = reg(a)[W-1:0]
```
`W` is the address width from the above table. For reading from the scratchpad, `addr(a)` is multiplied by 8 for 8-byte aligned access.
@ -54,7 +54,7 @@ The second operand is loaded either from a register or from an immediate value e
`imm0` is an 8-bit immediate value, which is used for shift and rotate ALU operations.
`imm1` is a 32-bit immediate value which is used for most operations. For operands larger than 32 bits, the value is zero-extended for unsigned instructions and sign-extended for signed instructions. For FPU instructions, the value is considered a signed 32-bit integer and then converted to a double precision floating point format.
`imm1` is a 32-bit immediate value which is used for most operations. For operands larger than 32 bits, the value is sign-extended. For FPU instructions, the value is considered a signed 32-bit integer and then converted to a double precision floating point format.
#### Operand C
The third operand is the location where the result is stored.
@ -80,7 +80,7 @@ addr(c) = 8 * (addr1 XOR reg(c)[31:0])[W-1:0]
An 8-bit immediate value that is used as the shift/rotate count by some ALU instructions and as the jump offset of the CALL instruction.
#### addr0
A 32-bit address mask that is used to calculate the read address for the A operand.
A 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits.
#### addr1
A 32-bit address mask that is used to calculate the write address for the C operand. `addr1` is equal to `imm1`.

View File

@ -16,7 +16,7 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
//#define TRACE
#include "AssemblyGeneratorX86.hpp"
#include "Pcg32.hpp"
#include "common.hpp"
@ -164,6 +164,9 @@ namespace RandomX {
asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
}
return;
case 1:
@ -174,10 +177,16 @@ namespace RandomX {
asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
}
return;
default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
}
}
}
@ -189,7 +198,7 @@ namespace RandomX {
asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
return;
break;
case 1:
case 2:
@ -198,10 +207,14 @@ namespace RandomX {
asmCode << "\txor eax, 0" << std::hex << instr.addr1 << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
return;
break;
default:
asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
break;
}
if (trace) {
asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl;
}
}
@ -466,8 +479,11 @@ namespace RandomX {
asmCode << "\tjmp rx_i_" << wrapi(i + 1) << std::endl;
asmCode << "taken_call_" << i << ":" << std::endl;
}
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
}
asmCode << "\tpush rax" << std::endl;
asmCode << "\tcall rx_i_" << wrapi(i + (instr.imm0 & 127) + 1) << std::endl;
asmCode << "\tcall rx_i_" << wrapi(i + (instr.imm0 & 127) + 2) << std::endl;
}
void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {

View File

@ -43,5 +43,11 @@ namespace RandomX {
void CompiledVirtualMachine::execute() {
FPINIT();
executeProgram(reg, mem, readDataset, scratchpad);
#ifdef TRACE
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;
}
#endif
}
}

View File

@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
#pragma once
//#define TRACE
#include "VirtualMachine.hpp"
#include "Program.hpp"
#include <sstream>
@ -30,5 +30,9 @@ namespace RandomX {
CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {}
virtual void initializeProgram(const void* seed) override;
virtual void execute() override;
private:
#ifdef TRACE
convertible_t tracepad[InstructionCount];
#endif
};
}

View File

@ -34,7 +34,7 @@ namespace RandomX {
uint8_t locc;
uint8_t regc;
uint8_t imm0;
uint32_t addr0;
int32_t addr0;
union {
uint32_t addr1;
int32_t imm1;

View File

@ -65,7 +65,7 @@ namespace RandomX {
convertible_t InterpretedVirtualMachine::loada(Instruction& inst) {
convertible_t& rega = reg.r[inst.rega % RegistersCount];
rega.u64 ^= inst.addr0;
rega.i64 ^= inst.addr0; //sign-extend addr0
addr_t addr = rega.u32;
switch (inst.loca & 7)
{
@ -86,7 +86,7 @@ namespace RandomX {
}
convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) {
switch (inst.loca & 7)
switch (inst.locb & 7)
{
case 0:
case 1:
@ -98,7 +98,7 @@ namespace RandomX {
case 6:
case 7:
convertible_t temp;
temp.i64 = inst.imm1;
temp.i64 = inst.imm1; //sign-extend imm1
return temp;
}
}
@ -182,13 +182,13 @@ namespace RandomX {
}
#define ALU_RETIRE(x) x(a, b, c); \
if(trace) std::cout << std::hex << a.u64 << " " << b.u64 << " " << c.u64 << std::endl;
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define FPU_RETIRE(x) x(a, b, c); \
if(trace) { \
convertible_t bc; \
bc.f64 = b; \
std::cout << std::hex << a.u64 << " " << bc.u64 << " " << c.u64 << std::endl; \
std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \
} \
if(fpuCheck) { \
convertible_t bc; \
@ -206,7 +206,7 @@ namespace RandomX {
}
#define FPU_RETIRE_NB(x) x(a, b, c); \
if(trace) std::cout << std::hex << a.u64 << " " << c.u64 << std::endl;
if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
convertible_t a = loada(inst); \
@ -277,9 +277,11 @@ namespace RandomX {
stackPush(pc);
pc += (inst.imm0 & 127) + 1;
pc = pc % ProgramLength;
if (trace) std::cout << std::hex << a.u64 << std::endl;
}
else {
c.u64 = a.u64;
if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
}
}
@ -296,6 +298,7 @@ namespace RandomX {
else {
c.u64 = a.u64;
}
if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
}
#include "instructionWeights.hpp"

View File

@ -58,16 +58,14 @@ namespace RandomX {
void VirtualMachine::initializeScratchpad(uint32_t index) {
if (lightClient) {
if (softAes) {
initBlock<true>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 0, 4 * index + 0, mem.lcm->keys);
initBlock<true>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 1, 4 * index + 1, mem.lcm->keys);
initBlock<true>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 2, 4 * index + 2, mem.lcm->keys);
initBlock<true>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 3, 4 * index + 3, mem.lcm->keys);
for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) {
initBlock<true>(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys);
}
}
else {
initBlock<false>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 0, 4 * index + 0, mem.lcm->keys);
initBlock<false>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 1, 4 * index + 1, mem.lcm->keys);
initBlock<false>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 2, 4 * index + 2, mem.lcm->keys);
initBlock<false>(mem.lcm->cache, ((uint8_t*)scratchpad) + DatasetBlockSize * 3, 4 * index + 3, mem.lcm->keys);
for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) {
initBlock<false>(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys);
}
}
}
else {

View File

@ -35,6 +35,7 @@ namespace RandomX {
constexpr int SeedSize = 32;
constexpr int CacheBlockSize = 1024;
constexpr int CacheShift = CacheBlockSize / 2;
constexpr int BlockExpansionRatio = 64;
constexpr uint32_t DatasetBlockSize = BlockExpansionRatio * CacheBlockSize;
constexpr uint32_t DatasetBlockCount = 65536;

View File

@ -26,6 +26,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <new>
#include <algorithm>
#include <stdexcept>
#include <cstring>
#if defined(_MSC_VER)
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
@ -237,7 +238,7 @@ namespace RandomX {
convertible_t data;
auto blockNumber = memory.ma / DatasetBlockSize;
if (memory.lcm->blockNumber != blockNumber) {
initBlock<softAes>(memory.lcm->cache, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys);
initBlock<softAes>(memory.lcm->cache + CacheShift, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys);
memory.lcm->blockNumber = blockNumber;
}
data.u64 = *(uint64_t*)(memory.lcm->block + (memory.ma % DatasetBlockSize));
@ -263,15 +264,16 @@ namespace RandomX {
if (dataset == nullptr) {
throw std::runtime_error("Dataset memory allocation failed. >4 GiB of virtual memory is needed.");
}
uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize, sizeof(__m128i));
if (dataset == nullptr) {
uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i));
if (cache == nullptr) {
throw std::bad_alloc();
}
initializeCache(seed, SeedSize, cache);
memcpy(cache + CacheSize, cache, CacheShift);
alignas(16) __m128i keys[10];
expandAesKeys<softAes>((const __m128i*)seed, keys);
for (uint32_t i = 0; i < DatasetBlockCount; ++i) {
initBlock<softAes>(cache, dataset + i * DatasetBlockSize, i, keys);
initBlock<softAes>(cache + CacheShift, dataset + i * DatasetBlockSize, i, keys);
}
_mm_free(cache);
}
@ -285,11 +287,12 @@ namespace RandomX {
template<bool softAes>
void datasetInitLight(const void* seed, LightClientMemory*& lcm) {
lcm = new LightClientMemory();
lcm->cache = (uint8_t*)_mm_malloc(CacheSize, sizeof(__m128i));
lcm->cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i));
if (lcm->cache == nullptr) {
throw std::bad_alloc();
}
initializeCache(seed, SeedSize, lcm->cache);
memcpy(lcm->cache + CacheSize, lcm->cache, CacheShift);
expandAesKeys<softAes>((__m128i*)seed, lcm->keys);
lcm->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i));
if (lcm->block == nullptr) {
@ -303,4 +306,4 @@ namespace RandomX {
template
void datasetInitLight<true>(const void*, LightClientMemory*&);
}
}

View File

@ -151,19 +151,40 @@ rx_finish:
; return
ret 0
rx_read_dataset:
rx_read_dataset_light:
push rdx
push r9
push r10
push r11
sub rsp, 32
movd qword ptr [rsp - 8], xmm1
movd qword ptr [rsp - 16], xmm2
sub rsp, 48
call qword ptr [rbp]
add rsp, 32
add rsp, 48
movd xmm2, qword ptr [rsp - 16]
movd xmm1, qword ptr [rsp - 8]
pop r11
pop r10
pop r9
pop rdx
ret 0
rx_read_dataset:
mov r8d, dword ptr [rdx] ; ma
mov rax, qword ptr [rdx+8] ; dataset
mov rax, qword ptr [rax+r8]
add dword ptr [rdx], 8
mov r8d, dword ptr [rdx+4] ; mx
xor ecx, r8d
mov dword ptr [rdx+4], ecx
test ecx, 0FFF8h
jne short rx_read_dataset_full_ret
and ecx, -8
mov dword ptr [rdx], ecx
mov r8, qword ptr [rdx+8]
prefetcht0 byte ptr [r8+rcx]
rx_read_dataset_full_ret:
ret 0
executeProgram ENDP
END

View File

@ -123,7 +123,7 @@ rx_i_6: ;CALL
jmp rx_i_7
taken_call_6:
push rax
call rx_i_50
call rx_i_51
rx_i_7: ;FPDIV
dec edi
@ -538,7 +538,7 @@ rx_i_38: ;CALL
jmp rx_i_39
taken_call_38:
push rax
call rx_i_111
call rx_i_112
rx_i_39: ;CALL
dec edi
@ -553,7 +553,7 @@ rx_i_39: ;CALL
jmp rx_i_40
taken_call_39:
push rax
call rx_i_61
call rx_i_62
rx_i_40: ;FPMUL
dec edi
@ -621,7 +621,7 @@ rx_i_44: ;CALL
jmp rx_i_45
taken_call_44:
push rax
call rx_i_93
call rx_i_94
rx_i_45: ;FPROUND
dec edi
@ -726,7 +726,7 @@ rx_i_51: ;CALL
jmp rx_i_52
taken_call_51:
push rax
call rx_i_134
call rx_i_135
rx_i_52: ;FPDIV
dec edi
@ -943,7 +943,7 @@ rx_i_65: ;CALL
jmp rx_i_66
taken_call_65:
push rax
call rx_i_123
call rx_i_124
rx_i_66: ;FPSUB
dec edi
@ -996,7 +996,7 @@ rx_i_69: ;CALL
jmp rx_i_70
taken_call_69:
push rax
call rx_i_132
call rx_i_133
rx_i_70: ;FPDIV
dec edi
@ -1022,7 +1022,7 @@ rx_i_71: ;CALL
jmp rx_i_72
taken_call_71:
push rax
call rx_i_82
call rx_i_83
rx_i_72: ;FPADD
dec edi
@ -1093,7 +1093,7 @@ rx_i_76: ;CALL
jmp rx_i_77
taken_call_76:
push rax
call rx_i_194
call rx_i_195
rx_i_77: ;FPDIV
dec edi
@ -1138,7 +1138,7 @@ rx_i_79: ;CALL
jmp rx_i_80
taken_call_79:
push rax
call rx_i_205
call rx_i_206
rx_i_80: ;FPADD
dec edi
@ -1208,7 +1208,7 @@ rx_i_83: ;CALL
jmp rx_i_84
taken_call_83:
push rax
call rx_i_96
call rx_i_97
rx_i_84: ;ROR_64
dec edi
@ -1249,7 +1249,7 @@ rx_i_86: ;CALL
jmp rx_i_87
taken_call_86:
push rax
call rx_i_148
call rx_i_149
rx_i_87: ;DIV_64
dec edi
@ -1376,7 +1376,7 @@ rx_i_96: ;CALL
mov ecx, ebx
call rx_read_dataset
push rax
call rx_i_173
call rx_i_174
rx_i_97: ;ROR_64
dec edi
@ -1402,7 +1402,7 @@ rx_i_98: ;CALL
jmp rx_i_99
taken_call_98:
push rax
call rx_i_160
call rx_i_161
rx_i_99: ;MUL_64
dec edi
@ -1567,7 +1567,7 @@ rx_i_111: ;CALL
and eax, 2047
mov rax, qword ptr [rsi + rax * 8]
push rax
call rx_i_146
call rx_i_147
rx_i_112: ;FPMUL
dec edi
@ -1617,7 +1617,7 @@ rx_i_115: ;CALL
mov ecx, ebx
call rx_read_dataset
push rax
call rx_i_215
call rx_i_216
rx_i_116: ;ADD_32
dec edi
@ -1778,7 +1778,7 @@ rx_i_126: ;CALL
jmp rx_i_127
taken_call_126:
push rax
call rx_i_195
call rx_i_196
rx_i_127: ;ADD_64
dec edi
@ -1806,7 +1806,7 @@ rx_i_128: ;CALL
jmp rx_i_129
taken_call_128:
push rax
call rx_i_240
call rx_i_241
rx_i_129: ;MUL_32
dec edi
@ -1863,7 +1863,7 @@ rx_i_133: ;CALL
and eax, 2047
mov rax, qword ptr [rsi + rax * 8]
push rax
call rx_i_157
call rx_i_158
rx_i_134: ;AND_64
dec edi
@ -2049,7 +2049,7 @@ rx_i_146: ;CALL
jmp rx_i_147
taken_call_146:
push rax
call rx_i_260
call rx_i_261
rx_i_147: ;IMUL_32
dec edi
@ -2277,7 +2277,7 @@ rx_i_163: ;CALL
jmp rx_i_164
taken_call_163:
push rax
call rx_i_184
call rx_i_185
rx_i_164: ;ADD_32
dec edi
@ -2430,7 +2430,7 @@ rx_i_173: ;CALL
jmp rx_i_174
taken_call_173:
push rax
call rx_i_200
call rx_i_201
rx_i_174: ;FPSQRT
dec edi
@ -2593,7 +2593,7 @@ rx_i_185: ;CALL
jmp rx_i_186
taken_call_185:
push rax
call rx_i_214
call rx_i_215
rx_i_186: ;FPADD
dec edi
@ -2647,7 +2647,7 @@ rx_i_189: ;CALL
jmp rx_i_190
taken_call_189:
push rax
call rx_i_249
call rx_i_250
rx_i_190: ;XOR_64
dec edi
@ -3209,7 +3209,7 @@ rx_i_230: ;CALL
jmp rx_i_231
taken_call_230:
push rax
call rx_i_331
call rx_i_332
rx_i_231: ;FPMUL
dec edi
@ -3323,7 +3323,7 @@ rx_i_237: ;CALL
jmp rx_i_238
taken_call_237:
push rax
call rx_i_271
call rx_i_272
rx_i_238: ;FPDIV
dec edi
@ -3379,7 +3379,7 @@ rx_i_241: ;CALL
mov ecx, r15d
call rx_read_dataset
push rax
call rx_i_298
call rx_i_299
rx_i_242: ;ROR_64
dec edi
@ -3597,7 +3597,7 @@ rx_i_257: ;CALL
jmp rx_i_258
taken_call_257:
push rax
call rx_i_370
call rx_i_371
rx_i_258: ;FPADD
dec edi
@ -3776,7 +3776,7 @@ rx_i_270: ;CALL
jmp rx_i_271
taken_call_270:
push rax
call rx_i_298
call rx_i_299
rx_i_271: ;ROL_64
dec edi
@ -3868,7 +3868,7 @@ rx_i_277: ;CALL
and eax, 2047
mov rax, qword ptr [rsi + rax * 8]
push rax
call rx_i_375
call rx_i_376
rx_i_278: ;FPADD
dec edi
@ -4548,7 +4548,7 @@ rx_i_326: ;CALL
jmp rx_i_327
taken_call_326:
push rax
call rx_i_346
call rx_i_347
rx_i_327: ;MUL_64
dec edi
@ -4922,7 +4922,7 @@ rx_i_354: ;CALL
jmp rx_i_355
taken_call_354:
push rax
call rx_i_355
call rx_i_356
rx_i_355: ;MUL_64
dec edi
@ -5659,7 +5659,7 @@ rx_i_409: ;CALL
jmp rx_i_410
taken_call_409:
push rax
call rx_i_497
call rx_i_498
rx_i_410: ;FPDIV
dec edi
@ -5866,7 +5866,7 @@ rx_i_425: ;CALL
jmp rx_i_426
taken_call_425:
push rax
call rx_i_34
call rx_i_35
rx_i_426: ;IMUL_32
dec edi
@ -6556,7 +6556,7 @@ rx_i_476: ;CALL
and eax, 2047
mov rax, qword ptr [rsi + rax * 8]
push rax
call rx_i_11
call rx_i_12
rx_i_477: ;MUL_64
dec edi
@ -6580,7 +6580,7 @@ rx_i_478: ;CALL
jmp rx_i_479
taken_call_478:
push rax
call rx_i_72
call rx_i_73
rx_i_479: ;FPSUB
dec edi
@ -6721,7 +6721,7 @@ rx_i_489: ;CALL
jmp rx_i_490
taken_call_489:
push rax
call rx_i_61
call rx_i_62
rx_i_490: ;ADD_64
dec edi