mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2025-01-20 14:08:36 +00:00
Optimizations to reduce code size under 32K
This commit is contained in:
parent
b6d654291f
commit
b71e0eec65
@ -28,9 +28,10 @@ namespace RandomX {
|
|||||||
static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" };
|
static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" };
|
||||||
static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" };
|
static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" };
|
||||||
|
|
||||||
static const char* regMx = "rdi";
|
static const char* regMx = "rbp";
|
||||||
static const char* regIc = "ebp";
|
static const char* regIc = "ebx";
|
||||||
static const char* regStackBeginAddr = "rbx";
|
static const char* regIc8 = "bl";
|
||||||
|
static const char* regStackBeginAddr = "rdi";
|
||||||
static const char* regScratchpadAddr = "rsi";
|
static const char* regScratchpadAddr = "rsi";
|
||||||
|
|
||||||
void AssemblyGeneratorX86::generateProgram(const void* seed) {
|
void AssemblyGeneratorX86::generateProgram(const void* seed) {
|
||||||
@ -62,7 +63,7 @@ namespace RandomX {
|
|||||||
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
|
||||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||||
asmCode << "\ttest " << regIc << ", 63" << std::endl;
|
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||||
switch (instr.loca & 3)
|
switch (instr.loca & 3)
|
||||||
{
|
{
|
||||||
@ -71,12 +72,14 @@ namespace RandomX {
|
|||||||
case 2:
|
case 2:
|
||||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
asmCode << "\tcall rx_read_l1" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
|
if ((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
||||||
break;
|
break;
|
||||||
default: //3
|
default: //3
|
||||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
asmCode << "\tcall rx_read_l2" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
|
if ((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
||||||
break;
|
break;
|
||||||
@ -88,7 +91,7 @@ namespace RandomX {
|
|||||||
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
|
||||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||||
asmCode << "\ttest " << regIc << ", 63" << std::endl;
|
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||||
switch (instr.loca & 3)
|
switch (instr.loca & 3)
|
||||||
{
|
{
|
||||||
@ -97,12 +100,14 @@ namespace RandomX {
|
|||||||
case 2:
|
case 2:
|
||||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
asmCode << "\tcall rx_read_l1" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
|
if((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
||||||
break;
|
break;
|
||||||
default: //3
|
default: //3
|
||||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
asmCode << "\tcall rx_read_l2" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
|
if ((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
||||||
break;
|
break;
|
||||||
@ -200,7 +205,8 @@ namespace RandomX {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AssemblyGeneratorX86::gencf(Instruction& instr) {
|
void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) {
|
||||||
|
if(move)
|
||||||
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
|
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
|
||||||
const char* store = (instr.locc & 8) ? "movhpd" : "movlpd";
|
const char* store = (instr.locc & 8) ? "movhpd" : "movlpd";
|
||||||
switch (instr.locc & 7)
|
switch (instr.locc & 7)
|
||||||
@ -451,8 +457,8 @@ namespace RandomX {
|
|||||||
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
|
||||||
genaf(instr, i);
|
genaf(instr, i);
|
||||||
asmCode << "\tandps xmm0, xmm10" << std::endl;
|
asmCode << "\tandps xmm0, xmm10" << std::endl;
|
||||||
asmCode << "\tsqrtpd xmm0, xmm0" << std::endl;
|
asmCode << "\tsqrtpd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
|
||||||
gencf(instr);
|
gencf(instr, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
|
||||||
|
@ -45,7 +45,7 @@ namespace RandomX {
|
|||||||
void genbr132(Instruction&);
|
void genbr132(Instruction&);
|
||||||
void genbf(Instruction&, const char*);
|
void genbf(Instruction&, const char*);
|
||||||
void gencr(Instruction&);
|
void gencr(Instruction&);
|
||||||
void gencf(Instruction&);
|
void gencf(Instruction&, bool);
|
||||||
|
|
||||||
void generateCode(Instruction&, int);
|
void generateCode(Instruction&, int);
|
||||||
|
|
||||||
|
@ -22,12 +22,12 @@ PUBLIC executeProgram
|
|||||||
executeProgram PROC
|
executeProgram PROC
|
||||||
; REGISTER ALLOCATION:
|
; REGISTER ALLOCATION:
|
||||||
; rax -> temporary
|
; rax -> temporary
|
||||||
; rbx -> beginning of VM stack
|
; rbx -> "ic"
|
||||||
; rcx -> temporary
|
; rcx -> temporary
|
||||||
; rdx -> temporary
|
; rdx -> temporary
|
||||||
; rsi -> convertible_t& scratchpad
|
; rsi -> convertible_t& scratchpad
|
||||||
; rdi -> "mx"
|
; rdi -> beginning of VM stack
|
||||||
; rbp -> "ic"
|
; rbp -> "ma", "mx"
|
||||||
; rsp -> end of VM stack
|
; rsp -> end of VM stack
|
||||||
; r8 -> "r0"
|
; r8 -> "r0"
|
||||||
; r9 -> "r1"
|
; r9 -> "r1"
|
||||||
@ -82,13 +82,13 @@ executeProgram PROC
|
|||||||
|
|
||||||
; function arguments
|
; function arguments
|
||||||
push rcx ; RegisterFile& registerFile
|
push rcx ; RegisterFile& registerFile
|
||||||
mov rdi, qword ptr [rdx] ; "mx", "ma"
|
mov rbp, qword ptr [rdx] ; "mx", "ma"
|
||||||
mov rax, qword ptr [rdx+8] ; uint8_t* dataset
|
mov rax, qword ptr [rdx+8] ; uint8_t* dataset
|
||||||
push rax
|
push rax
|
||||||
mov rsi, r8 ; convertible_t* scratchpad
|
mov rsi, r8 ; convertible_t* scratchpad
|
||||||
|
|
||||||
mov rbx, rsp ; beginning of VM stack
|
mov rdi, rsp ; beginning of VM stack
|
||||||
mov ebp, 1048577 ; number of VM instructions to execute + 1
|
mov ebx, 1048577 ; number of VM instructions to execute + 1
|
||||||
|
|
||||||
xorps xmm10, xmm10
|
xorps xmm10, xmm10
|
||||||
cmpeqpd xmm10, xmm10
|
cmpeqpd xmm10, xmm10
|
||||||
@ -164,7 +164,7 @@ executeProgram PROC
|
|||||||
|
|
||||||
rx_finish:
|
rx_finish:
|
||||||
; unroll the stack
|
; unroll the stack
|
||||||
mov rsp, rbx
|
mov rsp, rdi
|
||||||
|
|
||||||
; save VM register values
|
; save VM register values
|
||||||
pop rcx
|
pop rcx
|
||||||
@ -211,30 +211,29 @@ TransformAddress MACRO reg32, reg64
|
|||||||
;# lies in a different cache line than the original address (mod 2^N).
|
;# lies in a different cache line than the original address (mod 2^N).
|
||||||
;# This is done to prevent a load-store dependency.
|
;# This is done to prevent a load-store dependency.
|
||||||
;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C
|
;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C
|
||||||
lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127
|
;lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127
|
||||||
;lea reg32, [reg64-128] ;# C = all except -7 to +7
|
db 64
|
||||||
|
add reg32, -39 ;# C = all except -7 to +7
|
||||||
;xor reg32, -8 ;# C = all except 0 to 7
|
;xor reg32, -8 ;# C = all except 0 to 7
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
ReadMemoryRandom MACRO spmask
|
ReadMemoryRandom MACRO spmask
|
||||||
;# IN ecx = random 32-bit address
|
;# IN ecx = random 32-bit address
|
||||||
;# OUT rax = 64-bit integer return value
|
;# GLOBAL rdi = address of the dataset address
|
||||||
;# OUT xmm0 = 128-bit floating point return value
|
|
||||||
;# GLOBAL rbp = "ic" number of instructions until the end of the program
|
|
||||||
;# GLOBAL rbx = address of the dataset address
|
|
||||||
;# GLOBAL rsi = address of the scratchpad
|
;# GLOBAL rsi = address of the scratchpad
|
||||||
;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma"
|
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
||||||
;# MODIFY rcx, rdx
|
;# MODIFY rcx, rdx
|
||||||
|
push rcx ;# preserve ecx
|
||||||
|
TransformAddress ecx, rcx ;# TransformAddress function
|
||||||
|
mov rax, qword ptr [rdi] ;# load the dataset address
|
||||||
|
xor rbp, rcx ;# modify "mx"
|
||||||
; prefetch cacheline "mx"
|
; prefetch cacheline "mx"
|
||||||
mov rax, qword ptr [rbx] ;# load the dataset address
|
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||||
and rdi, -64 ;# align "mx" to the start of a cache line
|
mov edx, ebp ;# edx = mx
|
||||||
mov edx, edi ;# edx = mx
|
|
||||||
prefetchnta byte ptr [rax+rdx]
|
prefetchnta byte ptr [rax+rdx]
|
||||||
; read cacheline "ma"
|
; read cacheline "ma"
|
||||||
ror rdi, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov edx, edi ;# edx = ma
|
mov edx, ebp ;# edx = ma
|
||||||
push rcx
|
|
||||||
TransformAddress ecx, rcx ;# TransformAddress function
|
|
||||||
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||||
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
||||||
lea rax, [rax+rdx] ;# dataset cache line
|
lea rax, [rax+rdx] ;# dataset cache line
|
||||||
@ -254,7 +253,7 @@ ReadMemoryRandom MACRO spmask
|
|||||||
xor qword ptr [rcx+48], rdx
|
xor qword ptr [rcx+48], rdx
|
||||||
mov rdx, qword ptr [rax+56]
|
mov rdx, qword ptr [rax+56]
|
||||||
xor qword ptr [rcx+56], rdx
|
xor qword ptr [rcx+56], rdx
|
||||||
pop rcx
|
pop rcx ;# restore ecx
|
||||||
ret
|
ret
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
|
2806
src/program.inc
2806
src/program.inc
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user