Reduced x86 code size by 512 bytes (and ecx -> and eax)

This commit is contained in:
tevador 2019-01-12 20:27:35 +01:00
parent 1426fcbab5
commit 67e741ff22
8 changed files with 1841 additions and 1842 deletions

View File

@ -66,34 +66,34 @@ namespace RandomX {
void AssemblyGeneratorX86::gena(Instruction& instr, int i) { void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
asmCode << "\tjnz short rx_body_" << i << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl;
if (instr.loca & 3) { if (instr.loca & 3) {
asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "\tcall rx_read_l1" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0) if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
} }
else { else {
asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "\tcall rx_read_l2" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0) if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
} }
} }
void AssemblyGeneratorX86::genar(Instruction& instr, int i) { void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
gena(instr, i); gena(instr, i);
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
} }
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
gena(instr, i); gena(instr, i);
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
} }
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) { void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {

View File

@ -48,7 +48,7 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue): DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc" #include "asm/program_epilogue_linux.inc"
#define scratchpad_mask and ecx, 2040 #define scratchpad_mask and eax, 2040
.align 64 .align 64
DECL(randomx_program_read_l1): DECL(randomx_program_read_l1):
@ -56,7 +56,7 @@ DECL(randomx_program_read_l1):
#undef scratchpad_mask #undef scratchpad_mask
#define scratchpad_mask and ecx, 32760 #define scratchpad_mask and eax, 32760
.align 64 .align 64
DECL(randomx_program_read_l2): DECL(randomx_program_read_l2):

View File

@ -42,7 +42,7 @@ randomx_program_epilogue PROC
randomx_program_epilogue ENDP randomx_program_epilogue ENDP
scratchpad_mask MACRO scratchpad_mask MACRO
and ecx, 2040 and eax, 2040
ENDM ENDM
ALIGN 64 ALIGN 64
@ -51,7 +51,7 @@ randomx_program_read_l1 PROC
randomx_program_read_l1 ENDP randomx_program_read_l1 ENDP
scratchpad_mask MACRO scratchpad_mask MACRO
and ecx, 32760 and eax, 32760
ENDM ENDM
ALIGN 64 ALIGN 64

View File

@ -175,7 +175,7 @@ namespace RandomX {
emitByte(0xf0 + (instr.rega % RegistersCount)); emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra); emit(instr.addra);
emit(uint16_t(0x8b41)); //mov emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emit(0x753fc3f6); //test bl,0x3f; jne emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805)); emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W if (instr.loca & 3) { //A.LOC.W
@ -186,9 +186,9 @@ namespace RandomX {
} }
if ((instr.loca & 192) == 0) { //A.LOC.X if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348)); emit(uint16_t(0x3348));
emitByte(0xe9); //xor rbp, rcx emitByte(0xe8); //xor rbp, rax
} }
emit(uint16_t(0xe181)); //and ecx, emitByte(0x25); //and eax,
if (instr.loca & 3) { if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
} }
@ -199,14 +199,13 @@ namespace RandomX {
void JitCompilerX86::genar(Instruction& instr) { void JitCompilerX86::genar(Instruction& instr) {
gena(instr); gena(instr);
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8]
emit(0xdc580f66);
} }
void JitCompilerX86::genaf(Instruction& instr) { void JitCompilerX86::genaf(Instruction& instr) {
gena(instr); gena(instr);
emitByte(0xf3); emitByte(0xf3);
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8] emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
} }
void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {

View File

@ -1,32 +1,32 @@
push rcx ;# preserve ecx push rax ;# preserve eax
db 0, 0, 0, 0 ;# TransformAddress placeholder db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rax, qword ptr [rdi] ;# load the dataset address mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx" xor rbp, rax ;# modify "mx"
;# prefetch cacheline "mx" ;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx] prefetchnta byte ptr [rcx+rdx]
;# read cacheline "ma" ;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx" ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8 scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8] mov rdx, qword ptr [rcx+8]
xor qword ptr [rcx+8], rdx xor qword ptr [rax+8], rdx
mov rdx, qword ptr [rax+16] mov rdx, qword ptr [rcx+16]
xor qword ptr [rcx+16], rdx xor qword ptr [rax+16], rdx
mov rdx, qword ptr [rax+24] mov rdx, qword ptr [rcx+24]
xor qword ptr [rcx+24], rdx xor qword ptr [rax+24], rdx
mov rdx, qword ptr [rax+32] mov rdx, qword ptr [rcx+32]
xor qword ptr [rcx+32], rdx xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rax+40] mov rdx, qword ptr [rcx+40]
xor qword ptr [rcx+40], rdx xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rax+48] mov rdx, qword ptr [rcx+48]
xor qword ptr [rcx+48], rdx xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rax+56] mov rdx, qword ptr [rcx+56]
xor qword ptr [rcx+56], rdx xor qword ptr [rax+56], rdx
pop rcx ;# restore ecx pop rax ;# restore eax
ret ret

View File

@ -1,154 +1,154 @@
;# 90 address transformations ;# 90 address transformations
;# forced REX prefix is used to make all transformations 4 bytes long ;# forced REX prefix is used to make all transformations 4 bytes long
lea ecx, [rcx+rcx*8+109] lea eax, [rax+rax*8+109]
db 64 db 64
xor ecx, 96 xor eax, 96
lea ecx, [rcx+rcx*8-19] lea eax, [rax+rax*8-19]
db 64 db 64
add ecx, -98 add eax, -98
db 64 db 64
add ecx, -21 add eax, -21
db 64 db 64
xor ecx, -80 xor eax, -80
lea ecx, [rcx+rcx*8-92] lea eax, [rax+rax*8-92]
db 64 db 64
add ecx, 113 add eax, 113
lea ecx, [rcx+rcx*8+100] lea eax, [rax+rax*8+100]
db 64 db 64
add ecx, -39 add eax, -39
db 64 db 64
xor ecx, 120 xor eax, 120
lea ecx, [rcx+rcx*8-119] lea eax, [rax+rax*8-119]
db 64 db 64
add ecx, -113 add eax, -113
db 64 db 64
add ecx, 111 add eax, 111
db 64 db 64
xor ecx, 104 xor eax, 104
lea ecx, [rcx+rcx*8-83] lea eax, [rax+rax*8-83]
lea ecx, [rcx+rcx*8+127] lea eax, [rax+rax*8+127]
db 64 db 64
xor ecx, -112 xor eax, -112
db 64 db 64
add ecx, 89 add eax, 89
db 64 db 64
add ecx, -32 add eax, -32
db 64 db 64
add ecx, 104 add eax, 104
db 64 db 64
xor ecx, -120 xor eax, -120
db 64 db 64
xor ecx, 24 xor eax, 24
lea ecx, [rcx+rcx*8+9] lea eax, [rax+rax*8+9]
db 64 db 64
add ecx, -31 add eax, -31
db 64 db 64
xor ecx, -16 xor eax, -16
db 64 db 64
add ecx, 68 add eax, 68
lea ecx, [rcx+rcx*8-110] lea eax, [rax+rax*8-110]
db 64 db 64
xor ecx, 64 xor eax, 64
db 64 db 64
xor ecx, -40 xor eax, -40
db 64 db 64
xor ecx, -8 xor eax, -8
db 64 db 64
add ecx, -10 add eax, -10
db 64 db 64
xor ecx, -32 xor eax, -32
db 64 db 64
add ecx, 14 add eax, 14
lea ecx, [rcx+rcx*8-46] lea eax, [rax+rax*8-46]
db 64 db 64
xor ecx, -104 xor eax, -104
lea ecx, [rcx+rcx*8+36] lea eax, [rax+rax*8+36]
db 64 db 64
add ecx, 100 add eax, 100
lea ecx, [rcx+rcx*8-65] lea eax, [rax+rax*8-65]
lea ecx, [rcx+rcx*8+27] lea eax, [rax+rax*8+27]
lea ecx, [rcx+rcx*8+91] lea eax, [rax+rax*8+91]
db 64 db 64
add ecx, -101 add eax, -101
db 64 db 64
add ecx, -94 add eax, -94
lea ecx, [rcx+rcx*8-10] lea eax, [rax+rax*8-10]
db 64 db 64
xor ecx, 80 xor eax, 80
db 64 db 64
add ecx, -108 add eax, -108
db 64 db 64
add ecx, -58 add eax, -58
db 64 db 64
xor ecx, 48 xor eax, 48
lea ecx, [rcx+rcx*8+73] lea eax, [rax+rax*8+73]
db 64 db 64
xor ecx, -48 xor eax, -48
db 64 db 64
xor ecx, 32 xor eax, 32
db 64 db 64
xor ecx, -96 xor eax, -96
db 64 db 64
add ecx, 118 add eax, 118
db 64 db 64
add ecx, 91 add eax, 91
lea ecx, [rcx+rcx*8+18] lea eax, [rax+rax*8+18]
db 64 db 64
add ecx, -11 add eax, -11
lea ecx, [rcx+rcx*8+63] lea eax, [rax+rax*8+63]
db 64 db 64
add ecx, 114 add eax, 114
lea ecx, [rcx+rcx*8+45] lea eax, [rax+rax*8+45]
db 64 db 64
add ecx, -67 add eax, -67
db 64 db 64
add ecx, 53 add eax, 53
lea ecx, [rcx+rcx*8-101] lea eax, [rax+rax*8-101]
lea ecx, [rcx+rcx*8-1] lea eax, [rax+rax*8-1]
db 64 db 64
xor ecx, 16 xor eax, 16
lea ecx, [rcx+rcx*8-37] lea eax, [rax+rax*8-37]
lea ecx, [rcx+rcx*8-28] lea eax, [rax+rax*8-28]
lea ecx, [rcx+rcx*8-55] lea eax, [rax+rax*8-55]
db 64 db 64
xor ecx, -88 xor eax, -88
db 64 db 64
xor ecx, -72 xor eax, -72
db 64 db 64
add ecx, 36 add eax, 36
db 64 db 64
xor ecx, -56 xor eax, -56
db 64 db 64
add ecx, 116 add eax, 116
db 64 db 64
xor ecx, 88 xor eax, 88
db 64 db 64
xor ecx, -128 xor eax, -128
db 64 db 64
add ecx, 50 add eax, 50
db 64 db 64
add ecx, 105 add eax, 105
db 64 db 64
add ecx, -37 add eax, -37
db 64 db 64
xor ecx, 112 xor eax, 112
db 64 db 64
xor ecx, 8 xor eax, 8
db 64 db 64
xor ecx, -24 xor eax, -24
lea ecx, [rcx+rcx*8+118] lea eax, [rax+rax*8+118]
db 64 db 64
xor ecx, 72 xor eax, 72
db 64 db 64
xor ecx, -64 xor eax, -64
db 64 db 64
add ecx, 40 add eax, 40
lea ecx, [rcx+rcx*8-74] lea eax, [rax+rax*8-74]
lea ecx, [rcx+rcx*8+82] lea eax, [rax+rax*8+82]
lea ecx, [rcx+rcx*8+54] lea eax, [rax+rax*8+54]
db 64 db 64
xor ecx, 56 xor eax, 56
db 64 db 64
xor ecx, 40 xor eax, 40
db 64 db 64
add ecx, 87 add eax, 87

View File

@ -222,42 +222,42 @@ TransformAddress MACRO reg32, reg64
ENDM ENDM
ReadMemoryRandom MACRO spmask ReadMemoryRandom MACRO spmask
;# IN ecx = random 32-bit address ;# IN eax = random 32-bit address
;# GLOBAL rdi = address of the dataset address ;# GLOBAL rdi = address of the dataset address
;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rsi = address of the scratchpad
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
;# MODIFY rcx, rdx ;# MODIFY rcx, rdx
push rcx ;# preserve ecx push rax ;# preserve eax
TransformAddress ecx, rcx ;# TransformAddress function TransformAddress eax, rax ;# TransformAddress function
mov rax, qword ptr [rdi] ;# load the dataset address mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx" xor rbp, rax ;# modify "mx"
; prefetch cacheline "mx" ; prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx] prefetchnta byte ptr [rcx+rdx]
; read cacheline "ma" ; read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx" ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma mov edx, ebp ;# edx = ma
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8] mov rdx, qword ptr [rcx+8]
xor qword ptr [rcx+8], rdx xor qword ptr [rax+8], rdx
mov rdx, qword ptr [rax+16] mov rdx, qword ptr [rcx+16]
xor qword ptr [rcx+16], rdx xor qword ptr [rax+16], rdx
mov rdx, qword ptr [rax+24] mov rdx, qword ptr [rcx+24]
xor qword ptr [rcx+24], rdx xor qword ptr [rax+24], rdx
mov rdx, qword ptr [rax+32] mov rdx, qword ptr [rcx+32]
xor qword ptr [rcx+32], rdx xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rax+40] mov rdx, qword ptr [rcx+40]
xor qword ptr [rcx+40], rdx xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rax+48] mov rdx, qword ptr [rcx+48]
xor qword ptr [rcx+48], rdx xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rax+56] mov rdx, qword ptr [rcx+56]
xor qword ptr [rcx+56], rdx xor qword ptr [rax+56], rdx
pop rcx ;# restore ecx pop rax ;# restore eax
ret ret
ENDM ENDM

File diff suppressed because it is too large Load Diff