mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-12-22 15:58:53 +00:00
Reduced x86 code size by 512 bytes (and ecx -> and eax)
This commit is contained in:
parent
1426fcbab5
commit
67e741ff22
@ -66,34 +66,34 @@ namespace RandomX {
|
|||||||
|
|
||||||
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
||||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||||
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||||
if (instr.loca & 3) {
|
if (instr.loca & 3) {
|
||||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
asmCode << "\tcall rx_read_l1" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
if ((instr.loca & 192) == 0)
|
if ((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
asmCode << "\tcall rx_read_l2" << std::endl;
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
if ((instr.loca & 192) == 0)
|
if ((instr.loca & 192) == 0)
|
||||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||||
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
|
||||||
gena(instr, i);
|
gena(instr, i);
|
||||||
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
|
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
|
||||||
gena(instr, i);
|
gena(instr, i);
|
||||||
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
|
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {
|
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {
|
||||||
|
@ -48,7 +48,7 @@ DECL(randomx_program_begin):
|
|||||||
DECL(randomx_program_epilogue):
|
DECL(randomx_program_epilogue):
|
||||||
#include "asm/program_epilogue_linux.inc"
|
#include "asm/program_epilogue_linux.inc"
|
||||||
|
|
||||||
#define scratchpad_mask and ecx, 2040
|
#define scratchpad_mask and eax, 2040
|
||||||
|
|
||||||
.align 64
|
.align 64
|
||||||
DECL(randomx_program_read_l1):
|
DECL(randomx_program_read_l1):
|
||||||
@ -56,7 +56,7 @@ DECL(randomx_program_read_l1):
|
|||||||
|
|
||||||
#undef scratchpad_mask
|
#undef scratchpad_mask
|
||||||
|
|
||||||
#define scratchpad_mask and ecx, 32760
|
#define scratchpad_mask and eax, 32760
|
||||||
|
|
||||||
.align 64
|
.align 64
|
||||||
DECL(randomx_program_read_l2):
|
DECL(randomx_program_read_l2):
|
||||||
|
@ -42,7 +42,7 @@ randomx_program_epilogue PROC
|
|||||||
randomx_program_epilogue ENDP
|
randomx_program_epilogue ENDP
|
||||||
|
|
||||||
scratchpad_mask MACRO
|
scratchpad_mask MACRO
|
||||||
and ecx, 2040
|
and eax, 2040
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
@ -51,7 +51,7 @@ randomx_program_read_l1 PROC
|
|||||||
randomx_program_read_l1 ENDP
|
randomx_program_read_l1 ENDP
|
||||||
|
|
||||||
scratchpad_mask MACRO
|
scratchpad_mask MACRO
|
||||||
and ecx, 32760
|
and eax, 32760
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
|
@ -175,7 +175,7 @@ namespace RandomX {
|
|||||||
emitByte(0xf0 + (instr.rega % RegistersCount));
|
emitByte(0xf0 + (instr.rega % RegistersCount));
|
||||||
emit(instr.addra);
|
emit(instr.addra);
|
||||||
emit(uint16_t(0x8b41)); //mov
|
emit(uint16_t(0x8b41)); //mov
|
||||||
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
|
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||||
emit(0x753fc3f6); //test bl,0x3f; jne
|
emit(0x753fc3f6); //test bl,0x3f; jne
|
||||||
emit(uint16_t(0xe805));
|
emit(uint16_t(0xe805));
|
||||||
if (instr.loca & 3) { //A.LOC.W
|
if (instr.loca & 3) { //A.LOC.W
|
||||||
@ -186,9 +186,9 @@ namespace RandomX {
|
|||||||
}
|
}
|
||||||
if ((instr.loca & 192) == 0) { //A.LOC.X
|
if ((instr.loca & 192) == 0) { //A.LOC.X
|
||||||
emit(uint16_t(0x3348));
|
emit(uint16_t(0x3348));
|
||||||
emitByte(0xe9); //xor rbp, rcx
|
emitByte(0xe8); //xor rbp, rax
|
||||||
}
|
}
|
||||||
emit(uint16_t(0xe181)); //and ecx,
|
emitByte(0x25); //and eax,
|
||||||
if (instr.loca & 3) {
|
if (instr.loca & 3) {
|
||||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||||
}
|
}
|
||||||
@ -199,14 +199,13 @@ namespace RandomX {
|
|||||||
|
|
||||||
void JitCompilerX86::genar(Instruction& instr) {
|
void JitCompilerX86::genar(Instruction& instr) {
|
||||||
gena(instr);
|
gena(instr);
|
||||||
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
|
emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8]
|
||||||
emit(0xdc580f66);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::genaf(Instruction& instr) {
|
void JitCompilerX86::genaf(Instruction& instr) {
|
||||||
gena(instr);
|
gena(instr);
|
||||||
emitByte(0xf3);
|
emitByte(0xf3);
|
||||||
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
|
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
|
void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
|
||||||
|
@ -1,32 +1,32 @@
|
|||||||
push rcx ;# preserve ecx
|
push rax ;# preserve eax
|
||||||
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
||||||
mov rax, qword ptr [rdi] ;# load the dataset address
|
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||||
xor rbp, rcx ;# modify "mx"
|
xor rbp, rax ;# modify "mx"
|
||||||
;# prefetch cacheline "mx"
|
;# prefetch cacheline "mx"
|
||||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||||
mov edx, ebp ;# edx = mx
|
mov edx, ebp ;# edx = mx
|
||||||
prefetchnta byte ptr [rax+rdx]
|
prefetchnta byte ptr [rcx+rdx]
|
||||||
;# read cacheline "ma"
|
;# read cacheline "ma"
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov edx, ebp ;# edx = ma
|
mov edx, ebp ;# edx = ma
|
||||||
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
|
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||||
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||||
lea rax, [rax+rdx] ;# dataset cache line
|
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||||
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
|
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||||
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||||
mov rdx, qword ptr [rax+8]
|
mov rdx, qword ptr [rcx+8]
|
||||||
xor qword ptr [rcx+8], rdx
|
xor qword ptr [rax+8], rdx
|
||||||
mov rdx, qword ptr [rax+16]
|
mov rdx, qword ptr [rcx+16]
|
||||||
xor qword ptr [rcx+16], rdx
|
xor qword ptr [rax+16], rdx
|
||||||
mov rdx, qword ptr [rax+24]
|
mov rdx, qword ptr [rcx+24]
|
||||||
xor qword ptr [rcx+24], rdx
|
xor qword ptr [rax+24], rdx
|
||||||
mov rdx, qword ptr [rax+32]
|
mov rdx, qword ptr [rcx+32]
|
||||||
xor qword ptr [rcx+32], rdx
|
xor qword ptr [rax+32], rdx
|
||||||
mov rdx, qword ptr [rax+40]
|
mov rdx, qword ptr [rcx+40]
|
||||||
xor qword ptr [rcx+40], rdx
|
xor qword ptr [rax+40], rdx
|
||||||
mov rdx, qword ptr [rax+48]
|
mov rdx, qword ptr [rcx+48]
|
||||||
xor qword ptr [rcx+48], rdx
|
xor qword ptr [rax+48], rdx
|
||||||
mov rdx, qword ptr [rax+56]
|
mov rdx, qword ptr [rcx+56]
|
||||||
xor qword ptr [rcx+56], rdx
|
xor qword ptr [rax+56], rdx
|
||||||
pop rcx ;# restore ecx
|
pop rax ;# restore eax
|
||||||
ret
|
ret
|
@ -1,154 +1,154 @@
|
|||||||
;# 90 address transformations
|
;# 90 address transformations
|
||||||
;# forced REX prefix is used to make all transformations 4 bytes long
|
;# forced REX prefix is used to make all transformations 4 bytes long
|
||||||
lea ecx, [rcx+rcx*8+109]
|
lea eax, [rax+rax*8+109]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 96
|
xor eax, 96
|
||||||
lea ecx, [rcx+rcx*8-19]
|
lea eax, [rax+rax*8-19]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -98
|
add eax, -98
|
||||||
db 64
|
db 64
|
||||||
add ecx, -21
|
add eax, -21
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -80
|
xor eax, -80
|
||||||
lea ecx, [rcx+rcx*8-92]
|
lea eax, [rax+rax*8-92]
|
||||||
db 64
|
db 64
|
||||||
add ecx, 113
|
add eax, 113
|
||||||
lea ecx, [rcx+rcx*8+100]
|
lea eax, [rax+rax*8+100]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -39
|
add eax, -39
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 120
|
xor eax, 120
|
||||||
lea ecx, [rcx+rcx*8-119]
|
lea eax, [rax+rax*8-119]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -113
|
add eax, -113
|
||||||
db 64
|
db 64
|
||||||
add ecx, 111
|
add eax, 111
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 104
|
xor eax, 104
|
||||||
lea ecx, [rcx+rcx*8-83]
|
lea eax, [rax+rax*8-83]
|
||||||
lea ecx, [rcx+rcx*8+127]
|
lea eax, [rax+rax*8+127]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -112
|
xor eax, -112
|
||||||
db 64
|
db 64
|
||||||
add ecx, 89
|
add eax, 89
|
||||||
db 64
|
db 64
|
||||||
add ecx, -32
|
add eax, -32
|
||||||
db 64
|
db 64
|
||||||
add ecx, 104
|
add eax, 104
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -120
|
xor eax, -120
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 24
|
xor eax, 24
|
||||||
lea ecx, [rcx+rcx*8+9]
|
lea eax, [rax+rax*8+9]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -31
|
add eax, -31
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -16
|
xor eax, -16
|
||||||
db 64
|
db 64
|
||||||
add ecx, 68
|
add eax, 68
|
||||||
lea ecx, [rcx+rcx*8-110]
|
lea eax, [rax+rax*8-110]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 64
|
xor eax, 64
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -40
|
xor eax, -40
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -8
|
xor eax, -8
|
||||||
db 64
|
db 64
|
||||||
add ecx, -10
|
add eax, -10
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -32
|
xor eax, -32
|
||||||
db 64
|
db 64
|
||||||
add ecx, 14
|
add eax, 14
|
||||||
lea ecx, [rcx+rcx*8-46]
|
lea eax, [rax+rax*8-46]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -104
|
xor eax, -104
|
||||||
lea ecx, [rcx+rcx*8+36]
|
lea eax, [rax+rax*8+36]
|
||||||
db 64
|
db 64
|
||||||
add ecx, 100
|
add eax, 100
|
||||||
lea ecx, [rcx+rcx*8-65]
|
lea eax, [rax+rax*8-65]
|
||||||
lea ecx, [rcx+rcx*8+27]
|
lea eax, [rax+rax*8+27]
|
||||||
lea ecx, [rcx+rcx*8+91]
|
lea eax, [rax+rax*8+91]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -101
|
add eax, -101
|
||||||
db 64
|
db 64
|
||||||
add ecx, -94
|
add eax, -94
|
||||||
lea ecx, [rcx+rcx*8-10]
|
lea eax, [rax+rax*8-10]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 80
|
xor eax, 80
|
||||||
db 64
|
db 64
|
||||||
add ecx, -108
|
add eax, -108
|
||||||
db 64
|
db 64
|
||||||
add ecx, -58
|
add eax, -58
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 48
|
xor eax, 48
|
||||||
lea ecx, [rcx+rcx*8+73]
|
lea eax, [rax+rax*8+73]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -48
|
xor eax, -48
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 32
|
xor eax, 32
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -96
|
xor eax, -96
|
||||||
db 64
|
db 64
|
||||||
add ecx, 118
|
add eax, 118
|
||||||
db 64
|
db 64
|
||||||
add ecx, 91
|
add eax, 91
|
||||||
lea ecx, [rcx+rcx*8+18]
|
lea eax, [rax+rax*8+18]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -11
|
add eax, -11
|
||||||
lea ecx, [rcx+rcx*8+63]
|
lea eax, [rax+rax*8+63]
|
||||||
db 64
|
db 64
|
||||||
add ecx, 114
|
add eax, 114
|
||||||
lea ecx, [rcx+rcx*8+45]
|
lea eax, [rax+rax*8+45]
|
||||||
db 64
|
db 64
|
||||||
add ecx, -67
|
add eax, -67
|
||||||
db 64
|
db 64
|
||||||
add ecx, 53
|
add eax, 53
|
||||||
lea ecx, [rcx+rcx*8-101]
|
lea eax, [rax+rax*8-101]
|
||||||
lea ecx, [rcx+rcx*8-1]
|
lea eax, [rax+rax*8-1]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 16
|
xor eax, 16
|
||||||
lea ecx, [rcx+rcx*8-37]
|
lea eax, [rax+rax*8-37]
|
||||||
lea ecx, [rcx+rcx*8-28]
|
lea eax, [rax+rax*8-28]
|
||||||
lea ecx, [rcx+rcx*8-55]
|
lea eax, [rax+rax*8-55]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -88
|
xor eax, -88
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -72
|
xor eax, -72
|
||||||
db 64
|
db 64
|
||||||
add ecx, 36
|
add eax, 36
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -56
|
xor eax, -56
|
||||||
db 64
|
db 64
|
||||||
add ecx, 116
|
add eax, 116
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 88
|
xor eax, 88
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -128
|
xor eax, -128
|
||||||
db 64
|
db 64
|
||||||
add ecx, 50
|
add eax, 50
|
||||||
db 64
|
db 64
|
||||||
add ecx, 105
|
add eax, 105
|
||||||
db 64
|
db 64
|
||||||
add ecx, -37
|
add eax, -37
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 112
|
xor eax, 112
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 8
|
xor eax, 8
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -24
|
xor eax, -24
|
||||||
lea ecx, [rcx+rcx*8+118]
|
lea eax, [rax+rax*8+118]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 72
|
xor eax, 72
|
||||||
db 64
|
db 64
|
||||||
xor ecx, -64
|
xor eax, -64
|
||||||
db 64
|
db 64
|
||||||
add ecx, 40
|
add eax, 40
|
||||||
lea ecx, [rcx+rcx*8-74]
|
lea eax, [rax+rax*8-74]
|
||||||
lea ecx, [rcx+rcx*8+82]
|
lea eax, [rax+rax*8+82]
|
||||||
lea ecx, [rcx+rcx*8+54]
|
lea eax, [rax+rax*8+54]
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 56
|
xor eax, 56
|
||||||
db 64
|
db 64
|
||||||
xor ecx, 40
|
xor eax, 40
|
||||||
db 64
|
db 64
|
||||||
add ecx, 87
|
add eax, 87
|
@ -222,42 +222,42 @@ TransformAddress MACRO reg32, reg64
|
|||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
ReadMemoryRandom MACRO spmask
|
ReadMemoryRandom MACRO spmask
|
||||||
;# IN ecx = random 32-bit address
|
;# IN eax = random 32-bit address
|
||||||
;# GLOBAL rdi = address of the dataset address
|
;# GLOBAL rdi = address of the dataset address
|
||||||
;# GLOBAL rsi = address of the scratchpad
|
;# GLOBAL rsi = address of the scratchpad
|
||||||
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
||||||
;# MODIFY rcx, rdx
|
;# MODIFY rcx, rdx
|
||||||
push rcx ;# preserve ecx
|
push rax ;# preserve eax
|
||||||
TransformAddress ecx, rcx ;# TransformAddress function
|
TransformAddress eax, rax ;# TransformAddress function
|
||||||
mov rax, qword ptr [rdi] ;# load the dataset address
|
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||||
xor rbp, rcx ;# modify "mx"
|
xor rbp, rax ;# modify "mx"
|
||||||
; prefetch cacheline "mx"
|
; prefetch cacheline "mx"
|
||||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||||
mov edx, ebp ;# edx = mx
|
mov edx, ebp ;# edx = mx
|
||||||
prefetchnta byte ptr [rax+rdx]
|
prefetchnta byte ptr [rcx+rdx]
|
||||||
; read cacheline "ma"
|
; read cacheline "ma"
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov edx, ebp ;# edx = ma
|
mov edx, ebp ;# edx = ma
|
||||||
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||||
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||||
lea rax, [rax+rdx] ;# dataset cache line
|
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||||
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
|
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||||
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||||
mov rdx, qword ptr [rax+8]
|
mov rdx, qword ptr [rcx+8]
|
||||||
xor qword ptr [rcx+8], rdx
|
xor qword ptr [rax+8], rdx
|
||||||
mov rdx, qword ptr [rax+16]
|
mov rdx, qword ptr [rcx+16]
|
||||||
xor qword ptr [rcx+16], rdx
|
xor qword ptr [rax+16], rdx
|
||||||
mov rdx, qword ptr [rax+24]
|
mov rdx, qword ptr [rcx+24]
|
||||||
xor qword ptr [rcx+24], rdx
|
xor qword ptr [rax+24], rdx
|
||||||
mov rdx, qword ptr [rax+32]
|
mov rdx, qword ptr [rcx+32]
|
||||||
xor qword ptr [rcx+32], rdx
|
xor qword ptr [rax+32], rdx
|
||||||
mov rdx, qword ptr [rax+40]
|
mov rdx, qword ptr [rcx+40]
|
||||||
xor qword ptr [rcx+40], rdx
|
xor qword ptr [rax+40], rdx
|
||||||
mov rdx, qword ptr [rax+48]
|
mov rdx, qword ptr [rcx+48]
|
||||||
xor qword ptr [rcx+48], rdx
|
xor qword ptr [rax+48], rdx
|
||||||
mov rdx, qword ptr [rax+56]
|
mov rdx, qword ptr [rcx+56]
|
||||||
xor qword ptr [rcx+56], rdx
|
xor qword ptr [rax+56], rdx
|
||||||
pop rcx ;# restore ecx
|
pop rax ;# restore eax
|
||||||
ret
|
ret
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
|
3372
src/program.inc
3372
src/program.inc
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user