mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2025-01-03 05:38:54 +00:00
Optimized dataset read (#211)
* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
This commit is contained in:
parent
c12097400b
commit
3c8c7ee097
@ -15,6 +15,7 @@
|
|||||||
mov rsi, rdx ;# uint8_t* scratchpad
|
mov rsi, rdx ;# uint8_t* scratchpad
|
||||||
|
|
||||||
mov rax, rbp
|
mov rax, rbp
|
||||||
|
ror rbp, 32
|
||||||
|
|
||||||
;# zero integer registers
|
;# zero integer registers
|
||||||
xor r8, r8
|
xor r8, r8
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
mov rbx, r9 ;# loop counter
|
mov rbx, r9 ;# loop counter
|
||||||
|
|
||||||
mov rax, rbp
|
mov rax, rbp
|
||||||
|
ror rbp, 32
|
||||||
|
|
||||||
;# zero integer registers
|
;# zero integer registers
|
||||||
xor r8, r8
|
xor r8, r8
|
||||||
|
@ -1,17 +1,16 @@
|
|||||||
|
mov ecx, ebp ;# ecx = ma
|
||||||
|
and ecx, RANDOMX_DATASET_BASE_MASK
|
||||||
|
xor r8, qword ptr [rdi+rcx]
|
||||||
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
xor rbp, rax ;# modify "mx"
|
xor rbp, rax ;# modify "mx"
|
||||||
mov edx, ebp ;# edx = mx
|
mov edx, ebp ;# edx = mx
|
||||||
and edx, RANDOMX_DATASET_BASE_MASK
|
and edx, RANDOMX_DATASET_BASE_MASK
|
||||||
prefetchnta byte ptr [rdi+rdx]
|
prefetchnta byte ptr [rdi+rdx]
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
xor r9, qword ptr [rdi+rcx+8]
|
||||||
mov edx, ebp ;# edx = ma
|
xor r10, qword ptr [rdi+rcx+16]
|
||||||
and edx, RANDOMX_DATASET_BASE_MASK
|
xor r11, qword ptr [rdi+rcx+24]
|
||||||
lea rcx, [rdi+rdx] ;# dataset cache line
|
xor r12, qword ptr [rdi+rcx+32]
|
||||||
xor r8, qword ptr [rcx+0]
|
xor r13, qword ptr [rdi+rcx+40]
|
||||||
xor r9, qword ptr [rcx+8]
|
xor r14, qword ptr [rdi+rcx+48]
|
||||||
xor r10, qword ptr [rcx+16]
|
xor r15, qword ptr [rdi+rcx+56]
|
||||||
xor r11, qword ptr [rcx+24]
|
|
||||||
xor r12, qword ptr [rcx+32]
|
|
||||||
xor r13, qword ptr [rcx+40]
|
|
||||||
xor r14, qword ptr [rcx+48]
|
|
||||||
xor r15, qword ptr [rcx+56]
|
|
||||||
|
|
@ -8,10 +8,10 @@
|
|||||||
mov qword ptr [rsp+16], r13
|
mov qword ptr [rsp+16], r13
|
||||||
mov qword ptr [rsp+8], r14
|
mov qword ptr [rsp+8], r14
|
||||||
mov qword ptr [rsp+0], r15
|
mov qword ptr [rsp+0], r15
|
||||||
xor rbp, rax ;# modify "mx"
|
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov ebx, ebp ;# ecx = ma
|
xor rbp, rax ;# modify "mx"
|
||||||
and ebx, RANDOMX_DATASET_BASE_MASK
|
mov rbx, rbp ;# ebx = ma
|
||||||
shr ebx, 6 ;# ebx = Dataset block number
|
shr rbx, 38
|
||||||
|
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
|
||||||
;# add ebx, datasetOffset / 64
|
;# add ebx, datasetOffset / 64
|
||||||
;# call 32768
|
;# call 32768
|
@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
|
|||||||
literal_v15: .fill 2,8,0
|
literal_v15: .fill 2,8,0
|
||||||
|
|
||||||
DECL(randomx_program_aarch64_vm_instructions_end):
|
DECL(randomx_program_aarch64_vm_instructions_end):
|
||||||
|
# Calculate dataset pointer for dataset read
|
||||||
|
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
|
||||||
|
lsr x10, x9, 32
|
||||||
|
|
||||||
# mx ^= r[readReg2] ^ r[readReg3];
|
# mx ^= r[readReg2] ^ r[readReg3];
|
||||||
eor x9, x9, x18
|
eor x9, x9, x18
|
||||||
@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
|
|||||||
# mx <-> ma
|
# mx <-> ma
|
||||||
ror x9, x9, 32
|
ror x9, x9, 32
|
||||||
|
|
||||||
# Calculate dataset pointer for dataset read
|
|
||||||
mov w10, w9
|
|
||||||
DECL(randomx_program_aarch64_cacheline_align_mask2):
|
DECL(randomx_program_aarch64_cacheline_align_mask2):
|
||||||
# Actual mask will be inserted by JIT compiler
|
# Actual mask will be inserted by JIT compiler
|
||||||
and x10, x10, 1
|
and x10, x10, 1
|
||||||
|
Loading…
Reference in New Issue
Block a user