Combined prefetch + read into a single step

This commit is contained in:
tevador 2019-01-06 21:26:53 +01:00
parent 4189e4ebc6
commit 6519fed4d1
2 changed files with 28 additions and 20 deletions

View File

@ -19,13 +19,24 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once #pragma once
//#define TRACEVM //#define TRACEVM
#include <new>
#include "VirtualMachine.hpp" #include "VirtualMachine.hpp"
#include "JitCompilerX86.hpp" #include "JitCompilerX86.hpp"
#include "intrinPortable.h"
namespace RandomX { namespace RandomX {
class CompiledVirtualMachine : public VirtualMachine { class CompiledVirtualMachine : public VirtualMachine {
public: public:
void* operator new(size_t size) {
void* ptr = _mm_malloc(size, 64);
if (ptr == nullptr)
throw std::bad_alloc();
return ptr;
}
void operator delete(void* ptr) {
_mm_free(ptr);
}
CompiledVirtualMachine(bool softAes); CompiledVirtualMachine(bool softAes);
void setDataset(dataset_t ds, bool light = false) override; void setDataset(dataset_t ds, bool light = false) override;
void initializeProgram(const void* seed) override; void initializeProgram(const void* seed) override;

View File

@ -223,17 +223,12 @@ ReadMemoryRandom MACRO spmask, float
;# GLOBAL rbp = "ic" number of instructions until the end of the program ;# GLOBAL rbp = "ic" number of instructions until the end of the program
;# GLOBAL rbx = address of the dataset address ;# GLOBAL rbx = address of the dataset address
;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rsi = address of the scratchpad
;# GLOBAL rdi = "mx" random 32-bit dataset address ;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma"
;# MODIFY rcx, rdx ;# MODIFY rcx, rdx
LOCAL L_prefetch, L_read, L_return LOCAL L_prefetch_read, L_return
mov eax, ebp test ebp, 63
and al, 63 jz short L_prefetch_read ;# "ic" divisible by 64 -> prefetch + read
jz short L_prefetch ;# "ic" divisible by 64 -> prefetch xor rdi, rcx ;# randomize "mx"
xor edx, edx
cmp al, 14
je short L_read ;# "ic" = 14 (mod 64) -> random read
cmovb edx, ecx ;# "ic" < 14 (mod 64) -> modify random read address
xor edi, edx
L_return: L_return:
and ecx, spmask ;# limit address to the specified scratchpad size and ecx, spmask ;# limit address to the specified scratchpad size
IF float IF float
@ -242,12 +237,15 @@ ELSE
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
ENDIF ENDIF
ret ret
L_prefetch: L_prefetch_read:
; prefetch cacheline "mx"
mov rax, qword ptr [rbx] ;# load the dataset address mov rax, qword ptr [rbx] ;# load the dataset address
and edi, -64 ;# align "mx" to the start of a cache line and rdi, -64 ;# align "mx" to the start of a cache line
prefetchnta byte ptr [rax+rdi] mov edx, edi ;# edx = mx
jmp short L_return prefetchnta byte ptr [rax+rdx]
L_read: ; read cacheline "ma"
ror rdi, 32 ;# swap "ma" and "mx"
mov edx, edi ;# edx = ma
push rcx push rcx
TransformAddress ecx, rcx ;# TransformAddress function TransformAddress ecx, rcx ;# TransformAddress function
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
@ -274,14 +272,13 @@ ReadMemoryRandom 32767, 1
ALIGN 64 ALIGN 64
rx_read_dataset: rx_read_dataset:
;# IN rcx = scratchpad index - must be divisible by 8 ;# IN rax = dataset address
;# GLOBAL rbx = address of the dataset address ;# IN ecx = scratchpad index - must be divisible by 8
;# IN edx = dataset index - must be divisible by 64
;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rsi = address of the scratchpad
;# GLOBAL rdi = "mx" random 32-bit dataset address
;# MODIFY rax, rcx, rdx ;# MODIFY rax, rcx, rdx
mov rax, qword ptr [rbx] ;# load the dataset address
lea rcx, [rsi+rcx*8] ;# scratchpad cache line lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdi] ;# dataset cache line lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8] mov rdx, qword ptr [rax+8]