diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp
index 0932cfe..cf131d1 100644
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@@ -19,13 +19,24 @@ along with RandomX. If not, see.
#pragma once
//#define TRACEVM
+#include
#include "VirtualMachine.hpp"
#include "JitCompilerX86.hpp"
+#include "intrinPortable.h"
namespace RandomX {
class CompiledVirtualMachine : public VirtualMachine {
public:
+ void* operator new(size_t size) {
+ void* ptr = _mm_malloc(size, 64);
+ if (ptr == nullptr)
+ throw std::bad_alloc();
+ return ptr;
+ }
+ void operator delete(void* ptr) {
+ _mm_free(ptr);
+ }
CompiledVirtualMachine(bool softAes);
void setDataset(dataset_t ds, bool light = false) override;
void initializeProgram(const void* seed) override;
diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm
index 99941b1..3bc161e 100644
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@@ -223,17 +223,12 @@ ReadMemoryRandom MACRO spmask, float
;# GLOBAL rbp = "ic" number of instructions until the end of the program
;# GLOBAL rbx = address of the dataset address
;# GLOBAL rsi = address of the scratchpad
-;# GLOBAL rdi = "mx" random 32-bit dataset address
+;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma"
;# MODIFY rcx, rdx
-LOCAL L_prefetch, L_read, L_return
- mov eax, ebp
- and al, 63
- jz short L_prefetch ;# "ic" divisible by 64 -> prefetch
- xor edx, edx
- cmp al, 14
- je short L_read ;# "ic" = 14 (mod 64) -> random read
- cmovb edx, ecx ;# "ic" < 14 (mod 64) -> modify random read address
- xor edi, edx
+LOCAL L_prefetch_read, L_return
+ test ebp, 63
+ jz short L_prefetch_read ;# "ic" divisible by 64 -> prefetch + read
+ xor rdi, rcx ;# randomize "mx"
L_return:
and ecx, spmask ;# limit address to the specified scratchpad size
IF float
@@ -242,12 +237,15 @@ ELSE
mov rax, qword ptr [rsi+rcx*8]
ENDIF
ret
-L_prefetch:
+L_prefetch_read:
+ ; prefetch cacheline "mx"
mov rax, qword ptr [rbx] ;# load the dataset address
- and edi, -64 ;# align "mx" to the start of a cache line
- prefetchnta byte ptr [rax+rdi]
- jmp short L_return
-L_read:
+ and rdi, -64 ;# align "mx" to the start of a cache line
+ mov edx, edi ;# edx = mx
+ prefetchnta byte ptr [rax+rdx]
+ ; read cacheline "ma"
+ ror rdi, 32 ;# swap "ma" and "mx"
+ mov edx, edi ;# edx = ma
push rcx
TransformAddress ecx, rcx ;# TransformAddress function
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
@@ -274,14 +272,13 @@ ReadMemoryRandom 32767, 1
ALIGN 64
rx_read_dataset:
-;# IN rcx = scratchpad index - must be divisible by 8
-;# GLOBAL rbx = address of the dataset address
+;# IN rax = dataset address
+;# IN ecx = scratchpad index - must be divisible by 8
+;# IN edx = dataset index - must be divisible by 64
;# GLOBAL rsi = address of the scratchpad
-;# GLOBAL rdi = "mx" random 32-bit dataset address
;# MODIFY rax, rcx, rdx
- mov rax, qword ptr [rbx] ;# load the dataset address
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
- lea rax, [rax+rdi] ;# dataset cache line
+ lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]