From 6b344b81fd6878993064c833624e3afccef99201 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 17 Mar 2019 00:57:48 +0100 Subject: [PATCH] initBlock asm version (disabled) --- makefile | 22 +++--- src/asm/initBlock.inc | 155 +++++++++++++++++++++++++++++++++++++++++ src/dataset.cpp | 4 +- src/dataset.hpp | 2 +- src/squareHash.S | 23 ++++++ src/squareHash.asm | 158 +----------------------------------------- 6 files changed, 194 insertions(+), 170 deletions(-) create mode 100644 src/asm/initBlock.inc diff --git a/makefile b/makefile index 159eb2a..de8c830 100644 --- a/makefile +++ b/makefile @@ -53,16 +53,16 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak $(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@ -$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR) +$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@ $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/blake2/blake2b.c -o $@ -$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp) | $(OBJDIR) +$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@ -$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR) +$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ $(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR) @@ -71,40 +71,40 @@ $(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OB $(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@ -$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR) +$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ -$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR) +$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc) configuration.h) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@ $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp intrinPortable.h blake2/endian.h common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ -$(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp blake2/endian.h common.hpp) | $(OBJDIR) +$(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp blake2/endian.h common.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Instruction.cpp -o $@ -$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h dataset.hpp Cache.hpp virtualMemory.hpp LightClientAsyncWorker.hpp) | $(OBJDIR) +$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h dataset.hpp Cache.hpp virtualMemory.hpp LightClientAsyncWorker.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@ $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ -$(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h) | $(OBJDIR) +$(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ -$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp) | $(OBJDIR) +$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@ -$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h) | $(OBJDIR) +$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/softAes.cpp -o $@ -$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp blake2/endian.h Program.hpp Instruction.hpp hashAes1Rx4.hpp softAes.h intrinPortable.h blake2/blake2.h) | $(OBJDIR) +$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp blake2/endian.h Program.hpp Instruction.hpp hashAes1Rx4.hpp softAes.h intrinPortable.h blake2/blake2.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@ $(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR) diff --git a/src/asm/initBlock.inc b/src/asm/initBlock.inc new file mode 100644 index 0000000..61b06b6 --- /dev/null +++ b/src/asm/initBlock.inc @@ -0,0 +1,155 @@ + prefetcht0 byte ptr [rbp] + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +initBlock_loop: + ;# c0 + mov rbx, r8 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r8+r9] + call squareHash + mov r9, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c1 + mov rbx, r9 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r9+r10] + call squareHash + mov r10, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c2 + mov rbx, r10 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r10+r11] + call squareHash + mov r11, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c3 + mov rbx, r11 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r11+r12] + call squareHash + mov r12, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c4 + mov rbx, r12 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r12+r13] + call squareHash + mov r13, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c5 + mov rbx, r13 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r13+r14] + call squareHash + mov r14, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c6 + mov rbx, r14 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r14+r15] + call squareHash + mov r15, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + ;# c7 + mov rbx, r15 + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rcx, [r15+r8] + call squareHash + mov r8, rax + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] + sub rsi, 1 + jnz initBlock_loop + mov qword ptr [rbp+0], r8 + mov qword ptr [rbp+8], r9 + mov qword ptr [rbp+16], r10 + mov qword ptr [rbp+24], r11 + mov qword ptr [rbp+32], r12 + mov qword ptr [rbp+40], r13 + mov qword ptr [rbp+48], r14 + mov qword ptr [rbp+56], r15 \ No newline at end of file diff --git a/src/dataset.cpp b/src/dataset.cpp index 0a96d86..40e72b1 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -40,7 +40,7 @@ along with RandomX. If not, see. namespace RandomX { -#if !defined(_M_X64) +#if true //RANDOMX_ARGON_GROWTH != 0 || (!defined(_M_X64) && !defined(__x86_64__)) static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) { uint8_t* mixBlock; if (RANDOMX_ARGON_GROWTH == 0) { @@ -75,7 +75,7 @@ namespace RandomX { uint8_t* mixBlock; - for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) { + for (auto i = 0; i < iterations; ++i) { mixBlock = selectMixBlock(cache, c0, c1); mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); diff --git a/src/dataset.hpp b/src/dataset.hpp index de6ac0b..fcc863b 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -25,7 +25,7 @@ along with RandomX. If not, see. namespace RandomX { -#if defined(_M_X64) +#if false //RANDOMX_ARGON_GROWTH == 0 && (defined(_M_X64) || defined(__x86_64__)) extern "C" #endif void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations); diff --git a/src/squareHash.S b/src/squareHash.S index 4527cc0..963cc03 100644 --- a/src/squareHash.S +++ b/src/squareHash.S @@ -10,8 +10,31 @@ #define DECL(x) x #endif +#include "configuration.h" + .global DECL(squareHash) +.global DECL(initBlock) DECL(squareHash): mov rcx, rdi #include "asm/squareHash.inc" + +DECL(initBlock): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov rdi, qword ptr [rdi] + mov rbp, rsi + mov r8, rdx + mov rsi, rcx + #include "asm/initBlock.inc" + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret \ No newline at end of file diff --git a/src/squareHash.asm b/src/squareHash.asm index 8917428..8f591d0 100644 --- a/src/squareHash.asm +++ b/src/squareHash.asm @@ -22,165 +22,11 @@ initBlock PROC push r13 push r14 push r15 - mov rsi, r9 mov rdi, qword ptr [rcx] mov rbp, rdx - prefetcht0 byte ptr [rbp] ; r8 = blockNumber - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 -initBlock_loop: - ; c0 - mov rbx, r8 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r8+r9] - call squareHash - mov r9, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c1 - mov rbx, r9 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r9+r10] - call squareHash - mov r10, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c2 - mov rbx, r10 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r10+r11] - call squareHash - mov r11, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c3 - mov rbx, r11 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r11+r12] - call squareHash - mov r12, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c4 - mov rbx, r12 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r12+r13] - call squareHash - mov r13, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c5 - mov rbx, r13 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r13+r14] - call squareHash - mov r14, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c6 - mov rbx, r14 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r14+r15] - call squareHash - mov r15, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - ; c7 - mov rbx, r15 - and rbx, 4194303 - shl rbx, 6 - add rbx, rdi - prefetchnta byte ptr [rbx] - lea rcx, [r15+r8] - call squareHash - mov r8, rax - xor r8, qword ptr [rbx+0] - xor r9, qword ptr [rbx+8] - xor r10, qword ptr [rbx+16] - xor r11, qword ptr [rbx+24] - xor r12, qword ptr [rbx+32] - xor r13, qword ptr [rbx+40] - xor r14, qword ptr [rbx+48] - xor r15, qword ptr [rbx+56] - sub rsi, 1 - jnz initBlock_loop - mov qword ptr [rbp+0], r8 - mov qword ptr [rbp+8], r9 - mov qword ptr [rbp+16], r10 - mov qword ptr [rbp+24], r11 - mov qword ptr [rbp+32], r12 - mov qword ptr [rbp+40], r13 - mov qword ptr [rbp+48], r14 - mov qword ptr [rbp+56], r15 + mov rsi, r9 + include asm/initBlock.inc pop r15 pop r14 pop r13