From 4fb168e249d22e565ea9dadd02c2c6b7dda93736 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 18 Jan 2019 17:57:47 +0100 Subject: [PATCH] Large page support for cache Bug fixes --- makefile | 13 ++++++++++++- src/Cache.hpp | 24 ++++++++++++++++++++++-- src/JitCompilerX86-static.asm | 4 ++++ src/JitCompilerX86.cpp | 4 ++++ src/LightClientAsyncWorker.cpp | 22 +++++++++++++++++++--- src/LightClientAsyncWorker.hpp | 10 +++++++++- src/Stopwatch.hpp | 4 ++-- src/common.hpp | 21 +++++++++++---------- src/dataset.cpp | 22 +++++++++++++++++----- src/dataset.hpp | 5 ++++- src/executeProgram-win64.asm | 4 ++++ src/main.cpp | 29 ++++++++++++++++++++++------- src/virtualMemory.cpp | 6 +++++- 13 files changed, 135 insertions(+), 33 deletions(-) diff --git a/makefile b/makefile index d0a969c..0dcd7de 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -27,6 +27,11 @@ debug: CCFLAGS += -g debug: LDFLAGS += -g debug: $(BINDIR)/randomx +profile: CXXFLAGS += -pg +profile: CCFLAGS += -pg +profile: LDFLAGS += -pg +profile: $(BINDIR)/randomx + test: CXXFLAGS += -O0 test: $(BINDIR)/AluFpuTest @@ -38,6 +43,9 @@ $(BINDIR)/AluFpuTest: $(TOBJS) | $(BINDIR) $(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/TestAluFpu.cpp -o $@ + +$(OBJDIR)/AddressTransform.o: $(addprefix $(SRCDIR)/,AddressTransform.cpp InterpretedVirtualMachine.hpp common.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AddressTransform.cpp -o $@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blake2/blake2.h blake2/blake2-impl.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_core.c -o $@ @@ -74,6 +82,9 @@ $(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp Pcg32.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@ + +$(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ diff --git a/src/Cache.hpp b/src/Cache.hpp index 8a2b93a..bc3d6ed 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -23,12 +23,32 @@ along with RandomX. If not, see. #include #include "common.hpp" #include "dataset.hpp" +#include "virtualMemory.hpp" namespace RandomX { class Cache { public: - void* operator new(size_t size) { + static void* alloc(bool largePages) { + if (largePages) { + return allocLargePagesMemory(sizeof(Cache)); + } + else { + void* ptr = _mm_malloc(sizeof(Cache), sizeof(__m128i)); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + } + static void dealloc(Cache* cache, bool largePages) { + if (largePages) { + //allocLargePagesMemory(sizeof(Cache)); + } + else { + _mm_free(cache); + } + } + /*void* operator new(size_t size) { void* ptr = _mm_malloc(size, sizeof(__m128i)); if (ptr == nullptr) throw std::bad_alloc(); @@ -37,7 +57,7 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); - } + }*/ template void initialize(const void* seed, size_t seedSize); diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index cbbf658..031c2e4 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -15,6 +15,8 @@ ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. +IFDEF RAX + _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue @@ -57,4 +59,6 @@ randomx_program_transform ENDP _RANDOMX_JITX86_STATIC ENDS +ENDIF + END \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index b41f7b5..f828d0a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -46,6 +46,10 @@ namespace RandomX { void JitCompilerX86::generateProgram(Pcg32& gen) { } + + size_t JitCompilerX86::getCodeSize() { + return 0; + } #else /* diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index c069f3f..32aa508 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -24,12 +24,19 @@ along with RandomX. If not, see. namespace RandomX { template - LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), workerThread(&LightClientAsyncWorker::runWorker, this) { + LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), +#ifdef TRACE + sw(true), +#endif + workerThread(&LightClientAsyncWorker::runWorker, this) { } template void LightClientAsyncWorker::prepareBlock(addr_t addr) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr << std::endl; +#endif { std::lock_guard lk(mutex); startBlock = addr / CacheLineSize; @@ -37,6 +44,9 @@ namespace RandomX { output = currentLine.data(); hasWork = true; } +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; +#endif notifier.notify_all(); } @@ -54,10 +64,13 @@ namespace RandomX { template void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlocks-enter " << startBlock << "/" << blockCount << std::endl; +#endif { std::lock_guard lk(mutex); - startBlock = startBlock; - blockCount = blockCount; + this->startBlock = startBlock; + this->blockCount = blockCount; output = out; hasWork = true; } @@ -79,6 +92,9 @@ namespace RandomX { template void LightClientAsyncWorker::runWorker() { +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-enter " << std::endl; +#endif for (;;) { std::unique_lock lk(mutex); notifier.wait(lk, [this] { return hasWork; }); diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp index 7596fd5..29571e5 100644 --- a/src/LightClientAsyncWorker.hpp +++ b/src/LightClientAsyncWorker.hpp @@ -17,12 +17,17 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +//#define TRACE #include "common.hpp" #include #include #include #include +#ifdef TRACE +#include "Stopwatch.hpp" +#include +#endif namespace RandomX { @@ -43,10 +48,13 @@ namespace RandomX { void runWorker(); std::condition_variable notifier; std::mutex mutex; - DatasetLine currentLine; + alignas(16) DatasetLine currentLine; void* output; uint32_t startBlock, blockCount; bool hasWork; +#ifdef TRACE + Stopwatch sw; +#endif std::thread workerThread; }; } \ No newline at end of file diff --git a/src/Stopwatch.hpp b/src/Stopwatch.hpp index 4f3a5a1..931bc02 100644 --- a/src/Stopwatch.hpp +++ b/src/Stopwatch.hpp @@ -53,7 +53,7 @@ public: isRunning = false; } } - double getElapsed() { + double getElapsed() const { return getElapsedNanosec() / 1e+9; } private: @@ -63,7 +63,7 @@ private: uint64_t elapsed; bool isRunning; - uint64_t getElapsedNanosec() { + uint64_t getElapsedNanosec() const { uint64_t elns = elapsed; if (isRunning) { chrono_t endMark = std::chrono::high_resolution_clock::now(); diff --git a/src/common.hpp b/src/common.hpp index 62fae70..fea337f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -34,20 +34,21 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int ResultSize = 32; - constexpr int CacheBlockCount = 1024 * 1024; - constexpr int CacheLineSize = 64; - constexpr int BlockExpansionRatio = 64; - constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 16; - constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; - constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; - - constexpr int ArgonIterations = 12; - constexpr uint32_t ArgonMemorySize = 65536; //KiB + constexpr int ArgonIterations = 6; + constexpr uint32_t ArgonMemorySize = 131072; //KiB constexpr int ArgonLanes = 1; const char ArgonSalt[] = "Monero\x1A$"; constexpr int ArgonSaltSize = sizeof(ArgonSalt) - 1; + constexpr int CacheLineSize = 64; + constexpr uint64_t DatasetSize = 4ULL * 1024 * 1024 * 1024; //4 GiB + constexpr uint32_t CacheSize = ArgonMemorySize * 1024; + constexpr int CacheBlockCount = CacheSize / CacheLineSize; + constexpr int BlockExpansionRatio = DatasetSize / CacheSize; + constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; + constexpr int DatasetIterations = 32; + + #ifdef TRACE constexpr bool trace = true; #else diff --git a/src/dataset.cpp b/src/dataset.cpp index ae31963..2ef6e7f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -67,7 +67,7 @@ namespace RandomX { //block number 0..67108863 //Initialization vector = block number extended to 128 bits iv = _mm_cvtsi32_si128(blockNumber); - uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..1048575 + uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..2097151 __m128i* cacheCacheLine = (__m128i*)(in + cacheBlockNumber * CacheLineSize); __m128i* datasetCacheLine = (__m128i*)out; @@ -173,14 +173,26 @@ namespace RandomX { void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); template - void datasetInitCache(const void* seed, dataset_t& ds) { - ds.cache = new Cache(); + void datasetInitCache(const void* seed, dataset_t& ds, bool largePages) { + ds.cache = new(Cache::alloc(largePages)) Cache(); ds.cache->initialize(seed, SeedSize); } template - void datasetInitCache(const void*, dataset_t&); + void datasetInitCache(const void*, dataset_t&, bool); template - void datasetInitCache(const void*, dataset_t&); + void datasetInitCache(const void*, dataset_t&, bool); + + template + void aesBench(uint32_t blockCount) { + alignas(16) KeysContainer keys; + alignas(16) uint8_t buffer[CacheLineSize]; + for (uint32_t block = 0; block < blockCount; ++block) { + initBlock(buffer, buffer, 0, keys); + } + } + + template void aesBench(uint32_t blockCount); + template void aesBench(uint32_t blockCount); } diff --git a/src/dataset.hpp b/src/dataset.hpp index 0103271..bdd34d3 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -43,11 +43,14 @@ namespace RandomX { void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&); template - void datasetInitCache(const void* seed, dataset_t& dataset); + void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages); template void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + + template + void aesBench(uint32_t blockCount); } diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 2cc98fb..2da88b5 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -15,6 +15,8 @@ ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. +IFDEF RAX + _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram @@ -252,4 +254,6 @@ executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS +ENDIF + END diff --git a/src/main.cpp b/src/main.cpp index 3295500..db3850e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help, largePages, async; + bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -179,29 +179,44 @@ int main(int argc, char** argv) { readIntOption("--nonces", argc, argv, programCount, 1000); readOption("--largePages", argc, argv, largePages); readOption("--async", argc, argv, async); + readOption("--aesBench", argc, argv, aesBench); if (genAsm) { generateAsm(programCount); return 0; } + if (softAes) + std::cout << "Using software AES." << std::endl; + + if(aesBench) { + programCount *= 10; + Stopwatch sw(true); + if (softAes) { + RandomX::aesBench(programCount); + } + else { + RandomX::aesBench(programCount); + } + sw.stop(); + std::cout << "AES performance: " << programCount / sw.getElapsed() << " blocks/s" << std::endl; + return 0; + } + std::atomic atomicNonce(0); AtomicHash result; std::vector vms; std::vector threads; RandomX::dataset_t dataset; - if (softAes) - std::cout << "Using software AES." << std::endl; std::cout << "Initializing..." << std::endl; - try { Stopwatch sw(true); if (softAes) { - RandomX::datasetInitCache(seed, dataset); + RandomX::datasetInitCache(seed, dataset, largePages); } else { - RandomX::datasetInitCache(seed, dataset); + RandomX::datasetInitCache(seed, dataset, largePages); } if (RandomX::trace) { std::cout << "Keys: " << std::endl; @@ -243,7 +258,7 @@ int main(int argc, char** argv) { RandomX::datasetInit(cache, dataset, 0, RandomX::DatasetBlockCount); } } - delete cache; + RandomX::Cache::dealloc(cache, largePages); threads.clear(); std::cout << "Dataset (4 GiB) initialized in " << sw.getElapsed() << " s" << std::endl; } diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp index e6e44fc..f324e95 100644 --- a/src/virtualMemory.cpp +++ b/src/virtualMemory.cpp @@ -88,11 +88,15 @@ void* allocExecutableMemory(std::size_t bytes) { return mem; } +constexpr std::size_t align(std::size_t pos, uint32_t align) { + return ((pos - 1) / align + 1) * align; +} + void* allocLargePagesMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 setPrivilege("SeLockMemoryPrivilege", 1); - mem = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + mem = VirtualAlloc(NULL, align(bytes, 2 * 1024 * 1024), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); if (mem == nullptr) throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); #else