From 48d85643de0666731037015da02d5fea89d6fc3d Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 13 Jan 2019 13:47:25 +0100 Subject: [PATCH] Dataset intialization algorithm (AES) --- src/Cache.cpp | 5 -- src/Cache.hpp | 4 +- src/JitCompilerX86.cpp | 2 +- src/JitCompilerX86.hpp | 1 - src/VirtualMachine.cpp | 10 ++-- src/common.hpp | 12 ++--- src/dataset.cpp | 117 ++++++++++++++++------------------------- 7 files changed, 58 insertions(+), 93 deletions(-) diff --git a/src/Cache.cpp b/src/Cache.cpp index eb03f9d..bb1758f 100644 --- a/src/Cache.cpp +++ b/src/Cache.cpp @@ -134,11 +134,6 @@ namespace RandomX { //Argon2d memory fill argonFill(seed, seedSize); - //Circular shift of the cache buffer by 512 bytes - //realized by copying the first 512 bytes to the back - //of the buffer and shifting the start by 512 bytes - memcpy(memory + CacheSize, memory, CacheShift); - //AES keys expandAesKeys((__m128i*)seed, keys.data()); } diff --git a/src/Cache.hpp b/src/Cache.hpp index 7a34ee8..4137b97 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -47,11 +47,11 @@ namespace RandomX { } const uint8_t* getCache() { - return memory + CacheShift; + return memory; } private: alignas(16) KeysContainer keys; - uint8_t memory[CacheSize + CacheShift]; + uint8_t memory[CacheSize]; void argonFill(const void* seed, size_t seedSize); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8175485..955d8ba 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ -//#define MAGIC_DIVISION +#define MAGIC_DIVISION #include "JitCompilerX86.hpp" #include "Pcg32.hpp" #include diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 0c0c48c..e6a7e6d 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -33,7 +33,6 @@ namespace RandomX { typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); constexpr uint32_t CodeSize = 64 * 1024; - constexpr uint32_t CacheLineSize = 64; struct CallOffset { CallOffset(int32_t p, int32_t i) : pos(p), index(i) {} diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 103d245..6e8cfad 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -56,7 +56,7 @@ namespace RandomX { if (light) { auto lds = mem.ds.lightDataset = new LightClientDataset(); lds->cache = ds.cache; - lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); + //lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); lds->blockNumber = -1; if (lds->block == nullptr) { throw std::bad_alloc(); @@ -78,13 +78,13 @@ namespace RandomX { if (lightClient) { auto cache = mem.ds.lightDataset->cache; if (softAes) { - for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); } } else { - for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); } } } diff --git a/src/common.hpp b/src/common.hpp index acda52a..3831175 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -34,13 +34,13 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int ResultSize = 32; - constexpr int CacheBlockSize = 1024; - constexpr int CacheShift = CacheBlockSize / 2; + constexpr int CacheBlockCount = 1024 * 1024; + constexpr int CacheLineSize = 64; constexpr int BlockExpansionRatio = 64; - constexpr uint32_t DatasetBlockSize = BlockExpansionRatio * CacheBlockSize; - constexpr uint32_t DatasetBlockCount = 65536; - constexpr uint32_t CacheSize = DatasetBlockCount * CacheBlockSize; - constexpr uint64_t DatasetSize = (uint64_t)DatasetBlockCount * DatasetBlockSize; + constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; + constexpr int DatasetIterations = 64; + constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; + constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; constexpr int ArgonIterations = 12; constexpr uint32_t ArgonMemorySize = 65536; //KiB diff --git a/src/dataset.cpp b/src/dataset.cpp index 70561c1..d9c7b3f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -56,59 +56,55 @@ namespace RandomX { return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); } - template +#define AES_ROUND(i) x0 = aesdec(x0, keys[i]); \ + x1 = aesenc(x1, keys[i]); \ + x2 = aesdec(x2, keys[i]); \ + x3 = aesenc(x3, keys[i]) + + template void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - __m128i xin, xout; + __m128i x0, x1, x2, x3, iv; + //block number 0..67108863 //Initialization vector = block number extended to 128 bits - xout = _mm_cvtsi32_si128(blockNumber); - //Expand + AES - for (uint32_t i = 0; i < DatasetBlockSize / sizeof(__m128i); ++i) { - if ((i % 32) == 0) { - xin = _mm_set_epi64x(*(uint64_t*)(in + i / 4), 0); - xout = _mm_xor_si128(xin, xout); - } - if (enc) { - xout = aesenc(xout, keys[0]); - xout = aesenc(xout, keys[1]); - xout = aesenc(xout, keys[2]); - xout = aesenc(xout, keys[3]); - xout = aesenc(xout, keys[4]); - xout = aesenc(xout, keys[5]); - xout = aesenc(xout, keys[6]); - xout = aesenc(xout, keys[7]); - xout = aesenc(xout, keys[8]); - xout = aesenc(xout, keys[9]); - } - else { - xout = aesdec(xout, keys[0]); - xout = aesdec(xout, keys[1]); - xout = aesdec(xout, keys[2]); - xout = aesdec(xout, keys[3]); - xout = aesdec(xout, keys[4]); - xout = aesdec(xout, keys[5]); - xout = aesdec(xout, keys[6]); - xout = aesdec(xout, keys[7]); - xout = aesdec(xout, keys[8]); - xout = aesdec(xout, keys[9]); - } - _mm_store_si128((__m128i*)(out + i * sizeof(__m128i)), xout); + iv = _mm_cvtsi32_si128(blockNumber); + uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..1048575 + __m128i* cacheCacheLine = (__m128i*)(in + cacheBlockNumber * CacheLineSize); + __m128i* datasetCacheLine = (__m128i*)out; + + x0 = _mm_load_si128(cacheCacheLine + 0); + x1 = _mm_load_si128(cacheCacheLine + 1); + x2 = _mm_load_si128(cacheCacheLine + 2); + x3 = _mm_load_si128(cacheCacheLine + 3); + + x0 = _mm_xor_si128(x0, iv); + x1 = _mm_xor_si128(x1, iv); + x2 = _mm_xor_si128(x2, iv); + x3 = _mm_xor_si128(x3, iv); + + for (auto i = 0; i < DatasetIterations; ++i) { + AES_ROUND(0); + AES_ROUND(1); + AES_ROUND(2); + AES_ROUND(3); + AES_ROUND(4); + AES_ROUND(5); + AES_ROUND(6); + AES_ROUND(7); + AES_ROUND(8); + AES_ROUND(9); } - //Shuffle - Pcg32 gen(&xout); - shuffle((uint32_t*)out, DatasetBlockSize, gen); + + _mm_store_si128(datasetCacheLine + 0, x0); + _mm_store_si128(datasetCacheLine + 1, x1); + _mm_store_si128(datasetCacheLine + 2, x2); + _mm_store_si128(datasetCacheLine + 3, x3); } template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) { convertible_t data; @@ -122,37 +118,12 @@ namespace RandomX { return data; } - template - void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys) { - if (blockNumber % 2 == 1) { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); - } - else { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); - } - } - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) { convertible_t data; LightClientDataset* lds = memory.ds.lightDataset; - auto blockNumber = memory.ma / DatasetBlockSize; - if (lds->blockNumber != blockNumber) { - initBlock(lds->cache->getCache(), (uint8_t*)lds->block, blockNumber, lds->cache->getKeys()); - lds->blockNumber = blockNumber; - } - data.u64 = *(uint64_t*)(lds->block + (memory.ma % DatasetBlockSize)); - memory.ma += 8; - memory.mx ^= addr; - if ((memory.mx & 0xFFF8) == 0) { - memory.ma = memory.mx & ~7; - } + auto blockNumber = memory.ma / CacheLineSize; + return data; } @@ -179,7 +150,7 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache->getCache(), ds.dataset + i * DatasetBlockSize, i, cache->getKeys()); + initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); } }