diff --git a/src/Cache.cpp b/src/Cache.cpp index 85d481e..60b7755 100644 --- a/src/Cache.cpp +++ b/src/Cache.cpp @@ -17,11 +17,8 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ -// Parts of this file are originally copyright (c) xmr-stak - #include #include "Cache.hpp" -#include "softAes.h" #include "argon2.h" #include "argon2_core.h" @@ -29,52 +26,6 @@ namespace RandomX { static_assert(ArgonMemorySize % (ArgonLanes * ARGON2_SYNC_POINTS) == 0, "ArgonMemorySize - invalid value"); - // This will shift and xor tmp1 into itself as 4 32-bit vals such as - // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) - static inline __m128i sl_xor(__m128i tmp1) { - __m128i tmp4; - tmp4 = _mm_slli_si128(tmp1, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - return tmp1; - } - - template - static inline void aesGenKeys(__m128i* xout0, __m128i* xout2) { - __m128i xout1 = soft ? soft_aeskeygenassist(*xout2, rcon) : _mm_aeskeygenassist_si128(*xout2, rcon); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); - *xout0 = sl_xor(*xout0); - *xout0 = _mm_xor_si128(*xout0, xout1); - xout1 = soft ? soft_aeskeygenassist(*xout0, 0x00) : _mm_aeskeygenassist_si128(*xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); - *xout2 = sl_xor(*xout2); - *xout2 = _mm_xor_si128(*xout2, xout1); - } - - template - static inline void expandAesKeys(const __m128i* seed, __m128i* keys) { - __m128i xout0, xout2; - xout0 = _mm_load_si128(seed); - xout2 = _mm_load_si128(seed + 1); - *keys++ = xout0; - *keys++ = xout2; - aesGenKeys<0x01, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aesGenKeys<0x02, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aesGenKeys<0x04, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aesGenKeys<0x08, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - } - void Cache::argonFill(const void* seed, size_t seedSize) { uint32_t memory_blocks, segment_length; argon2_instance_t instance; @@ -128,16 +79,8 @@ namespace RandomX { fill_memory_blocks(&instance); } - template void Cache::initialize(const void* seed, size_t seedSize) { //Argon2d memory fill argonFill(seed, seedSize); - - //AES keys - expandAesKeys((__m128i*)seed, keys.data()); } - - template void Cache::initialize(const void*, size_t); - - template void Cache::initialize(const void*, size_t); } \ No newline at end of file diff --git a/src/Cache.hpp b/src/Cache.hpp index bc3d6ed..927c5e4 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -42,7 +42,7 @@ namespace RandomX { } static void dealloc(Cache* cache, bool largePages) { if (largePages) { - //allocLargePagesMemory(sizeof(Cache)); + freePagedMemory(cache, sizeof(Cache)); } else { _mm_free(cache); @@ -59,18 +59,12 @@ namespace RandomX { _mm_free(ptr); }*/ - template void initialize(const void* seed, size_t seedSize); - const KeysContainer& getKeys() const { - return keys; - } - const uint8_t* getCache() const { return memory; } private: - alignas(16) KeysContainer keys; uint8_t memory[CacheSize]; void argonFill(const void* seed, size_t seedSize); }; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index f44a391..08d4536 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -350,7 +350,7 @@ namespace RandomX { mem.mx &= CacheLineAlignMask; Cache* cache = mem.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache->getCache(), (uint8_t*)datasetLine, mem.ma / CacheLineSize, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)datasetLine, mem.ma / CacheLineSize); for (int i = 0; i < RegistersCount; ++i) r[i] ^= datasetLine[i]; std::swap(mem.mx, mem.ma); diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index f79d03d..d9f62a2 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -57,7 +57,7 @@ namespace RandomX { #endif uint32_t currentBlock = addr / CacheLineSize; if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock); } else { sync(); @@ -86,7 +86,7 @@ namespace RandomX { template void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i); } } @@ -108,7 +108,7 @@ namespace RandomX { std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; #endif //getBlocks(output, startBlock, blockCount); - initBlock(cache->getCache(), (uint8_t*)output, startBlock, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)output, startBlock); hasWork = false; #ifdef TRACE std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; diff --git a/src/dataset.cpp b/src/dataset.cpp index 4a6a5e6..a5132fd 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -39,36 +39,36 @@ along with RandomX. If not, see. namespace RandomX { - void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber) { + uint64_t c0, c1, c2, c3, c4, c5, c6, c7; - r0 = 4ULL * blockNumber; - r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; + c0 = 4ULL * blockNumber; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0; constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; for (auto i = 0; i < DatasetIterations; ++i) { - const uint8_t* mixBlock = cache + (r0 & mask); + const uint8_t* mixBlock = cache + (c0 & mask); PREFETCHNTA(mixBlock); - r0 = squareHash(r0); - r0 ^= load64(mixBlock + 0); - r1 ^= load64(mixBlock + 8); - r2 ^= load64(mixBlock + 16); - r3 ^= load64(mixBlock + 24); - r4 ^= load64(mixBlock + 32); - r5 ^= load64(mixBlock + 40); - r6 ^= load64(mixBlock + 48); - r7 ^= load64(mixBlock + 56); + c0 = squareHash(c0); + c0 ^= load64(mixBlock + 0); + c1 ^= load64(mixBlock + 8); + c2 ^= load64(mixBlock + 16); + c3 ^= load64(mixBlock + 24); + c4 ^= load64(mixBlock + 32); + c5 ^= load64(mixBlock + 40); + c6 ^= load64(mixBlock + 48); + c7 ^= load64(mixBlock + 56); } - store64(out + 0, r0); - store64(out + 8, r1); - store64(out + 16, r2); - store64(out + 24, r3); - store64(out + 32, r4); - store64(out + 40, r5); - store64(out + 48, r6); - store64(out + 56, r7); + store64(out + 0, c0); + store64(out + 8, c1); + store64(out + 16, c2); + store64(out + 24, c3); + store64(out + 32, c4); + store64(out + 40, c5); + store64(out + 48, c6); + store64(out + 56, c7); } void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { @@ -86,7 +86,7 @@ namespace RandomX { memory.mx &= CacheLineAlignMask; //align to cache line Cache* cache = memory.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize); for (int i = 0; i < RegistersCount; ++i) reg[i] ^= datasetLine[i]; std::swap(memory.mx, memory.ma); @@ -119,31 +119,12 @@ namespace RandomX { void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); + initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i); } } - template void datasetInitCache(const void* seed, dataset_t& ds, bool largePages) { ds.cache = new(Cache::alloc(largePages)) Cache(); - ds.cache->initialize(seed, SeedSize); + ds.cache->initialize(seed, SeedSize); } - - template - void datasetInitCache(const void*, dataset_t&, bool); - - template - void datasetInitCache(const void*, dataset_t&, bool); - - template - void aesBench(uint32_t blockCount) { - alignas(16) KeysContainer keys; - alignas(16) uint8_t buffer[CacheLineSize]; - for (uint32_t block = 0; block < blockCount; ++block) { - initBlock(buffer, buffer, 0, keys); - } - } - - template void aesBench(uint32_t blockCount); - template void aesBench(uint32_t blockCount); } diff --git a/src/dataset.hpp b/src/dataset.hpp index 9438173..c01835a 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -20,18 +20,15 @@ along with RandomX. If not, see. #pragma once #include -#include #include "intrinPortable.h" #include "common.hpp" namespace RandomX { - using KeysContainer = std::array<__m128i, 10>; - template - void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys); + void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber); - void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys); + void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber); void datasetAlloc(dataset_t& ds, bool largePages); @@ -39,14 +36,10 @@ namespace RandomX { void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&); - template void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages); void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); - - template - void aesBench(uint32_t blockCount); } diff --git a/src/main.cpp b/src/main.cpp index b6efceb..ad6e856 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -256,18 +256,8 @@ int main(int argc, char** argv) { try { Stopwatch sw(true); - if (softAes) { - RandomX::datasetInitCache(seed, dataset, largePages); - } - else { - RandomX::datasetInitCache(seed, dataset, largePages); - } + RandomX::datasetInitCache(seed, dataset, largePages); if (RandomX::trace) { - std::cout << "Keys: " << std::endl; - for (unsigned i = 0; i < dataset.cache->getKeys().size(); ++i) { - outputHex(std::cout, (char*)&dataset.cache->getKeys()[i], sizeof(__m128i)); - } - std::cout << std::endl; std::cout << "Cache: " << std::endl; outputHex(std::cout, (char*)dataset.cache->getCache(), sizeof(__m128i)); std::cout << std::endl; diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp index f324e95..17b91da 100644 --- a/src/virtualMemory.cpp +++ b/src/virtualMemory.cpp @@ -109,4 +109,12 @@ void* allocLargePagesMemory(std::size_t bytes) { throw std::runtime_error("allocLargePagesMemory - mmap failed"); #endif return mem; -} \ No newline at end of file +} + +void freePagedMemory(void* ptr, std::size_t bytes) { +#ifdef _WIN32 + VirtualFree(ptr, 0, MEM_RELEASE); +#else + munmap(ptr, bytes); +#endif +} diff --git a/src/virtualMemory.hpp b/src/virtualMemory.hpp index c80d33e..239f24c 100644 --- a/src/virtualMemory.hpp +++ b/src/virtualMemory.hpp @@ -22,4 +22,5 @@ along with RandomX. If not, see. #include void* allocExecutableMemory(std::size_t); -void* allocLargePagesMemory(std::size_t); \ No newline at end of file +void* allocLargePagesMemory(std::size_t); +void freePagedMemory(void*, std::size_t);