From 870b4138a6bef44cdfce3162799a4f9c501abbec Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 4 Feb 2019 17:49:19 +0100 Subject: [PATCH 1/3] Cryptonight variant 4 aka CryptonightR It introduces random integer math into the main loop. --- src/crypto/chacha.h | 8 +- src/crypto/hash-ops.h | 2 +- src/crypto/hash.h | 8 +- src/crypto/slow-hash.c | 50 ++- src/crypto/variant4_random_math.h | 405 ++++++++++++++++++ .../cryptonote_format_utils.cpp | 4 +- tests/hash/CMakeLists.txt | 2 +- tests/hash/main.cpp | 29 +- tests/hash/tests-slow-4.txt | 10 + 9 files changed, 494 insertions(+), 24 deletions(-) create mode 100644 src/crypto/variant4_random_math.h create mode 100644 tests/hash/tests-slow-4.txt diff --git a/src/crypto/chacha.h b/src/crypto/chacha.h index 1cc1ed88a..a39823e5a 100644 --- a/src/crypto/chacha.h +++ b/src/crypto/chacha.h @@ -73,18 +73,18 @@ namespace crypto { inline void generate_chacha_key(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) { static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key"); epee::mlocked> pwd_hash; - crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); for (uint64_t n = 1; n < kdf_rounds; ++n) - crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key)); } inline void generate_chacha_key_prehashed(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) { static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key"); epee::mlocked> pwd_hash; - crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/); + crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/, 0/*height*/); for (uint64_t n = 1; n < kdf_rounds; ++n) - crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key)); } diff --git a/src/crypto/hash-ops.h b/src/crypto/hash-ops.h index 0f7cd0c2a..859c810bd 100644 --- a/src/crypto/hash-ops.h +++ b/src/crypto/hash-ops.h @@ -79,7 +79,7 @@ enum { }; void cn_fast_hash(const void *data, size_t length, char *hash); -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed); +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height); void hash_extra_blake(const void *data, size_t length, char *hash); void hash_extra_groestl(const void *data, size_t length, char *hash); diff --git a/src/crypto/hash.h b/src/crypto/hash.h index 450953584..17071923d 100644 --- a/src/crypto/hash.h +++ b/src/crypto/hash.h @@ -71,12 +71,12 @@ namespace crypto { return h; } - inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0) { - cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 0/*prehashed*/); + inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) { + cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 0/*prehashed*/, height); } - inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0) { - cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 1/*prehashed*/); + inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) { + cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 1/*prehashed*/, height); } inline void tree_hash(const hash *hashes, std::size_t count, hash &root_hash) { diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index ceb3a567b..8da79dee0 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -39,6 +39,7 @@ #include "hash-ops.h" #include "oaes_lib.h" #include "variant2_int_sqrt.h" +#include "variant4_random_math.h" #define MEMORY (1 << 21) // 2MB scratchpad #define ITER (1 << 20) @@ -172,7 +173,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex const uint64_t sqrt_input = SWAP64LE(((uint64_t*)(ptr))[0]) + division_result #define VARIANT2_INTEGER_MATH_SSE2(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_SSE2(); \ @@ -182,7 +183,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex #if defined DBL_MANT_DIG && (DBL_MANT_DIG >= 50) // double precision floating point type has enough bits of precision on current platform #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \ @@ -192,7 +193,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex // double precision floating point type is not good enough on current platform // fall back to the reference code (integer only) #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_REF(); \ @@ -214,6 +215,33 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \ } while (0) +#define VARIANT4_RANDOM_MATH_INIT() \ + v4_reg r[8]; \ + struct V4_Instruction code[TOTAL_LATENCY * ALU_COUNT + 1]; \ + do if (variant >= 4) \ + { \ + v4_reg* data = (v4_reg*)(state.hs.w + 12); \ + r[0] = data[0]; \ + r[1] = data[1]; \ + r[2] = data[2]; \ + r[3] = data[3]; \ + v4_random_math_init(code, height); \ + } while (0) + +#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \ + do if (variant >= 4) \ + { \ + if (sizeof(v4_reg) == sizeof(uint32_t)) \ + U64(b)[0] ^= (r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32); \ + else \ + U64(b)[0] ^= (r[0] + r[1]) ^ (r[2] + r[3]); \ + r[4] = ((v4_reg*)(a))[0]; \ + r[5] = ((v4_reg*)(a))[sizeof(uint64_t) / sizeof(v4_reg)]; \ + r[6] = ((v4_reg*)(_b))[0]; \ + r[7] = ((v4_reg*)(_b1))[0]; \ + v4_random_math(code, r); \ + } while (0) + #if !defined NO_AES && (defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64))) // Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI @@ -298,6 +326,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_INTEGER_MATH_SSE2(b, c); \ + VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \ @@ -694,7 +723,7 @@ void slow_hash_free_state(void) * @param length the length in bytes of the data * @param hash a pointer to a buffer in which the final 256 bit hash will be stored */ -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { RDATA_ALIGN16 uint8_t expandedKey[240]; /* These buffers are aligned to use later with SSE functions */ @@ -730,6 +759,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); /* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill * the 2MB large random access buffer. @@ -901,6 +931,7 @@ union cn_slow_hash_state p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_PORTABLE_INTEGER_MATH(b, c); \ + VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \ @@ -1063,7 +1094,7 @@ STATIC INLINE void aligned_free(void *ptr) } #endif /* FORCE_USE_HEAP */ -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { RDATA_ALIGN16 uint8_t expandedKey[240]; @@ -1100,6 +1131,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); /* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill * the 2MB large random access buffer. @@ -1278,7 +1310,7 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b) U64(a)[1] ^= U64(b)[1]; } -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { uint8_t text[INIT_SIZE_BYTE]; uint8_t a[AES_BLOCK_SIZE]; @@ -1317,6 +1349,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); // use aligned data memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len); @@ -1353,6 +1386,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int copy_block(c, p); VARIANT2_PORTABLE_INTEGER_MATH(c, c1); + VARIANT4_RANDOM_MATH(a, c, r, b, b + AES_BLOCK_SIZE); mul(c1, c, d); VARIANT2_2_PORTABLE(); VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j); @@ -1476,7 +1510,7 @@ union cn_slow_hash_state { }; #pragma pack(pop) -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) { +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { #ifndef FORCE_USE_HEAP uint8_t long_state[MEMORY]; #else @@ -1505,6 +1539,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_PORTABLE_INIT(); VARIANT2_PORTABLE_INIT(); + VARIANT4_RANDOM_MATH_INIT(); oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE); for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) { @@ -1537,6 +1572,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; copy_block(c2, &long_state[j]); VARIANT2_PORTABLE_INTEGER_MATH(c2, c1); + VARIANT4_RANDOM_MATH(a, c2, r, b, b + AES_BLOCK_SIZE); mul(c1, c2, d); VARIANT2_2_PORTABLE(); VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j); diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h new file mode 100644 index 000000000..def8c4aa5 --- /dev/null +++ b/src/crypto/variant4_random_math.h @@ -0,0 +1,405 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + +// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t) +typedef uint32_t v4_reg; + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS = 60, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, -128 <= C <= 127 + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionCompact is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 8 registers in total: +// - 4 variable registers +// - 4 constant registers initialized from loop variables +// +// This is why dst_index is 2 bits +struct V4_InstructionCompact +{ + uint8_t opcode : 3; + uint8_t dst_index : 2; + uint8_t src_index : 3; +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#ifdef __GNUC__ +#define FORCEINLINE __attribute__((always_inline)) inline +#elif _MSC_VER +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#ifdef __GNUC__ +#define UNREACHABLE_CODE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << (REG_BITS - shift)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> (REG_BITS - shift)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 28495 + // 61 106077 + // 62 2455855 + // 63 5114930 + // 64 1020868 + // 65 1109026 + // 66 151756 + // 67 8429 + // 68 4477 + // 69 87 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, char* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, sizeof(data), data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + char data[32]; + memset(data, 0, sizeof(data)); + *((uint64_t*)data) = height; + + size_t data_index = sizeof(data); + + int code_size; + do { + int latency[8]; + int asic_latency[8]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R7 are constant and are threatened as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + int inst_data[8] = { 0, 1, 2, 3, -1, -1, -1, -1 }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + check_data(&data_index, 1, data, sizeof(data)); + + struct V4_InstructionCompact op = ((struct V4_InstructionCompact*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = (op.opcode <= 2) ? MUL : (op.opcode - 2); + if (op.opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (op.opcode >= 6) + { + opcode = XOR; + } + + const int a = op.dst_index; + int b = op.src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // a is always < 4, so we don't need to check bounds here + b = a + 4; + op.src_index = b; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = op.dst_index; + code[code_size].src_index = op.src_index; + code[code_size].C = 0; + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + code[code_size].C = *((uint32_t*)&data[data_index]); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely + } while (code_size < NUM_INSTRUCTIONS); + + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif diff --git a/src/cryptonote_basic/cryptonote_format_utils.cpp b/src/cryptonote_basic/cryptonote_format_utils.cpp index 46c5e4085..44bf08d60 100644 --- a/src/cryptonote_basic/cryptonote_format_utils.cpp +++ b/src/cryptonote_basic/cryptonote_format_utils.cpp @@ -1149,8 +1149,8 @@ namespace cryptonote bool get_block_longhash(const block& b, crypto::hash& res, uint64_t height) { blobdata bd = get_block_hashing_blob(b); - const int cn_variant = b.major_version >= 9 ? 2 : 1; - crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant); + const int cn_variant = b.major_version >= 11 ? 4 : b.major_version >= 9 && b.major_version <= 10 ? 2 : 1; + crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant, height); return true; } //--------------------------------------------------------------- diff --git a/tests/hash/CMakeLists.txt b/tests/hash/CMakeLists.txt index 0624d5d02..a0c78bfdc 100644 --- a/tests/hash/CMakeLists.txt +++ b/tests/hash/CMakeLists.txt @@ -43,7 +43,7 @@ set_property(TARGET hash-tests PROPERTY FOLDER "tests") -foreach (hash IN ITEMS fast slow slow-1 slow-2 tree extra-blake extra-groestl extra-jh extra-skein) +foreach (hash IN ITEMS fast slow slow-1 slow-2 slow-4 tree extra-blake extra-groestl extra-jh extra-skein) add_test( NAME "hash-${hash}" COMMAND hash-tests "${hash}" "${CMAKE_CURRENT_SOURCE_DIR}/tests-${hash}.txt") diff --git a/tests/hash/main.cpp b/tests/hash/main.cpp index 5a141d41c..adf1bd9c4 100644 --- a/tests/hash/main.cpp +++ b/tests/hash/main.cpp @@ -44,6 +44,13 @@ using namespace std; using namespace crypto; typedef crypto::hash chash; +struct V4_Data +{ + const void* data; + size_t length; + uint64_t height; +}; + PUSH_WARNINGS DISABLE_VS_WARNINGS(4297) extern "C" { @@ -54,13 +61,17 @@ extern "C" { tree_hash((const char (*)[crypto::HASH_SIZE]) data, length >> 5, hash); } static void cn_slow_hash_0(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 0/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 0/*variant*/, 0/*prehashed*/, 0/*height*/); } static void cn_slow_hash_1(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 1/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 1/*variant*/, 0/*prehashed*/, 0/*height*/); } static void cn_slow_hash_2(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 2/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 2/*variant*/, 0/*prehashed*/, 0/*height*/); + } + static void cn_slow_hash_4(const void *data, size_t, char *hash) { + const V4_Data* p = reinterpret_cast(data); + return cn_slow_hash(p->data, p->length, hash, 4/*variant*/, 0/*prehashed*/, p->height); } } POP_WARNINGS @@ -72,7 +83,7 @@ struct hash_func { } hashes[] = {{"fast", cn_fast_hash}, {"slow", cn_slow_hash_0}, {"tree", hash_tree}, {"extra-blake", hash_extra_blake}, {"extra-groestl", hash_extra_groestl}, {"extra-jh", hash_extra_jh}, {"extra-skein", hash_extra_skein}, - {"slow-1", cn_slow_hash_1}, {"slow-2", cn_slow_hash_2}}; + {"slow-1", cn_slow_hash_1}, {"slow-2", cn_slow_hash_2}, {"slow-4", cn_slow_hash_4}}; int test_variant2_int_sqrt(); int test_variant2_int_sqrt_ref(); @@ -140,7 +151,15 @@ int main(int argc, char *argv[]) { input.exceptions(ios_base::badbit | ios_base::failbit | ios_base::eofbit); input.clear(input.rdstate()); get(input, data); - f(data.data(), data.size(), (char *) &actual); + if (f == cn_slow_hash_4) { + V4_Data d; + d.data = data.data(); + d.length = data.size(); + get(input, d.height); + f(&d, 0, (char *) &actual); + } else { + f(data.data(), data.size(), (char *) &actual); + } if (expected != actual) { size_t i; cerr << "Hash mismatch on test " << test << endl << "Input: "; diff --git a/tests/hash/tests-slow-4.txt b/tests/hash/tests-slow-4.txt new file mode 100644 index 000000000..4a8d8025d --- /dev/null +++ b/tests/hash/tests-slow-4.txt @@ -0,0 +1,10 @@ +da8b319c5305ee30c4427f6d92fc022eadf64d6ddc190a98f76f340e36501947 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 +5651baabf26ce6a54e1fd151b6569cba720c353f9fc58af365aaebc55269afe0 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 +2962bf37adbfea15929b937a208e13975b3cfaa4d45b6c0c9e3cb83382587bf6 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 +31be4ac4701e3f2687d13623b77842b1f3a7f63ffa8eff1f8779b051fcff4aa7 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 +842dd9bc384f46700ff5b451b8fd6d123c24b9bd559813e65b40cbc5b6fd4e24 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264 +470684872a046ac5934f8fa6d14ce16b6dab3068cf5f7c1db4c878deac3319e0 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265 +b3e98b59ed7e114356ab1b437b607bc2420cb650d6993d75b5027cfb341d7e65 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 +492cd553721b23337b30ef163336dc4411f6331d929be113a465dbabbe3794a6 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 +9e5406153d8419dbecf083a460e161ad88c3d51bc6dd28df7303ef1b81cf76c9 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 +f28575078964f99e4f8177373faf6a0b62d056e161289df048c83a4dd017b9fd 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269 From 26ff4205318afaae6727c7f820530d9e5687fb55 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 4 Feb 2019 20:49:37 +0100 Subject: [PATCH 2/3] Variant 4 random code generator: fixed data size for Blake hash --- src/crypto/variant4_random_math.h | 2 +- tests/hash/tests-slow-4.txt | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h index def8c4aa5..40b68bbd1 100644 --- a/src/crypto/variant4_random_math.h +++ b/src/crypto/variant4_random_math.h @@ -170,7 +170,7 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed { if (*data_index + bytes_needed > data_size) { - hash_extra_blake(data, sizeof(data), data); + hash_extra_blake(data, data_size, data); *data_index = 0; } } diff --git a/tests/hash/tests-slow-4.txt b/tests/hash/tests-slow-4.txt index 4a8d8025d..78e72b744 100644 --- a/tests/hash/tests-slow-4.txt +++ b/tests/hash/tests-slow-4.txt @@ -1,10 +1,10 @@ -da8b319c5305ee30c4427f6d92fc022eadf64d6ddc190a98f76f340e36501947 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 -5651baabf26ce6a54e1fd151b6569cba720c353f9fc58af365aaebc55269afe0 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 -2962bf37adbfea15929b937a208e13975b3cfaa4d45b6c0c9e3cb83382587bf6 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 -31be4ac4701e3f2687d13623b77842b1f3a7f63ffa8eff1f8779b051fcff4aa7 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 -842dd9bc384f46700ff5b451b8fd6d123c24b9bd559813e65b40cbc5b6fd4e24 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264 -470684872a046ac5934f8fa6d14ce16b6dab3068cf5f7c1db4c878deac3319e0 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265 -b3e98b59ed7e114356ab1b437b607bc2420cb650d6993d75b5027cfb341d7e65 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 -492cd553721b23337b30ef163336dc4411f6331d929be113a465dbabbe3794a6 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 -9e5406153d8419dbecf083a460e161ad88c3d51bc6dd28df7303ef1b81cf76c9 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 -f28575078964f99e4f8177373faf6a0b62d056e161289df048c83a4dd017b9fd 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269 +9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 +0d4a495cb844a3ca8ba4edb8e6bcf829ef1c06d9cdea2b62ca46c2a21b8b0a79 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 +a1d6d848b5c5915fccd2f64cf216c6b1a02cf7c77bc80d8d4e51b419e88ff0dd 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 +af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 +313399e0963ae8a99dab8af66d343e097dae0c0feb08dbc43ccdafef5515f413 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264 +6021c6ef90bff9ae94a7506d623d3a7a86c1756d655f50dd558f716d64622a34 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265 +2b13000535f3db5f9b9b84a65c4351f386cd2cdedebb8c3ad2eab086e6a3fee5 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 +fc0e1dad8e895749dc90eb690bc1ba059a1cd772afaaf65a106bf9e5e6b80503 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 +b60b0afe144deff7d903ed2d5545e77ebe66a3c51fee7016eeb8fee9eb630c0f 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 +64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269 From 0009b32f08abe347f13a5c1b77190324c1a398ca Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 5 Feb 2019 20:25:29 +0100 Subject: [PATCH 3/3] Rewrote opcode parsing without bitfield struct --- src/crypto/variant4_random_math.h | 35 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h index 40b68bbd1..fc16ef4d8 100644 --- a/src/crypto/variant4_random_math.h +++ b/src/crypto/variant4_random_math.h @@ -33,7 +33,7 @@ enum V4_InstructionList V4_INSTRUCTION_COUNT = RET, }; -// V4_InstructionCompact is used to generate code from random data +// V4_InstructionDefinition is used to generate code from random data // Every random sequence of bytes is a valid code // // There are 8 registers in total: @@ -41,11 +41,11 @@ enum V4_InstructionList // - 4 constant registers initialized from loop variables // // This is why dst_index is 2 bits -struct V4_InstructionCompact +enum V4_InstructionDefinition { - uint8_t opcode : 3; - uint8_t dst_index : 2; - uint8_t src_index : 3; + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, }; struct V4_Instruction @@ -235,33 +235,40 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ { check_data(&data_index, 1, data, sizeof(data)); - struct V4_InstructionCompact op = ((struct V4_InstructionCompact*)data)[data_index++]; + const uint8_t c = ((uint8_t*)data)[data_index++]; // MUL = opcodes 0-2 // ADD = opcode 3 // SUB = opcode 4 // ROR/ROL = opcode 5, shift direction is selected randomly // XOR = opcodes 6-7 - uint8_t opcode = (op.opcode <= 2) ? MUL : (op.opcode - 2); - if (op.opcode == 5) + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) { check_data(&data_index, 1, data, sizeof(data)); opcode = (data[data_index++] >= 0) ? ROR : ROL; } - else if (op.opcode >= 6) + else if (opcode >= 6) { opcode = XOR; } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } - const int a = op.dst_index; - int b = op.src_index; + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; // Don't do ADD/SUB/XOR with the same register if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) { // a is always < 4, so we don't need to check bounds here b = a + 4; - op.src_index = b; + src_index = b; } // Don't do rotation with the same destination twice because it's equal to a single rotation @@ -337,8 +344,8 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); code[code_size].opcode = opcode; - code[code_size].dst_index = op.dst_index; - code[code_size].src_index = op.src_index; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; code[code_size].C = 0; if (opcode == ADD)