diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index dc8fa4e..eaf5efe 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -22,43 +22,29 @@ along with RandomX. If not, see. #include "Program.hpp" #include "blake2/endian.h"; #include +#include namespace RandomX { - - namespace LightInstruction { - constexpr int IADD_R = 0; - constexpr int IADD_RC = 1; - constexpr int ISUB_R = 2; - constexpr int IMUL_9C = 3; - constexpr int IMUL_R = 4; - constexpr int IMULH_R = 5; - constexpr int ISMULH_R = 6; - constexpr int IMUL_RCP = 7; - constexpr int IXOR_R = 8; - constexpr int IROR_R = 9; - constexpr int COND_R = 10; - constexpr int COUNT = 11; + // Intel Ivy Bridge reference + namespace LightInstructionType { //uOPs (decode) execution ports latency code size + constexpr int IADD_R = 0; //1 p015 1 3 + constexpr int IADD_C = 1; //1 p015 1 7 + constexpr int IADD_RC = 2; //1 p1 3 8 + constexpr int ISUB_R = 3; //1 p015 1 3 + constexpr int IMUL_9C = 4; //1 p1 3 8 + constexpr int IMUL_R = 5; //1 p1 3 4 + constexpr int IMUL_C = 6; //1 p1 3 7 + constexpr int IMULH_R = 7; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int ISMULH_R = 8; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int IMUL_RCP = 9; //1+1 p015+p1 4 10+4 + constexpr int IXOR_R = 10; //1 p015 1 3 + constexpr int IXOR_C = 11; //1 p015 1 7 + constexpr int IROR_R = 12; //1+2 0+(p0,p5) 1 3+3 + constexpr int IROR_C = 13; //1 p05 1 4 + constexpr int COND_R = 14; //1+1+1+1+1+1 p015+p5+0+p015+p05+p015 3 7+13+3+7+3+3 + constexpr int COUNT = 15; } - const int lightInstruction[] = { - LightInstruction::IADD_RC, - LightInstruction::IADD_RC, - LightInstruction::ISUB_R, - LightInstruction::ISUB_R, - LightInstruction::IMUL_9C, - LightInstruction::IMUL_R, - LightInstruction::IMUL_R, - LightInstruction::IMUL_R, - LightInstruction::IMULH_R, - LightInstruction::ISMULH_R, - LightInstruction::IMUL_RCP, - LightInstruction::IXOR_R, - LightInstruction::IXOR_R, - LightInstruction::IROR_R, - LightInstruction::IROR_R, - LightInstruction::COND_R - }; - namespace LightInstructionOpcode { constexpr int IADD_R = 0; constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; @@ -67,26 +53,605 @@ namespace RandomX { constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; constexpr int IMULH_R = IMUL_R + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M; constexpr int ISMULH_R = IMULH_R + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M; - constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M;; + constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M; constexpr int IXOR_R = IMUL_RCP + RANDOMX_FREQ_IMUL_RCP + RANDOMX_FREQ_INEG_R; constexpr int IROR_R = IXOR_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M; constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; } const int lightInstructionOpcode[] = { + LightInstructionOpcode::IADD_R, LightInstructionOpcode::IADD_R, LightInstructionOpcode::IADD_RC, LightInstructionOpcode::ISUB_R, LightInstructionOpcode::IMUL_9C, LightInstructionOpcode::IMUL_R, + LightInstructionOpcode::IMUL_R, LightInstructionOpcode::IMULH_R, LightInstructionOpcode::ISMULH_R, LightInstructionOpcode::IMUL_RCP, LightInstructionOpcode::IXOR_R, + LightInstructionOpcode::IXOR_R, + LightInstructionOpcode::IROR_R, LightInstructionOpcode::IROR_R, LightInstructionOpcode::COND_R }; + const int lightInstruction[] = { + LightInstructionType::IADD_R, + LightInstructionType::IADD_C, + LightInstructionType::IADD_RC, + LightInstructionType::ISUB_R, + LightInstructionType::IMUL_9C, + LightInstructionType::IMUL_R, + LightInstructionType::IMUL_R, + LightInstructionType::IMUL_C, + LightInstructionType::IMULH_R, + LightInstructionType::ISMULH_R, + LightInstructionType::IMUL_RCP, + LightInstructionType::IXOR_R, + LightInstructionType::IXOR_C, + LightInstructionType::IROR_R, + LightInstructionType::IROR_C, + LightInstructionType::COND_R + }; + + namespace ExecutionPort { + using type = int; + constexpr type Null = 0; + constexpr type P0 = 1; + constexpr type P1 = 2; + constexpr type P5 = 4; + constexpr type P05 = 6; + constexpr type P015 = 7; + } + + class Blake2Generator { + public: + Blake2Generator(const void* seed) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + data[60] = 39; + } + + uint8_t getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } + }; + + class MacroOp { + public: + MacroOp(const char* name, int size) + : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) + : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) + : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + const char* getName() const { + return name_; + } + int getSize() const { + return size_; + } + int getLatency() const { + return latency_; + } + ExecutionPort::type getUop1() const { + return uop1_; + } + ExecutionPort::type getUop2() const { + return uop2_; + } + bool isSimple() const { + return uop2_ == ExecutionPort::Null; + } + bool isEliminated() const { + return uop1_ == ExecutionPort::Null; + } + static const MacroOp Add_rr; + static const MacroOp Add_ri; + static const MacroOp Lea_sib; + static const MacroOp Sub_rr; + static const MacroOp Imul_rr; + static const MacroOp Imul_rri; + static const MacroOp Imul_r; + static const MacroOp Mul_r; + static const MacroOp Mov_rr; + static const MacroOp Mov_ri64; + static const MacroOp Xor_rr; + static const MacroOp Xor_ri; + static const MacroOp Ror_rcl; + static const MacroOp Ror_ri; + static const MacroOp TestJmp_fused; + static const MacroOp Xor_self; + static const MacroOp Cmp_ri; + static const MacroOp Setcc_r; + private: + const char* name_; + int size_; + int latency_; + ExecutionPort::type uop1_; + ExecutionPort::type uop2_; + }; + + const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,m", 8, 3, ExecutionPort::P1); + const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); + const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); + const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); + const MacroOp MacroOp::TestJmp_fused = MacroOp("testjmp r,i", 13, 0, ExecutionPort::P5); + + template + T* begin(T(&arr)[N]) { return &arr[0]; } + template + T* end(T(&arr)[N]) { return &arr[0] + N; } + + const MacroOp* IMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Mul_r, &MacroOp::Mov_rr }; + const MacroOp* ISMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Imul_r, &MacroOp::Mov_rr }; + const MacroOp* IMUL_RCP_ops_array[] = { &MacroOp::Mov_ri64, &MacroOp::Imul_rr }; + const MacroOp* IROR_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Ror_rcl }; + const MacroOp* COND_R_ops_array[] = { &MacroOp::Add_ri, &MacroOp::TestJmp_fused, &MacroOp::Xor_self, &MacroOp::Cmp_ri, &MacroOp::Setcc_r, &MacroOp::Add_rr }; + + + class LightInstructionInfo { + public: + LightInstructionInfo(const char* name, const MacroOp* op) + : name_(name), op_(op), opsCount_(1), latency_(op->getLatency()) {} + template + LightInstructionInfo(const char* name, const MacroOp*(&arr)[N]) + : name_(name), ops_(arr), opsCount_(N), latency_(0) { + for (unsigned i = 0; i < N; ++i) { + latency_ += arr[i]->getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } + template + LightInstructionInfo(const char* name, const MacroOp*(&arr)[N], int latency) + : name_(name), ops_(arr), opsCount_(N), latency_(latency) { + static_assert(N > 1, "Invalid array size"); + } + const char* getName() const { + return name_; + } + int getSize() const { + return opsCount_; + } + bool isSimple() const { + return opsCount_ == 1; + } + int getLatency() const { + return latency_; + } + const MacroOp* getOp(int index) const { + return opsCount_ > 1 ? ops_[index] : op_; + } + static const LightInstructionInfo IADD_R; + static const LightInstructionInfo IADD_C; + static const LightInstructionInfo IADD_RC; + static const LightInstructionInfo ISUB_R; + static const LightInstructionInfo IMUL_9C; + static const LightInstructionInfo IMUL_R; + static const LightInstructionInfo IMUL_C; + static const LightInstructionInfo IMULH_R; + static const LightInstructionInfo ISMULH_R; + static const LightInstructionInfo IMUL_RCP; + static const LightInstructionInfo IXOR_R; + static const LightInstructionInfo IXOR_C; + static const LightInstructionInfo IROR_R; + static const LightInstructionInfo IROR_C; + static const LightInstructionInfo COND_R; + static const LightInstructionInfo NOP; + private: + const char* name_; + union { + const MacroOp** ops_; + const MacroOp* op_; + }; + int opsCount_; + int latency_; + + LightInstructionInfo(const char* name) + : name_(name), opsCount_(0), latency_(0) {} + }; + + const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", &MacroOp::Add_rr); + const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", &MacroOp::Add_ri); + const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", &MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", &MacroOp::Sub_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", &MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", &MacroOp::Imul_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", &MacroOp::Imul_rri); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", IMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", ISMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", IMUL_RCP_ops_array); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", &MacroOp::Xor_rr); + const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", &MacroOp::Xor_ri); + const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", IROR_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", &MacroOp::Ror_ri); + const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", COND_R_ops_array); + const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); + + const int buffer0[] = { 3, 3, 10 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 3, 3, 7 }; + const int buffer3[] = { 4, 8, 4 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 7, 3, 3 }; + const int buffer6[] = { 3, 3, 7, 3 }; + const int buffer7[] = { 13, 3 }; + + class DecoderBuffer { + public: + static DecoderBuffer Default; + template + DecoderBuffer(const char* name, int index, const int(&arr)[N]) + : name_(name), index_(index), counts_(arr), opsCount_(N) {} + const int* getCounts() const { + return counts_; + } + int getSize() const { + return opsCount_; + } + int getIndex() const { + return index_; + } + const char* getName() const { + return name_; + } + const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { + if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) + return decodeBuffers[0]; + if (index_ == 0) { + if ((gen.getByte() % 2) == 0) + return decodeBuffers[3]; + else + return decodeBuffers[4]; + } + if (index_ == 2) { + return decodeBuffers[7]; + } + if (index_ == 7) { + return decodeBuffers[1]; + } + return fetchNextDefault(gen); + } + private: + const char* name_; + int index_; + const int* counts_; + int opsCount_; + DecoderBuffer() : index_(-1) {} + static const DecoderBuffer decodeBuffers[8]; + const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { + int select; + do { + select = gen.getByte() & 7; + } while (select == 7); + return decodeBuffers[select]; + } + }; + + const DecoderBuffer DecoderBuffer::decodeBuffers[8] = { + DecoderBuffer("3,3,10", 0, buffer0), + DecoderBuffer("7,3,3,3", 1, buffer1), + DecoderBuffer("3,3,3,7", 2, buffer2), + DecoderBuffer("4,8,4", 3, buffer3), + DecoderBuffer("4,4,4,4", 4, buffer4), + DecoderBuffer("3,7,3,3", 5, buffer5), + DecoderBuffer("3,3,7,3", 6, buffer6), + DecoderBuffer("13,3", 7, buffer7), + }; + + DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + + const int slot_3[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IADD_R }; + const int slot_3L[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R }; + const int slot_3F[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IROR_R }; + const int slot_4[] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_C }; + const int slot_7[] = { LightInstructionType::IADD_C, LightInstructionType::IMUL_C, LightInstructionType::IXOR_C, LightInstructionType::IXOR_C }; + const int slot_7L = LightInstructionType::COND_R; + const int slot_8[] = { LightInstructionType::IADD_RC, LightInstructionType::IMUL_9C }; + const int slot_10 = LightInstructionType::IMUL_RCP; + + class LightInstruction { + public: + Instruction toInstr() { + Instruction instr; + instr.opcode = lightInstructionOpcode[type_]; + instr.dst = dst_; + instr.src = src_ >= 0 ? src_ : dst_; + instr.mod = mod_; + instr.setImm32(imm32_); + return instr; + } + + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool isFirst = false) { + switch (slotSize) + { + case 3: + if (isLast) { + return create(slot_3L[gen.getByte() & 7], gen); + } + else if (isFirst) { + return create(slot_3F[gen.getByte() & 3], gen); + } + else { + return create(slot_3[gen.getByte() & 3], gen); + } + case 4: + return create(slot_4[gen.getByte() & 1], gen); + case 7: + if (isLast) { + return create(slot_7L, gen); + } + else { + return create(slot_7[gen.getByte() & 3], gen); + } + case 8: + return create(slot_8[gen.getByte() & 1], gen); + case 10: + return create(slot_10, gen); + default: + break; + } + } + + static LightInstruction create(int type, Blake2Generator& gen) { + LightInstruction li; + li.type_ = type; + li.opGroup_ = type; + switch (type) + { + case LightInstructionType::IADD_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IADD_R; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IADD_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IADD_C; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IADD_RC: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IADD_RC; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::ISUB_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::ISUB_R; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IMUL_9C: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_9C; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IMUL_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IMUL_R; + li.opGroup_ = LightInstructionType::IMUL_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::IMUL_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_C; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IMULH_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IMULH_R; + li.opGroup_ = LightInstructionType::IMULH_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::ISMULH_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::ISMULH_R; + li.opGroup_ = LightInstructionType::ISMULH_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::IMUL_RCP: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_RCP; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IXOR_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IXOR_R; + li.opGroup_ = LightInstructionType::IXOR_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IXOR_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IXOR_C; + li.opGroup_ = LightInstructionType::IXOR_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IROR_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IROR_R; + li.opGroup_ = LightInstructionType::IROR_R; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IROR_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getByte(); + li.info_ = &LightInstructionInfo::IROR_C; + li.opGroup_ = LightInstructionType::IROR_R; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::COND_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = gen.getByte(); + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::COND_R; + li.opGroup_ = LightInstructionType::COND_R; + li.opGroupPar_ = li.imm32_; + } break; + + default: + break; + } + + return li; + } + + int getType() { + return type_; + } + int getSource() { + return src_; + } + int getDestination() { + return dst_; + } + int getGroup() { + return opGroup_; + } + int getGroupPar() { + return opGroupPar_; + } + + const LightInstructionInfo* getInfo() { + return info_; + } + + static const LightInstruction Null; + + private: + int type_; + int src_; + int dst_; + int mod_; + uint32_t imm32_; + + const LightInstructionInfo* info_; + int opGroup_; + int opGroupPar_; + + LightInstruction() {} + LightInstruction(int type, const LightInstructionInfo* info) : type_(type), info_(info) {} + }; + + class RegisterInfo { + public: + RegisterInfo() : lastOpGroup(-1), source(-1), value(0), latency(0) {} + int lastOpGroup; + int source; + int value; + int latency; + }; + + const LightInstruction LightInstruction::Null = LightInstruction(-1, &LightInstructionInfo::NOP); + constexpr int ALU_COUNT_MUL = 1; constexpr int ALU_COUNT = 4; constexpr int LIGHT_OPCODE_BITS = 4; @@ -106,16 +671,61 @@ namespace RandomX { } } + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister) { + + bool portBusy[RANDOMX_LPROG_LATENCY][3]; + RegisterInfo registers[8]; + bool decoderBusy[RANDOMX_LPROG_LATENCY][4]; + Blake2Generator gen(seed); + std::vector instructions; + + DecoderBuffer& fetchLine = DecoderBuffer::Default; + LightInstruction currentInstruction = LightInstruction::Null; + int instrIndex = 0; + int codeSize = 0; + int macroOpCount = 0; + int rxOpCount = 0; + + for (int cycle = 0; cycle < 170; ++cycle) { + fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); + std::cout << "; cycle " << cycle << " buffer " << fetchLine.getName() << std::endl; + + int mopIndex = 0; + + while (mopIndex < fetchLine.getSize()) { + if (instrIndex >= currentInstruction.getInfo()->getSize()) { + currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + instrIndex = 0; + std::cout << "; " << currentInstruction.getInfo()->getName() << std::endl; + rxOpCount++; + } + if (fetchLine.getCounts()[mopIndex] != currentInstruction.getInfo()->getOp(instrIndex)->getSize()) { + std::cout << "ERROR instruction " << currentInstruction.getInfo()->getOp(instrIndex)->getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; + return; + } + std::cout << currentInstruction.getInfo()->getOp(instrIndex)->getName() << std::endl; + codeSize += currentInstruction.getInfo()->getOp(instrIndex)->getSize(); + mopIndex++; + instrIndex++; + macroOpCount++; + } + } + + std::cout << "; code size " << codeSize << std::endl; + std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; + std::cout << "; RandomX instructions: " << rxOpCount << std::endl; + } + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { // Source: https://www.agner.org/optimize/instruction_tables.pdf - const int op_latency[LightInstruction::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; + const int op_latency[LightInstructionType::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; // Instruction latencies for theoretical ASIC implementation - const int asic_op_latency[LightInstruction::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + const int asic_op_latency[LightInstructionType::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; // Available ALUs for each instruction - const int op_ALUs[LightInstruction::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + const int op_ALUs[LightInstructionType::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; uint8_t data[64]; memset(data, 0, sizeof(data)); @@ -147,7 +757,7 @@ namespace RandomX { uint64_t inst_data[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; bool alu_busy[RANDOMX_LPROG_LATENCY + 1][ALU_COUNT]; - bool is_rotation[LightInstruction::COUNT]; + bool is_rotation[LightInstructionType::COUNT]; bool rotated[8]; int rotate_count = 0; @@ -156,7 +766,7 @@ namespace RandomX { memset(alu_busy, 0, sizeof(alu_busy)); memset(is_rotation, 0, sizeof(is_rotation)); memset(rotated, 0, sizeof(rotated)); - is_rotation[LightInstruction::IROR_R] = true; + is_rotation[LightInstructionType::IROR_R] = true; int num_retries = 0; code_size = 0; @@ -201,12 +811,12 @@ namespace RandomX { // 2x IMUL_RCP(a, C) = a * (C * C) // 2x IXOR_R = NOP // 2x IROR_R(a, b) = IROR_R(a, 2*b) - if (instrType != LightInstruction::IMULH_R && instrType != LightInstruction::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) + if (instrType != LightInstructionType::IMULH_R && instrType != LightInstructionType::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) { continue; } - if ((instrType == LightInstruction::IADD_RC) || (instrType == LightInstruction::IMUL_9C) || (instrType == LightInstruction::IMUL_RCP) || (instrType == LightInstruction::COND_R) || ((instrType != LightInstruction::IMULH_R) && (instrType != LightInstruction::ISMULH_R) && (a == b))) + if ((instrType == LightInstructionType::IADD_RC) || (instrType == LightInstructionType::IMUL_9C) || (instrType == LightInstructionType::IMUL_RCP) || (instrType == LightInstructionType::COND_R) || ((instrType != LightInstructionType::IMULH_R) && (instrType != LightInstructionType::ISMULH_R) && (a == b))) { check_data(data_index, 4, data, sizeof(data)); imm32 = load32(&data[data_index++]); @@ -222,7 +832,7 @@ namespace RandomX { if (!alu_busy[next_latency][i]) { // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check - if ((instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) && alu_busy[next_latency + 1][i]) + if ((instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) && alu_busy[next_latency + 1][i]) { continue; } @@ -275,7 +885,7 @@ namespace RandomX { prog(code_size).src = src_index; prog(code_size).setImm32(imm32); - if (instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) + if (instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too alu_busy[next_latency - op_latency[instrType] + 1][alu_index] = true; @@ -308,7 +918,7 @@ namespace RandomX { if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; } - const int pattern[3] = { LightInstruction::IMUL_R, LightInstruction::IROR_R, LightInstruction::IMUL_R }; + const int pattern[3] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_R, LightInstructionType::IMUL_R }; const int instrType = pattern[(code_size - prev_code_size) % 3]; latency[min_idx] = latency[max_idx] + op_latency[instrType]; asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[instrType]; diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 71c4a7c..a7762b1 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -21,4 +21,5 @@ along with RandomX. If not, see. namespace RandomX { void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister); + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister); } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 61bb2ff..8c1f64a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -223,8 +223,8 @@ int main(int argc, char** argv) { if (genLight) { RandomX::LightProgram p; - RandomX::generateLightProgram(p, seed, 0); - std::cout << p << std::endl; + RandomX::generateLightProg2(p, seed, 0); + //std::cout << p << std::endl; return 0; } diff --git a/src/program.inc b/src/program.inc index 46d8093..97a8122 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,3 +1,5 @@ + mov ebx, 111 ; Start marker bytes + db 064h, 067h, 090h ; Start marker bytes randomx_isn_0: ; IROR_R r3, 30 ror r11, 30 @@ -1001,3 +1003,5 @@ randomx_isn_255: ; IROR_R r7, r3 mov ecx, r11d ror r15, cl + mov ebx, 222 ; End marker bytes + db 064h, 067h, 090h ; End marker bytes \ No newline at end of file