/* Copyright (c) 2019 tevador This file is part of RandomX. RandomX is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. RandomX is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ #include "configuration.h" #include "Program.hpp" #include "blake2/endian.h" #include #include #include #include #include #include "superscalarGenerator.hpp" namespace RandomX { static bool isMultiplication(int type) { return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; } //uOPs (micro-ops) are represented only by the execution port they can go to namespace ExecutionPort { using type = int; constexpr type Null = 0; constexpr type P0 = 1; constexpr type P1 = 2; constexpr type P5 = 4; constexpr type P01 = P0 | P1; constexpr type P05 = P0 | P5; constexpr type P015 = P0 | P1 | P5; } //Macro-operation as output of the x86 decoder //Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op //Macro-op can consist of 1 or 2 uOPs. class MacroOp { public: MacroOp(const char* name, int size) : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} MacroOp(const MacroOp& parent, bool dependent) : name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {} const char* getName() const { return name_; } int getSize() const { return size_; } int getLatency() const { return latency_; } ExecutionPort::type getUop1() const { return uop1_; } ExecutionPort::type getUop2() const { return uop2_; } bool isSimple() const { return uop2_ == ExecutionPort::Null; } bool isEliminated() const { return uop1_ == ExecutionPort::Null; } bool isDependent() const { return dependent_; } static const MacroOp Add_rr; static const MacroOp Add_ri; static const MacroOp Lea_sib; static const MacroOp Sub_rr; static const MacroOp Imul_rr; static const MacroOp Imul_r; static const MacroOp Mul_r; static const MacroOp Mov_rr; static const MacroOp Mov_ri64; static const MacroOp Xor_rr; static const MacroOp Xor_ri; static const MacroOp Ror_rcl; static const MacroOp Ror_ri; static const MacroOp TestJz_fused; static const MacroOp Xor_self; static const MacroOp Cmp_ri; static const MacroOp Setcc_r; private: const char* name_; int size_; int latency_; ExecutionPort::type uop1_; ExecutionPort::type uop2_; bool dependent_ = false; }; //Size: 3 bytes const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); //Size: 4 bytes const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); //Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes) const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); //Size: 10 bytes const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); //Unused: const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; class SuperscalarInstructionInfo { public: const char* getName() const { return name_; } int getSize() const { return ops_.size(); } bool isSimple() const { return getSize() == 1; } int getLatency() const { return latency_; } const MacroOp& getOp(int index) const { return ops_[index]; } int getType() const { return type_; } int getResultOp() const { return resultOp_; } int getDstOp() const { return dstOp_; } int getSrcOp() const { return srcOp_; } static const SuperscalarInstructionInfo ISUB_R; static const SuperscalarInstructionInfo IXOR_R; static const SuperscalarInstructionInfo IADD_RS; static const SuperscalarInstructionInfo IMUL_R; static const SuperscalarInstructionInfo IROR_C; static const SuperscalarInstructionInfo IADD_C7; static const SuperscalarInstructionInfo IXOR_C7; static const SuperscalarInstructionInfo IADD_C8; static const SuperscalarInstructionInfo IXOR_C8; static const SuperscalarInstructionInfo IADD_C9; static const SuperscalarInstructionInfo IXOR_C9; static const SuperscalarInstructionInfo IMULH_R; static const SuperscalarInstructionInfo ISMULH_R; static const SuperscalarInstructionInfo IMUL_RCP; static const SuperscalarInstructionInfo NOP; private: const char* name_; int type_; std::vector ops_; int latency_; int resultOp_ = 0; int dstOp_ = 0; int srcOp_; SuperscalarInstructionInfo(const char* name) : name_(name), type_(-1), latency_(0) {} SuperscalarInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { ops_.push_back(MacroOp(op)); } template SuperscalarInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { for (unsigned i = 0; i < N; ++i) { ops_.push_back(MacroOp(arr[i])); latency_ += ops_.back().getLatency(); } static_assert(N > 1, "Invalid array size"); } }; const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP"); //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. const int buffer0[] = { 4, 8, 4 }; const int buffer1[] = { 7, 3, 3, 3 }; const int buffer2[] = { 3, 7, 3, 3 }; const int buffer3[] = { 4, 9, 3 }; const int buffer4[] = { 4, 4, 4, 4 }; const int buffer5[] = { 3, 3, 10 }; class DecoderBuffer { public: static const DecoderBuffer Default; template DecoderBuffer(const char* name, int index, const int(&arr)[N]) : name_(name), index_(index), counts_(arr), opsCount_(N) {} const int* getCounts() const { return counts_; } int getSize() const { return opsCount_; } int getIndex() const { return index_; } const char* getName() const { return name_; } const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R) return &decodeBuffer3310; //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications //is lower than the number of cycles. if (mulCount < cycle + 1) return &decodeBuffer4444; //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. if(instrType == SuperscalarInstructionType::IMUL_RCP) return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; //Default: select a random fetch configuration. return fetchNextDefault(gen); } private: const char* name_; int index_; const int* counts_; int opsCount_; DecoderBuffer() : index_(-1) {} static const DecoderBuffer decodeBuffer484; static const DecoderBuffer decodeBuffer7333; static const DecoderBuffer decodeBuffer3733; static const DecoderBuffer decodeBuffer493; static const DecoderBuffer decodeBuffer4444; static const DecoderBuffer decodeBuffer3310; static const DecoderBuffer* decodeBuffers[4]; const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { return decodeBuffers[gen.getByte() & 3]; } }; const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3); const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5); const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = { &DecoderBuffer::decodeBuffer484, &DecoderBuffer::decodeBuffer7333, &DecoderBuffer::decodeBuffer3733, &DecoderBuffer::decodeBuffer493, }; const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R }; const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R }; const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS }; const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 }; const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 }; const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 }; const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP; static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { int index; if (availableRegisters.size() == 0) return false; if (availableRegisters.size() > 1) { index = gen.getInt32() % availableRegisters.size(); } else { index = 0; } reg = availableRegisters[index]; return true; } class RegisterInfo { public: RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {} int latency; int lastOpGroup; int lastOpPar; int value; }; //"SuperscalarInstruction" consists of one or more macro-ops class SuperscalarInstruction { public: void toInstr(Instruction& instr) { //translate to a RandomX instruction format instr.opcode = getType(); instr.dst = dst_; instr.src = src_ >= 0 ? src_ : dst_; instr.mod = mod_; instr.setImm32(imm32_); } void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { switch (slotSize) { case 3: //if this is the last slot, we can also select "IMULH" instructions if (isLast) { create(slot_3L[gen.getByte() & 3], gen); } else { create(slot_3[gen.getByte() & 1], gen); } break; case 4: //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions if (fetchType == 4 && !isLast) { create(&SuperscalarInstructionInfo::IMUL_R, gen); } else { create(slot_4[gen.getByte() & 1], gen); } break; case 7: create(slot_7[gen.getByte() & 1], gen); break; case 8: create(slot_8[gen.getByte() & 1], gen); break; case 9: create(slot_9[gen.getByte() & 1], gen); break; case 10: create(slot_10, gen); break; default: UNREACHABLE; } } void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) { info_ = info; reset(); switch (info->getType()) { case SuperscalarInstructionType::ISUB_R: { mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::IADD_RS; groupParIsSource_ = true; } break; case SuperscalarInstructionType::IXOR_R: { mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::IXOR_R; groupParIsSource_ = true; } break; case SuperscalarInstructionType::IADD_RS: { mod_ = gen.getByte(); imm32_ = 0; opGroup_ = SuperscalarInstructionType::IADD_RS; groupParIsSource_ = true; } break; case SuperscalarInstructionType::IMUL_R: { mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::IMUL_R; groupParIsSource_ = true; } break; case SuperscalarInstructionType::IROR_C: { mod_ = 0; do { imm32_ = gen.getByte() & 63; } while (imm32_ == 0); opGroup_ = SuperscalarInstructionType::IROR_C; opGroupPar_ = -1; } break; case SuperscalarInstructionType::IADD_C7: case SuperscalarInstructionType::IADD_C8: case SuperscalarInstructionType::IADD_C9: { mod_ = 0; imm32_ = gen.getInt32(); opGroup_ = SuperscalarInstructionType::IADD_C7; opGroupPar_ = -1; } break; case SuperscalarInstructionType::IXOR_C7: case SuperscalarInstructionType::IXOR_C8: case SuperscalarInstructionType::IXOR_C9: { mod_ = 0; imm32_ = gen.getInt32(); opGroup_ = SuperscalarInstructionType::IXOR_C7; opGroupPar_ = -1; } break; case SuperscalarInstructionType::IMULH_R: { canReuse_ = true; mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::IMULH_R; opGroupPar_ = gen.getInt32(); } break; case SuperscalarInstructionType::ISMULH_R: { canReuse_ = true; mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::ISMULH_R; opGroupPar_ = gen.getInt32(); } break; case SuperscalarInstructionType::IMUL_RCP: { mod_ = 0; do { imm32_ = gen.getInt32(); } while ((imm32_ & (imm32_ - 1)) == 0); opGroup_ = SuperscalarInstructionType::IMUL_RCP; opGroupPar_ = -1; } break; default: break; } } bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (®isters)[8], Blake2Generator& gen) { /*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R) std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/ std::vector availableRegisters; //Conditions for the destination register: // * value must be ready at the required cycle // * cannot be the same as the source register unless the instruction allows it // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" // * register cannot be multiplied twice in a row unless allowChainedMul is true // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) // * either the last instruction applied to the register or its source must be different than this instruction // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) for (unsigned i = 0; i < 8; ++i) { if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); } bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; //all registers that are ready at the cycle for (unsigned i = 0; i < 8; ++i) { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) { opGroupPar_ = src_ = RegisterNeedsDisplacement; return true; } } if (selectRegister(availableRegisters, gen, src_)) { if (groupParIsSource_) opGroupPar_ = src_; return true; } return false; } int getType() { return info_->getType(); } int getSource() { return src_; } int getDestination() { return dst_; } int getGroup() { return opGroup_; } int getGroupPar() { return opGroupPar_; } const SuperscalarInstructionInfo& getInfo() const { return *info_; } static const SuperscalarInstruction Null; private: const SuperscalarInstructionInfo* info_; int src_ = -1; int dst_ = -1; int mod_; uint32_t imm32_; int opGroup_; int opGroupPar_; bool canReuse_ = false; bool groupParIsSource_ = false; void reset() { src_ = dst_ = -1; canReuse_ = groupParIsSource_ = false; } SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) { } }; const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP); constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4; constexpr int LOOK_FORWARD_CYCLES = 4; constexpr int MAX_THROWAWAY_COUNT = 256; #ifndef _DEBUG constexpr bool TRACE = false; constexpr bool INFO = false; #else constexpr bool TRACE = true; constexpr bool INFO = true; #endif template static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload //port P1 (multiplication) by instructions that can go to any port. for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { if (commit) { if (TRACE) std::cout << "; P5 at cycle " << cycle << std::endl; portBusy[cycle][2] = uop; } return cycle; } if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { if (commit) { if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; portBusy[cycle][0] = uop; } return cycle; } if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { if (commit) { if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; portBusy[cycle][1] = uop; } return cycle; } } return -1; } template static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { //if this macro-op depends on the previous one, increase the starting cycle if needed //this handles an explicit dependency chain in IMUL_RCP if (mop.isDependent()) { cycle = std::max(cycle, depCycle); } //move instructions are eliminated and don't need an execution unit if (mop.isEliminated()) { if (commit) if (TRACE) std::cout << "; (eliminated)" << std::endl; return cycle; } else if (mop.isSimple()) { //this macro-op has only one uOP return scheduleUop(mop.getUop1(), portBusy, cycle); } else { //macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle for (; cycle < CYCLE_MAP_SIZE; ++cycle) { int cycle1 = scheduleUop(mop.getUop1(), portBusy, cycle); int cycle2 = scheduleUop(mop.getUop2(), portBusy, cycle); if (cycle1 == cycle2) { if (commit) { scheduleUop(mop.getUop1(), portBusy, cycle1); scheduleUop(mop.getUop2(), portBusy, cycle2); } return cycle1; } } } return -1; } void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null; int macroOpIndex = 0; int codeSize = 0; int macroOpCount = 0; int cycle = 0; int depCycle = 0; int retireCycle = 0; bool portsSaturated = false; int programSize = 0; int mulCount = 0; int decodeCycle; int throwAwayCount = 0; //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. //Each decode cycle decodes 16 bytes of x86 code. //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always //saturated first. The cycle limit is present only to guarantee loop termination. //Program size is limited to RANDOMX_SUPERSCALAR_MAX_SIZE instructions. for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) { //select a decode configuration decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; int bufferIndex = 0; //fill all instruction slots in the current decode buffer while (bufferIndex < decodeBuffer->getSize()) { int topCycle = cycle; //if we have issued all macro-ops for the current RandomX instruction, create a new instruction if (macroOpIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; //select an instruction so that the first macro-op fits into the current slot currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); macroOpIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); if (TRACE) std::cout << mop.getName() << " "; //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); if (scheduleCycle < 0) { /*if (TRACE)*/ std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; //__debugbreak(); portsSaturated = true; break; } //find a source register (if applicable) that will be ready when this instruction executes if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) { int forward; //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { if (TRACE) std::cout << "; src STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } //if no register was found, throw the instruction away and try another one if (forward == LOOK_FORWARD_CYCLES) { if (throwAwayCount < MAX_THROWAWAY_COUNT) { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; //cycle = topCycle; continue; } //abort this decode buffer /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; currentInstruction = SuperscalarInstruction::Null; break; } if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } //find a destination register that will be ready when this instruction executes if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { int forward; for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away if (throwAwayCount < MAX_THROWAWAY_COUNT) { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; //cycle = topCycle; continue; } //abort this decode buffer /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; currentInstruction = SuperscalarInstruction::Null; break; } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } throwAwayCount = 0; //recalculate when the instruction can be scheduled for execution based on operand availability scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); //calculate when the result will be ready depCycle = scheduleCycle + mop.getLatency(); //if this instruction writes the result, modify register information // RegisterInfo.latency - which cycle the register will be ready // RegisterInfo.lastOpGroup - the last operation that was applied to the register // RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register) if (macroOpIndex == currentInstruction.getInfo().getResultOp()) { int dst = currentInstruction.getDestination(); RegisterInfo& ri = registers[dst]; retireCycle = depCycle; ri.latency = retireCycle; ri.lastOpGroup = currentInstruction.getGroup(); ri.lastOpPar = currentInstruction.getGroupPar(); if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; } codeSize += mop.getSize(); bufferIndex++; macroOpIndex++; macroOpCount++; //terminating condition if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) { portsSaturated = true; } cycle = topCycle; //when all macro-ops of the current instruction have been issued, add the instruction into the program if (macroOpIndex >= currentInstruction.getInfo().getSize()) { currentInstruction.toInstr(prog(programSize++)); mulCount += isMultiplication(currentInstruction.getType()); } } ++cycle; } double ipc = (macroOpCount / (double)retireCycle); memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies)); //Calculate ASIC latency: //Assumes 1 cycle latency for all operations and unlimited parallelization. for (int i = 0; i < programSize; ++i) { Instruction& instr = prog(i); int latDst = prog.asicLatencies[instr.dst] + 1; int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0; prog.asicLatencies[instr.dst] = std::max(latDst, latSrc); } //address register is the register with the highest ASIC latency int asicLatencyMax = 0; int addressReg = 0; for (int i = 0; i < 8; ++i) { if (prog.asicLatencies[i] > asicLatencyMax) { asicLatencyMax = prog.asicLatencies[i]; addressReg = i; } prog.cpuLatencies[i] = registers[i].latency; } prog.setSize(programSize); prog.setAddressRegister(addressReg); prog.cpuLatency = retireCycle; prog.asicLatency = asicLatencyMax; prog.codeSize = codeSize; prog.macroOps = macroOpCount; prog.decodeCycles = decodeCycle; prog.ipc = ipc; prog.mulCount = mulCount; /*if(INFO) std::cout << "; ALU port utilization:" << std::endl; if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); portCycles += !!portBusy[i][j]; } std::cout << std::endl; }*/ } }