RandomWOW/src/superscalar.cpp

/*
Copyright (c) 2019 tevador

This file is part of RandomX.

RandomX is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

RandomX is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
*/

#include "configuration.h"
#include "program.hpp"
#include "blake2/endian.h"
#include <iostream>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <iomanip>
#include "superscalar.hpp"
#include "intrin_portable.h"
#include "reciprocal.h"

namespace randomx {

	static bool isMultiplication(int type) {
		return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP;
	}

	//uOPs (micro-ops) are represented only by the execution port they can go to
	namespace ExecutionPort {
		using type = int;
		constexpr type Null = 0;
		constexpr type P0 = 1;
		constexpr type P1 = 2;
		constexpr type P5 = 4;
		constexpr type P01 = P0 | P1;
		constexpr type P05 = P0 | P5;
		constexpr type P015 = P0 | P1 | P5;
	}

	//Macro-operation as output of the x86 decoder
	//Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op
	//Macro-op can consist of 1 or 2 uOPs.
	class MacroOp {
	public:
		MacroOp(const char* name, int size)
			: name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {}
		MacroOp(const char* name, int size, int latency, ExecutionPort::type uop)
			: name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {}
		MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2)
			: name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {}
		MacroOp(const MacroOp& parent, bool dependent)
			: name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {}
		const char* getName() const {
			return name_;
		}
		int getSize() const {
			return size_;
		}
		int getLatency() const {
			return latency_;
		}
		ExecutionPort::type getUop1() const {
			return uop1_;
		}
		ExecutionPort::type getUop2() const {
			return uop2_;
		}
		bool isSimple() const {
			return uop2_ == ExecutionPort::Null;
		}
		bool isEliminated() const {
			return uop1_ == ExecutionPort::Null;
		}
		bool isDependent() const {
			return dependent_;
		}
		static const MacroOp Add_rr;
		static const MacroOp Add_ri;
		static const MacroOp Lea_sib;
		static const MacroOp Sub_rr;
		static const MacroOp Imul_rr;
		static const MacroOp Imul_r;
		static const MacroOp Mul_r;
		static const MacroOp Mov_rr;
		static const MacroOp Mov_ri64;
		static const MacroOp Xor_rr;
		static const MacroOp Xor_ri;
		static const MacroOp Ror_rcl;
		static const MacroOp Ror_ri;
		static const MacroOp TestJz_fused;
		static const MacroOp Xor_self;
		static const MacroOp Cmp_ri;
		static const MacroOp Setcc_r;
	private:
		const char* name_;
		int size_;
		int latency_;
		ExecutionPort::type uop1_;
		ExecutionPort::type uop2_;
		bool dependent_ = false;
	};

	//Size: 3 bytes
	const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
	const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5);
	const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3);

	//Size: 4 bytes
	const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01);
	const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1);
	const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05);

	//Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes)
	const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015);

	//Size: 10 bytes
	const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015);

	//Unused:
	const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5);
	const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3);
	const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05);
	const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5);

	const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr };
	const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr };
	const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) };

	class SuperscalarInstructionInfo {
	public:
		const char* getName() const {
			return name_;
		}
		int getSize() const {
			return ops_.size();
		}
		bool isSimple() const {
			return getSize() == 1;
		}
		int getLatency() const {
			return latency_;
		}
		const MacroOp& getOp(int index) const {
			return ops_[index];
		}
		int getType() const {
			return type_;
		}
		int getResultOp() const {
			return resultOp_;
		}
		int getDstOp() const {
			return dstOp_;
		}
		int getSrcOp() const {
			return srcOp_;
		}
		static const SuperscalarInstructionInfo ISUB_R;
		static const SuperscalarInstructionInfo IXOR_R;
		static const SuperscalarInstructionInfo IADD_RS;
		static const SuperscalarInstructionInfo IMUL_R;
		static const SuperscalarInstructionInfo IROR_C;
		static const SuperscalarInstructionInfo IADD_C7;
		static const SuperscalarInstructionInfo IXOR_C7;
		static const SuperscalarInstructionInfo IADD_C8;
		static const SuperscalarInstructionInfo IXOR_C8;
		static const SuperscalarInstructionInfo IADD_C9;
		static const SuperscalarInstructionInfo IXOR_C9;
		static const SuperscalarInstructionInfo IMULH_R;
		static const SuperscalarInstructionInfo ISMULH_R;
		static const SuperscalarInstructionInfo IMUL_RCP;
		static const SuperscalarInstructionInfo NOP;
	private:
		const char* name_;
		int type_;
		std::vector<MacroOp> ops_;
		int latency_;
		int resultOp_ = 0;
		int dstOp_ = 0;
		int srcOp_;

		SuperscalarInstructionInfo(const char* name)
			: name_(name), type_(-1), latency_(0) {}
		SuperscalarInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp)
			: name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) {
			ops_.push_back(MacroOp(op));
		}
		template <size_t N>
		SuperscalarInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp)
			: name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) {
			for (unsigned i = 0; i < N; ++i) {
				ops_.push_back(MacroOp(arr[i]));
				latency_ += ops_.back().getLatency();
			}
			static_assert(N > 1, "Invalid array size");
		}
	};

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1);

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1);

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP");

	//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
	//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
	//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
	const int buffer0[] = { 4, 8, 4 };
	const int buffer1[] = { 7, 3, 3, 3 };
	const int buffer2[] = { 3, 7, 3, 3 };
	const int buffer3[] = { 4, 9, 3 };
	const int buffer4[] = { 4, 4, 4, 4 };
	const int buffer5[] = { 3, 3, 10 };

	class DecoderBuffer {
	public:
		static const DecoderBuffer Default;
		template <size_t N>
		DecoderBuffer(const char* name, int index, const int(&arr)[N])
			: name_(name), index_(index), counts_(arr), opsCount_(N) {}
		const int* getCounts() const {
			return counts_;
		}
		int getSize() const {
			return opsCount_;
		}
		int getIndex() const {
			return index_;
		}
		const char* getName() const {
			return name_;
		}
		const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const {
			//If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10
			//because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs.
			//Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops.
			if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R)
				return &decodeBuffer3310;

			//To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications
			//is lower than the number of cycles.
			if (mulCount < cycle + 1)
				return &decodeBuffer4444;

			//If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication.
			if(instrType == SuperscalarInstructionType::IMUL_RCP)
				return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493;

			//Default: select a random fetch configuration.
			return fetchNextDefault(gen);
		}
	private:
		const char* name_;
		int index_;
		const int* counts_;
		int opsCount_;
		DecoderBuffer() : index_(-1) {}
		static const DecoderBuffer decodeBuffer484;
		static const DecoderBuffer decodeBuffer7333;
		static const DecoderBuffer decodeBuffer3733;
		static const DecoderBuffer decodeBuffer493;
		static const DecoderBuffer decodeBuffer4444;
		static const DecoderBuffer decodeBuffer3310;
		static const DecoderBuffer* decodeBuffers[4];
		const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const {
			return decodeBuffers[gen.getByte() & 3];
		}
	};

	const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0);
	const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1);
	const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2);
	const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3);
	const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4);
	const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5);

	const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = {
			&DecoderBuffer::decodeBuffer484,
			&DecoderBuffer::decodeBuffer7333,
			&DecoderBuffer::decodeBuffer3733,
			&DecoderBuffer::decodeBuffer493,
	};

	const DecoderBuffer DecoderBuffer::Default = DecoderBuffer();

	const SuperscalarInstructionInfo* slot_3[]  = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R };
	const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R };
	const SuperscalarInstructionInfo* slot_4[]  = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS };
	const SuperscalarInstructionInfo* slot_7[]  = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 };
	const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 };
	const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 };
	const SuperscalarInstructionInfo* slot_10   = &SuperscalarInstructionInfo::IMUL_RCP;

	static bool selectRegister(std::vector<int>& availableRegisters, Blake2Generator& gen, int& reg) {
		int index;
		if (availableRegisters.size() == 0)
			return false;

		if (availableRegisters.size() > 1) {
			index = gen.getInt32() % availableRegisters.size();
		}
		else {
			index = 0;
		}
		reg = availableRegisters[index];
		return true;
	}

	class RegisterInfo {
	public:
		RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {}
		int latency;
		int lastOpGroup;
		int lastOpPar;
		int value;
	};

	//"SuperscalarInstruction" consists of one or more macro-ops
	class SuperscalarInstruction {
	public:
		void toInstr(Instruction& instr) { //translate to a RandomX instruction format
			instr.opcode = getType();
			instr.dst = dst_;
			instr.src = src_ >= 0 ? src_ : dst_;
			instr.setMod(mod_);
			instr.setImm32(imm32_);
		}

		void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) {
			switch (slotSize)
			{
			case 3:
				//if this is the last slot, we can also select "IMULH" instructions
				if (isLast) {
					create(slot_3L[gen.getByte() & 3], gen);
				}
				else {
					create(slot_3[gen.getByte() & 1], gen);
				}
				break;
			case 4:
				//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
				if (fetchType == 4 && !isLast) {
					create(&SuperscalarInstructionInfo::IMUL_R, gen);
				}
				else {
					create(slot_4[gen.getByte() & 1], gen);
				}
				break;
			case 7:
				create(slot_7[gen.getByte() & 1], gen);
				break;
			case 8:
				create(slot_8[gen.getByte() & 1], gen);
				break;
			case 9:
				create(slot_9[gen.getByte() & 1], gen);
				break;
			case 10:
				create(slot_10, gen);
				break;
			default:
				UNREACHABLE;
			}
		}

		void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) {
			info_ = info;
			reset();
			switch (info->getType())
			{
			case SuperscalarInstructionType::ISUB_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IADD_RS;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IXOR_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IXOR_R;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IADD_RS: {
				mod_ = gen.getByte();
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IADD_RS;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IMUL_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IMUL_R;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IROR_C: {
				mod_ = 0;
				do {
					imm32_ = gen.getByte() & 63;
				} while (imm32_ == 0);
				opGroup_ = SuperscalarInstructionType::IROR_C;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IADD_C7:
			case SuperscalarInstructionType::IADD_C8:
			case SuperscalarInstructionType::IADD_C9: {
				mod_ = 0;
				imm32_ = gen.getInt32();
				opGroup_ = SuperscalarInstructionType::IADD_C7;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IXOR_C7:
			case SuperscalarInstructionType::IXOR_C8:
			case SuperscalarInstructionType::IXOR_C9: {
				mod_ = 0;
				imm32_ = gen.getInt32();
				opGroup_ = SuperscalarInstructionType::IXOR_C7;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IMULH_R: {
				canReuse_ = true;
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IMULH_R;
				opGroupPar_ = gen.getInt32();
			} break;

			case SuperscalarInstructionType::ISMULH_R: {
				canReuse_ = true;
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::ISMULH_R;
				opGroupPar_ = gen.getInt32();
			} break;

			case SuperscalarInstructionType::IMUL_RCP: {
				mod_ = 0;
				do {
					imm32_ = gen.getInt32();
				} while ((imm32_ & (imm32_ - 1)) == 0);
				opGroup_ = SuperscalarInstructionType::IMUL_RCP;
				opGroupPar_ = -1;
			} break;

			default:
				break;
			}
		}

		bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (&registers)[8], Blake2Generator& gen) {
			/*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R)
				std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/
			std::vector<int> availableRegisters;
			//Conditions for the destination register:
			// * value must be ready at the required cycle
			// * cannot be the same as the source register unless the instruction allows it
			//   - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
			// * register cannot be multiplied twice in a row unless allowChainedMul is true
			//   - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
			//   - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
			// * either the last instruction applied to the register or its source must be different than this instruction
			//   - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
			// * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
			for (unsigned i = 0; i < 8; ++i) {
				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement))
					availableRegisters.push_back(i);
			}
			return selectRegister(availableRegisters, gen, dst_);
		}

		bool selectSource(int cycle, RegisterInfo(&registers)[8], Blake2Generator& gen) {
			std::vector<int> availableRegisters;
			//all registers that are ready at the cycle
			for (unsigned i = 0; i < 8; ++i) {
				if (registers[i].latency <= cycle)
					availableRegisters.push_back(i);
			}
			//if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination
			if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) {
				if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) {
					opGroupPar_ = src_ = RegisterNeedsDisplacement;
					return true;
				}
			}
			if (selectRegister(availableRegisters, gen, src_)) {
				if (groupParIsSource_)
					opGroupPar_ = src_;
				return true;
			}
			return false;
		}

		int getType() {
			return info_->getType();
		}
		int getSource() {
			return src_;
		}
		int getDestination() {
			return dst_;
		}
		int getGroup() {
			return opGroup_;
		}
		int getGroupPar() {
			return opGroupPar_;
		}

		const SuperscalarInstructionInfo& getInfo() const {
			return *info_;
		}

		static const SuperscalarInstruction Null;

	private:
		const SuperscalarInstructionInfo* info_;
		int src_ = -1;
		int dst_ = -1;
		int mod_;
		uint32_t imm32_;
		int opGroup_;
		int opGroupPar_;
		bool canReuse_ = false;
		bool groupParIsSource_ = false;

		void reset() {
			src_ = dst_ = -1;
			canReuse_ = groupParIsSource_ = false;
		}

		SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) {
		}
	};

	const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP);

	constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4;
	constexpr int LOOK_FORWARD_CYCLES = 4;
	constexpr int MAX_THROWAWAY_COUNT = 256;

	template<bool commit>
	static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) {
		//The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
		//port P1 (multiplication) by instructions that can go to any port.
		for (; cycle < CYCLE_MAP_SIZE; ++cycle) {
			if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) {
				if (commit) {
					if (trace) std::cout << "; P5 at cycle " << cycle << std::endl;
					portBusy[cycle][2] = uop;
				}
				return cycle;
			}
			if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) {
				if (commit) {
					if (trace) std::cout << "; P0 at cycle " << cycle << std::endl;
					portBusy[cycle][0] = uop;
				}
				return cycle;
			}
			if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) {
				if (commit) {
					if (trace) std::cout << "; P1 at cycle " << cycle << std::endl;
					portBusy[cycle][1] = uop;
				}
				return cycle;
			}
		}
		return -1;
	}

	template<bool commit>
	static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) {
		//if this macro-op depends on the previous one, increase the starting cycle if needed
		//this handles an explicit dependency chain in IMUL_RCP
		if (mop.isDependent()) {
			cycle = std::max(cycle, depCycle);
		}
		//move instructions are eliminated and don't need an execution unit
		if (mop.isEliminated()) {
			if (commit)
				if (trace) std::cout << "; (eliminated)" << std::endl;
			return cycle;
		}
		else if (mop.isSimple()) {
			//this macro-op has only one uOP
			return scheduleUop<commit>(mop.getUop1(), portBusy, cycle);
		}
		else {
			//macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle
			for (; cycle < CYCLE_MAP_SIZE; ++cycle) {

				int cycle1 = scheduleUop<false>(mop.getUop1(), portBusy, cycle);
				int cycle2 = scheduleUop<false>(mop.getUop2(), portBusy, cycle);

				if (cycle1 == cycle2) {
					if (commit) {
						scheduleUop<true>(mop.getUop1(), portBusy, cycle1);
						scheduleUop<true>(mop.getUop2(), portBusy, cycle2);
					}
					return cycle1;
				}
			}
		}

		return -1;
	}

	void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) {

		ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3];
		memset(portBusy, 0, sizeof(portBusy));
		RegisterInfo registers[8];

		const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default;
		SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null;
		int macroOpIndex = 0;
		int codeSize = 0;
		int macroOpCount = 0;
		int cycle = 0;
		int depCycle = 0;
		int retireCycle = 0;
		bool portsSaturated = false;
		int programSize = 0;
		int mulCount = 0;
		int decodeCycle;
		int throwAwayCount = 0;

		//decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated.
		//Each decode cycle decodes 16 bytes of x86 code.
		//Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always
		//saturated first. The cycle limit is present only to guarantee loop termination.
		//Program size is limited to RANDOMX_SUPERSCALAR_MAX_SIZE instructions.
		for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) {

			//select a decode configuration
			decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen);
			if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl;

			int bufferIndex = 0;

			//fill all instruction slots in the current decode buffer
			while (bufferIndex < decodeBuffer->getSize()) {
				int topCycle = cycle;

				//if we have issued all macro-ops for the current RandomX instruction, create a new instruction
				if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
					if (portsSaturated)
						break;
					//select an instruction so that the first macro-op fits into the current slot
					currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0);
					macroOpIndex = 0;
					if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl;
				}
				const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex);
				if (trace) std::cout << mop.getName() << " ";

				//calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution
				int scheduleCycle = scheduleMop<false>(mop, portBusy, cycle, depCycle);
				if (scheduleCycle < 0) {
					if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl;
					//__debugbreak();
					portsSaturated = true;
					break;
				}

				//find a source register (if applicable) that will be ready when this instruction executes
				if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) {
					int forward;
					//if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward
					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) {
						if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl;
						++scheduleCycle;
						++cycle;
					}
					//if no register was found, throw the instruction away and try another one
					if (forward == LOOK_FORWARD_CYCLES) {
						if (throwAwayCount < MAX_THROWAWAY_COUNT) {
							throwAwayCount++;
							macroOpIndex = currentInstruction.getInfo().getSize();
							if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
							//cycle = topCycle;
							continue;
						}
						//abort this decode buffer
						if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl;
						currentInstruction = SuperscalarInstruction::Null;
						break;
					}
					if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl;
				}
				//find a destination register that will be ready when this instruction executes
				if (macroOpIndex == currentInstruction.getInfo().getDstOp()) {
					int forward;
					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) {
						if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl;
						++scheduleCycle;
						++cycle;
					}
					if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away
						if (throwAwayCount < MAX_THROWAWAY_COUNT) {
							throwAwayCount++;
							macroOpIndex = currentInstruction.getInfo().getSize();
							if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
							//cycle = topCycle;
							continue;
						}
						//abort this decode buffer
						if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl;
						currentInstruction = SuperscalarInstruction::Null;
						break;
					}
					if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl;
				}
				throwAwayCount = 0;

				//recalculate when the instruction can be scheduled for execution based on operand availability
				scheduleCycle = scheduleMop<true>(mop, portBusy, scheduleCycle, scheduleCycle);

				//calculate when the result will be ready
				depCycle = scheduleCycle + mop.getLatency();

				//if this instruction writes the result, modify register information
				//  RegisterInfo.latency - which cycle the register will be ready
				//  RegisterInfo.lastOpGroup - the last operation that was applied to the register
				//  RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register)
				if (macroOpIndex == currentInstruction.getInfo().getResultOp()) {
					int dst = currentInstruction.getDestination();
					RegisterInfo& ri = registers[dst];
					retireCycle = depCycle;
					ri.latency = retireCycle;
					ri.lastOpGroup = currentInstruction.getGroup();
					ri.lastOpPar = currentInstruction.getGroupPar();
					if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl;
				}
				codeSize += mop.getSize();
				bufferIndex++;
				macroOpIndex++;
				macroOpCount++;

				//terminating condition
				if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) {
					portsSaturated = true;
				}
				cycle = topCycle;

				//when all macro-ops of the current instruction have been issued, add the instruction into the program
				if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
					currentInstruction.toInstr(prog(programSize++));
					mulCount += isMultiplication(currentInstruction.getType());
				}
			}
			++cycle;
		}

		double ipc = (macroOpCount / (double)retireCycle);

		memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies));

		//Calculate ASIC latency:
		//Assumes 1 cycle latency for all operations and unlimited parallelization.
		for (int i = 0; i < programSize; ++i) {
			Instruction& instr = prog(i);
			int latDst = prog.asicLatencies[instr.dst] + 1;
			int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0;
			prog.asicLatencies[instr.dst] = std::max(latDst, latSrc);
		}

		//address register is the register with the highest ASIC latency
		int asicLatencyMax = 0;
		int addressReg = 0;
		for (int i = 0; i < 8; ++i) {
			if (prog.asicLatencies[i] > asicLatencyMax) {
				asicLatencyMax = prog.asicLatencies[i];
				addressReg = i;
			}
			prog.cpuLatencies[i] = registers[i].latency;
		}

		prog.setSize(programSize);
		prog.setAddressRegister(addressReg);

		prog.cpuLatency = retireCycle;
		prog.asicLatency = asicLatencyMax;
		prog.codeSize = codeSize;
		prog.macroOps = macroOpCount;
		prog.decodeCycles = decodeCycle;
		prog.ipc = ipc;
		prog.mulCount = mulCount;


		/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
		if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;

		int portCycles = 0;
		for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
			std::cout << "; " << std::setw(3) << i << " ";
			for (int j = 0; j < 3; ++j) {
				std::cout << (portBusy[i][j] ? '*' : '_');
				portCycles += !!portBusy[i][j];
			}
			std::cout << std::endl;
		}*/
	}

	void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals) {
		for (unsigned j = 0; j < prog.getSize(); ++j) {
			Instruction& instr = prog(j);
			switch (instr.opcode)
			{
			case randomx::SuperscalarInstructionType::ISUB_R:
				r[instr.dst] -= r[instr.src];
				break;
			case randomx::SuperscalarInstructionType::IXOR_R:
				r[instr.dst] ^= r[instr.src];
				break;
			case randomx::SuperscalarInstructionType::IADD_RS:
				r[instr.dst] += r[instr.src] << instr.getModMem();
				break;
			case randomx::SuperscalarInstructionType::IMUL_R:
				r[instr.dst] *= r[instr.src];
				break;
			case randomx::SuperscalarInstructionType::IROR_C:
				r[instr.dst] = rotr(r[instr.dst], instr.getImm32());
				break;
			case randomx::SuperscalarInstructionType::IADD_C7:
			case randomx::SuperscalarInstructionType::IADD_C8:
			case randomx::SuperscalarInstructionType::IADD_C9:
				r[instr.dst] += signExtend2sCompl(instr.getImm32());
				break;
			case randomx::SuperscalarInstructionType::IXOR_C7:
			case randomx::SuperscalarInstructionType::IXOR_C8:
			case randomx::SuperscalarInstructionType::IXOR_C9:
				r[instr.dst] ^= signExtend2sCompl(instr.getImm32());
				break;
			case randomx::SuperscalarInstructionType::IMULH_R:
				r[instr.dst] = mulh(r[instr.dst], r[instr.src]);
				break;
			case randomx::SuperscalarInstructionType::ISMULH_R:
				r[instr.dst] = smulh(r[instr.dst], r[instr.src]);
				break;
			case randomx::SuperscalarInstructionType::IMUL_RCP:
				if (reciprocals != nullptr)
					r[instr.dst] *= (*reciprocals)[instr.getImm32()];
				else
					r[instr.dst] *= randomx_reciprocal(instr.getImm32());
				break;
			default:
				UNREACHABLE;
			}
		}
	}
}