2019-03-28 14:27:10 +00:00
/*
2019-05-18 12:21:47 +00:00
Copyright ( c ) 2018 - 2019 , tevador < tevador @ gmail . com >
2019-03-28 14:27:10 +00:00
2019-05-18 12:21:47 +00:00
All rights reserved .
2019-03-28 14:27:10 +00:00
2019-05-18 12:21:47 +00:00
Redistribution and use in source and binary forms , with or without
modification , are permitted provided that the following conditions are met :
* Redistributions of source code must retain the above copyright
notice , this list of conditions and the following disclaimer .
* Redistributions in binary form must reproduce the above copyright
notice , this list of conditions and the following disclaimer in the
documentation and / or other materials provided with the distribution .
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission .
2019-03-28 14:27:10 +00:00
2019-05-18 12:21:47 +00:00
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS " AND
ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR CONSEQUENTIAL
DAMAGES ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES ; LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY ,
OR TORT ( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
2019-03-28 14:27:10 +00:00
*/
# include "configuration.h"
2019-04-21 12:07:32 +00:00
# include "program.hpp"
2019-04-10 22:01:22 +00:00
# include "blake2/endian.h"
2019-03-28 14:27:10 +00:00
# include <iostream>
2019-03-31 11:32:16 +00:00
# include <vector>
2019-03-31 19:22:36 +00:00
# include <algorithm>
# include <stdexcept>
2019-03-31 22:38:17 +00:00
# include <iomanip>
2019-04-20 14:53:06 +00:00
# include "superscalar.hpp"
# include "intrin_portable.h"
2019-04-20 09:08:01 +00:00
# include "reciprocal.h"
2019-03-28 14:27:10 +00:00
2019-04-20 09:08:01 +00:00
namespace randomx {
2019-03-28 14:27:10 +00:00
2019-04-12 11:32:22 +00:00
static bool isMultiplication ( int type ) {
return type = = SuperscalarInstructionType : : IMUL_R | | type = = SuperscalarInstructionType : : IMULH_R | | type = = SuperscalarInstructionType : : ISMULH_R | | type = = SuperscalarInstructionType : : IMUL_RCP ;
2019-04-03 07:53:25 +00:00
}
2019-04-12 17:36:08 +00:00
//uOPs (micro-ops) are represented only by the execution port they can go to
2019-03-31 11:32:16 +00:00
namespace ExecutionPort {
using type = int ;
constexpr type Null = 0 ;
constexpr type P0 = 1 ;
constexpr type P1 = 2 ;
2019-04-12 11:32:22 +00:00
constexpr type P5 = 4 ;
constexpr type P01 = P0 | P1 ;
constexpr type P05 = P0 | P5 ;
constexpr type P015 = P0 | P1 | P5 ;
2019-03-31 11:32:16 +00:00
}
2019-04-12 17:36:08 +00:00
//Macro-operation as output of the x86 decoder
//Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op
//Macro-op can consist of 1 or 2 uOPs.
2019-03-31 11:32:16 +00:00
class MacroOp {
public :
MacroOp ( const char * name , int size )
: name_ ( name ) , size_ ( size ) , latency_ ( 0 ) , uop1_ ( ExecutionPort : : Null ) , uop2_ ( ExecutionPort : : Null ) { }
MacroOp ( const char * name , int size , int latency , ExecutionPort : : type uop )
: name_ ( name ) , size_ ( size ) , latency_ ( latency ) , uop1_ ( uop ) , uop2_ ( ExecutionPort : : Null ) { }
MacroOp ( const char * name , int size , int latency , ExecutionPort : : type uop1 , ExecutionPort : : type uop2 )
: name_ ( name ) , size_ ( size ) , latency_ ( latency ) , uop1_ ( uop1 ) , uop2_ ( uop2 ) { }
2019-03-31 19:22:36 +00:00
MacroOp ( const MacroOp & parent , bool dependent )
: name_ ( parent . name_ ) , size_ ( parent . size_ ) , latency_ ( parent . latency_ ) , uop1_ ( parent . uop1_ ) , uop2_ ( parent . uop2_ ) , dependent_ ( dependent ) { }
2019-03-31 11:32:16 +00:00
const char * getName ( ) const {
return name_ ;
}
int getSize ( ) const {
return size_ ;
}
int getLatency ( ) const {
return latency_ ;
}
ExecutionPort : : type getUop1 ( ) const {
return uop1_ ;
}
ExecutionPort : : type getUop2 ( ) const {
return uop2_ ;
}
bool isSimple ( ) const {
return uop2_ = = ExecutionPort : : Null ;
}
bool isEliminated ( ) const {
return uop1_ = = ExecutionPort : : Null ;
}
2019-03-31 19:22:36 +00:00
bool isDependent ( ) const {
return dependent_ ;
}
2019-03-31 11:32:16 +00:00
static const MacroOp Add_rr ;
static const MacroOp Add_ri ;
static const MacroOp Lea_sib ;
static const MacroOp Sub_rr ;
static const MacroOp Imul_rr ;
static const MacroOp Imul_r ;
static const MacroOp Mul_r ;
static const MacroOp Mov_rr ;
static const MacroOp Mov_ri64 ;
static const MacroOp Xor_rr ;
static const MacroOp Xor_ri ;
static const MacroOp Ror_rcl ;
static const MacroOp Ror_ri ;
2019-04-06 10:00:56 +00:00
static const MacroOp TestJz_fused ;
2019-03-31 11:32:16 +00:00
static const MacroOp Xor_self ;
static const MacroOp Cmp_ri ;
static const MacroOp Setcc_r ;
private :
const char * name_ ;
int size_ ;
int latency_ ;
ExecutionPort : : type uop1_ ;
ExecutionPort : : type uop2_ ;
2019-03-31 19:22:36 +00:00
bool dependent_ = false ;
2019-03-31 11:32:16 +00:00
} ;
2019-04-11 22:02:22 +00:00
//Size: 3 bytes
2019-03-31 11:32:16 +00:00
const MacroOp MacroOp : : Add_rr = MacroOp ( " add r,r " , 3 , 1 , ExecutionPort : : P015 ) ;
const MacroOp MacroOp : : Sub_rr = MacroOp ( " sub r,r " , 3 , 1 , ExecutionPort : : P015 ) ;
2019-04-11 22:02:22 +00:00
const MacroOp MacroOp : : Xor_rr = MacroOp ( " xor r,r " , 3 , 1 , ExecutionPort : : P015 ) ;
2019-04-06 15:07:40 +00:00
const MacroOp MacroOp : : Imul_r = MacroOp ( " imul r " , 3 , 4 , ExecutionPort : : P1 , ExecutionPort : : P5 ) ;
2019-05-05 12:49:00 +00:00
const MacroOp MacroOp : : Mul_r = MacroOp ( " mul r " , 3 , 4 , ExecutionPort : : P1 , ExecutionPort : : P5 ) ;
2019-03-31 11:32:16 +00:00
const MacroOp MacroOp : : Mov_rr = MacroOp ( " mov r,r " , 3 ) ;
2019-04-11 22:02:22 +00:00
//Size: 4 bytes
const MacroOp MacroOp : : Lea_sib = MacroOp ( " lea r,r+r*s " , 4 , 1 , ExecutionPort : : P01 ) ;
const MacroOp MacroOp : : Imul_rr = MacroOp ( " imul r,r " , 4 , 3 , ExecutionPort : : P1 ) ;
const MacroOp MacroOp : : Ror_ri = MacroOp ( " ror r,i " , 4 , 1 , ExecutionPort : : P05 ) ;
//Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes)
const MacroOp MacroOp : : Add_ri = MacroOp ( " add r,i " , 7 , 1 , ExecutionPort : : P015 ) ;
2019-03-31 11:32:16 +00:00
const MacroOp MacroOp : : Xor_ri = MacroOp ( " xor r,i " , 7 , 1 , ExecutionPort : : P015 ) ;
2019-04-11 22:02:22 +00:00
//Size: 10 bytes
const MacroOp MacroOp : : Mov_ri64 = MacroOp ( " mov rax,i64 " , 10 , 1 , ExecutionPort : : P015 ) ;
//Unused:
2019-03-31 11:32:16 +00:00
const MacroOp MacroOp : : Ror_rcl = MacroOp ( " ror r,cl " , 3 , 1 , ExecutionPort : : P0 , ExecutionPort : : P5 ) ;
const MacroOp MacroOp : : Xor_self = MacroOp ( " xor rcx,rcx " , 3 ) ;
const MacroOp MacroOp : : Cmp_ri = MacroOp ( " cmp r,i " , 7 , 1 , ExecutionPort : : P015 ) ;
const MacroOp MacroOp : : Setcc_r = MacroOp ( " setcc cl " , 3 , 1 , ExecutionPort : : P05 ) ;
2019-04-06 10:00:56 +00:00
const MacroOp MacroOp : : TestJz_fused = MacroOp ( " testjz r,i " , 13 , 0 , ExecutionPort : : P5 ) ;
2019-03-31 11:32:16 +00:00
2019-03-31 19:22:36 +00:00
const MacroOp IMULH_R_ops_array [ ] = { MacroOp : : Mov_rr , MacroOp : : Mul_r , MacroOp : : Mov_rr } ;
const MacroOp ISMULH_R_ops_array [ ] = { MacroOp : : Mov_rr , MacroOp : : Imul_r , MacroOp : : Mov_rr } ;
const MacroOp IMUL_RCP_ops_array [ ] = { MacroOp : : Mov_ri64 , MacroOp ( MacroOp : : Imul_rr , true ) } ;
2019-03-31 11:32:16 +00:00
2019-04-12 17:36:08 +00:00
class SuperscalarInstructionInfo {
2019-03-31 11:32:16 +00:00
public :
const char * getName ( ) const {
return name_ ;
}
int getSize ( ) const {
2019-03-31 19:22:36 +00:00
return ops_ . size ( ) ;
2019-03-31 11:32:16 +00:00
}
bool isSimple ( ) const {
2019-03-31 19:22:36 +00:00
return getSize ( ) = = 1 ;
2019-03-31 11:32:16 +00:00
}
int getLatency ( ) const {
return latency_ ;
}
2019-04-11 22:02:22 +00:00
const MacroOp & getOp ( int index ) const {
2019-03-31 19:22:36 +00:00
return ops_ [ index ] ;
}
int getType ( ) const {
return type_ ;
2019-03-31 11:32:16 +00:00
}
2019-03-31 22:38:17 +00:00
int getResultOp ( ) const {
return resultOp_ ;
}
int getDstOp ( ) const {
return dstOp_ ;
}
int getSrcOp ( ) const {
return srcOp_ ;
}
2019-04-12 17:36:08 +00:00
static const SuperscalarInstructionInfo ISUB_R ;
static const SuperscalarInstructionInfo IXOR_R ;
static const SuperscalarInstructionInfo IADD_RS ;
static const SuperscalarInstructionInfo IMUL_R ;
static const SuperscalarInstructionInfo IROR_C ;
static const SuperscalarInstructionInfo IADD_C7 ;
static const SuperscalarInstructionInfo IXOR_C7 ;
static const SuperscalarInstructionInfo IADD_C8 ;
static const SuperscalarInstructionInfo IXOR_C8 ;
static const SuperscalarInstructionInfo IADD_C9 ;
static const SuperscalarInstructionInfo IXOR_C9 ;
static const SuperscalarInstructionInfo IMULH_R ;
static const SuperscalarInstructionInfo ISMULH_R ;
static const SuperscalarInstructionInfo IMUL_RCP ;
static const SuperscalarInstructionInfo NOP ;
2019-03-31 11:32:16 +00:00
private :
const char * name_ ;
2019-03-31 19:22:36 +00:00
int type_ ;
std : : vector < MacroOp > ops_ ;
2019-03-31 11:32:16 +00:00
int latency_ ;
2019-03-31 22:38:17 +00:00
int resultOp_ = 0 ;
int dstOp_ = 0 ;
2019-04-01 16:31:02 +00:00
int srcOp_ ;
2019-03-31 11:32:16 +00:00
2019-04-12 17:36:08 +00:00
SuperscalarInstructionInfo ( const char * name )
2019-03-31 19:22:36 +00:00
: name_ ( name ) , type_ ( - 1 ) , latency_ ( 0 ) { }
2019-04-12 17:36:08 +00:00
SuperscalarInstructionInfo ( const char * name , int type , const MacroOp & op , int srcOp )
2019-04-11 22:02:22 +00:00
: name_ ( name ) , type_ ( type ) , latency_ ( op . getLatency ( ) ) , srcOp_ ( srcOp ) {
ops_ . push_back ( MacroOp ( op ) ) ;
}
template < size_t N >
2019-04-12 17:36:08 +00:00
SuperscalarInstructionInfo ( const char * name , int type , const MacroOp ( & arr ) [ N ] , int resultOp , int dstOp , int srcOp )
2019-04-11 22:02:22 +00:00
: name_ ( name ) , type_ ( type ) , latency_ ( 0 ) , resultOp_ ( resultOp ) , dstOp_ ( dstOp ) , srcOp_ ( srcOp ) {
for ( unsigned i = 0 ; i < N ; + + i ) {
ops_ . push_back ( MacroOp ( arr [ i ] ) ) ;
latency_ + = ops_ . back ( ) . getLatency ( ) ;
}
static_assert ( N > 1 , " Invalid array size " ) ;
}
2019-03-31 11:32:16 +00:00
} ;
2019-04-12 17:36:08 +00:00
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : ISUB_R = SuperscalarInstructionInfo ( " ISUB_R " , SuperscalarInstructionType : : ISUB_R , MacroOp : : Sub_rr , 0 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IXOR_R = SuperscalarInstructionInfo ( " IXOR_R " , SuperscalarInstructionType : : IXOR_R , MacroOp : : Xor_rr , 0 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IADD_RS = SuperscalarInstructionInfo ( " IADD_RS " , SuperscalarInstructionType : : IADD_RS , MacroOp : : Lea_sib , 0 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IMUL_R = SuperscalarInstructionInfo ( " IMUL_R " , SuperscalarInstructionType : : IMUL_R , MacroOp : : Imul_rr , 0 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IROR_C = SuperscalarInstructionInfo ( " IROR_C " , SuperscalarInstructionType : : IROR_C , MacroOp : : Ror_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IADD_C7 = SuperscalarInstructionInfo ( " IADD_C7 " , SuperscalarInstructionType : : IADD_C7 , MacroOp : : Add_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IXOR_C7 = SuperscalarInstructionInfo ( " IXOR_C7 " , SuperscalarInstructionType : : IXOR_C7 , MacroOp : : Xor_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IADD_C8 = SuperscalarInstructionInfo ( " IADD_C8 " , SuperscalarInstructionType : : IADD_C8 , MacroOp : : Add_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IXOR_C8 = SuperscalarInstructionInfo ( " IXOR_C8 " , SuperscalarInstructionType : : IXOR_C8 , MacroOp : : Xor_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IADD_C9 = SuperscalarInstructionInfo ( " IADD_C9 " , SuperscalarInstructionType : : IADD_C9 , MacroOp : : Add_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IXOR_C9 = SuperscalarInstructionInfo ( " IXOR_C9 " , SuperscalarInstructionType : : IXOR_C9 , MacroOp : : Xor_ri , - 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IMULH_R = SuperscalarInstructionInfo ( " IMULH_R " , SuperscalarInstructionType : : IMULH_R , IMULH_R_ops_array , 1 , 0 , 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : ISMULH_R = SuperscalarInstructionInfo ( " ISMULH_R " , SuperscalarInstructionType : : ISMULH_R , ISMULH_R_ops_array , 1 , 0 , 1 ) ;
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : IMUL_RCP = SuperscalarInstructionInfo ( " IMUL_RCP " , SuperscalarInstructionType : : IMUL_RCP , IMUL_RCP_ops_array , 1 , 1 , - 1 ) ;
2019-04-07 13:38:51 +00:00
2019-04-12 17:36:08 +00:00
const SuperscalarInstructionInfo SuperscalarInstructionInfo : : NOP = SuperscalarInstructionInfo ( " NOP " ) ;
//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
const int buffer0 [ ] = { 4 , 8 , 4 } ;
const int buffer1 [ ] = { 7 , 3 , 3 , 3 } ;
const int buffer2 [ ] = { 3 , 7 , 3 , 3 } ;
const int buffer3 [ ] = { 4 , 9 , 3 } ;
const int buffer4 [ ] = { 4 , 4 , 4 , 4 } ;
const int buffer5 [ ] = { 3 , 3 , 10 } ;
2019-03-31 11:32:16 +00:00
class DecoderBuffer {
public :
2019-04-06 10:00:56 +00:00
static const DecoderBuffer Default ;
2019-03-31 11:32:16 +00:00
template < size_t N >
DecoderBuffer ( const char * name , int index , const int ( & arr ) [ N ] )
: name_ ( name ) , index_ ( index ) , counts_ ( arr ) , opsCount_ ( N ) { }
const int * getCounts ( ) const {
return counts_ ;
}
int getSize ( ) const {
return opsCount_ ;
}
int getIndex ( ) const {
return index_ ;
}
const char * getName ( ) const {
return name_ ;
}
2019-04-06 15:07:40 +00:00
const DecoderBuffer * fetchNext ( int instrType , int cycle , int mulCount , Blake2Generator & gen ) const {
2019-04-11 22:02:22 +00:00
//If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10
//because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs.
//Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops.
2019-04-12 11:32:22 +00:00
if ( instrType = = SuperscalarInstructionType : : IMULH_R | | instrType = = SuperscalarInstructionType : : ISMULH_R )
2019-04-11 22:02:22 +00:00
return & decodeBuffer3310 ;
//To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications
//is lower than the number of cycles.
2019-04-07 13:38:51 +00:00
if ( mulCount < cycle + 1 )
return & decodeBuffer4444 ;
2019-04-11 22:02:22 +00:00
//If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication.
2019-04-12 11:32:22 +00:00
if ( instrType = = SuperscalarInstructionType : : IMUL_RCP )
2019-04-07 13:38:51 +00:00
return ( gen . getByte ( ) & 1 ) ? & decodeBuffer484 : & decodeBuffer493 ;
2019-04-11 22:02:22 +00:00
//Default: select a random fetch configuration.
2019-03-31 11:32:16 +00:00
return fetchNextDefault ( gen ) ;
}
private :
const char * name_ ;
int index_ ;
const int * counts_ ;
int opsCount_ ;
DecoderBuffer ( ) : index_ ( - 1 ) { }
2019-04-07 13:38:51 +00:00
static const DecoderBuffer decodeBuffer484 ;
2019-04-03 12:06:59 +00:00
static const DecoderBuffer decodeBuffer7333 ;
static const DecoderBuffer decodeBuffer3733 ;
2019-04-07 13:38:51 +00:00
static const DecoderBuffer decodeBuffer493 ;
static const DecoderBuffer decodeBuffer4444 ;
static const DecoderBuffer decodeBuffer3310 ;
static const DecoderBuffer * decodeBuffers [ 4 ] ;
2019-04-06 10:00:56 +00:00
const DecoderBuffer * fetchNextDefault ( Blake2Generator & gen ) const {
2019-04-07 13:38:51 +00:00
return decodeBuffers [ gen . getByte ( ) & 3 ] ;
2019-03-31 11:32:16 +00:00
}
} ;
2019-04-07 13:38:51 +00:00
const DecoderBuffer DecoderBuffer : : decodeBuffer484 = DecoderBuffer ( " 4,8,4 " , 0 , buffer0 ) ;
2019-04-03 12:06:59 +00:00
const DecoderBuffer DecoderBuffer : : decodeBuffer7333 = DecoderBuffer ( " 7,3,3,3 " , 1 , buffer1 ) ;
2019-04-07 13:38:51 +00:00
const DecoderBuffer DecoderBuffer : : decodeBuffer3733 = DecoderBuffer ( " 3,7,3,3 " , 2 , buffer2 ) ;
const DecoderBuffer DecoderBuffer : : decodeBuffer493 = DecoderBuffer ( " 4,9,3 " , 3 , buffer3 ) ;
2019-04-03 12:06:59 +00:00
const DecoderBuffer DecoderBuffer : : decodeBuffer4444 = DecoderBuffer ( " 4,4,4,4 " , 4 , buffer4 ) ;
2019-04-07 13:38:51 +00:00
const DecoderBuffer DecoderBuffer : : decodeBuffer3310 = DecoderBuffer ( " 3,3,10 " , 5 , buffer5 ) ;
2019-04-03 12:06:59 +00:00
2019-04-07 13:38:51 +00:00
const DecoderBuffer * DecoderBuffer : : decodeBuffers [ 4 ] = {
& DecoderBuffer : : decodeBuffer484 ,
& DecoderBuffer : : decodeBuffer7333 ,
2019-04-03 12:06:59 +00:00
& DecoderBuffer : : decodeBuffer3733 ,
2019-04-07 13:38:51 +00:00
& DecoderBuffer : : decodeBuffer493 ,
2019-03-31 11:32:16 +00:00
} ;
2019-04-06 10:00:56 +00:00
const DecoderBuffer DecoderBuffer : : Default = DecoderBuffer ( ) ;
2019-03-31 11:32:16 +00:00
2019-04-12 17:36:08 +00:00
const SuperscalarInstructionInfo * slot_3 [ ] = { & SuperscalarInstructionInfo : : ISUB_R , & SuperscalarInstructionInfo : : IXOR_R } ;
const SuperscalarInstructionInfo * slot_3L [ ] = { & SuperscalarInstructionInfo : : ISUB_R , & SuperscalarInstructionInfo : : IXOR_R , & SuperscalarInstructionInfo : : IMULH_R , & SuperscalarInstructionInfo : : ISMULH_R } ;
const SuperscalarInstructionInfo * slot_4 [ ] = { & SuperscalarInstructionInfo : : IROR_C , & SuperscalarInstructionInfo : : IADD_RS } ;
const SuperscalarInstructionInfo * slot_7 [ ] = { & SuperscalarInstructionInfo : : IXOR_C7 , & SuperscalarInstructionInfo : : IADD_C7 } ;
const SuperscalarInstructionInfo * slot_8 [ ] = { & SuperscalarInstructionInfo : : IXOR_C8 , & SuperscalarInstructionInfo : : IADD_C8 } ;
const SuperscalarInstructionInfo * slot_9 [ ] = { & SuperscalarInstructionInfo : : IXOR_C9 , & SuperscalarInstructionInfo : : IADD_C9 } ;
const SuperscalarInstructionInfo * slot_10 = & SuperscalarInstructionInfo : : IMUL_RCP ;
2019-03-31 19:22:36 +00:00
2019-03-31 22:38:17 +00:00
static bool selectRegister ( std : : vector < int > & availableRegisters , Blake2Generator & gen , int & reg ) {
2019-03-31 19:22:36 +00:00
int index ;
2019-03-31 22:38:17 +00:00
if ( availableRegisters . size ( ) = = 0 )
return false ;
2019-03-31 19:22:36 +00:00
if ( availableRegisters . size ( ) > 1 ) {
index = gen . getInt32 ( ) % availableRegisters . size ( ) ;
}
else {
index = 0 ;
}
2019-03-31 22:38:17 +00:00
reg = availableRegisters [ index ] ;
return true ;
2019-03-31 19:22:36 +00:00
}
2019-03-31 11:32:16 +00:00
2019-04-12 17:36:08 +00:00
class RegisterInfo {
2019-03-31 11:32:16 +00:00
public :
2019-04-12 17:36:08 +00:00
RegisterInfo ( ) : latency ( 0 ) , lastOpGroup ( - 1 ) , lastOpPar ( - 1 ) , value ( 0 ) { }
int latency ;
int lastOpGroup ;
int lastOpPar ;
int value ;
} ;
//"SuperscalarInstruction" consists of one or more macro-ops
class SuperscalarInstruction {
public :
void toInstr ( Instruction & instr ) { //translate to a RandomX instruction format
2019-04-07 13:38:51 +00:00
instr . opcode = getType ( ) ;
2019-03-31 11:32:16 +00:00
instr . dst = dst_ ;
instr . src = src_ > = 0 ? src_ : dst_ ;
2019-04-16 16:58:44 +00:00
instr . setMod ( mod_ ) ;
2019-03-31 11:32:16 +00:00
instr . setImm32 ( imm32_ ) ;
}
2019-04-11 22:02:22 +00:00
void createForSlot ( Blake2Generator & gen , int slotSize , int fetchType , bool isLast , bool isFirst ) {
2019-03-31 11:32:16 +00:00
switch ( slotSize )
{
case 3 :
2019-04-12 11:32:22 +00:00
//if this is the last slot, we can also select "IMULH" instructions
2019-03-31 11:32:16 +00:00
if ( isLast ) {
2019-04-11 22:02:22 +00:00
create ( slot_3L [ gen . getByte ( ) & 3 ] , gen ) ;
2019-03-31 11:32:16 +00:00
}
else {
2019-04-11 22:02:22 +00:00
create ( slot_3 [ gen . getByte ( ) & 1 ] , gen ) ;
2019-03-31 11:32:16 +00:00
}
2019-04-11 22:02:22 +00:00
break ;
2019-03-31 11:32:16 +00:00
case 4 :
2019-04-12 11:32:22 +00:00
//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
2019-04-07 13:38:51 +00:00
if ( fetchType = = 4 & & ! isLast ) {
2019-04-12 17:36:08 +00:00
create ( & SuperscalarInstructionInfo : : IMUL_R , gen ) ;
2019-04-06 15:07:40 +00:00
}
else {
2019-04-11 22:02:22 +00:00
create ( slot_4 [ gen . getByte ( ) & 1 ] , gen ) ;
2019-04-06 15:07:40 +00:00
}
2019-04-11 22:02:22 +00:00
break ;
2019-03-31 11:32:16 +00:00
case 7 :
2019-04-11 22:02:22 +00:00
create ( slot_7 [ gen . getByte ( ) & 1 ] , gen ) ;
break ;
2019-04-07 13:38:51 +00:00
case 8 :
2019-04-11 22:02:22 +00:00
create ( slot_8 [ gen . getByte ( ) & 1 ] , gen ) ;
break ;
2019-04-07 13:38:51 +00:00
case 9 :
2019-04-11 22:02:22 +00:00
create ( slot_9 [ gen . getByte ( ) & 1 ] , gen ) ;
break ;
2019-03-31 11:32:16 +00:00
case 10 :
2019-04-11 22:02:22 +00:00
create ( slot_10 , gen ) ;
break ;
2019-03-31 11:32:16 +00:00
default :
2019-04-11 22:02:22 +00:00
UNREACHABLE ;
2019-03-31 11:32:16 +00:00
}
}
2019-04-12 17:36:08 +00:00
void create ( const SuperscalarInstructionInfo * info , Blake2Generator & gen ) {
2019-04-11 22:02:22 +00:00
info_ = info ;
reset ( ) ;
2019-03-31 19:22:36 +00:00
switch ( info - > getType ( ) )
2019-03-31 11:32:16 +00:00
{
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : ISUB_R : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IADD_RS ;
2019-04-11 22:02:22 +00:00
groupParIsSource_ = true ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IXOR_R : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IXOR_R ;
2019-04-11 22:02:22 +00:00
groupParIsSource_ = true ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IADD_RS : {
2019-04-11 22:02:22 +00:00
mod_ = gen . getByte ( ) ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IADD_RS ;
2019-04-11 22:02:22 +00:00
groupParIsSource_ = true ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IMUL_R : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IMUL_R ;
2019-04-12 17:36:08 +00:00
groupParIsSource_ = true ;
2019-04-07 13:38:51 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IROR_C : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
2019-04-07 13:38:51 +00:00
do {
2019-04-11 22:02:22 +00:00
imm32_ = gen . getByte ( ) & 63 ;
} while ( imm32_ = = 0 ) ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IROR_C ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = - 1 ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IADD_C7 :
case SuperscalarInstructionType : : IADD_C8 :
case SuperscalarInstructionType : : IADD_C9 : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
imm32_ = gen . getInt32 ( ) ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IADD_C7 ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = - 1 ;
2019-04-07 13:38:51 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IXOR_C7 :
case SuperscalarInstructionType : : IXOR_C8 :
case SuperscalarInstructionType : : IXOR_C9 : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
imm32_ = gen . getInt32 ( ) ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IXOR_C7 ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = - 1 ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IMULH_R : {
2019-04-11 22:02:22 +00:00
canReuse_ = true ;
mod_ = 0 ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IMULH_R ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = gen . getInt32 ( ) ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : ISMULH_R : {
2019-04-11 22:02:22 +00:00
canReuse_ = true ;
mod_ = 0 ;
imm32_ = 0 ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : ISMULH_R ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = gen . getInt32 ( ) ;
2019-03-31 11:32:16 +00:00
} break ;
2019-04-12 11:32:22 +00:00
case SuperscalarInstructionType : : IMUL_RCP : {
2019-04-11 22:02:22 +00:00
mod_ = 0 ;
2019-04-01 16:31:02 +00:00
do {
2019-04-11 22:02:22 +00:00
imm32_ = gen . getInt32 ( ) ;
} while ( ( imm32_ & ( imm32_ - 1 ) ) = = 0 ) ;
2019-04-12 11:32:22 +00:00
opGroup_ = SuperscalarInstructionType : : IMUL_RCP ;
2019-04-11 22:02:22 +00:00
opGroupPar_ = - 1 ;
2019-03-31 11:32:16 +00:00
} break ;
default :
break ;
}
}
2019-04-12 17:36:08 +00:00
bool selectDestination ( int cycle , bool allowChainedMul , RegisterInfo ( & registers ) [ 8 ] , Blake2Generator & gen ) {
/*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R)
std : : cout < < " Selecting destination with chained MUL enabled " < < std : : endl ; */
2019-03-31 22:38:17 +00:00
std : : vector < int > availableRegisters ;
2019-04-12 12:56:20 +00:00
//Conditions for the destination register:
// * value must be ready at the required cycle
// * cannot be the same as the source register unless the instruction allows it
// - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
2019-04-12 17:36:08 +00:00
// * register cannot be multiplied twice in a row unless allowChainedMul is true
// - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
// - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
2019-04-12 12:56:20 +00:00
// * either the last instruction applied to the register or its source must be different than this instruction
// - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
// * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
2019-03-31 22:38:17 +00:00
for ( unsigned i = 0 ; i < 8 ; + + i ) {
2019-04-14 15:21:26 +00:00
if ( registers [ i ] . latency < = cycle & & ( canReuse_ | | i ! = src_ ) & & ( allowChainedMul | | opGroup_ ! = SuperscalarInstructionType : : IMUL_R | | registers [ i ] . lastOpGroup ! = SuperscalarInstructionType : : IMUL_R ) & & ( registers [ i ] . lastOpGroup ! = opGroup_ | | registers [ i ] . lastOpPar ! = opGroupPar_ ) & & ( info_ - > getType ( ) ! = SuperscalarInstructionType : : IADD_RS | | i ! = RegisterNeedsDisplacement ) )
2019-03-31 22:38:17 +00:00
availableRegisters . push_back ( i ) ;
}
return selectRegister ( availableRegisters , gen , dst_ ) ;
}
bool selectSource ( int cycle , RegisterInfo ( & registers ) [ 8 ] , Blake2Generator & gen ) {
std : : vector < int > availableRegisters ;
2019-04-12 12:56:20 +00:00
//all registers that are ready at the cycle
2019-03-31 22:38:17 +00:00
for ( unsigned i = 0 ; i < 8 ; + + i ) {
2019-04-01 16:31:02 +00:00
if ( registers [ i ] . latency < = cycle )
2019-03-31 22:38:17 +00:00
availableRegisters . push_back ( i ) ;
}
2019-04-12 12:56:20 +00:00
//if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination
2019-04-12 11:32:22 +00:00
if ( availableRegisters . size ( ) = = 2 & & info_ - > getType ( ) = = SuperscalarInstructionType : : IADD_RS ) {
2019-04-14 15:21:26 +00:00
if ( availableRegisters [ 0 ] = = RegisterNeedsDisplacement | | availableRegisters [ 1 ] = = RegisterNeedsDisplacement ) {
opGroupPar_ = src_ = RegisterNeedsDisplacement ;
2019-04-06 10:00:56 +00:00
return true ;
}
}
2019-04-01 16:31:02 +00:00
if ( selectRegister ( availableRegisters , gen , src_ ) ) {
if ( groupParIsSource_ )
opGroupPar_ = src_ ;
return true ;
}
return false ;
2019-03-31 22:38:17 +00:00
}
2019-03-31 11:32:16 +00:00
int getType ( ) {
2019-04-11 22:02:22 +00:00
return info_ - > getType ( ) ;
2019-03-31 11:32:16 +00:00
}
int getSource ( ) {
return src_ ;
}
int getDestination ( ) {
return dst_ ;
}
int getGroup ( ) {
return opGroup_ ;
}
int getGroupPar ( ) {
return opGroupPar_ ;
}
2019-04-12 17:36:08 +00:00
const SuperscalarInstructionInfo & getInfo ( ) const {
2019-04-11 22:02:22 +00:00
return * info_ ;
2019-03-31 11:32:16 +00:00
}
2019-04-12 17:36:08 +00:00
static const SuperscalarInstruction Null ;
2019-03-31 11:32:16 +00:00
private :
2019-04-12 17:36:08 +00:00
const SuperscalarInstructionInfo * info_ ;
2019-03-31 22:38:17 +00:00
int src_ = - 1 ;
int dst_ = - 1 ;
2019-03-31 11:32:16 +00:00
int mod_ ;
uint32_t imm32_ ;
int opGroup_ ;
int opGroupPar_ ;
2019-03-31 22:38:17 +00:00
bool canReuse_ = false ;
2019-04-01 16:31:02 +00:00
bool groupParIsSource_ = false ;
2019-03-31 11:32:16 +00:00
2019-04-11 22:02:22 +00:00
void reset ( ) {
src_ = dst_ = - 1 ;
canReuse_ = groupParIsSource_ = false ;
}
2019-04-12 17:36:08 +00:00
SuperscalarInstruction ( const SuperscalarInstructionInfo * info ) : info_ ( info ) {
2019-03-31 19:22:36 +00:00
}
2019-03-31 11:32:16 +00:00
} ;
2019-04-12 17:36:08 +00:00
const SuperscalarInstruction SuperscalarInstruction : : Null = SuperscalarInstruction ( & SuperscalarInstructionInfo : : NOP ) ;
2019-03-31 11:32:16 +00:00
2019-04-12 17:36:08 +00:00
constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4 ;
2019-04-11 22:02:22 +00:00
constexpr int LOOK_FORWARD_CYCLES = 4 ;
constexpr int MAX_THROWAWAY_COUNT = 256 ;
2019-04-12 17:36:08 +00:00
2019-04-12 11:32:22 +00:00
template < bool commit >
static int scheduleUop ( ExecutionPort : : type uop , ExecutionPort : : type ( & portBusy ) [ CYCLE_MAP_SIZE ] [ 3 ] , int cycle ) {
//The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
2019-04-12 17:36:08 +00:00
//port P1 (multiplication) by instructions that can go to any port.
2019-04-12 11:32:22 +00:00
for ( ; cycle < CYCLE_MAP_SIZE ; + + cycle ) {
if ( ( uop & ExecutionPort : : P5 ) ! = 0 & & ! portBusy [ cycle ] [ 2 ] ) {
if ( commit ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; P5 at cycle " < < cycle < < std : : endl ;
2019-04-12 11:32:22 +00:00
portBusy [ cycle ] [ 2 ] = uop ;
}
return cycle ;
}
if ( ( uop & ExecutionPort : : P0 ) ! = 0 & & ! portBusy [ cycle ] [ 0 ] ) {
if ( commit ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; P0 at cycle " < < cycle < < std : : endl ;
2019-04-12 11:32:22 +00:00
portBusy [ cycle ] [ 0 ] = uop ;
}
return cycle ;
}
if ( ( uop & ExecutionPort : : P1 ) ! = 0 & & ! portBusy [ cycle ] [ 1 ] ) {
if ( commit ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; P1 at cycle " < < cycle < < std : : endl ;
2019-04-12 11:32:22 +00:00
portBusy [ cycle ] [ 1 ] = uop ;
}
return cycle ;
}
}
return - 1 ;
}
2019-03-28 14:27:10 +00:00
2019-04-03 07:53:25 +00:00
template < bool commit >
2019-04-12 11:32:22 +00:00
static int scheduleMop ( const MacroOp & mop , ExecutionPort : : type ( & portBusy ) [ CYCLE_MAP_SIZE ] [ 3 ] , int cycle , int depCycle ) {
//if this macro-op depends on the previous one, increase the starting cycle if needed
//this handles an explicit dependency chain in IMUL_RCP
2019-03-31 19:22:36 +00:00
if ( mop . isDependent ( ) ) {
cycle = std : : max ( cycle , depCycle ) ;
}
2019-04-12 11:32:22 +00:00
//move instructions are eliminated and don't need an execution unit
2019-03-31 19:22:36 +00:00
if ( mop . isEliminated ( ) ) {
2019-04-03 07:53:25 +00:00
if ( commit )
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; (eliminated) " < < std : : endl ;
2019-03-31 19:22:36 +00:00
return cycle ;
}
else if ( mop . isSimple ( ) ) {
2019-04-12 11:32:22 +00:00
//this macro-op has only one uOP
return scheduleUop < commit > ( mop . getUop1 ( ) , portBusy , cycle ) ;
2019-03-31 19:22:36 +00:00
}
else {
2019-04-12 11:32:22 +00:00
//macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle
2019-04-03 07:53:25 +00:00
for ( ; cycle < CYCLE_MAP_SIZE ; + + cycle ) {
2019-04-12 11:32:22 +00:00
int cycle1 = scheduleUop < false > ( mop . getUop1 ( ) , portBusy , cycle ) ;
int cycle2 = scheduleUop < false > ( mop . getUop2 ( ) , portBusy , cycle ) ;
if ( cycle1 = = cycle2 ) {
2019-04-03 07:53:25 +00:00
if ( commit ) {
2019-04-12 11:32:22 +00:00
scheduleUop < true > ( mop . getUop1 ( ) , portBusy , cycle1 ) ;
scheduleUop < true > ( mop . getUop2 ( ) , portBusy , cycle2 ) ;
2019-04-03 07:53:25 +00:00
}
2019-04-12 11:32:22 +00:00
return cycle1 ;
2019-03-31 19:22:36 +00:00
}
}
}
return - 1 ;
}
2019-04-12 17:36:08 +00:00
void generateSuperscalar ( SuperscalarProgram & prog , Blake2Generator & gen ) {
2019-03-31 11:32:16 +00:00
2019-04-03 07:53:25 +00:00
ExecutionPort : : type portBusy [ CYCLE_MAP_SIZE ] [ 3 ] ;
2019-03-31 19:22:36 +00:00
memset ( portBusy , 0 , sizeof ( portBusy ) ) ;
2019-03-31 11:32:16 +00:00
RegisterInfo registers [ 8 ] ;
2019-04-11 22:02:22 +00:00
const DecoderBuffer * decodeBuffer = & DecoderBuffer : : Default ;
2019-04-12 17:36:08 +00:00
SuperscalarInstruction currentInstruction = SuperscalarInstruction : : Null ;
2019-04-12 11:32:22 +00:00
int macroOpIndex = 0 ;
2019-03-31 11:32:16 +00:00
int codeSize = 0 ;
int macroOpCount = 0 ;
2019-03-31 19:22:36 +00:00
int cycle = 0 ;
int depCycle = 0 ;
2019-04-03 07:53:25 +00:00
int retireCycle = 0 ;
2019-03-31 19:22:36 +00:00
bool portsSaturated = false ;
2019-04-12 11:32:22 +00:00
int programSize = 0 ;
2019-04-03 12:06:59 +00:00
int mulCount = 0 ;
2019-04-11 22:02:22 +00:00
int decodeCycle ;
2019-04-12 12:56:20 +00:00
int throwAwayCount = 0 ;
2019-03-31 11:32:16 +00:00
2019-04-11 22:02:22 +00:00
//decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated.
//Each decode cycle decodes 16 bytes of x86 code.
//Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always
//saturated first. The cycle limit is present only to guarantee loop termination.
2019-06-10 13:57:36 +00:00
//Program size is limited to SuperscalarMaxSize instructions.
for ( decodeCycle = 0 ; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY & & ! portsSaturated & & programSize < SuperscalarMaxSize ; + + decodeCycle ) {
2019-03-31 19:22:36 +00:00
2019-04-12 11:32:22 +00:00
//select a decode configuration
2019-04-11 22:02:22 +00:00
decodeBuffer = decodeBuffer - > fetchNext ( currentInstruction . getType ( ) , decodeCycle , mulCount , gen ) ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; ------------- fetch cycle " < < cycle < < " ( " < < decodeBuffer - > getName ( ) < < " ) " < < std : : endl ;
2019-04-11 22:02:22 +00:00
int bufferIndex = 0 ;
2019-03-31 11:32:16 +00:00
2019-04-12 11:32:22 +00:00
//fill all instruction slots in the current decode buffer
2019-04-11 22:02:22 +00:00
while ( bufferIndex < decodeBuffer - > getSize ( ) ) {
2019-04-01 17:04:08 +00:00
int topCycle = cycle ;
2019-04-11 22:02:22 +00:00
2019-04-12 11:32:22 +00:00
//if we have issued all macro-ops for the current RandomX instruction, create a new instruction
if ( macroOpIndex > = currentInstruction . getInfo ( ) . getSize ( ) ) {
2019-06-10 13:57:36 +00:00
if ( portsSaturated | | programSize > = SuperscalarMaxSize )
2019-04-03 07:53:25 +00:00
break ;
2019-04-12 11:32:22 +00:00
//select an instruction so that the first macro-op fits into the current slot
2019-04-11 22:02:22 +00:00
currentInstruction . createForSlot ( gen , decodeBuffer - > getCounts ( ) [ bufferIndex ] , decodeBuffer - > getIndex ( ) , decodeBuffer - > getSize ( ) = = bufferIndex + 1 , bufferIndex = = 0 ) ;
2019-04-12 11:32:22 +00:00
macroOpIndex = 0 ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; " < < currentInstruction . getInfo ( ) . getName ( ) < < std : : endl ;
2019-03-31 11:32:16 +00:00
}
2019-04-12 11:32:22 +00:00
const MacroOp & mop = currentInstruction . getInfo ( ) . getOp ( macroOpIndex ) ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < mop . getName ( ) < < " " ;
2019-04-11 22:02:22 +00:00
//calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution
2019-04-12 11:32:22 +00:00
int scheduleCycle = scheduleMop < false > ( mop , portBusy , cycle , depCycle ) ;
2019-04-03 07:53:25 +00:00
if ( scheduleCycle < 0 ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " Unable to map operation ' " < < mop . getName ( ) < < " ' to execution port (cycle " < < cycle < < " ) " < < std : : endl ;
2019-04-12 17:36:08 +00:00
//__debugbreak();
portsSaturated = true ;
break ;
2019-04-03 07:53:25 +00:00
}
2019-03-31 22:38:17 +00:00
2019-04-11 22:02:22 +00:00
//find a source register (if applicable) that will be ready when this instruction executes
2019-04-12 11:32:22 +00:00
if ( macroOpIndex = = currentInstruction . getInfo ( ) . getSrcOp ( ) ) {
2019-04-11 22:02:22 +00:00
int forward ;
//if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward
for ( forward = 0 ; forward < LOOK_FORWARD_CYCLES & & ! currentInstruction . selectSource ( scheduleCycle , registers , gen ) ; + + forward ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; src STALL at cycle " < < cycle < < std : : endl ;
2019-03-31 22:38:17 +00:00
+ + scheduleCycle ;
+ + cycle ;
}
2019-04-11 22:02:22 +00:00
//if no register was found, throw the instruction away and try another one
if ( forward = = LOOK_FORWARD_CYCLES ) {
2019-04-12 12:56:20 +00:00
if ( throwAwayCount < MAX_THROWAWAY_COUNT ) {
throwAwayCount + + ;
macroOpIndex = currentInstruction . getInfo ( ) . getSize ( ) ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; THROW away " < < currentInstruction . getInfo ( ) . getName ( ) < < std : : endl ;
2019-04-12 17:36:08 +00:00
//cycle = topCycle;
2019-04-12 12:56:20 +00:00
continue ;
}
//abort this decode buffer
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " Aborting at cycle " < < cycle < < " with decode buffer " < < decodeBuffer - > getName ( ) < < " - source registers not available for operation " < < currentInstruction . getInfo ( ) . getName ( ) < < std : : endl ;
2019-04-12 17:36:08 +00:00
currentInstruction = SuperscalarInstruction : : Null ;
2019-04-12 12:56:20 +00:00
break ;
2019-04-01 17:04:08 +00:00
}
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; src = r " < < currentInstruction . getSource ( ) < < std : : endl ;
2019-03-31 22:38:17 +00:00
}
2019-04-11 22:02:22 +00:00
//find a destination register that will be ready when this instruction executes
2019-04-12 11:32:22 +00:00
if ( macroOpIndex = = currentInstruction . getInfo ( ) . getDstOp ( ) ) {
2019-04-11 22:02:22 +00:00
int forward ;
2019-04-12 17:36:08 +00:00
for ( forward = 0 ; forward < LOOK_FORWARD_CYCLES & & ! currentInstruction . selectDestination ( scheduleCycle , throwAwayCount > 0 , registers , gen ) ; + + forward ) {
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; dst STALL at cycle " < < cycle < < std : : endl ;
2019-03-31 22:38:17 +00:00
+ + scheduleCycle ;
+ + cycle ;
}
2019-04-11 22:02:22 +00:00
if ( forward = = LOOK_FORWARD_CYCLES ) { //throw instruction away
2019-04-12 12:56:20 +00:00
if ( throwAwayCount < MAX_THROWAWAY_COUNT ) {
throwAwayCount + + ;
macroOpIndex = currentInstruction . getInfo ( ) . getSize ( ) ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; THROW away " < < currentInstruction . getInfo ( ) . getName ( ) < < std : : endl ;
2019-04-12 17:36:08 +00:00
//cycle = topCycle;
2019-04-12 12:56:20 +00:00
continue ;
}
//abort this decode buffer
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " Aborting at cycle " < < cycle < < " with decode buffer " < < decodeBuffer - > getName ( ) < < " - destination registers not available " < < std : : endl ;
2019-04-12 17:36:08 +00:00
currentInstruction = SuperscalarInstruction : : Null ;
2019-04-12 12:56:20 +00:00
break ;
2019-04-01 17:04:08 +00:00
}
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; dst = r " < < currentInstruction . getDestination ( ) < < std : : endl ;
2019-03-31 22:38:17 +00:00
}
2019-04-12 12:56:20 +00:00
throwAwayCount = 0 ;
2019-04-12 17:36:08 +00:00
2019-04-11 22:02:22 +00:00
//recalculate when the instruction can be scheduled for execution based on operand availability
2019-04-12 11:32:22 +00:00
scheduleCycle = scheduleMop < true > ( mop , portBusy , scheduleCycle , scheduleCycle ) ;
2019-04-11 22:02:22 +00:00
//calculate when the result will be ready
2019-04-01 16:31:02 +00:00
depCycle = scheduleCycle + mop . getLatency ( ) ;
2019-04-11 22:02:22 +00:00
//if this instruction writes the result, modify register information
// RegisterInfo.latency - which cycle the register will be ready
// RegisterInfo.lastOpGroup - the last operation that was applied to the register
2019-04-12 11:32:22 +00:00
// RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register)
if ( macroOpIndex = = currentInstruction . getInfo ( ) . getResultOp ( ) ) {
2019-04-01 16:31:02 +00:00
int dst = currentInstruction . getDestination ( ) ;
RegisterInfo & ri = registers [ dst ] ;
2019-04-03 07:53:25 +00:00
retireCycle = depCycle ;
ri . latency = retireCycle ;
2019-04-01 16:31:02 +00:00
ri . lastOpGroup = currentInstruction . getGroup ( ) ;
ri . lastOpPar = currentInstruction . getGroupPar ( ) ;
2019-04-24 16:37:58 +00:00
if ( trace ) std : : cout < < " ; RETIRED at cycle " < < retireCycle < < std : : endl ;
2019-03-31 22:38:17 +00:00
}
2019-03-31 19:22:36 +00:00
codeSize + = mop . getSize ( ) ;
2019-04-11 22:02:22 +00:00
bufferIndex + + ;
2019-04-12 11:32:22 +00:00
macroOpIndex + + ;
2019-03-31 11:32:16 +00:00
macroOpCount + + ;
2019-04-11 22:02:22 +00:00
//terminating condition
if ( scheduleCycle > = RANDOMX_SUPERSCALAR_LATENCY ) {
2019-03-31 19:22:36 +00:00
portsSaturated = true ;
}
2019-04-01 17:04:08 +00:00
cycle = topCycle ;
2019-04-11 22:02:22 +00:00
//when all macro-ops of the current instruction have been issued, add the instruction into the program
2019-04-12 11:32:22 +00:00
if ( macroOpIndex > = currentInstruction . getInfo ( ) . getSize ( ) ) {
currentInstruction . toInstr ( prog ( programSize + + ) ) ;
mulCount + = isMultiplication ( currentInstruction . getType ( ) ) ;
2019-04-03 07:53:25 +00:00
}
2019-03-31 19:22:36 +00:00
}
+ + cycle ;
}
2019-04-06 10:00:56 +00:00
double ipc = ( macroOpCount / ( double ) retireCycle ) ;
2019-03-31 22:38:17 +00:00
2019-04-12 17:36:08 +00:00
memset ( prog . asicLatencies , 0 , sizeof ( prog . asicLatencies ) ) ;
2019-04-03 07:53:25 +00:00
2019-04-11 22:02:22 +00:00
//Calculate ASIC latency:
//Assumes 1 cycle latency for all operations and unlimited parallelization.
2019-04-12 11:32:22 +00:00
for ( int i = 0 ; i < programSize ; + + i ) {
2019-04-03 07:53:25 +00:00
Instruction & instr = prog ( i ) ;
2019-04-12 17:36:08 +00:00
int latDst = prog . asicLatencies [ instr . dst ] + 1 ;
int latSrc = instr . dst ! = instr . src ? prog . asicLatencies [ instr . src ] + 1 : 0 ;
prog . asicLatencies [ instr . dst ] = std : : max ( latDst , latSrc ) ;
2019-04-03 07:53:25 +00:00
}
2019-04-11 22:02:22 +00:00
//address register is the register with the highest ASIC latency
int asicLatencyMax = 0 ;
2019-04-06 10:00:56 +00:00
int addressReg = 0 ;
for ( int i = 0 ; i < 8 ; + + i ) {
2019-04-12 17:36:08 +00:00
if ( prog . asicLatencies [ i ] > asicLatencyMax ) {
asicLatencyMax = prog . asicLatencies [ i ] ;
2019-04-06 10:00:56 +00:00
addressReg = i ;
}
2019-04-12 17:36:08 +00:00
prog . cpuLatencies [ i ] = registers [ i ] . latency ;
2019-04-03 07:53:25 +00:00
}
2019-04-12 11:32:22 +00:00
prog . setSize ( programSize ) ;
2019-04-06 10:00:56 +00:00
prog . setAddressRegister ( addressReg ) ;
2019-04-12 17:36:08 +00:00
prog . cpuLatency = retireCycle ;
prog . asicLatency = asicLatencyMax ;
prog . codeSize = codeSize ;
prog . macroOps = macroOpCount ;
prog . decodeCycles = decodeCycle ;
prog . ipc = ipc ;
prog . mulCount = mulCount ;
/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
if ( INFO ) std : : cout < < " ; (* = in use, _ = idle) " < < std : : endl ;
int portCycles = 0 ;
for ( int i = 0 ; i < CYCLE_MAP_SIZE ; + + i ) {
std : : cout < < " ; " < < std : : setw ( 3 ) < < i < < " " ;
for ( int j = 0 ; j < 3 ; + + j ) {
std : : cout < < ( portBusy [ i ] [ j ] ? ' * ' : ' _ ' ) ;
portCycles + = ! ! portBusy [ i ] [ j ] ;
}
std : : cout < < std : : endl ;
} */
2019-03-31 11:32:16 +00:00
}
2019-04-20 09:08:01 +00:00
void executeSuperscalar ( int_reg_t ( & r ) [ 8 ] , SuperscalarProgram & prog , std : : vector < uint64_t > * reciprocals ) {
for ( unsigned j = 0 ; j < prog . getSize ( ) ; + + j ) {
Instruction & instr = prog ( j ) ;
switch ( instr . opcode )
{
case randomx : : SuperscalarInstructionType : : ISUB_R :
r [ instr . dst ] - = r [ instr . src ] ;
break ;
case randomx : : SuperscalarInstructionType : : IXOR_R :
r [ instr . dst ] ^ = r [ instr . src ] ;
break ;
case randomx : : SuperscalarInstructionType : : IADD_RS :
2019-05-01 12:46:51 +00:00
r [ instr . dst ] + = r [ instr . src ] < < instr . getModShift ( ) ;
2019-04-20 09:08:01 +00:00
break ;
case randomx : : SuperscalarInstructionType : : IMUL_R :
r [ instr . dst ] * = r [ instr . src ] ;
break ;
case randomx : : SuperscalarInstructionType : : IROR_C :
r [ instr . dst ] = rotr ( r [ instr . dst ] , instr . getImm32 ( ) ) ;
break ;
case randomx : : SuperscalarInstructionType : : IADD_C7 :
case randomx : : SuperscalarInstructionType : : IADD_C8 :
case randomx : : SuperscalarInstructionType : : IADD_C9 :
r [ instr . dst ] + = signExtend2sCompl ( instr . getImm32 ( ) ) ;
break ;
case randomx : : SuperscalarInstructionType : : IXOR_C7 :
case randomx : : SuperscalarInstructionType : : IXOR_C8 :
case randomx : : SuperscalarInstructionType : : IXOR_C9 :
r [ instr . dst ] ^ = signExtend2sCompl ( instr . getImm32 ( ) ) ;
break ;
case randomx : : SuperscalarInstructionType : : IMULH_R :
r [ instr . dst ] = mulh ( r [ instr . dst ] , r [ instr . src ] ) ;
break ;
case randomx : : SuperscalarInstructionType : : ISMULH_R :
r [ instr . dst ] = smulh ( r [ instr . dst ] , r [ instr . src ] ) ;
break ;
case randomx : : SuperscalarInstructionType : : IMUL_RCP :
if ( reciprocals ! = nullptr )
r [ instr . dst ] * = ( * reciprocals ) [ instr . getImm32 ( ) ] ;
else
2019-04-20 14:53:06 +00:00
r [ instr . dst ] * = randomx_reciprocal ( instr . getImm32 ( ) ) ;
2019-04-20 09:08:01 +00:00
break ;
default :
UNREACHABLE ;
}
}
}
2019-04-20 14:53:06 +00:00
}