diff --git a/makefile b/makefile
index f805724..87fef86 100644
--- a/makefile
+++ b/makefile
@@ -13,7 +13,7 @@ LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o)
ifeq ($(PLATFORM),x86_64)
- ROBJS += $(OBJDIR)/JitCompilerX86-static.o
+ ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
endif
all: release test
@@ -77,6 +77,9 @@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompile
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
+$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR)
+ $(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@
+
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@
diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index a46fe5d..3092e4d 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -72,16 +72,16 @@ namespace RandomX {
void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") {
asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl;
- asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
+ asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
}
void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl;
- asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
+ asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
}
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
- return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
+ return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
//1 uOP
@@ -348,6 +348,13 @@ namespace RandomX {
}
}
+ //2 uOPs
+ void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) {
+ if (instr.src != instr.dst) {
+ asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
+ }
+ }
+
//1 uOPs
void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) {
asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl;
@@ -431,7 +438,7 @@ namespace RandomX {
//6 uOPs
void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
- int rotate = (13 - (instr.alt & 63)) & 63;
+ int rotate = (13 - (instr.imm32 & 63)) & 63;
if (rotate != 0)
asmCode << "\trol rax, " << rotate << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
@@ -441,7 +448,7 @@ namespace RandomX {
}
static inline const char* condition(Instruction& instr, bool invert = false) {
- switch (((instr.alt >> 2) & 7) ^ invert)
+ switch (((instr.mod >> 2) & 7) ^ invert)
{
case 0:
return "be";
@@ -519,6 +526,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
+ INST_HANDLE(ISWAP_R)
//Common floating point
INST_HANDLE(FPSWAP_R)
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 6b0c505..a8e062c 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -63,6 +63,7 @@ namespace RandomX {
void h_IXOR_M(Instruction&, int);
void h_IROR_R(Instruction&, int);
void h_IROL_R(Instruction&, int);
+ void h_ISWAP_R(Instruction&, int);
void h_FPSWAP_R(Instruction&, int);
void h_FPADD_R(Instruction&, int);
void h_FPADD_M(Instruction&, int);
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index f5d33d0..ebacf42 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -57,7 +57,7 @@ namespace RandomX {
for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
*(((uint32_t*)®) + i) = gen();
}
- FPINIT();
+ initFpu();
/*for (int i = 0; i < RegistersCount / 2; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index 0aa0289..ce75f43 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -29,15 +29,15 @@ namespace RandomX {
}
void Instruction::genAddressReg(std::ostream& os) const {
- os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
+ os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
}
void Instruction::genAddressRegDst(std::ostream& os) const {
- os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
+ os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
}
void Instruction::genAddressImm(std::ostream& os) const {
- os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
+ os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
}
void Instruction::h_IADD_R(std::ostream& os) const {
@@ -211,6 +211,10 @@ namespace RandomX {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
+ void Instruction::h_ISWAP_R(std::ostream& os) const {
+ os << "r" << (int)dst << ", r" << (int)src << std::endl;
+ }
+
void Instruction::h_FPSWAP_R(std::ostream& os) const {
const char reg = (dst >= 4) ? 'e' : 'f';
auto dstIndex = dst % 4;
@@ -280,7 +284,7 @@ namespace RandomX {
}
void Instruction::h_CFROUND(std::ostream& os) const {
- os << "r" << (int)src << ", " << (alt & 63) << std::endl;
+ os << "r" << (int)src << ", " << (imm32 & 63) << std::endl;
}
static inline const char* condition(int index) {
@@ -306,11 +310,11 @@ namespace RandomX {
}
void Instruction::h_COND_R(std::ostream& os) const {
- os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
+ os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
}
void Instruction::h_COND_M(std::ostream& os) const {
- os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(";
+ os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(";
genAddressReg(os);
os << ", " << imm32 << ")" << std::endl;
}
@@ -356,6 +360,7 @@ namespace RandomX {
INST_NAME(IXOR_M)
INST_NAME(IROR_R)
INST_NAME(IROL_R)
+ INST_NAME(ISWAP_R)
//Common floating point
INST_NAME(FPSWAP_R)
@@ -406,6 +411,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
+ INST_HANDLE(ISWAP_R)
//Common floating point
INST_HANDLE(FPSWAP_R)
diff --git a/src/Instruction.hpp b/src/Instruction.hpp
index ffa3880..987f326 100644
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@@ -28,12 +28,52 @@ namespace RandomX {
typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const;
+ namespace InstructionType {
+ constexpr int IADD_R = 0;
+ constexpr int IADD_M = 1;
+ constexpr int IADD_RC = 2;
+ constexpr int ISUB_R = 3;
+ constexpr int ISUB_M = 4;
+ constexpr int IMUL_9C = 5;
+ constexpr int IMUL_R = 6;
+ constexpr int IMUL_M = 7;
+ constexpr int IMULH_R = 8;
+ constexpr int IMULH_M = 9;
+ constexpr int ISMULH_R = 10;
+ constexpr int ISMULH_M = 11;
+ constexpr int IDIV_C = 12;
+ constexpr int ISDIV_C = 13;
+ constexpr int INEG_R = 14;
+ constexpr int IXOR_R = 15;
+ constexpr int IXOR_M = 16;
+ constexpr int IROR_R = 17;
+ constexpr int IROL_R = 18;
+ constexpr int ISWAP_R = 19;
+ constexpr int FPSWAP_R = 20;
+ constexpr int FPADD_R = 21;
+ constexpr int FPADD_M = 22;
+ constexpr int FPSUB_R = 23;
+ constexpr int FPSUB_M = 24;
+ constexpr int FPNEG_R = 25;
+ constexpr int FPMUL_R = 26;
+ constexpr int FPMUL_M = 27;
+ constexpr int FPDIV_R = 28;
+ constexpr int FPDIV_M = 29;
+ constexpr int FPSQRT_R = 30;
+ constexpr int COND_R = 31;
+ constexpr int COND_M = 32;
+ constexpr int CFROUND = 33;
+ constexpr int ISTORE = 34;
+ constexpr int FSTORE = 35;
+ constexpr int NOP = 36;
+ }
+
class Instruction {
public:
uint8_t opcode;
uint8_t dst;
uint8_t src;
- uint8_t alt;
+ uint8_t mod;
int32_t imm32;
const char* getName() const {
return names[opcode];
@@ -70,6 +110,7 @@ namespace RandomX {
void h_IXOR_M(std::ostream&) const;
void h_IROR_R(std::ostream&) const;
void h_IROL_R(std::ostream&) const;
+ void h_ISWAP_R(std::ostream&) const;
void h_FPSWAP_R(std::ostream&) const;
void h_FPADD_R(std::ostream&) const;
void h_FPADD_M(std::ostream&) const;
diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index d145e78..af01183 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -30,6 +30,7 @@ along with RandomX. If not, see.
#include
#include
#include
+#include "intrinPortable.h"
#ifdef STATS
#include
#endif
@@ -98,7 +99,7 @@ namespace RandomX {
for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
*(((uint32_t*)®) + i) = gen();
}
- FPINIT();
+ initFpu();
for (int i = 0; i < RegistersCount; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
@@ -114,24 +115,32 @@ namespace RandomX {
}
void InterpretedVirtualMachine::execute() {
- while (ic > 0) {
-#ifdef STATS
- count_instructions[pc]++;
-#endif
- auto& inst = p(pc);
- if(trace) std::cout << inst.getName() << " (" << std::dec << pc << ")" << std::endl;
- pc = (pc + 1) % ProgramLength;
- auto handler = engine[inst.opcode];
- (this->*handler)(inst);
- ic--;
+ for(int i = 0; i < InstructionCount; ++i) {
+ for (int j = 0; j < ProgramLength; ++j) {
+ auto& ibc = byteCode[j];
+ switch (ibc.type)
+ {
+ case InstructionType::CFROUND: {
+ uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32);
+ setRoundMode(rcFlag);
+ }
+ break;
+ }
+ }
}
-#ifdef STATS
- count_endstack += stack.size();
-#endif
+
}
#include "instructionWeights.hpp"
-#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x))
+
+ void InterpretedVirtualMachine::executeInstruction(Instruction& instr) {
+ switch (instr.opcode)
+ {
+ CASE_REP(IADD_R)
+
+ break;
+ }
+ }
InstructionHandler InterpretedVirtualMachine::engine[256] = {
diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp
index fba081a..2eee73d 100644
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@@ -33,10 +33,24 @@ namespace RandomX {
virtual std::ostream& printCxx(std::ostream&) const = 0;
};
+ struct InstructionByteCode;
class InterpretedVirtualMachine;
typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&);
+ struct alignas(64) InstructionByteCode {
+ convertible_t* idst;
+ convertible_t* isrc;
+ convertible_t imm;
+ fpu_reg_t* fdst;
+ fpu_reg_t* fsrc;
+ uint32_t condition;
+ uint32_t memMask;
+ uint32_t type;
+ };
+
+ constexpr int asedwfagdewsa = sizeof(InstructionByteCode);
+
class InterpretedVirtualMachine : public VirtualMachine {
public:
InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {}
@@ -53,6 +67,7 @@ namespace RandomX {
static const ITransform* addressTransformations[TransformationCount];
bool softAes, asyncWorker;
Program p;
+ InstructionByteCode byteCode[ProgramLength];
std::vector stack;
uint64_t pc, ic;
const ITransform* currentTransform;
@@ -106,7 +121,7 @@ namespace RandomX {
int count_FPMUL_nop2 = 0;
int datasetAccess[256] = { 0 };
#endif
-
+ void executeInstruction(Instruction&);
convertible_t loada(Instruction&);
convertible_t loadbiashift(Instruction&);
convertible_t loadbiadiv(Instruction&);
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index cf50582..d8e7a42 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -176,6 +176,7 @@ namespace RandomX {
static const uint8_t JNZ[] = { 0x0f, 0x85 };
static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
+ static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize;
@@ -248,7 +249,7 @@ namespace RandomX {
emitByte(AND_EAX_I);
else
emit(AND_ECX_I);
- emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
+ emit32((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) {
@@ -257,11 +258,11 @@ namespace RandomX {
emitByte(AND_EAX_I);
int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask;
int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask;
- emit32((instr.alt % 4) ? maskL1 : maskL2);
+ emit32((instr.mod % 4) ? maskL1 : maskL2);
}
void JitCompilerX86::genAddressImm(Instruction& instr) {
- emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
+ emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
}
void JitCompilerX86::h_IADD_R(Instruction& instr) {
@@ -595,6 +596,13 @@ namespace RandomX {
}
}
+ void JitCompilerX86::h_ISWAP_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_XCHG);
+ emitByte(0xc0 + instr.dst + 8 * instr.src);
+ }
+ }
+
void JitCompilerX86::h_FPSWAP_R(Instruction& instr) {
emit(SHUFPD);
emitByte(0xc0 + 9 * instr.dst);
@@ -682,7 +690,7 @@ namespace RandomX {
void JitCompilerX86::h_CFROUND(Instruction& instr) {
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.src);
- int rotate = (13 - (instr.alt & 63)) & 63;
+ int rotate = (13 - (instr.imm32 & 63)) & 63;
if (rotate != 0) {
emit(ROL_RAX);
emitByte(rotate);
@@ -691,7 +699,7 @@ namespace RandomX {
}
static inline uint8_t condition(Instruction& instr, bool invert = false) {
- switch ((instr.alt & 7) ^ invert)
+ switch ((instr.mod & 7) ^ invert)
{
case 0:
return 0x96; //setbe
@@ -777,6 +785,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
+ INST_HANDLE(ISWAP_R)
INST_HANDLE(FPSWAP_R)
INST_HANDLE(FPADD_R)
INST_HANDLE(FPADD_M)
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index 0aef990..9c85667 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -109,6 +109,7 @@ namespace RandomX {
void h_IXOR_M(Instruction&);
void h_IROR_R(Instruction&);
void h_IROL_R(Instruction&);
+ void h_ISWAP_R(Instruction&);
void h_FPSWAP_R(Instruction&);
void h_FPADD_R(Instruction&);
void h_FPADD_M(Instruction&);
diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc
index c4c1fed..76b8f3d 100644
--- a/src/asm/program_loop_load.inc
+++ b/src/asm/program_loop_load.inc
@@ -1,5 +1,5 @@
mov rdx, rax
- and eax, 1048512
+ and eax, 2097088
lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
@@ -11,7 +11,7 @@
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
ror rdx, 32
- and edx, 1048512
+ and edx, 2097088
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
diff --git a/src/asm/squareHash.inc b/src/asm/squareHash.inc
new file mode 100644
index 0000000..b62dc9e
--- /dev/null
+++ b/src/asm/squareHash.inc
@@ -0,0 +1,87 @@
+ mov rax, 1613783669344650115
+ add rax, rcx
+ mul rax
+ sub rax, rdx ;# 1
+ mul rax
+ sub rax, rdx ;# 2
+ mul rax
+ sub rax, rdx ;# 3
+ mul rax
+ sub rax, rdx ;# 4
+ mul rax
+ sub rax, rdx ;# 5
+ mul rax
+ sub rax, rdx ;# 6
+ mul rax
+ sub rax, rdx ;# 7
+ mul rax
+ sub rax, rdx ;# 8
+ mul rax
+ sub rax, rdx ;# 9
+ mul rax
+ sub rax, rdx ;# 10
+ mul rax
+ sub rax, rdx ;# 11
+ mul rax
+ sub rax, rdx ;# 12
+ mul rax
+ sub rax, rdx ;# 13
+ mul rax
+ sub rax, rdx ;# 14
+ mul rax
+ sub rax, rdx ;# 15
+ mul rax
+ sub rax, rdx ;# 16
+ mul rax
+ sub rax, rdx ;# 17
+ mul rax
+ sub rax, rdx ;# 18
+ mul rax
+ sub rax, rdx ;# 19
+ mul rax
+ sub rax, rdx ;# 20
+ mul rax
+ sub rax, rdx ;# 21
+ mul rax
+ sub rax, rdx ;# 22
+ mul rax
+ sub rax, rdx ;# 23
+ mul rax
+ sub rax, rdx ;# 24
+ mul rax
+ sub rax, rdx ;# 25
+ mul rax
+ sub rax, rdx ;# 26
+ mul rax
+ sub rax, rdx ;# 27
+ mul rax
+ sub rax, rdx ;# 28
+ mul rax
+ sub rax, rdx ;# 29
+ mul rax
+ sub rax, rdx ;# 30
+ mul rax
+ sub rax, rdx ;# 31
+ mul rax
+ sub rax, rdx ;# 32
+ mul rax
+ sub rax, rdx ;# 33
+ mul rax
+ sub rax, rdx ;# 34
+ mul rax
+ sub rax, rdx ;# 35
+ mul rax
+ sub rax, rdx ;# 36
+ mul rax
+ sub rax, rdx ;# 37
+ mul rax
+ sub rax, rdx ;# 38
+ mul rax
+ sub rax, rdx ;# 39
+ mul rax
+ sub rax, rdx ;# 40
+ mul rax
+ sub rax, rdx ;# 41
+ mul rax
+ sub rax, rdx ;# 42
+ ret
\ No newline at end of file
diff --git a/src/common.hpp b/src/common.hpp
index bbd5a2b..e52dbc2 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -26,11 +26,6 @@ namespace RandomX {
using addr_t = uint32_t;
- constexpr int RoundToNearest = 0;
- constexpr int RoundDown = 1;
- constexpr int RoundUp = 2;
- constexpr int RoundToZero = 3;
-
constexpr int SeedSize = 32;
constexpr int ResultSize = 32;
@@ -46,7 +41,7 @@ namespace RandomX {
constexpr int CacheBlockCount = CacheSize / CacheLineSize;
constexpr int BlockExpansionRatio = DatasetSize / CacheSize;
constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount;
- constexpr int DatasetIterations = 3;
+ constexpr int DatasetIterations = 10;
#ifdef TRACE
@@ -72,12 +67,12 @@ namespace RandomX {
convertible_t hi;
};
- constexpr int ProgramLength = 128;
+ constexpr int ProgramLength = 256;
constexpr uint32_t InstructionCount = 1024;
- constexpr uint32_t ScratchpadSize = 1024 * 1024;
+ constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
- constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t);
- constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t);
+ constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t);
+ constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
@@ -133,6 +128,8 @@ namespace RandomX {
typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
+ typedef bool(*Condition)(convertible_t&, convertible_t&);
+
extern "C" {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
}
diff --git a/src/dataset.cpp b/src/dataset.cpp
index 6029611..b941a75 100644
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@@ -28,10 +28,11 @@ along with RandomX. If not, see.
#include "Cache.hpp"
#include "virtualMemory.hpp"
#include "softAes.h"
+#include "squareHash.h"
#if defined(__SSE2__)
#include
-#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA)
+#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#else
#define PREFETCH(memory)
#endif
@@ -49,42 +50,37 @@ namespace RandomX {
template
void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) {
- __m128i x0, x1, x2, x3;
+ uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
- __m128i* xit = (__m128i*)intermediate;
- __m128i* xout = (__m128i*)out;
+ r0 = 4ULL * blockNumber;
+ r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;
- x0 = _mm_cvtsi32_si128(blockNumber);
- constexpr int mask = (CacheSize / CacheLineSize) - 1;
+ constexpr int mask = (CacheSize - 1) & -64;
for (auto i = 0; i < DatasetIterations; ++i) {
- x0 = aesenc(x0, keys[0]);
- //x0 = aesenc(x0, keys[1]);
- x1 = aesenc(x0, keys[2]);
- //x1 = aesenc(x1, keys[3]);
- x2 = aesenc(x1, keys[4]);
- //x2 = aesenc(x2, keys[5]);
- x3 = aesenc(x2, keys[6]);
- //x3 = aesenc(x3, keys[7]);
-
- int index = _mm_cvtsi128_si32(x3);
- index &= mask;
-
- __m128i t0 = _mm_load_si128(xit + 4 * index + 0);
- __m128i t1 = _mm_load_si128(xit + 4 * index + 1);
- __m128i t2 = _mm_load_si128(xit + 4 * index + 2);
- __m128i t3 = _mm_load_si128(xit + 4 * index + 3);
-
- x0 = _mm_xor_si128(x0, t0);
- x1 = _mm_xor_si128(x1, t1);
- x2 = _mm_xor_si128(x2, t2);
- x3 = _mm_xor_si128(x3, t3);
+ uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask));
+ PREFETCHNTA(mix);
+ r0 = squareHash(r0);
+ r0 ^= mix[0];
+ r1 ^= mix[1];
+ r2 ^= mix[2];
+ r3 ^= mix[3];
+ r4 ^= mix[4];
+ r5 ^= mix[5];
+ r6 ^= mix[6];
+ r7 ^= mix[7];
}
- _mm_store_si128(xout + 0, x0);
- _mm_store_si128(xout + 1, x1);
- _mm_store_si128(xout + 2, x2);
- _mm_store_si128(xout + 3, x3);
+ uint64_t* out64 = (uint64_t*)out;
+
+ out64[0] = r0;
+ out64[1] = r1;
+ out64[2] = r2;
+ out64[3] = r3;
+ out64[4] = r4;
+ out64[5] = r5;
+ out64[6] = r6;
+ out64[7] = r7;
}
template
@@ -98,7 +94,7 @@ namespace RandomX {
memory.mx ^= addr;
memory.mx &= -64; //align to cache line
std::swap(memory.mx, memory.ma);
- PREFETCH(memory);
+ PREFETCHNTA(memory.ds.dataset + memory.ma);
for (int i = 0; i < RegistersCount; ++i)
reg.r[i].u64 ^= datasetLine[i];
}
diff --git a/src/hashAes1Rx4.cpp b/src/hashAes1Rx4.cpp
index 1f25335..623d4b6 100644
--- a/src/hashAes1Rx4.cpp
+++ b/src/hashAes1Rx4.cpp
@@ -71,3 +71,44 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
template void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
template void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
+
+template
+void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
+ const uint8_t* outptr = (uint8_t*)buffer;
+ const uint8_t* outputEnd = outptr + outputSize;
+
+ __m128i state0, state1, state2, state3;
+ __m128i key0, key1, key2, key3;
+
+ key0 = _mm_set_epi32(0x9274f206, 0x79498d2f, 0x7d2de6ab, 0x67a04d26);
+ key1 = _mm_set_epi32(0xe1f7af05, 0x2a3a6f1d, 0x86658a15, 0x4f719812);
+ key2 = _mm_set_epi32(0xd1b1f791, 0x9e2ec914, 0x14c77bce, 0xba90750e);
+ key3 = _mm_set_epi32(0x179d0fd9, 0x6e57883c, 0xa53bbe4f, 0xaa07621f);
+
+ state0 = _mm_load_si128((__m128i*)state + 0);
+ state1 = _mm_load_si128((__m128i*)state + 1);
+ state2 = _mm_load_si128((__m128i*)state + 2);
+ state3 = _mm_load_si128((__m128i*)state + 3);
+
+ while (outptr < outputEnd) {
+ state0 = aesdec(state0, key0);
+ state1 = aesenc(state1, key1);
+ state2 = aesdec(state2, key2);
+ state3 = aesenc(state3, key3);
+
+ _mm_store_si128((__m128i*)outptr + 0, state0);
+ _mm_store_si128((__m128i*)outptr + 1, state1);
+ _mm_store_si128((__m128i*)outptr + 2, state2);
+ _mm_store_si128((__m128i*)outptr + 3, state3);
+
+ outptr += 64;
+ }
+
+ _mm_store_si128((__m128i*)state + 0, state0);
+ _mm_store_si128((__m128i*)state + 1, state1);
+ _mm_store_si128((__m128i*)state + 2, state2);
+ _mm_store_si128((__m128i*)state + 3, state3);
+}
+
+template void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
+template void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
diff --git a/src/hashAes1Rx4.hpp b/src/hashAes1Rx4.hpp
index a9af1fc..8c0c156 100644
--- a/src/hashAes1Rx4.hpp
+++ b/src/hashAes1Rx4.hpp
@@ -20,4 +20,7 @@ along with RandomX. If not, see.
#include "softAes.h"
template
-void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
\ No newline at end of file
+void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
+
+template
+void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index 55c9b79..d24800e 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -37,8 +37,9 @@ along with RandomX. If not, see.
#define WT_INEG_R 2
#define WT_IXOR_R 12
#define WT_IXOR_M 3
-#define WT_IROR_R 12
-#define WT_IROL_R 12
+#define WT_IROR_R 10
+#define WT_IROL_R 10
+#define WT_ISWAP_R 4
//Common floating point
#define WT_FPSWAP_R 8
@@ -72,7 +73,7 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
-WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
+WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;
diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp
index 78bdb6f..9e1eff1 100644
--- a/src/instructionsPortable.cpp
+++ b/src/instructionsPortable.cpp
@@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
along with RandomX. If not, see.
*/
//#define DEBUG
-#include "instructions.hpp"
#include "intrinPortable.h"
#pragma STDC FENV_ACCESS on
#include
@@ -29,14 +28,14 @@ along with RandomX. If not, see.
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
typedef __int128 int128_t;
- static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+ uint64_t mulh(uint64_t a, uint64_t b) {
return ((uint128_t)a * b) >> 64;
}
- static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+ int64_t smulh(int64_t a, int64_t b) {
return ((int128_t)a * b) >> 64;
}
- #define umulhi64 __umulhi64
- #define imulhi64 __imulhi64
+ #define HAVE_MULH
+ #define HAVE_SMULH
#endif
#if defined(_MSC_VER)
@@ -44,62 +43,62 @@ along with RandomX. If not, see.
#define EVAL_DEFINE(X) HAS_VALUE(X)
#include
#include
- #define ror64 _rotr64
- #define rol64 _rotl64
+
+ uint64_t rotl(uint64_t x, int c) {
+ return _rotl64(x, c);
+ }
+ uint64_t rotr(uint64_t x , int c) {
+ return _rotr64(x, c);
+ }
+ #define HAVE_ROTL
+ #define HAVE_ROTR
+
#if EVAL_DEFINE(__MACHINEARM64_X64(1))
- #define umulhi64 __umulh
+ uint64_t mulh(uint64_t a, uint64_t b) {
+ return __umulh(a, b);
+ }
+ #define HAVE_MULH
#endif
+
#if EVAL_DEFINE(__MACHINEX64(1))
- static inline uint64_t __imulhi64(int64_t a, int64_t b) {
+ int64_t smulh(int64_t a, int64_t b) {
int64_t hi;
_mul128(a, b, &hi);
return hi;
}
- #define imulhi64 __imulhi64
+ #define HAVE_SMULH
#endif
- static inline uint32_t _setRoundMode(uint32_t mode) {
- return _controlfp(mode, _MCW_RC);
+
+ static void setRoundMode__(uint32_t mode) {
+ _controlfp(mode, _MCW_RC);
}
- #define setRoundMode _setRoundMode
+ #define HAVE_SETROUNDMODE_IMPL
#endif
-#ifndef setRoundMode
- #define setRoundMode fesetround
+#ifndef HAVE_SETROUNDMODE_IMPL
+ static void setRoundMode__(uint32_t mode) {
+ fesetround(mode);
+ }
#endif
-#ifndef ror64
- static inline uint64_t __ror64(uint64_t a, int b) {
+#ifndef HAVE_ROTR
+ uint64_t rotr(uint64_t a, int b) {
return (a >> b) | (a << (64 - b));
}
- #define ror64 __ror64
+ #define HAS_ROTR
#endif
-#ifndef rol64
- static inline uint64_t __rol64(uint64_t a, int b) {
+#ifndef HAVE_ROTL
+ uint64_t rotl(uint64_t a, int b) {
return (a << b) | (a >> (64 - b));
}
- #define rol64 __rol64
+ #define HAS_ROTL
#endif
-#ifndef sar64
- #include
- constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
- return value >> shift;
- }
-
- struct UsesArithmeticShift : std::integral_constant {
- };
-
- static inline int64_t __sar64(int64_t a, int b) {
- return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
- }
- #define sar64 __sar64
-#endif
-
-#ifndef umulhi64
+#ifndef HAVE_MULH
#define LO(x) ((x)&0xffffffff)
#define HI(x) ((x)>>32)
- static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
+ uint64_t mulh(uint64_t a, uint64_t b) {
uint64_t ah = HI(a), al = LO(a);
uint64_t bh = HI(b), bl = LO(b);
uint64_t x00 = al * bl;
@@ -112,17 +111,17 @@ along with RandomX. If not, see.
return (m3 << 32) + LO(m2);
}
- #define umulhi64 __umulhi64
+ #define HAVE_MULH
#endif
-#ifndef imulhi64
- static inline int64_t __imulhi64(int64_t a, int64_t b) {
- int64_t hi = umulhi64(a, b);
+#ifndef HAVE_SMULH
+ int64_t smulh(int64_t a, int64_t b) {
+ int64_t hi = mulh(a, b);
if (a < 0LL) hi -= b;
if (b < 0LL) hi -= a;
return hi;
}
- #define imulhi64 __imulhi64
+ #define HAVE_SMULH
#endif
// avoid undefined behavior of signed overflow
@@ -137,20 +136,20 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
#if defined(__has_builtin)
#if __has_builtin(__builtin_sub_overflow)
- static inline bool __subOverflow(int32_t a, int32_t b) {
+ static inline bool subOverflow__(int32_t a, int32_t b) {
int32_t temp;
return __builtin_sub_overflow(a, b, &temp);
}
- #define subOverflow __subOverflow
+ #define HAVE_SUB_OVERFLOW
#endif
#endif
-#ifndef subOverflow
- static inline bool __subOverflow(int32_t a, int32_t b) {
+#ifndef HAVE_SUB_OVERFLOW
+ static inline bool subOverflow__(int32_t a, int32_t b) {
auto c = safeSub(a, b);
return (c < a) != (b > 0);
}
- #define subOverflow __subOverflow
+ #define HAVE_SUB_OVERFLOW
#endif
static inline double FlushDenormalNaN(double x) {
@@ -165,47 +164,57 @@ static inline double FlushNaN(double x) {
return x != x ? 0.0 : x;
}
+void setRoundMode(uint32_t rcflag) {
+ switch (rcflag & 3) {
+ case RoundDown:
+ setRoundMode__(FE_DOWNWARD);
+ break;
+ case RoundUp:
+ setRoundMode__(FE_UPWARD);
+ break;
+ case RoundToZero:
+ setRoundMode__(FE_TOWARDZERO);
+ break;
+ default:
+ setRoundMode__(FE_TONEAREST);
+ break;
+ }
+}
+
+bool condition(uint32_t type, int32_t value, int32_t imm32) {
+ switch (type & 7)
+ {
+ case 0:
+ return (uint32_t)value <= (uint32_t)imm32;
+ case 1:
+ return (uint32_t)value > (uint32_t)imm32;
+ case 2:
+ return safeSub(value, imm32) < 0;
+ case 3:
+ return safeSub(value, imm32) >= 0;
+ case 4:
+ return subOverflow__(value, imm32);
+ case 5:
+ return !subOverflow__(value, imm32);
+ case 6:
+ return value < imm32;
+ case 7:
+ return value >= imm32;
+ }
+}
+
+void initFpu() {
+#ifdef __SSE2__
+ _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
+#else
+ setRoundMode(FE_TONEAREST);
+#endif
+}
+
namespace RandomX {
extern "C" {
-
- void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 + b.u64;
- }
-
- void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u32 + b.u32;
- }
-
- void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 - b.u64;
- }
-
- void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u32 - b.u32;
- }
-
- void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 * b.u64;
- }
-
- void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = umulhi64(a.u64, b.u64);
- }
-
- void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = (uint64_t)a.u32 * b.u32;
- }
-
- void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.i64 = (int64_t)a.i32 * b.i32;
- }
-
- void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.i64 = imulhi64(a.i64, b.i64);
- }
-
- void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
+ /*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
}
@@ -216,80 +225,6 @@ namespace RandomX {
c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
}
- void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 & b.u64;
- }
-
- void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u32 & b.u32;
- }
-
- void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 | b.u64;
- }
-
- void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u32 | b.u32;
- }
-
- void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 ^ b.u64;
- }
-
- void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u32 ^ b.u32;
- }
-
- void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 << (b.u64 & 63);
- }
-
- void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = a.u64 >> (b.u64 & 63);
- }
-
- void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = sar64(a.i64, b.u64 & 63);
- }
-
- void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = rol64(a.u64, (b.u64 & 63));
- }
-
- void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
- c.u64 = ror64(a.u64, (b.u64 & 63));
- }
-
- bool JMP_COND(uint8_t type, convertible_t& regb, int32_t imm32) {
- switch (type & 7)
- {
- case 0:
- return regb.u32 <= (uint32_t)imm32;
- case 1:
- return regb.u32 > (uint32_t)imm32;
- case 2:
- return safeSub(regb.i32, imm32) < 0;
- case 3:
- return safeSub(regb.i32, imm32) >= 0;
- case 4:
- return subOverflow(regb.i32, imm32);
- case 5:
- return !subOverflow(regb.i32, imm32);
- case 6:
- return regb.i32 < imm32;
- case 7:
- return regb.i32 >= imm32;
- }
- }
-
- void FPINIT() {
-#ifdef __SSE2__
- _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
-#else
- setRoundMode(FE_TONEAREST);
-#endif
- }
-
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
@@ -368,48 +303,8 @@ namespace RandomX {
c.lo.f64 = sqrt(std::abs(alo));
c.hi.f64 = sqrt(std::abs(ahi));
#endif
- }
+ }*/
+
- void FPROUND(convertible_t a, uint8_t rot) {
- uint64_t flag = ror64(a.u64, rot);
- switch (flag & 3) {
- case RoundDown:
-#ifdef DEBUG
- std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " <<
-#endif
- setRoundMode(FE_DOWNWARD);
-#ifdef DEBUG
- std::cout << std::endl;
-#endif
- break;
- case RoundUp:
-#ifdef DEBUG
- std::cout << "Round FE_UPWARD (" << FE_UPWARD << ") = " <<
-#endif
- setRoundMode(FE_UPWARD);
-#ifdef DEBUG
- std::cout << std::endl;
-#endif
- break;
- case RoundToZero:
-#ifdef DEBUG
- std::cout << "Round FE_TOWARDZERO (" << FE_TOWARDZERO << ") = " <<
-#endif
- setRoundMode(FE_TOWARDZERO);
-#ifdef DEBUG
- std::cout << std::endl;
-#endif
- break;
- default:
-#ifdef DEBUG
- std::cout << "Round FE_TONEAREST (" << FE_TONEAREST << ") = " <<
-#endif
- setRoundMode(FE_TONEAREST);
-#ifdef DEBUG
- std::cout << std::endl;
-#endif
- break;
- }
- }
}
}
\ No newline at end of file
diff --git a/src/intrinPortable.h b/src/intrinPortable.h
index 3a473a2..3d2136c 100644
--- a/src/intrinPortable.h
+++ b/src/intrinPortable.h
@@ -19,6 +19,8 @@ along with RandomX. If not, see.
#pragma once
+#include
+
#if defined(_MSC_VER)
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
#define __SSE2__ 1
@@ -45,6 +47,18 @@ typedef union {
uint8_t u8[16];
} __m128i;
+typedef struct {
+ double lo;
+ double hi;
+} __m128d;
+
+inline __m128d _mm_load_pd(const double* pd) {
+ __m128d x;
+ x.lo = *(pd + 0);
+ x.hi = *(pd + 1);
+ return x;
+}
+
static const char* platformError = "Platform doesn't support hardware AES";
inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
@@ -131,4 +145,17 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
return _A;
}
-#endif
\ No newline at end of file
+#endif
+
+constexpr int RoundToNearest = 0;
+constexpr int RoundDown = 1;
+constexpr int RoundUp = 2;
+constexpr int RoundToZero = 3;
+
+uint64_t mulh(uint64_t, uint64_t);
+int64_t smulh(int64_t, int64_t);
+uint64_t rotl(uint64_t, int);
+uint64_t rotr(uint64_t, int);
+void initFpu();
+void setRoundMode(uint32_t);
+bool condition(uint32_t, int32_t, int32_t);
diff --git a/src/main.cpp b/src/main.cpp
index 4f5a021..c761b97 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -35,6 +35,7 @@ along with RandomX. If not, see.
#include "dataset.hpp"
#include "Cache.hpp"
#include "Pcg32.hpp"
+#include "hashAes1Rx4.hpp"
const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
@@ -153,7 +154,7 @@ void generateNative(int nonce) {
}
void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) {
- uint64_t hash[4];
+ alignas(16) uint64_t hash[8];
unsigned char blockTemplate[] = {
0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
@@ -167,8 +168,8 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash
//std::cout << "Thread " << thread << " nonce " << nonce << std::endl;
*noncePtr = nonce;
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
- int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
- vm->initializeScratchpad(scratchpad, spIndex);
+ fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad);
+ //vm->initializeScratchpad(scratchpad, spIndex);
vm->setScratchpad(scratchpad);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
for (int chain = 0; chain < 16; ++chain) {
@@ -309,7 +310,7 @@ int main(int argc, char** argv) {
}
uint8_t* scratchpadMem;
if (largePages) {
- scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2);
+ scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RandomX::ScratchpadSize);
}
else {
scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize);
diff --git a/src/squareHash.S b/src/squareHash.S
new file mode 100644
index 0000000..4cd3b54
--- /dev/null
+++ b/src/squareHash.S
@@ -0,0 +1,17 @@
+.intel_syntax noprefix
+#if defined(__APPLE__)
+.text
+#else
+.section .text
+#endif
+#if defined(__WIN32__) || defined(__APPLE__)
+#define DECL(x) _##x
+#else
+#define DECL(x) x
+#endif
+
+.global DECL(squareHash)
+
+DECL(squareHash):
+ mov rcx, rsi
+ #include "asm/squareHash.inc"
diff --git a/src/squareHash.asm b/src/squareHash.asm
new file mode 100644
index 0000000..4433719
--- /dev/null
+++ b/src/squareHash.asm
@@ -0,0 +1,9 @@
+PUBLIC squareHash
+
+.code
+
+squareHash PROC
+ include asm/squareHash.inc
+squareHash ENDP
+
+END
\ No newline at end of file
diff --git a/src/squareHash.h b/src/squareHash.h
new file mode 100644
index 0000000..f80b492
--- /dev/null
+++ b/src/squareHash.h
@@ -0,0 +1,71 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX. If not, see.
+*/
+
+#include
+
+#if !defined(_M_X64) && !defined(__x86_64__)
+
+typedef struct {
+ uint64_t lo;
+ uint64_t hi;
+} uint128_t;
+
+#define LO(x) ((x)&0xffffffff)
+#define HI(x) ((x)>>32)
+static inline uint128_t square128(uint64_t x) {
+ uint64_t xh = HI(x), xl = LO(x);
+ uint64_t xll = xl * xl;
+ uint64_t xlh = xl * xh;
+ uint64_t xhh = xh * xh;
+ uint64_t m1 = 2 * LO(xlh) + HI(xll);
+ uint64_t m2 = 2 * HI(xlh) + LO(xhh) + HI(m1);
+ uint64_t m3 = HI(xhh) + HI(m2);
+
+ uint128_t x2;
+
+ x2.lo = (m1 << 32) + LO(xll);
+ x2.hi = (m3 << 32) + LO(m2);
+
+ return x2;
+}
+#undef LO(x)
+#undef HI(x)
+
+inline uint64_t squareHash(uint64_t x) {
+ x += 1613783669344650115;
+ for (int i = 0; i < 42; ++i) {
+ uint128_t x2 = square128(x);
+ x = x2.lo - x2.hi;
+ }
+ return x;
+}
+
+#else
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+uint64_t squareHash(uint64_t);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
\ No newline at end of file