mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-12-22 07:48:54 +00:00
Replaced division instructions with IMUL_RCP
This commit is contained in:
parent
9d5f621d5c
commit
f3b114af88
@ -19,8 +19,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
|
|||||||
|1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`|
|
|1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`|
|
||||||
|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)|
|
|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)|
|
||||||
|1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)|
|
|1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)|
|
||||||
|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`|
|
|8/256|IMUL_RCP|R|-|-|<code>dst = 2<sup>x</sup> / imm32 * dst</code>|
|
||||||
|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)|
|
|
||||||
|2/256|INEG_R|R|-|-|`dst = -dst`|
|
|2/256|INEG_R|R|-|-|`dst = -dst`|
|
||||||
|16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`|
|
|16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`|
|
||||||
|4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`|
|
|4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`|
|
||||||
@ -30,8 +29,8 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
|
|||||||
#### IMULH and ISMULH
|
#### IMULH and ISMULH
|
||||||
These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`).
|
These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`).
|
||||||
|
|
||||||
#### IDIV_C and ISDIV_C
|
#### IMUL_RCP
|
||||||
The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero.
|
This instruction multiplies the destination register by a reciprocal of `imm32`. The reciprocal is calculated as <code>rcp = 2<sup>x</sup> / imm32</code> by choosing the largest integer `x` such that <code>rcp < 2<sup>64</sup></code>. If `imm32` equals 0, this instruction is a no-op.
|
||||||
|
|
||||||
#### ISWAP_R
|
#### ISWAP_R
|
||||||
This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op.
|
This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op.
|
||||||
@ -54,7 +53,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
|
|||||||
|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`|
|
|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`|
|
||||||
|
|
||||||
#### FSCAL_R
|
#### FSCAL_R
|
||||||
This instruction negates the number and multiplies it by <code>2<sup>x</sup></code>. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{-1, +1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
|
This instruction negates the number and multiplies it by <code>2<sup>x</sup></code>. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{+1, -1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
|
||||||
|
|
||||||
The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`.
|
The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`.
|
||||||
|
|
||||||
|
10
makefile
10
makefile
@ -9,7 +9,7 @@ OBJDIR=obj
|
|||||||
LDFLAGS=-lpthread
|
LDFLAGS=-lpthread
|
||||||
CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c
|
CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c
|
||||||
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
|
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
|
||||||
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o)
|
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o)
|
||||||
ifeq ($(PLATFORM),amd64)
|
ifeq ($(PLATFORM),amd64)
|
||||||
ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
|
ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
|
||||||
CXXFLAGS += -maes
|
CXXFLAGS += -maes
|
||||||
@ -53,7 +53,7 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak
|
|||||||
$(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR)
|
$(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR)
|
||||||
$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@
|
$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@
|
||||||
|
|
||||||
$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h divideByConstantCodegen.h Program.hpp) | $(OBJDIR)
|
$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR)
|
||||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@
|
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@
|
||||||
|
|
||||||
$(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR)
|
$(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR)
|
||||||
@ -65,13 +65,13 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin
|
|||||||
$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR)
|
$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR)
|
||||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
|
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
|
||||||
|
|
||||||
$(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR)
|
$(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR)
|
||||||
$(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@
|
$(CC) $(CCFLAGS) -c $(SRCDIR)/reciprocal.c -o $@
|
||||||
|
|
||||||
$(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR)
|
$(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR)
|
||||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@
|
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@
|
||||||
|
|
||||||
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp divideByConstantCodegen.h virtualMemory.hpp) | $(OBJDIR)
|
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR)
|
||||||
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
|
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
|
||||||
|
|
||||||
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)
|
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)
|
||||||
|
@ -17,12 +17,10 @@ You should have received a copy of the GNU General Public License
|
|||||||
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
//#define TRACE
|
//#define TRACE
|
||||||
#define MAGIC_DIVISION
|
|
||||||
#include "AssemblyGeneratorX86.hpp"
|
#include "AssemblyGeneratorX86.hpp"
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
#ifdef MAGIC_DIVISION
|
#include "reciprocal.h"
|
||||||
#include "divideByConstantCodegen.h"
|
|
||||||
#endif
|
|
||||||
#include "Program.hpp"
|
#include "Program.hpp"
|
||||||
|
|
||||||
namespace RandomX {
|
namespace RandomX {
|
||||||
@ -276,38 +274,12 @@ namespace RandomX {
|
|||||||
traceint(instr);
|
traceint(instr);
|
||||||
}
|
}
|
||||||
|
|
||||||
//~6 uOPs
|
//2 uOPs
|
||||||
void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) {
|
||||||
if (instr.imm32 != 0) {
|
if (instr.imm32 != 0) {
|
||||||
uint32_t divisor = instr.imm32;
|
uint32_t divisor = instr.imm32;
|
||||||
if (divisor & (divisor - 1)) {
|
asmCode << "\tmov rax, " << reciprocal(instr.imm32) << std::endl;
|
||||||
magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
|
asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl;
|
||||||
if (mi.pre_shift == 0 && !mi.increment) {
|
|
||||||
asmCode << "\tmov rax, " << mi.multiplier << std::endl;
|
|
||||||
asmCode << "\tmul " << regR[instr.dst] << std::endl;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
|
|
||||||
if (mi.pre_shift > 0)
|
|
||||||
asmCode << "\tshr rax, " << mi.pre_shift << std::endl;
|
|
||||||
if (mi.increment) {
|
|
||||||
asmCode << "\tadd rax, 1" << std::endl;
|
|
||||||
asmCode << "\tsbb rax, 0" << std::endl;
|
|
||||||
}
|
|
||||||
asmCode << "\tmov rcx, " << mi.multiplier << std::endl;
|
|
||||||
asmCode << "\tmul rcx" << std::endl;
|
|
||||||
}
|
|
||||||
if (mi.post_shift > 0)
|
|
||||||
asmCode << "\tshr rdx, " << mi.post_shift << std::endl;
|
|
||||||
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
|
|
||||||
}
|
|
||||||
else { //divisor is a power of two
|
|
||||||
int shift = 0;
|
|
||||||
while (divisor >>= 1)
|
|
||||||
++shift;
|
|
||||||
if(shift > 0)
|
|
||||||
asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl;
|
|
||||||
}
|
|
||||||
traceint(instr);
|
traceint(instr);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -317,59 +289,7 @@ namespace RandomX {
|
|||||||
|
|
||||||
//~8.5 uOPs
|
//~8.5 uOPs
|
||||||
void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) {
|
||||||
int64_t divisor = (int32_t)instr.imm32;
|
tracenop(instr);
|
||||||
if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
|
|
||||||
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
|
|
||||||
// +/- power of two
|
|
||||||
bool negative = divisor < 0;
|
|
||||||
if (negative)
|
|
||||||
divisor = -divisor;
|
|
||||||
int shift = 0;
|
|
||||||
uint64_t unsignedDivisor = divisor;
|
|
||||||
while (unsignedDivisor >>= 1)
|
|
||||||
++shift;
|
|
||||||
if (shift > 0) {
|
|
||||||
asmCode << "\tmov rcx, rax" << std::endl;
|
|
||||||
asmCode << "\tsar rcx, 63" << std::endl;
|
|
||||||
uint32_t mask = (1ULL << shift) + 0xFFFFFFFF;
|
|
||||||
asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl;
|
|
||||||
asmCode << "\tadd rax, rcx" << std::endl;
|
|
||||||
asmCode << "\tsar rax, " << shift << std::endl;
|
|
||||||
}
|
|
||||||
if (negative)
|
|
||||||
asmCode << "\tneg rax" << std::endl;
|
|
||||||
asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl;
|
|
||||||
traceint(instr);
|
|
||||||
}
|
|
||||||
else if (divisor != 0) {
|
|
||||||
magics_info mi = compute_signed_magic_info(divisor);
|
|
||||||
asmCode << "\tmov rax, " << mi.multiplier << std::endl;
|
|
||||||
asmCode << "\timul " << regR[instr.dst] << std::endl;
|
|
||||||
//asmCode << "\tmov rax, rdx" << std::endl;
|
|
||||||
asmCode << "\txor eax, eax" << std::endl;
|
|
||||||
bool haveSF = false;
|
|
||||||
if (divisor > 0 && mi.multiplier < 0) {
|
|
||||||
asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl;
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (divisor < 0 && mi.multiplier > 0) {
|
|
||||||
asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl;
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (mi.shift > 0) {
|
|
||||||
asmCode << "\tsar rdx, " << mi.shift << std::endl;
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (!haveSF)
|
|
||||||
asmCode << "\ttest rdx, rdx" << std::endl;
|
|
||||||
asmCode << "\tsets al" << std::endl;
|
|
||||||
asmCode << "\tadd rdx, rax" << std::endl;
|
|
||||||
asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
|
|
||||||
traceint(instr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
tracenop(instr);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//2 uOPs
|
//2 uOPs
|
||||||
@ -570,7 +490,7 @@ namespace RandomX {
|
|||||||
INST_HANDLE(IMULH_M)
|
INST_HANDLE(IMULH_M)
|
||||||
INST_HANDLE(ISMULH_R)
|
INST_HANDLE(ISMULH_R)
|
||||||
INST_HANDLE(ISMULH_M)
|
INST_HANDLE(ISMULH_M)
|
||||||
INST_HANDLE(IDIV_C)
|
INST_HANDLE(IMUL_RCP)
|
||||||
INST_HANDLE(ISDIV_C)
|
INST_HANDLE(ISDIV_C)
|
||||||
INST_HANDLE(INEG_R)
|
INST_HANDLE(INEG_R)
|
||||||
INST_HANDLE(IXOR_R)
|
INST_HANDLE(IXOR_R)
|
||||||
|
@ -61,7 +61,7 @@ namespace RandomX {
|
|||||||
void h_IMULH_M(Instruction&, int);
|
void h_IMULH_M(Instruction&, int);
|
||||||
void h_ISMULH_R(Instruction&, int);
|
void h_ISMULH_R(Instruction&, int);
|
||||||
void h_ISMULH_M(Instruction&, int);
|
void h_ISMULH_M(Instruction&, int);
|
||||||
void h_IDIV_C(Instruction&, int);
|
void h_IMUL_RCP(Instruction&, int);
|
||||||
void h_ISDIV_C(Instruction&, int);
|
void h_ISDIV_C(Instruction&, int);
|
||||||
void h_INEG_R(Instruction&, int);
|
void h_INEG_R(Instruction&, int);
|
||||||
void h_IXOR_R(Instruction&, int);
|
void h_IXOR_R(Instruction&, int);
|
||||||
|
@ -193,7 +193,7 @@ namespace RandomX {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Instruction::h_IDIV_C(std::ostream& os) const {
|
void Instruction::h_IMUL_RCP(std::ostream& os) const {
|
||||||
os << "r" << (int)dst << ", " << imm32 << std::endl;
|
os << "r" << (int)dst << ", " << imm32 << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,7 +345,7 @@ namespace RandomX {
|
|||||||
INST_NAME(IMULH_M)
|
INST_NAME(IMULH_M)
|
||||||
INST_NAME(ISMULH_R)
|
INST_NAME(ISMULH_R)
|
||||||
INST_NAME(ISMULH_M)
|
INST_NAME(ISMULH_M)
|
||||||
INST_NAME(IDIV_C)
|
INST_NAME(IMUL_RCP)
|
||||||
INST_NAME(ISDIV_C)
|
INST_NAME(ISDIV_C)
|
||||||
INST_NAME(INEG_R)
|
INST_NAME(INEG_R)
|
||||||
INST_NAME(IXOR_R)
|
INST_NAME(IXOR_R)
|
||||||
@ -396,7 +396,7 @@ namespace RandomX {
|
|||||||
INST_HANDLE(IMULH_M)
|
INST_HANDLE(IMULH_M)
|
||||||
INST_HANDLE(ISMULH_R)
|
INST_HANDLE(ISMULH_R)
|
||||||
INST_HANDLE(ISMULH_M)
|
INST_HANDLE(ISMULH_M)
|
||||||
INST_HANDLE(IDIV_C)
|
INST_HANDLE(IMUL_RCP)
|
||||||
INST_HANDLE(ISDIV_C)
|
INST_HANDLE(ISDIV_C)
|
||||||
INST_HANDLE(INEG_R)
|
INST_HANDLE(INEG_R)
|
||||||
INST_HANDLE(IXOR_R)
|
INST_HANDLE(IXOR_R)
|
||||||
|
@ -41,8 +41,8 @@ namespace RandomX {
|
|||||||
constexpr int IMULH_M = 9;
|
constexpr int IMULH_M = 9;
|
||||||
constexpr int ISMULH_R = 10;
|
constexpr int ISMULH_R = 10;
|
||||||
constexpr int ISMULH_M = 11;
|
constexpr int ISMULH_M = 11;
|
||||||
constexpr int IDIV_C = 12;
|
constexpr int IMUL_RCP = 12;
|
||||||
constexpr int ISDIV_C = 13;
|
//constexpr int ISDIV_C = 13;
|
||||||
constexpr int INEG_R = 14;
|
constexpr int INEG_R = 14;
|
||||||
constexpr int IXOR_R = 15;
|
constexpr int IXOR_R = 15;
|
||||||
constexpr int IXOR_M = 16;
|
constexpr int IXOR_M = 16;
|
||||||
@ -103,7 +103,7 @@ namespace RandomX {
|
|||||||
void h_IMULH_M(std::ostream&) const;
|
void h_IMULH_M(std::ostream&) const;
|
||||||
void h_ISMULH_R(std::ostream&) const;
|
void h_ISMULH_R(std::ostream&) const;
|
||||||
void h_ISMULH_M(std::ostream&) const;
|
void h_ISMULH_M(std::ostream&) const;
|
||||||
void h_IDIV_C(std::ostream&) const;
|
void h_IMUL_RCP(std::ostream&) const;
|
||||||
void h_ISDIV_C(std::ostream&) const;
|
void h_ISDIV_C(std::ostream&) const;
|
||||||
void h_INEG_R(std::ostream&) const;
|
void h_INEG_R(std::ostream&) const;
|
||||||
void h_IXOR_R(std::ostream&) const;
|
void h_IXOR_R(std::ostream&) const;
|
||||||
|
@ -30,6 +30,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include "intrinPortable.h"
|
#include "intrinPortable.h"
|
||||||
|
#include "reciprocal.h"
|
||||||
#ifdef STATS
|
#ifdef STATS
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#endif
|
#endif
|
||||||
@ -136,7 +137,7 @@ namespace RandomX {
|
|||||||
*ibc.idst += 8 * *ibc.idst + ibc.imm;
|
*ibc.idst += 8 * *ibc.idst + ibc.imm;
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::IMUL_R: {
|
case InstructionType::IMUL_R: { //also handles IMUL_RCP
|
||||||
*ibc.idst *= *ibc.isrc;
|
*ibc.idst *= *ibc.isrc;
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
@ -160,24 +161,6 @@ namespace RandomX {
|
|||||||
*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask))));
|
*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask))));
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::IDIV_C: {
|
|
||||||
uint64_t dividend = *ibc.idst;
|
|
||||||
uint64_t quotient = dividend / ibc.imm;
|
|
||||||
*ibc.idst += quotient;
|
|
||||||
} break;
|
|
||||||
|
|
||||||
case InstructionType::ISDIV_C: {
|
|
||||||
if (ibc.simm != -1) {
|
|
||||||
int64_t dividend = unsigned64ToSigned2sCompl(*ibc.idst);
|
|
||||||
int64_t quotient = dividend / ibc.simm;
|
|
||||||
*ibc.idst += quotient;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
uint64_t quotient = ~(*ibc.idst) + 1;
|
|
||||||
*ibc.idst += quotient;
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
|
|
||||||
case InstructionType::INEG_R: {
|
case InstructionType::INEG_R: {
|
||||||
*ibc.idst = ~(*ibc.idst) + 1; //two's complement negative
|
*ibc.idst = ~(*ibc.idst) + 1; //two's complement negative
|
||||||
} break;
|
} break;
|
||||||
@ -568,13 +551,14 @@ namespace RandomX {
|
|||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
CASE_REP(IDIV_C) {
|
CASE_REP(IMUL_RCP) {
|
||||||
uint32_t divisor = instr.imm32;
|
uint32_t divisor = instr.imm32;
|
||||||
if (divisor != 0) {
|
if (divisor != 0) {
|
||||||
auto dst = instr.dst % RegistersCount;
|
auto dst = instr.dst % RegistersCount;
|
||||||
ibc.type = InstructionType::IDIV_C;
|
ibc.type = InstructionType::IMUL_R;
|
||||||
ibc.idst = &r[dst];
|
ibc.idst = &r[dst];
|
||||||
ibc.imm = divisor;
|
ibc.imm = reciprocal(divisor);
|
||||||
|
ibc.isrc = &ibc.imm;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ibc.type = InstructionType::NOP;
|
ibc.type = InstructionType::NOP;
|
||||||
@ -582,16 +566,7 @@ namespace RandomX {
|
|||||||
} break;
|
} break;
|
||||||
|
|
||||||
CASE_REP(ISDIV_C) {
|
CASE_REP(ISDIV_C) {
|
||||||
int32_t divisor = unsigned32ToSigned2sCompl(instr.imm32);
|
ibc.type = InstructionType::NOP;
|
||||||
if (divisor != 0) {
|
|
||||||
auto dst = instr.dst % RegistersCount;
|
|
||||||
ibc.type = InstructionType::ISDIV_C;
|
|
||||||
ibc.idst = &r[dst];
|
|
||||||
ibc.simm = divisor;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ibc.type = InstructionType::NOP;
|
|
||||||
}
|
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
CASE_REP(INEG_R) {
|
CASE_REP(INEG_R) {
|
||||||
|
@ -21,7 +21,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include "JitCompilerX86.hpp"
|
#include "JitCompilerX86.hpp"
|
||||||
#include "Program.hpp"
|
#include "Program.hpp"
|
||||||
#include "divideByConstantCodegen.h"
|
#include "reciprocal.h"
|
||||||
#include "virtualMemory.hpp"
|
#include "virtualMemory.hpp"
|
||||||
|
|
||||||
namespace RandomX {
|
namespace RandomX {
|
||||||
@ -395,106 +395,17 @@ namespace RandomX {
|
|||||||
emitByte(0xc2 + 8 * instr.dst);
|
emitByte(0xc2 + 8 * instr.dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::h_IDIV_C(Instruction& instr) {
|
void JitCompilerX86::h_IMUL_RCP(Instruction& instr) {
|
||||||
if (instr.imm32 != 0) {
|
if (instr.imm32 != 0) {
|
||||||
uint32_t divisor = instr.imm32;
|
emit(MOV_RAX_I);
|
||||||
if (divisor & (divisor - 1)) {
|
emit64(reciprocal(instr.imm32));
|
||||||
magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
|
emit(REX_IMUL_RM);
|
||||||
if (mi.pre_shift == 0 && !mi.increment) {
|
emitByte(0xc0 + 8 * instr.dst);
|
||||||
emit(MOV_RAX_I);
|
|
||||||
emit64(mi.multiplier);
|
|
||||||
emit(REX_MUL_R);
|
|
||||||
emitByte(0xe0 + instr.dst);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
emit(REX_MOV_RR64);
|
|
||||||
emitByte(0xc0 + instr.dst);
|
|
||||||
if (mi.pre_shift > 0) {
|
|
||||||
emit(REX_SHR_RAX);
|
|
||||||
emitByte(mi.pre_shift);
|
|
||||||
}
|
|
||||||
if (mi.increment) {
|
|
||||||
emit(RAX_ADD_SBB_1);
|
|
||||||
}
|
|
||||||
emit(MOV_RCX_I);
|
|
||||||
emit64(mi.multiplier);
|
|
||||||
emit(MUL_RCX);
|
|
||||||
}
|
|
||||||
if (mi.post_shift > 0) {
|
|
||||||
emit(REX_SHR_RDX);
|
|
||||||
emitByte(mi.post_shift);
|
|
||||||
}
|
|
||||||
emit(REX_ADD_RM);
|
|
||||||
emitByte(0xc2 + 8 * instr.dst);
|
|
||||||
}
|
|
||||||
else { //divisor is a power of two
|
|
||||||
int shift = 0;
|
|
||||||
while (divisor >>= 1)
|
|
||||||
++shift;
|
|
||||||
if (shift > 0) {
|
|
||||||
emit(REX_SH);
|
|
||||||
emitByte(0xe8 + instr.dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::h_ISDIV_C(Instruction& instr) {
|
void JitCompilerX86::h_ISDIV_C(Instruction& instr) {
|
||||||
int64_t divisor = (int32_t)instr.imm32;
|
|
||||||
if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
|
|
||||||
emit(REX_MOV_RR64);
|
|
||||||
emitByte(0xc0 + instr.dst);
|
|
||||||
// +/- power of two
|
|
||||||
bool negative = divisor < 0;
|
|
||||||
if (negative)
|
|
||||||
divisor = -divisor;
|
|
||||||
int shift = 0;
|
|
||||||
uint64_t unsignedDivisor = divisor;
|
|
||||||
while (unsignedDivisor >>= 1)
|
|
||||||
++shift;
|
|
||||||
if (shift > 0) {
|
|
||||||
emit(MOV_RCX_RAX_SAR_RCX_63);
|
|
||||||
uint32_t mask = (1ULL << shift) - 1;
|
|
||||||
emit(AND_ECX_I);
|
|
||||||
emit32(mask);
|
|
||||||
emit(ADD_RAX_RCX);
|
|
||||||
emit(SAR_RAX_I8);
|
|
||||||
emitByte(shift);
|
|
||||||
}
|
|
||||||
if (negative)
|
|
||||||
emit(NEG_RAX);
|
|
||||||
emit(ADD_R_RAX);
|
|
||||||
emitByte(0xc0 + instr.dst);
|
|
||||||
}
|
|
||||||
else if (divisor != 0) {
|
|
||||||
magics_info mi = compute_signed_magic_info(divisor);
|
|
||||||
emit(MOV_RAX_I);
|
|
||||||
emit64(mi.multiplier);
|
|
||||||
emit(REX_MUL_R);
|
|
||||||
emitByte(0xe8 + instr.dst);
|
|
||||||
emit(XOR_EAX_EAX);
|
|
||||||
bool haveSF = false;
|
|
||||||
if (divisor > 0 && mi.multiplier < 0) {
|
|
||||||
emit(ADD_RDX_R);
|
|
||||||
emitByte(0xc2 + 8 * instr.dst);
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (divisor < 0 && mi.multiplier > 0) {
|
|
||||||
emit(SUB_RDX_R);
|
|
||||||
emitByte(0xc2 + 8 * instr.dst);
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (mi.shift > 0) {
|
|
||||||
emit(SAR_RDX_I8);
|
|
||||||
emitByte(mi.shift);
|
|
||||||
haveSF = true;
|
|
||||||
}
|
|
||||||
if (!haveSF)
|
|
||||||
emit(TEST_RDX_RDX);
|
|
||||||
emit(SETS_AL_ADD_RDX_RAX);
|
|
||||||
emit(ADD_R_RAX);
|
|
||||||
emitByte(0xc2 + 8 * instr.dst);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::h_INEG_R(Instruction& instr) {
|
void JitCompilerX86::h_INEG_R(Instruction& instr) {
|
||||||
@ -748,7 +659,7 @@ namespace RandomX {
|
|||||||
INST_HANDLE(IMULH_M)
|
INST_HANDLE(IMULH_M)
|
||||||
INST_HANDLE(ISMULH_R)
|
INST_HANDLE(ISMULH_R)
|
||||||
INST_HANDLE(ISMULH_M)
|
INST_HANDLE(ISMULH_M)
|
||||||
INST_HANDLE(IDIV_C)
|
INST_HANDLE(IMUL_RCP)
|
||||||
INST_HANDLE(ISDIV_C)
|
INST_HANDLE(ISDIV_C)
|
||||||
INST_HANDLE(INEG_R)
|
INST_HANDLE(INEG_R)
|
||||||
INST_HANDLE(IXOR_R)
|
INST_HANDLE(IXOR_R)
|
||||||
|
@ -101,7 +101,7 @@ namespace RandomX {
|
|||||||
void h_IMULH_M(Instruction&);
|
void h_IMULH_M(Instruction&);
|
||||||
void h_ISMULH_R(Instruction&);
|
void h_ISMULH_R(Instruction&);
|
||||||
void h_ISMULH_M(Instruction&);
|
void h_ISMULH_M(Instruction&);
|
||||||
void h_IDIV_C(Instruction&);
|
void h_IMUL_RCP(Instruction&);
|
||||||
void h_ISDIV_C(Instruction&);
|
void h_ISDIV_C(Instruction&);
|
||||||
void h_INEG_R(Instruction&);
|
void h_INEG_R(Instruction&);
|
||||||
void h_IXOR_R(Instruction&);
|
void h_IXOR_R(Instruction&);
|
||||||
|
@ -32,8 +32,8 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#define WT_IMULH_M 1
|
#define WT_IMULH_M 1
|
||||||
#define WT_ISMULH_R 4
|
#define WT_ISMULH_R 4
|
||||||
#define WT_ISMULH_M 1
|
#define WT_ISMULH_M 1
|
||||||
#define WT_IDIV_C 4
|
#define WT_IMUL_RCP 8
|
||||||
#define WT_ISDIV_C 4
|
#define WT_ISDIV_C 0
|
||||||
#define WT_INEG_R 2
|
#define WT_INEG_R 2
|
||||||
#define WT_IXOR_R 16
|
#define WT_IXOR_R 16
|
||||||
#define WT_IXOR_M 4
|
#define WT_IXOR_M 4
|
||||||
@ -71,7 +71,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
|
|
||||||
constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
|
constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
|
||||||
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
|
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
|
||||||
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
|
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IMUL_RCP + WT_ISDIV_C + \
|
||||||
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
|
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
|
||||||
WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \
|
WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \
|
||||||
WT_FSCAL_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \
|
WT_FSCAL_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \
|
||||||
|
@ -341,7 +341,7 @@ int main(int argc, char** argv) {
|
|||||||
std::cout << "Calculated result: ";
|
std::cout << "Calculated result: ";
|
||||||
result.print(std::cout);
|
result.print(std::cout);
|
||||||
if(programCount == 1000)
|
if(programCount == 1000)
|
||||||
std::cout << "Reference result: fe31e8fd7ed1cec773e87c0684b66b38e58b23ab255e8f9c6b62745e43a26851" << std::endl;
|
std::cout << "Reference result: d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl;
|
||||||
if (!miningMode) {
|
if (!miningMode) {
|
||||||
std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
|
std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
|
||||||
}
|
}
|
||||||
|
1378
src/program.inc
1378
src/program.inc
File diff suppressed because it is too large
Load Diff
60
src/reciprocal.c
Normal file
60
src/reciprocal.c
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2019 tevador
|
||||||
|
|
||||||
|
This file is part of RandomX.
|
||||||
|
|
||||||
|
RandomX is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
RandomX is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "reciprocal.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64.
|
||||||
|
|
||||||
|
Equivalent x86 assembly (divisor in rcx):
|
||||||
|
|
||||||
|
mov edx, 1
|
||||||
|
mov r8, rcx
|
||||||
|
xor eax, eax
|
||||||
|
bsr rcx, rcx
|
||||||
|
shl rdx, cl
|
||||||
|
div r8
|
||||||
|
ret
|
||||||
|
|
||||||
|
*/
|
||||||
|
uint64_t reciprocal(uint64_t divisor) {
|
||||||
|
|
||||||
|
const uint64_t p2exp63 = 1ULL << 63;
|
||||||
|
|
||||||
|
uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor;
|
||||||
|
|
||||||
|
unsigned bsr = 0; //highest set bit in divisor
|
||||||
|
|
||||||
|
for (uint64_t bit = divisor; bit > 0; bit >>= 1)
|
||||||
|
bsr++;
|
||||||
|
|
||||||
|
for (unsigned shift = 0; shift < bsr; shift++) {
|
||||||
|
if (remainder >= divisor - remainder) {
|
||||||
|
quotient = quotient * 2 + 1;
|
||||||
|
remainder = remainder * 2 - divisor;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
quotient = quotient * 2;
|
||||||
|
remainder = remainder * 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return quotient;
|
||||||
|
}
|
31
src/reciprocal.h
Normal file
31
src/reciprocal.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2019 tevador
|
||||||
|
|
||||||
|
This file is part of RandomX.
|
||||||
|
|
||||||
|
RandomX is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
RandomX is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t reciprocal(uint64_t);
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user