mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2025-01-05 06:38:53 +00:00
Merge pull request #281 from SChernykh/fix-x18
ARM64 JIT: don't use `x18` register
This commit is contained in:
commit
d3c96482ee
@ -130,8 +130,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
|
|||||||
// and w16, w10, ScratchpadL3Mask64
|
// and w16, w10, ScratchpadL3Mask64
|
||||||
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
||||||
|
|
||||||
// and w17, w18, ScratchpadL3Mask64
|
// and w17, w20, ScratchpadL3Mask64
|
||||||
emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
||||||
|
|
||||||
codePos = PrologueSize;
|
codePos = PrologueSize;
|
||||||
literalPos = ImulRcpLiteralsEnd;
|
literalPos = ImulRcpLiteralsEnd;
|
||||||
@ -149,16 +149,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update spMix2
|
// Update spMix2
|
||||||
// eor w18, config.readReg2, config.readReg3
|
// eor w20, config.readReg2, config.readReg3
|
||||||
emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
|
emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
|
||||||
|
|
||||||
// Jump back to the main loop
|
// Jump back to the main loop
|
||||||
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
||||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||||
|
|
||||||
// and w18, w18, CacheLineAlignMask
|
// and w20, w20, CacheLineAlignMask
|
||||||
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
|
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
|
||||||
emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
|
emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
|
||||||
|
|
||||||
// and w10, w10, CacheLineAlignMask
|
// and w10, w10, CacheLineAlignMask
|
||||||
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
|
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
|
||||||
@ -181,8 +181,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
|
|||||||
// and w16, w10, ScratchpadL3Mask64
|
// and w16, w10, ScratchpadL3Mask64
|
||||||
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
||||||
|
|
||||||
// and w17, w18, ScratchpadL3Mask64
|
// and w17, w20, ScratchpadL3Mask64
|
||||||
emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
|
||||||
|
|
||||||
codePos = PrologueSize;
|
codePos = PrologueSize;
|
||||||
literalPos = ImulRcpLiteralsEnd;
|
literalPos = ImulRcpLiteralsEnd;
|
||||||
@ -200,8 +200,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update spMix2
|
// Update spMix2
|
||||||
// eor w18, config.readReg2, config.readReg3
|
// eor w20, config.readReg2, config.readReg3
|
||||||
emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
|
emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
|
||||||
|
|
||||||
// Jump back to the main loop
|
// Jump back to the main loop
|
||||||
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
||||||
@ -434,7 +434,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMovImmediate(tmp_reg, imm, code, k);
|
emitMovImmediate(tmp_reg, imm, code, k);
|
||||||
|
|
||||||
// add dst, src, tmp_reg
|
// add dst, src, tmp_reg
|
||||||
@ -483,7 +483,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
|
|||||||
uint32_t k = codePos;
|
uint32_t k = codePos;
|
||||||
|
|
||||||
uint32_t imm = instr.getImm32();
|
uint32_t imm = instr.getImm32();
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 19;
|
||||||
|
|
||||||
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
|
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
|
||||||
emitAddImmediate(tmp_reg, src, imm, code, k);
|
emitAddImmediate(tmp_reg, src, imm, code, k);
|
||||||
@ -537,7 +537,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// add dst, dst, tmp_reg
|
// add dst, dst, tmp_reg
|
||||||
@ -575,7 +575,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// sub dst, dst, tmp_reg
|
// sub dst, dst, tmp_reg
|
||||||
@ -594,7 +594,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
if (src == dst)
|
if (src == dst)
|
||||||
{
|
{
|
||||||
src = 18;
|
src = 20;
|
||||||
emitMovImmediate(src, instr.getImm32(), code, k);
|
emitMovImmediate(src, instr.getImm32(), code, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -612,7 +612,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// sub dst, dst, tmp_reg
|
// sub dst, dst, tmp_reg
|
||||||
@ -643,7 +643,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// umulh dst, dst, tmp_reg
|
// umulh dst, dst, tmp_reg
|
||||||
@ -674,7 +674,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// smulh dst, dst, tmp_reg
|
// smulh dst, dst, tmp_reg
|
||||||
@ -692,7 +692,7 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
uint32_t k = codePos;
|
uint32_t k = codePos;
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint64_t N = 1ULL << 63;
|
constexpr uint64_t N = 1ULL << 63;
|
||||||
@ -711,9 +711,9 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
|
|||||||
literalPos -= sizeof(uint64_t);
|
literalPos -= sizeof(uint64_t);
|
||||||
*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
|
*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
|
||||||
|
|
||||||
if (literal_id < 13)
|
if (literal_id < 12)
|
||||||
{
|
{
|
||||||
static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 };
|
static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 };
|
||||||
|
|
||||||
// mul dst, dst, literal_reg
|
// mul dst, dst, literal_reg
|
||||||
emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
|
emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
|
||||||
@ -751,7 +751,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
if (src == dst)
|
if (src == dst)
|
||||||
{
|
{
|
||||||
src = 18;
|
src = 20;
|
||||||
emitMovImmediate(src, instr.getImm32(), code, k);
|
emitMovImmediate(src, instr.getImm32(), code, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -769,7 +769,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos)
|
|||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// eor dst, dst, tmp_reg
|
// eor dst, dst, tmp_reg
|
||||||
@ -807,7 +807,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
if (src != dst)
|
if (src != dst)
|
||||||
{
|
{
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
|
|
||||||
// sub tmp_reg, xzr, src
|
// sub tmp_reg, xzr, src
|
||||||
emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
|
emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
|
||||||
@ -835,7 +835,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
uint32_t k = codePos;
|
uint32_t k = codePos;
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
|
emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
|
||||||
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
|
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
|
||||||
emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
|
emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
|
||||||
@ -984,7 +984,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
|
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
constexpr uint32_t fpcr_tmp_reg = 8;
|
constexpr uint32_t fpcr_tmp_reg = 8;
|
||||||
|
|
||||||
// ror tmp_reg, src, imm
|
// ror tmp_reg, src, imm
|
||||||
@ -1008,7 +1008,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
|
|||||||
|
|
||||||
const uint32_t src = IntRegMap[instr.src];
|
const uint32_t src = IntRegMap[instr.src];
|
||||||
const uint32_t dst = IntRegMap[instr.dst];
|
const uint32_t dst = IntRegMap[instr.dst];
|
||||||
constexpr uint32_t tmp_reg = 18;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
|
|
||||||
uint32_t imm = instr.getImm32();
|
uint32_t imm = instr.getImm32();
|
||||||
|
|
||||||
|
@ -74,9 +74,9 @@
|
|||||||
# x15 -> "r7"
|
# x15 -> "r7"
|
||||||
# x16 -> spAddr0
|
# x16 -> spAddr0
|
||||||
# x17 -> spAddr1
|
# x17 -> spAddr1
|
||||||
# x18 -> temporary
|
# x18 -> unused (platform register, don't touch it)
|
||||||
# x19 -> temporary
|
# x19 -> temporary
|
||||||
# x20 -> literal for IMUL_RCP
|
# x20 -> temporary
|
||||||
# x21 -> literal for IMUL_RCP
|
# x21 -> literal for IMUL_RCP
|
||||||
# x22 -> literal for IMUL_RCP
|
# x22 -> literal for IMUL_RCP
|
||||||
# x23 -> literal for IMUL_RCP
|
# x23 -> literal for IMUL_RCP
|
||||||
@ -111,7 +111,7 @@ DECL(randomx_program_aarch64):
|
|||||||
# Save callee-saved registers
|
# Save callee-saved registers
|
||||||
sub sp, sp, 192
|
sub sp, sp, 192
|
||||||
stp x16, x17, [sp]
|
stp x16, x17, [sp]
|
||||||
stp x18, x19, [sp, 16]
|
str x19, [sp, 16]
|
||||||
stp x20, x21, [sp, 32]
|
stp x20, x21, [sp, 32]
|
||||||
stp x22, x23, [sp, 48]
|
stp x22, x23, [sp, 48]
|
||||||
stp x24, x25, [sp, 64]
|
stp x24, x25, [sp, 64]
|
||||||
@ -166,7 +166,6 @@ DECL(randomx_program_aarch64):
|
|||||||
# Read literals
|
# Read literals
|
||||||
ldr x0, literal_x0
|
ldr x0, literal_x0
|
||||||
ldr x11, literal_x11
|
ldr x11, literal_x11
|
||||||
ldr x20, literal_x20
|
|
||||||
ldr x21, literal_x21
|
ldr x21, literal_x21
|
||||||
ldr x22, literal_x22
|
ldr x22, literal_x22
|
||||||
ldr x23, literal_x23
|
ldr x23, literal_x23
|
||||||
@ -198,11 +197,11 @@ DECL(randomx_program_aarch64):
|
|||||||
DECL(randomx_program_aarch64_main_loop):
|
DECL(randomx_program_aarch64_main_loop):
|
||||||
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||||
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
||||||
lsr x18, x10, 32
|
lsr x20, x10, 32
|
||||||
|
|
||||||
# Actual mask will be inserted by JIT compiler
|
# Actual mask will be inserted by JIT compiler
|
||||||
and w16, w10, 1
|
and w16, w10, 1
|
||||||
and w17, w18, 1
|
and w17, w20, 1
|
||||||
|
|
||||||
# x16 = scratchpad + spAddr0
|
# x16 = scratchpad + spAddr0
|
||||||
# x17 = scratchpad + spAddr1
|
# x17 = scratchpad + spAddr1
|
||||||
@ -210,31 +209,31 @@ DECL(randomx_program_aarch64_main_loop):
|
|||||||
add x17, x17, x2
|
add x17, x17, x2
|
||||||
|
|
||||||
# xor integer registers with scratchpad data (spAddr0)
|
# xor integer registers with scratchpad data (spAddr0)
|
||||||
ldp x18, x19, [x16]
|
ldp x20, x19, [x16]
|
||||||
eor x4, x4, x18
|
eor x4, x4, x20
|
||||||
eor x5, x5, x19
|
eor x5, x5, x19
|
||||||
ldp x18, x19, [x16, 16]
|
ldp x20, x19, [x16, 16]
|
||||||
eor x6, x6, x18
|
eor x6, x6, x20
|
||||||
eor x7, x7, x19
|
eor x7, x7, x19
|
||||||
ldp x18, x19, [x16, 32]
|
ldp x20, x19, [x16, 32]
|
||||||
eor x12, x12, x18
|
eor x12, x12, x20
|
||||||
eor x13, x13, x19
|
eor x13, x13, x19
|
||||||
ldp x18, x19, [x16, 48]
|
ldp x20, x19, [x16, 48]
|
||||||
eor x14, x14, x18
|
eor x14, x14, x20
|
||||||
eor x15, x15, x19
|
eor x15, x15, x19
|
||||||
|
|
||||||
# Load group F registers (spAddr1)
|
# Load group F registers (spAddr1)
|
||||||
ldpsw x18, x19, [x17]
|
ldpsw x20, x19, [x17]
|
||||||
ins v16.d[0], x18
|
ins v16.d[0], x20
|
||||||
ins v16.d[1], x19
|
ins v16.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 8]
|
ldpsw x20, x19, [x17, 8]
|
||||||
ins v17.d[0], x18
|
ins v17.d[0], x20
|
||||||
ins v17.d[1], x19
|
ins v17.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 16]
|
ldpsw x20, x19, [x17, 16]
|
||||||
ins v18.d[0], x18
|
ins v18.d[0], x20
|
||||||
ins v18.d[1], x19
|
ins v18.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 24]
|
ldpsw x20, x19, [x17, 24]
|
||||||
ins v19.d[0], x18
|
ins v19.d[0], x20
|
||||||
ins v19.d[1], x19
|
ins v19.d[1], x19
|
||||||
scvtf v16.2d, v16.2d
|
scvtf v16.2d, v16.2d
|
||||||
scvtf v17.2d, v17.2d
|
scvtf v17.2d, v17.2d
|
||||||
@ -242,17 +241,17 @@ DECL(randomx_program_aarch64_main_loop):
|
|||||||
scvtf v19.2d, v19.2d
|
scvtf v19.2d, v19.2d
|
||||||
|
|
||||||
# Load group E registers (spAddr1)
|
# Load group E registers (spAddr1)
|
||||||
ldpsw x18, x19, [x17, 32]
|
ldpsw x20, x19, [x17, 32]
|
||||||
ins v20.d[0], x18
|
ins v20.d[0], x20
|
||||||
ins v20.d[1], x19
|
ins v20.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 40]
|
ldpsw x20, x19, [x17, 40]
|
||||||
ins v21.d[0], x18
|
ins v21.d[0], x20
|
||||||
ins v21.d[1], x19
|
ins v21.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 48]
|
ldpsw x20, x19, [x17, 48]
|
||||||
ins v22.d[0], x18
|
ins v22.d[0], x20
|
||||||
ins v22.d[1], x19
|
ins v22.d[1], x19
|
||||||
ldpsw x18, x19, [x17, 56]
|
ldpsw x20, x19, [x17, 56]
|
||||||
ins v23.d[0], x18
|
ins v23.d[0], x20
|
||||||
ins v23.d[1], x19
|
ins v23.d[1], x19
|
||||||
scvtf v20.2d, v20.2d
|
scvtf v20.2d, v20.2d
|
||||||
scvtf v21.2d, v21.2d
|
scvtf v21.2d, v21.2d
|
||||||
@ -276,7 +275,6 @@ DECL(randomx_program_aarch64_vm_instructions):
|
|||||||
|
|
||||||
literal_x0: .fill 1,8,0
|
literal_x0: .fill 1,8,0
|
||||||
literal_x11: .fill 1,8,0
|
literal_x11: .fill 1,8,0
|
||||||
literal_x20: .fill 1,8,0
|
|
||||||
literal_x21: .fill 1,8,0
|
literal_x21: .fill 1,8,0
|
||||||
literal_x22: .fill 1,8,0
|
literal_x22: .fill 1,8,0
|
||||||
literal_x23: .fill 1,8,0
|
literal_x23: .fill 1,8,0
|
||||||
@ -312,17 +310,17 @@ DECL(randomx_program_aarch64_vm_instructions_end):
|
|||||||
lsr x10, x9, 32
|
lsr x10, x9, 32
|
||||||
|
|
||||||
# mx ^= r[readReg2] ^ r[readReg3];
|
# mx ^= r[readReg2] ^ r[readReg3];
|
||||||
eor x9, x9, x18
|
eor x9, x9, x20
|
||||||
|
|
||||||
# Calculate dataset pointer for dataset prefetch
|
# Calculate dataset pointer for dataset prefetch
|
||||||
mov w18, w9
|
mov w20, w9
|
||||||
DECL(randomx_program_aarch64_cacheline_align_mask1):
|
DECL(randomx_program_aarch64_cacheline_align_mask1):
|
||||||
# Actual mask will be inserted by JIT compiler
|
# Actual mask will be inserted by JIT compiler
|
||||||
and x18, x18, 1
|
and x20, x20, 1
|
||||||
add x18, x18, x1
|
add x20, x20, x1
|
||||||
|
|
||||||
# Prefetch dataset data
|
# Prefetch dataset data
|
||||||
prfm pldl2strm, [x18]
|
prfm pldl2strm, [x20]
|
||||||
|
|
||||||
# mx <-> ma
|
# mx <-> ma
|
||||||
ror x9, x9, 32
|
ror x9, x9, 32
|
||||||
@ -335,17 +333,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
|
|||||||
DECL(randomx_program_aarch64_xor_with_dataset_line):
|
DECL(randomx_program_aarch64_xor_with_dataset_line):
|
||||||
rx_program_xor_with_dataset_line:
|
rx_program_xor_with_dataset_line:
|
||||||
# xor integer registers with dataset data
|
# xor integer registers with dataset data
|
||||||
ldp x18, x19, [x10]
|
ldp x20, x19, [x10]
|
||||||
eor x4, x4, x18
|
eor x4, x4, x20
|
||||||
eor x5, x5, x19
|
eor x5, x5, x19
|
||||||
ldp x18, x19, [x10, 16]
|
ldp x20, x19, [x10, 16]
|
||||||
eor x6, x6, x18
|
eor x6, x6, x20
|
||||||
eor x7, x7, x19
|
eor x7, x7, x19
|
||||||
ldp x18, x19, [x10, 32]
|
ldp x20, x19, [x10, 32]
|
||||||
eor x12, x12, x18
|
eor x12, x12, x20
|
||||||
eor x13, x13, x19
|
eor x13, x13, x19
|
||||||
ldp x18, x19, [x10, 48]
|
ldp x20, x19, [x10, 48]
|
||||||
eor x14, x14, x18
|
eor x14, x14, x20
|
||||||
eor x15, x15, x19
|
eor x15, x15, x19
|
||||||
|
|
||||||
DECL(randomx_program_aarch64_update_spMix1):
|
DECL(randomx_program_aarch64_update_spMix1):
|
||||||
@ -388,7 +386,7 @@ DECL(randomx_program_aarch64_update_spMix1):
|
|||||||
|
|
||||||
# Restore callee-saved registers
|
# Restore callee-saved registers
|
||||||
ldp x16, x17, [sp]
|
ldp x16, x17, [sp]
|
||||||
ldp x18, x19, [sp, 16]
|
ldr x19, [sp, 16]
|
||||||
ldp x20, x21, [sp, 32]
|
ldp x20, x21, [sp, 32]
|
||||||
ldp x22, x23, [sp, 48]
|
ldp x22, x23, [sp, 48]
|
||||||
ldp x24, x25, [sp, 64]
|
ldp x24, x25, [sp, 64]
|
||||||
@ -409,7 +407,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
|
|||||||
stp x2, x30, [sp, 80]
|
stp x2, x30, [sp, 80]
|
||||||
|
|
||||||
# mx ^= r[readReg2] ^ r[readReg3];
|
# mx ^= r[readReg2] ^ r[readReg3];
|
||||||
eor x9, x9, x18
|
eor x9, x9, x20
|
||||||
|
|
||||||
# mx <-> ma
|
# mx <-> ma
|
||||||
ror x9, x9, 32
|
ror x9, x9, 32
|
||||||
@ -451,8 +449,8 @@ DECL(randomx_program_aarch64_light_dataset_offset):
|
|||||||
# x3 -> end item
|
# x3 -> end item
|
||||||
|
|
||||||
DECL(randomx_init_dataset_aarch64):
|
DECL(randomx_init_dataset_aarch64):
|
||||||
# Save x30 (return address)
|
# Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address)
|
||||||
str x30, [sp, -16]!
|
stp x20, x30, [sp, -16]!
|
||||||
|
|
||||||
# Load pointer to cache memory
|
# Load pointer to cache memory
|
||||||
ldr x0, [x0]
|
ldr x0, [x0]
|
||||||
@ -464,8 +462,8 @@ DECL(randomx_init_dataset_aarch64_main_loop):
|
|||||||
cmp x2, x3
|
cmp x2, x3
|
||||||
bne DECL(randomx_init_dataset_aarch64_main_loop)
|
bne DECL(randomx_init_dataset_aarch64_main_loop)
|
||||||
|
|
||||||
# Restore x30 (return address)
|
# Restore x20 and x30
|
||||||
ldr x30, [sp], 16
|
ldp x20, x30, [sp], 16
|
||||||
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user