mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-12-22 15:58:53 +00:00
Abstracted away from x86 intrinsics
This commit is contained in:
parent
3dd21ea93d
commit
1aa7865619
@ -36,21 +36,21 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
|
|||||||
const uint8_t* inptr = (uint8_t*)input;
|
const uint8_t* inptr = (uint8_t*)input;
|
||||||
const uint8_t* inputEnd = inptr + inputSize;
|
const uint8_t* inputEnd = inptr + inputSize;
|
||||||
|
|
||||||
__m128i state0, state1, state2, state3;
|
rx_vec_i128 state0, state1, state2, state3;
|
||||||
__m128i in0, in1, in2, in3;
|
rx_vec_i128 in0, in1, in2, in3;
|
||||||
|
|
||||||
//intial state
|
//intial state
|
||||||
state0 = _mm_set_epi32(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
|
state0 = rx_set_int_vec_i128(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
|
||||||
state1 = _mm_set_epi32(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
|
state1 = rx_set_int_vec_i128(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
|
||||||
state2 = _mm_set_epi32(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
|
state2 = rx_set_int_vec_i128(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
|
||||||
state3 = _mm_set_epi32(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);
|
state3 = rx_set_int_vec_i128(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);
|
||||||
|
|
||||||
//process 64 bytes at a time in 4 lanes
|
//process 64 bytes at a time in 4 lanes
|
||||||
while (inptr < inputEnd) {
|
while (inptr < inputEnd) {
|
||||||
in0 = _mm_load_si128((__m128i*)inptr + 0);
|
in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0);
|
||||||
in1 = _mm_load_si128((__m128i*)inptr + 1);
|
in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1);
|
||||||
in2 = _mm_load_si128((__m128i*)inptr + 2);
|
in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2);
|
||||||
in3 = _mm_load_si128((__m128i*)inptr + 3);
|
in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3);
|
||||||
|
|
||||||
state0 = aesenc<softAes>(state0, in0);
|
state0 = aesenc<softAes>(state0, in0);
|
||||||
state1 = aesdec<softAes>(state1, in1);
|
state1 = aesdec<softAes>(state1, in1);
|
||||||
@ -61,8 +61,8 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//two extra rounds to achieve full diffusion
|
//two extra rounds to achieve full diffusion
|
||||||
__m128i xkey0 = _mm_set_epi32(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
|
rx_vec_i128 xkey0 = rx_set_int_vec_i128(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
|
||||||
__m128i xkey1 = _mm_set_epi32(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);
|
rx_vec_i128 xkey1 = rx_set_int_vec_i128(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);
|
||||||
|
|
||||||
state0 = aesenc<softAes>(state0, xkey0);
|
state0 = aesenc<softAes>(state0, xkey0);
|
||||||
state1 = aesdec<softAes>(state1, xkey0);
|
state1 = aesdec<softAes>(state1, xkey0);
|
||||||
@ -75,10 +75,10 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
|
|||||||
state3 = aesdec<softAes>(state3, xkey1);
|
state3 = aesdec<softAes>(state3, xkey1);
|
||||||
|
|
||||||
//output hash
|
//output hash
|
||||||
_mm_store_si128((__m128i*)hash + 0, state0);
|
rx_store_vec_i128((rx_vec_i128*)hash + 0, state0);
|
||||||
_mm_store_si128((__m128i*)hash + 1, state1);
|
rx_store_vec_i128((rx_vec_i128*)hash + 1, state1);
|
||||||
_mm_store_si128((__m128i*)hash + 2, state2);
|
rx_store_vec_i128((rx_vec_i128*)hash + 2, state2);
|
||||||
_mm_store_si128((__m128i*)hash + 3, state3);
|
rx_store_vec_i128((rx_vec_i128*)hash + 3, state3);
|
||||||
}
|
}
|
||||||
|
|
||||||
template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
|
template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
|
||||||
@ -99,18 +99,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
|
|||||||
const uint8_t* outptr = (uint8_t*)buffer;
|
const uint8_t* outptr = (uint8_t*)buffer;
|
||||||
const uint8_t* outputEnd = outptr + outputSize;
|
const uint8_t* outputEnd = outptr + outputSize;
|
||||||
|
|
||||||
__m128i state0, state1, state2, state3;
|
rx_vec_i128 state0, state1, state2, state3;
|
||||||
__m128i key0, key1, key2, key3;
|
rx_vec_i128 key0, key1, key2, key3;
|
||||||
|
|
||||||
key0 = _mm_set_epi32(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
|
key0 = rx_set_int_vec_i128(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
|
||||||
key1 = _mm_set_epi32(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
|
key1 = rx_set_int_vec_i128(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
|
||||||
key2 = _mm_set_epi32(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
|
key2 = rx_set_int_vec_i128(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
|
||||||
key3 = _mm_set_epi32(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);
|
key3 = rx_set_int_vec_i128(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);
|
||||||
|
|
||||||
state0 = _mm_load_si128((__m128i*)state + 0);
|
state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
|
||||||
state1 = _mm_load_si128((__m128i*)state + 1);
|
state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
|
||||||
state2 = _mm_load_si128((__m128i*)state + 2);
|
state2 = rx_load_vec_i128((rx_vec_i128*)state + 2);
|
||||||
state3 = _mm_load_si128((__m128i*)state + 3);
|
state3 = rx_load_vec_i128((rx_vec_i128*)state + 3);
|
||||||
|
|
||||||
while (outptr < outputEnd) {
|
while (outptr < outputEnd) {
|
||||||
state0 = aesdec<softAes>(state0, key0);
|
state0 = aesdec<softAes>(state0, key0);
|
||||||
@ -118,18 +118,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
|
|||||||
state2 = aesdec<softAes>(state2, key2);
|
state2 = aesdec<softAes>(state2, key2);
|
||||||
state3 = aesenc<softAes>(state3, key3);
|
state3 = aesenc<softAes>(state3, key3);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)outptr + 0, state0);
|
rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
|
||||||
_mm_store_si128((__m128i*)outptr + 1, state1);
|
rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
|
||||||
_mm_store_si128((__m128i*)outptr + 2, state2);
|
rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2);
|
||||||
_mm_store_si128((__m128i*)outptr + 3, state3);
|
rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3);
|
||||||
|
|
||||||
outptr += 64;
|
outptr += 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)state + 0, state0);
|
rx_store_vec_i128((rx_vec_i128*)state + 0, state0);
|
||||||
_mm_store_si128((__m128i*)state + 1, state1);
|
rx_store_vec_i128((rx_vec_i128*)state + 1, state1);
|
||||||
_mm_store_si128((__m128i*)state + 2, state2);
|
rx_store_vec_i128((rx_vec_i128*)state + 2, state2);
|
||||||
_mm_store_si128((__m128i*)state + 3, state3);
|
rx_store_vec_i128((rx_vec_i128*)state + 3, state3);
|
||||||
}
|
}
|
||||||
|
|
||||||
template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
|
template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
|
||||||
|
@ -27,7 +27,7 @@ namespace randomx {
|
|||||||
|
|
||||||
template<size_t alignment>
|
template<size_t alignment>
|
||||||
void* AlignedAllocator<alignment>::allocMemory(size_t count) {
|
void* AlignedAllocator<alignment>::allocMemory(size_t count) {
|
||||||
void *mem = _mm_malloc(count, alignment);
|
void *mem = rx_aligned_alloc(count, alignment);
|
||||||
if (mem == nullptr)
|
if (mem == nullptr)
|
||||||
throw std::bad_alloc();
|
throw std::bad_alloc();
|
||||||
return mem;
|
return mem;
|
||||||
@ -35,11 +35,10 @@ namespace randomx {
|
|||||||
|
|
||||||
template<size_t alignment>
|
template<size_t alignment>
|
||||||
void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
|
void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
|
||||||
_mm_free(ptr);
|
rx_aligned_free(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
template class AlignedAllocator<CacheLineSize>;
|
template class AlignedAllocator<CacheLineSize>;
|
||||||
template class AlignedAllocator<sizeof(__m128i)>;;
|
|
||||||
|
|
||||||
void* LargePageAllocator::allocMemory(size_t count) {
|
void* LargePageAllocator::allocMemory(size_t count) {
|
||||||
return allocLargePagesMemory(count);
|
return allocLargePagesMemory(count);
|
||||||
|
@ -148,7 +148,7 @@ namespace randomx {
|
|||||||
rl[7] = rl[0] ^ superscalarAdd7;
|
rl[7] = rl[0] ^ superscalarAdd7;
|
||||||
for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
|
for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
|
||||||
mixBlock = getMixBlock(registerValue, cache->memory);
|
mixBlock = getMixBlock(registerValue, cache->memory);
|
||||||
PREFETCHNTA(mixBlock);
|
rx_prefetch_nta(mixBlock);
|
||||||
SuperscalarProgram& prog = cache->programs[i];
|
SuperscalarProgram& prog = cache->programs[i];
|
||||||
|
|
||||||
executeSuperscalar(rl, prog, &cache->reciprocalCache);
|
executeSuperscalar(rl, prog, &cache->reciprocalCache);
|
||||||
|
@ -123,8 +123,14 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#define HAVE_SMULH
|
#define HAVE_SMULH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void setRoundMode(uint32_t rcflag) {
|
#ifdef RANDOMX_DEFAULT_FENV
|
||||||
switch (rcflag & 3) {
|
|
||||||
|
void rx_reset_float_state() {
|
||||||
|
setRoundMode_(FE_TONEAREST);
|
||||||
|
}
|
||||||
|
|
||||||
|
void rx_set_rounding_mode(uint32_t mode) {
|
||||||
|
switch (mode & 3) {
|
||||||
case RoundDown:
|
case RoundDown:
|
||||||
setRoundMode_(FE_DOWNWARD);
|
setRoundMode_(FE_DOWNWARD);
|
||||||
break;
|
break;
|
||||||
@ -142,13 +148,7 @@ void setRoundMode(uint32_t rcflag) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void initFpu() {
|
|
||||||
#ifdef __SSE2__
|
|
||||||
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
|
|
||||||
#else
|
|
||||||
setRoundMode(FE_TONEAREST);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
|
|
||||||
union double_ser_t {
|
union double_ser_t {
|
||||||
double f;
|
double f;
|
||||||
|
@ -20,6 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include "blake2/endian.h"
|
||||||
|
|
||||||
constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
|
constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
|
||||||
return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
|
return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
|
||||||
@ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
|
|||||||
return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
|
return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr int RoundToNearest = 0;
|
||||||
|
constexpr int RoundDown = 1;
|
||||||
|
constexpr int RoundUp = 2;
|
||||||
|
constexpr int RoundToZero = 3;
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
|
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
|
||||||
#define __SSE2__ 1
|
#define __SSE2__ 1
|
||||||
@ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
|
|||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
|
typedef __m128i rx_vec_i128;
|
||||||
|
typedef __m128d rx_vec_f128;
|
||||||
|
|
||||||
|
#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
|
||||||
|
#define rx_aligned_free(a) _mm_free(a)
|
||||||
|
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
|
||||||
|
|
||||||
|
#define rx_load_vec_f128 _mm_load_pd
|
||||||
|
#define rx_store_vec_f128 _mm_store_pd
|
||||||
|
#define rx_shuffle_vec_f128 _mm_shuffle_pd
|
||||||
|
#define rx_add_vec_f128 _mm_add_pd
|
||||||
|
#define rx_sub_vec_f128 _mm_sub_pd
|
||||||
|
#define rx_mul_vec_f128 _mm_mul_pd
|
||||||
|
#define rx_div_vec_f128 _mm_div_pd
|
||||||
|
#define rx_sqrt_vec_f128 _mm_sqrt_pd
|
||||||
|
#define rx_set1_long_vec_i128 _mm_set1_epi64x
|
||||||
|
#define rx_vec_i128_vec_f128 _mm_castsi128_pd
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
|
||||||
|
return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
|
||||||
|
return _mm_castsi128_pd(_mm_set1_epi64x(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define rx_xor_vec_f128 _mm_xor_pd
|
||||||
|
#define rx_and_vec_f128 _mm_and_pd
|
||||||
|
#define rx_or_vec_f128 _mm_or_pd
|
||||||
|
#define rx_aesenc_vec_i128 _mm_aesenc_si128
|
||||||
|
#define rx_aesdec_vec_i128 _mm_aesdec_si128
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
|
||||||
|
return _mm_cvtsi128_si32(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
|
||||||
|
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
|
||||||
|
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa));
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
|
||||||
|
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define rx_set_int_vec_i128 _mm_set_epi32
|
||||||
|
#define rx_xor_vec_i128 _mm_xor_si128
|
||||||
|
#define rx_load_vec_i128 _mm_load_si128
|
||||||
|
#define rx_store_vec_i128 _mm_store_si128
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
|
||||||
|
__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
|
||||||
|
return _mm_cvtepi32_pd(ix);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
|
||||||
|
|
||||||
|
FORCE_INLINE void rx_reset_float_state() {
|
||||||
|
_mm_setcsr(rx_mxcsr_default);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
|
||||||
|
_mm_setcsr(rx_mxcsr_default | (mode << 13));
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "blake2/endian.h"
|
|
||||||
|
|
||||||
#define _mm_malloc(a,b) malloc(a)
|
|
||||||
#define _mm_free(a) free(a)
|
|
||||||
#define PREFETCHNTA(x)
|
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
uint64_t u64[2];
|
uint64_t u64[2];
|
||||||
uint32_t u32[4];
|
uint32_t u32[4];
|
||||||
uint16_t u16[8];
|
uint16_t u16[8];
|
||||||
uint8_t u8[16];
|
uint8_t u8[16];
|
||||||
} __m128i;
|
} rx_vec_i128;
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
struct {
|
struct {
|
||||||
double lo;
|
double lo;
|
||||||
double hi;
|
double hi;
|
||||||
};
|
};
|
||||||
__m128i i;
|
rx_vec_i128 i;
|
||||||
} __m128d;
|
} rx_vec_f128;
|
||||||
|
|
||||||
inline __m128d _mm_load_pd(const double* pd) {
|
#define rx_aligned_alloc(a, b) malloc(a)
|
||||||
__m128d x;
|
#define rx_aligned_free(a) free(a)
|
||||||
|
#define rx_prefetch_nta(x)
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
|
||||||
|
rx_vec_f128 x;
|
||||||
x.i.u64[0] = load64(pd + 0);
|
x.i.u64[0] = load64(pd + 0);
|
||||||
x.i.u64[1] = load64(pd + 1);
|
x.i.u64[1] = load64(pd + 1);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_store_pd(double* mem_addr, __m128d a) {
|
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
|
||||||
store64(mem_addr + 0, a.i.u64[0]);
|
store64(mem_addr + 0, a.i.u64[0]);
|
||||||
store64(mem_addr + 1, a.i.u64[1]);
|
store64(mem_addr + 1, a.i.u64[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {
|
FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = (imm8 & 1) ? a.hi : a.lo;
|
x.lo = (imm8 & 1) ? a.hi : a.lo;
|
||||||
x.hi = (imm8 & 2) ? b.hi : b.lo;
|
x.hi = (imm8 & 2) ? b.hi : b.lo;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_add_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = a.lo + b.lo;
|
x.lo = a.lo + b.lo;
|
||||||
x.hi = a.hi + b.hi;
|
x.hi = a.hi + b.hi;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_sub_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = a.lo - b.lo;
|
x.lo = a.lo - b.lo;
|
||||||
x.hi = a.hi - b.hi;
|
x.hi = a.hi - b.hi;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_mul_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = a.lo * b.lo;
|
x.lo = a.lo * b.lo;
|
||||||
x.hi = a.hi * b.hi;
|
x.hi = a.hi * b.hi;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_div_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = a.lo / b.lo;
|
x.lo = a.lo / b.lo;
|
||||||
x.hi = a.hi / b.hi;
|
x.hi = a.hi / b.hi;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_sqrt_pd(__m128d a) {
|
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.lo = sqrt(a.lo);
|
x.lo = sqrt(a.lo);
|
||||||
x.hi = sqrt(a.hi);
|
x.hi = sqrt(a.hi);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_set1_epi64x(uint64_t a) {
|
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
|
||||||
__m128i x;
|
rx_vec_i128 x;
|
||||||
x.u64[0] = a;
|
x.u64[0] = a;
|
||||||
x.u64[1] = a;
|
x.u64[1] = a;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_castsi128_pd(__m128i a) {
|
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.i = a;
|
x.i = a;
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_abs(__m128d xd) {
|
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
|
||||||
xd.lo = std::fabs(xd.lo);
|
rx_vec_f128 v;
|
||||||
xd.hi = std::fabs(xd.hi);
|
v.i.u64[0] = x0;
|
||||||
return xd;
|
v.i.u64[1] = x1;
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
|
||||||
__m128d x;
|
rx_vec_f128 v;
|
||||||
|
v.i.u64[0] = x;
|
||||||
|
v.i.u64[1] = x;
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
|
rx_vec_f128 x;
|
||||||
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
|
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
|
||||||
x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
|
x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_and_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
|
x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
|
||||||
x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
|
x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_or_pd(__m128d a, __m128d b) {
|
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
|
||||||
__m128d x;
|
rx_vec_f128 x;
|
||||||
x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
|
x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
|
||||||
x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
|
x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128d _mm_set_pd(double e1, double e0) {
|
|
||||||
__m128d x;
|
|
||||||
x.lo = e0;
|
|
||||||
x.hi = e1;
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __m128d _mm_max_pd(__m128d a, __m128d b) {
|
|
||||||
__m128d x;
|
|
||||||
x.lo = a.lo > b.lo ? a.lo : b.lo;
|
|
||||||
x.hi = a.hi > b.hi ? a.hi : b.hi;
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __m128d _mm_cvtepi32_pd(__m128i a) {
|
|
||||||
__m128d x;
|
|
||||||
x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);
|
|
||||||
x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char* platformError = "Platform doesn't support hardware AES";
|
static const char* platformError = "Platform doesn't support hardware AES";
|
||||||
|
|
||||||
inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
|
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
|
||||||
throw std::runtime_error(platformError);
|
throw std::runtime_error(platformError);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {
|
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
|
||||||
throw std::runtime_error(platformError);
|
throw std::runtime_error(platformError);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {
|
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
|
||||||
throw std::runtime_error(platformError);
|
return a.u32[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int _mm_cvtsi128_si32(__m128i v) {
|
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
|
||||||
return v.u32[0];
|
return a.u32[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_cvtsi32_si128(int si32) {
|
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
|
||||||
__m128i v;
|
return a.u32[2];
|
||||||
v.u32[0] = si32;
|
|
||||||
v.u32[1] = 0;
|
|
||||||
v.u32[2] = 0;
|
|
||||||
v.u32[3] = 0;
|
|
||||||
return v;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {
|
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
|
||||||
__m128i v;
|
return a.u32[3];
|
||||||
v.u64[0] = _I0;
|
|
||||||
v.u64[1] = _I1;
|
|
||||||
return v;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
|
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
|
||||||
__m128i v;
|
rx_vec_i128 v;
|
||||||
v.u32[0] = _I0;
|
v.u32[0] = _I0;
|
||||||
v.u32[1] = _I1;
|
v.u32[1] = _I1;
|
||||||
v.u32[2] = _I2;
|
v.u32[2] = _I2;
|
||||||
@ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
|
|||||||
return v;
|
return v;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
|
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
|
||||||
__m128i c;
|
rx_vec_i128 c;
|
||||||
c.u32[0] = _A.u32[0] ^ _B.u32[0];
|
c.u32[0] = _A.u32[0] ^ _B.u32[0];
|
||||||
c.u32[1] = _A.u32[1] ^ _B.u32[1];
|
c.u32[1] = _A.u32[1] ^ _B.u32[1];
|
||||||
c.u32[2] = _A.u32[2] ^ _B.u32[2];
|
c.u32[2] = _A.u32[2] ^ _B.u32[2];
|
||||||
@ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {
|
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) {
|
||||||
__m128i c;
|
|
||||||
c.u32[0] = _A.u32[_Imm & 3];
|
|
||||||
c.u32[1] = _A.u32[(_Imm >> 2) & 3];
|
|
||||||
c.u32[2] = _A.u32[(_Imm >> 4) & 3];
|
|
||||||
c.u32[3] = _A.u32[(_Imm >> 6) & 3];
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __m128i _mm_load_si128(__m128i const*_P) {
|
|
||||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||||
return *_P;
|
return *_P;
|
||||||
#else
|
#else
|
||||||
uint32_t* ptr = (uint32_t*)_P;
|
uint32_t* ptr = (uint32_t*)_P;
|
||||||
__m128i c;
|
rx_vec_i128 c;
|
||||||
c.u32[0] = load32(ptr + 0);
|
c.u32[0] = load32(ptr + 0);
|
||||||
c.u32[1] = load32(ptr + 1);
|
c.u32[1] = load32(ptr + 1);
|
||||||
c.u32[2] = load32(ptr + 2);
|
c.u32[2] = load32(ptr + 2);
|
||||||
@ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_store_si128(__m128i *_P, __m128i _B) {
|
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
|
||||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||||
*_P = _B;
|
*_P = _B;
|
||||||
#else
|
#else
|
||||||
@ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
|
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
|
||||||
_Imm &= 255;
|
rx_vec_f128 x;
|
||||||
if (_Imm > 15) {
|
x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
|
||||||
_A.u64[0] = 0;
|
x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
|
||||||
_A.u64[1] = 0;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
for (int i = 15; i >= _Imm; --i) {
|
|
||||||
_A.u8[i] = _A.u8[i - _Imm];
|
|
||||||
}
|
|
||||||
for (int i = 0; i < _Imm; ++i) {
|
|
||||||
_A.u8[i] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return _A;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {
|
|
||||||
__m128i x;
|
|
||||||
x.u32[0] = load32((uint8_t*)mem_addr + 0);
|
|
||||||
x.u32[1] = load32((uint8_t*)mem_addr + 4);
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define RANDOMX_DEFAULT_FENV
|
||||||
|
|
||||||
|
void rx_reset_float_state();
|
||||||
|
|
||||||
|
void rx_set_rounding_mode(uint32_t mode);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
constexpr int RoundToNearest = 0;
|
|
||||||
constexpr int RoundDown = 1;
|
|
||||||
constexpr int RoundUp = 2;
|
|
||||||
constexpr int RoundToZero = 3;
|
|
||||||
|
|
||||||
inline __m128d load_cvt_i32x2(const void* addr) {
|
|
||||||
__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
|
|
||||||
return _mm_cvtepi32_pd(ix);
|
|
||||||
}
|
|
||||||
|
|
||||||
double loadDoublePortable(const void* addr);
|
double loadDoublePortable(const void* addr);
|
||||||
uint64_t mulh(uint64_t, uint64_t);
|
uint64_t mulh(uint64_t, uint64_t);
|
||||||
int64_t smulh(int64_t, int64_t);
|
int64_t smulh(int64_t, int64_t);
|
||||||
uint64_t rotl(uint64_t, int);
|
uint64_t rotl(uint64_t, int);
|
||||||
uint64_t rotr(uint64_t, int);
|
uint64_t rotr(uint64_t, int);
|
||||||
void initFpu();
|
|
||||||
void setRoundMode(uint32_t);
|
|
||||||
|
@ -318,38 +318,38 @@ alignas(16) const uint32_t lutDec3[256] = {
|
|||||||
0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
|
0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
|
||||||
};
|
};
|
||||||
|
|
||||||
__m128i soft_aesenc(__m128i in, __m128i key) {
|
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) {
|
||||||
uint32_t s0, s1, s2, s3;
|
uint32_t s0, s1, s2, s3;
|
||||||
|
|
||||||
s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
|
s0 = rx_vec_i128_w(in);
|
||||||
s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
|
s1 = rx_vec_i128_z(in);
|
||||||
s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
|
s2 = rx_vec_i128_y(in);
|
||||||
s3 = _mm_cvtsi128_si32(in);
|
s3 = rx_vec_i128_x(in);
|
||||||
|
|
||||||
__m128i out = _mm_set_epi32(
|
rx_vec_i128 out = rx_set_int_vec_i128(
|
||||||
(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
|
(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
|
||||||
(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
|
(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
|
||||||
(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
|
(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
|
||||||
(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
|
(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
|
||||||
);
|
);
|
||||||
|
|
||||||
return _mm_xor_si128(out, key);
|
return rx_xor_vec_i128(out, key);
|
||||||
}
|
}
|
||||||
|
|
||||||
__m128i soft_aesdec(__m128i in, __m128i key) {
|
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) {
|
||||||
uint32_t s0, s1, s2, s3;
|
uint32_t s0, s1, s2, s3;
|
||||||
|
|
||||||
s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
|
s0 = rx_vec_i128_w(in);
|
||||||
s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
|
s1 = rx_vec_i128_z(in);
|
||||||
s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
|
s2 = rx_vec_i128_y(in);
|
||||||
s3 = _mm_cvtsi128_si32(in);
|
s3 = rx_vec_i128_x(in);
|
||||||
|
|
||||||
__m128i out = _mm_set_epi32(
|
rx_vec_i128 out = rx_set_int_vec_i128(
|
||||||
(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
|
(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
|
||||||
(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
|
(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
|
||||||
(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
|
(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
|
||||||
(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
|
(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
|
||||||
);
|
);
|
||||||
|
|
||||||
return _mm_xor_si128(out, key);
|
return rx_xor_vec_i128(out, key);
|
||||||
}
|
}
|
||||||
|
@ -22,16 +22,16 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "intrin_portable.h"
|
#include "intrin_portable.h"
|
||||||
|
|
||||||
__m128i soft_aesenc(__m128i in, __m128i key);
|
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key);
|
||||||
|
|
||||||
__m128i soft_aesdec(__m128i in, __m128i key);
|
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key);
|
||||||
|
|
||||||
template<bool soft>
|
template<bool soft>
|
||||||
inline __m128i aesenc(__m128i in, __m128i key) {
|
inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) {
|
||||||
return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key);
|
return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool soft>
|
template<bool soft>
|
||||||
inline __m128i aesdec(__m128i in, __m128i key) {
|
inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) {
|
||||||
return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key);
|
return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key);
|
||||||
}
|
}
|
@ -32,7 +32,7 @@ randomx_vm::~randomx_vm() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void randomx_vm::resetRoundingMode() {
|
void randomx_vm::resetRoundingMode() {
|
||||||
initFpu();
|
rx_reset_float_state();
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace randomx {
|
namespace randomx {
|
||||||
@ -86,7 +86,7 @@ void randomx_vm::initialize() {
|
|||||||
|
|
||||||
namespace randomx {
|
namespace randomx {
|
||||||
|
|
||||||
alignas(16) volatile static __m128i aesDummy;
|
alignas(16) volatile static rx_vec_i128 aesDummy;
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
VmBase<Allocator, softAes>::~VmBase() {
|
VmBase<Allocator, softAes>::~VmBase() {
|
||||||
@ -98,9 +98,9 @@ namespace randomx {
|
|||||||
if (datasetPtr == nullptr)
|
if (datasetPtr == nullptr)
|
||||||
throw std::invalid_argument("Cache/Dataset not set");
|
throw std::invalid_argument("Cache/Dataset not set");
|
||||||
if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
|
if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
|
||||||
__m128i tmp = _mm_load_si128((const __m128i*)&aesDummy);
|
rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy);
|
||||||
tmp = _mm_aesenc_si128(tmp, tmp);
|
tmp = rx_aesenc_vec_i128(tmp, tmp);
|
||||||
_mm_store_si128((__m128i*)&aesDummy, tmp);
|
rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp);
|
||||||
}
|
}
|
||||||
scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
|
scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
|
||||||
}
|
}
|
||||||
|
@ -46,7 +46,7 @@ namespace randomx {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
|
void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
|
||||||
for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) {
|
for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) {
|
||||||
executeBytecode(pc, r, f, e, a);
|
executeBytecode(pc, r, f, e, a);
|
||||||
}
|
}
|
||||||
@ -59,16 +59,16 @@ namespace randomx {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
FORCE_INLINE __m128d InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(__m128d x) {
|
FORCE_INLINE rx_vec_f128 InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(rx_vec_f128 x) {
|
||||||
const __m128d xmantissaMask = _mm_castsi128_pd(_mm_set_epi64x(dynamicMantissaMask, dynamicMantissaMask));
|
const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask);
|
||||||
const __m128d xexponentMask = _mm_load_pd((const double*)&config.eMask);
|
const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask);
|
||||||
x = _mm_and_pd(x, xmantissaMask);
|
x = rx_and_vec_f128(x, xmantissaMask);
|
||||||
x = _mm_or_pd(x, xexponentMask);
|
x = rx_or_vec_f128(x, xexponentMask);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
|
void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
|
||||||
auto& ibc = byteCode[pc];
|
auto& ibc = byteCode[pc];
|
||||||
switch (ibc.type)
|
switch (ibc.type)
|
||||||
{
|
{
|
||||||
@ -139,43 +139,43 @@ namespace randomx {
|
|||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FSWAP_R: {
|
case InstructionType::FSWAP_R: {
|
||||||
*ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1);
|
*ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FADD_R: {
|
case InstructionType::FADD_R: {
|
||||||
*ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc);
|
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FADD_M: {
|
case InstructionType::FADD_M: {
|
||||||
__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
|
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
|
||||||
*ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc);
|
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FSUB_R: {
|
case InstructionType::FSUB_R: {
|
||||||
*ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc);
|
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FSUB_M: {
|
case InstructionType::FSUB_M: {
|
||||||
__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
|
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
|
||||||
*ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc);
|
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FSCAL_R: {
|
case InstructionType::FSCAL_R: {
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x81F0000000000000));
|
const rx_vec_f128 mask = rx_set1_vec_f128(0x81F0000000000000);
|
||||||
*ibc.fdst = _mm_xor_pd(*ibc.fdst, mask);
|
*ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FMUL_R: {
|
case InstructionType::FMUL_R: {
|
||||||
*ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc);
|
*ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FDIV_M: {
|
case InstructionType::FDIV_M: {
|
||||||
__m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc)));
|
rx_vec_f128 fsrc = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)));
|
||||||
*ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc);
|
*ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::FSQRT_R: {
|
case InstructionType::FSQRT_R: {
|
||||||
*ibc.fdst = _mm_sqrt_pd(*ibc.fdst);
|
*ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::CBRANCH: {
|
case InstructionType::CBRANCH: {
|
||||||
@ -186,7 +186,7 @@ namespace randomx {
|
|||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::CFROUND: {
|
case InstructionType::CFROUND: {
|
||||||
setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4);
|
rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
case InstructionType::ISTORE: {
|
case InstructionType::ISTORE: {
|
||||||
@ -205,12 +205,12 @@ namespace randomx {
|
|||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
void InterpretedVm<Allocator, softAes>::execute() {
|
void InterpretedVm<Allocator, softAes>::execute() {
|
||||||
int_reg_t r[RegistersCount] = { 0 };
|
int_reg_t r[RegistersCount] = { 0 };
|
||||||
__m128d f[RegisterCountFlt];
|
rx_vec_f128 f[RegisterCountFlt];
|
||||||
__m128d e[RegisterCountFlt];
|
rx_vec_f128 e[RegisterCountFlt];
|
||||||
__m128d a[RegisterCountFlt];
|
rx_vec_f128 a[RegisterCountFlt];
|
||||||
|
|
||||||
for(unsigned i = 0; i < RegisterCountFlt; ++i)
|
for(unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
a[i] = _mm_load_pd(®.a[i].lo);
|
a[i] = rx_load_vec_f128(®.a[i].lo);
|
||||||
|
|
||||||
precompileProgram(r, f, e, a);
|
precompileProgram(r, f, e, a);
|
||||||
|
|
||||||
@ -228,10 +228,10 @@ namespace randomx {
|
|||||||
r[i] ^= load64(scratchpad + spAddr0 + 8 * i);
|
r[i] ^= load64(scratchpad + spAddr0 + 8 * i);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i);
|
f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
|
e[i] = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
|
||||||
|
|
||||||
executeBytecode(r, f, e, a);
|
executeBytecode(r, f, e, a);
|
||||||
|
|
||||||
@ -244,10 +244,10 @@ namespace randomx {
|
|||||||
store64(scratchpad + spAddr1 + 8 * i, r[i]);
|
store64(scratchpad + spAddr1 + 8 * i, r[i]);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
f[i] = _mm_xor_pd(f[i], e[i]);
|
f[i] = rx_xor_vec_f128(f[i], e[i]);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
_mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]);
|
rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), f[i]);
|
||||||
|
|
||||||
spAddr0 = 0;
|
spAddr0 = 0;
|
||||||
spAddr1 = 0;
|
spAddr1 = 0;
|
||||||
@ -257,10 +257,10 @@ namespace randomx {
|
|||||||
store64(®.r[i], r[i]);
|
store64(®.r[i], r[i]);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
_mm_store_pd(®.f[i].lo, f[i]);
|
rx_store_vec_f128(®.f[i].lo, f[i]);
|
||||||
|
|
||||||
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
for (unsigned i = 0; i < RegisterCountFlt; ++i)
|
||||||
_mm_store_pd(®.e[i].lo, e[i]);
|
rx_store_vec_f128(®.e[i].lo, e[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
@ -273,7 +273,7 @@ namespace randomx {
|
|||||||
#include "instruction_weights.hpp"
|
#include "instruction_weights.hpp"
|
||||||
|
|
||||||
template<class Allocator, bool softAes>
|
template<class Allocator, bool softAes>
|
||||||
void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
|
void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
|
||||||
RegisterUsage registerUsage[RegistersCount];
|
RegisterUsage registerUsage[RegistersCount];
|
||||||
for (unsigned i = 0; i < RegistersCount; ++i) {
|
for (unsigned i = 0; i < RegistersCount; ++i) {
|
||||||
registerUsage[i].lastUsed = -1;
|
registerUsage[i].lastUsed = -1;
|
||||||
|
@ -31,11 +31,11 @@ namespace randomx {
|
|||||||
struct InstructionByteCode {
|
struct InstructionByteCode {
|
||||||
union {
|
union {
|
||||||
int_reg_t* idst;
|
int_reg_t* idst;
|
||||||
__m128d* fdst;
|
rx_vec_f128* fdst;
|
||||||
};
|
};
|
||||||
union {
|
union {
|
||||||
int_reg_t* isrc;
|
int_reg_t* isrc;
|
||||||
__m128d* fsrc;
|
rx_vec_f128* fsrc;
|
||||||
};
|
};
|
||||||
union {
|
union {
|
||||||
uint64_t imm;
|
uint64_t imm;
|
||||||
@ -74,11 +74,11 @@ namespace randomx {
|
|||||||
virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]);
|
virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]);
|
||||||
private:
|
private:
|
||||||
void execute();
|
void execute();
|
||||||
void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
|
void precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
|
||||||
void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
|
void executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
|
||||||
void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
|
void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
|
||||||
void* getScratchpadAddress(InstructionByteCode& ibc);
|
void* getScratchpadAddress(InstructionByteCode& ibc);
|
||||||
__m128d maskRegisterExponentMantissa(__m128d);
|
rx_vec_f128 maskRegisterExponentMantissa(rx_vec_f128);
|
||||||
|
|
||||||
InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
|
InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user