diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 6a97d7d..f44a391 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -29,6 +29,7 @@ along with RandomX. If not, see. #include #include #include +#include #include "intrinPortable.h" #include "reciprocal.h" #ifdef STATS @@ -108,9 +109,13 @@ namespace RandomX { } } + static bool isDenormal(double x) { + return std::fpclassify(x) == FP_SUBNORMAL; + } + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[i]; - if(trace) printState(r, f, e, a); + //if(trace) printState(r, f, e, a); switch (ibc.type) { case InstructionType::IADD_R: { @@ -219,9 +224,8 @@ namespace RandomX { } break; case InstructionType::FDIV_M: { - __m128d fsrc = _mm_abs(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask))); - __m128d fdst = _mm_div_pd(*ibc.fdst, fsrc); - *ibc.fdst = _mm_max_pd(fdst, _mm_set_pd(DBL_MIN, DBL_MIN)); + __m128d fsrc = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask))); + *ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc); } break; case InstructionType::FSQRT_R: { @@ -258,6 +262,18 @@ namespace RandomX { else //if(ibc.type >= 20 && ibc.type <= 30) print(0); } +#ifdef FPUCHECK + if (ibc.type >= 26 && ibc.type <= 30) { + double lo = *(((double*)ibc.fdst) + 0); + double hi = *(((double*)ibc.fdst) + 1); + if (lo <= 0 || hi <= 0) { + std::stringstream ss; + ss << "Underflow in operation " << ibc.type; + printState(r, f, e, a); + throw std::runtime_error(ss.str()); + } + } +#endif } void InterpretedVirtualMachine::execute() { @@ -304,10 +320,10 @@ namespace RandomX { f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); - e[0] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 32)); - e[1] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 40)); - e[2] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 48)); - e[3] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 56)); + e[0] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 32)); + e[1] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 40)); + e[2] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 48)); + e[3] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 56)); if (trace) { std::cout << "iteration " << std::dec << iter << std::endl; @@ -357,10 +373,22 @@ namespace RandomX { store64(scratchpad + spAddr1 + 48, r[6]); store64(scratchpad + spAddr1 + 56, r[7]); - f[0] = _mm_mul_pd(f[0], e[0]); - f[1] = _mm_mul_pd(f[1], e[1]); - f[2] = _mm_mul_pd(f[2], e[2]); - f[3] = _mm_mul_pd(f[3], e[3]); + f[0] = _mm_xor_pd(f[0], e[0]); + f[1] = _mm_xor_pd(f[1], e[1]); + f[2] = _mm_xor_pd(f[2], e[2]); + f[3] = _mm_xor_pd(f[3], e[3]); + +#ifdef FPUCHECK + for(int i = 0; i < 4; ++i) { + double lo = *(((double*)&f[i]) + 0); + double hi = *(((double*)&f[i]) + 1); + if (isDenormal(lo) || isDenormal(hi)) { + std::stringstream ss; + ss << "Denormal f" << i; + throw std::runtime_error(ss.str()); + } + } +#endif _mm_store_pd((double*)(scratchpad + spAddr0 + 0), f[0]); _mm_store_pd((double*)(scratchpad + spAddr0 + 16), f[1]); diff --git a/src/intrinPortable.h b/src/intrinPortable.h index aea0335..3bee98f 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -46,11 +46,6 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) { #include #endif -inline __m128d _mm_abs(__m128d xd) { - const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63))); - return _mm_and_pd(xd, absmask); -} - #define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) #else @@ -159,6 +154,20 @@ inline __m128d _mm_xor_pd(__m128d a, __m128d b) { return x; } +inline __m128d _mm_and_pd(__m128d a, __m128d b) { + __m128d x; + x.i.u64[0] = a.i.u64[0] & b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] & b.i.u64[1]; + return x; +} + +inline __m128d _mm_or_pd(__m128d a, __m128d b) { + __m128d x; + x.i.u64[0] = a.i.u64[0] | b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] | b.i.u64[1]; + return x; +} + inline __m128d _mm_set_pd(double e1, double e0) { __m128d x; x.lo = e0; @@ -302,6 +311,18 @@ inline __m128d load_cvt_i32x2(const void* addr) { return _mm_cvtepi32_pd(ix); } +template +__m128d ieee_set_exponent(__m128d x) { + static_assert(E > -1023, "Invalid exponent value"); + constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1; + const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64)); + constexpr uint64_t exponent64 = (uint64_t)(E + 1023U) << 52; + const __m128d exponentMask = _mm_castsi128_pd(_mm_set_epi64x(exponent64, exponent64)); + x = _mm_and_pd(x, mantissaMask); + x = _mm_or_pd(x, exponentMask); + return x; +} + double loadDoublePortable(const void* addr); uint64_t mulh(uint64_t, uint64_t);