Removed some legacy code

2024-12-22 07:48:54 +00:00 · 2019-05-03 15:33:51 +02:00 · 2019-05-03 15:33:51 +02:00 · 1037cc0139
commit 1037cc0139
parent 9e5eac8645
16 changed files with 0 additions and 27702 deletions
--- a/tests/branch_prediction/branch_always.c
+++ b/tests/branch_prediction/branch_always.c
--- a/tests/branch_prediction/branch_mixed.c
+++ b/tests/branch_prediction/branch_mixed.c
--- a/tests/branch_prediction/branch_predictably.c
+++ b/tests/branch_prediction/branch_predictably.c
--- a/tests/branch_prediction/branch_randomly.c
+++ b/tests/branch_prediction/branch_randomly.c
--- a/tests/branch_prediction/makefile
+++ b/tests/branch_prediction/makefile
@ -1,16 +0,0 @@
-all: branch_always branch_predictably branch_randomly branch_mixed
-
-branch_always: branch_always.c
-	gcc -O0 branch_always.c -o branch_always
-
-branch_predictably: branch_predictably.c
-	gcc -O0 branch_predictably.c -o branch_predictably
-
-branch_randomly: branch_randomly.c
-	gcc -O0 branch_randomly.c -o branch_randomly
-
-branch_mixed: branch_mixed.c
-	gcc -O0 branch_mixed.c -o branch_mixed
-
-clean:
-	rm branch_always branch_predictably branch_randomly branch_mixed
--- a/tests/branch_prediction/prof.h
+++ b/tests/branch_prediction/prof.h
@ -1,333 +0,0 @@
-/*
- * Prof
- * ====
- *
- * Self-contained C/C++ profiler library for Linux.
- *
- * Prof offers a quick way to measure performance events (CPU clock cycles,
- * cache misses, branch mispredictions, etc.) of C/C++ code snippets. Prof is
- * just a wrapper around the `perf_event_open` system call, its main goal is to
- * be easy to setup and painless to use for targeted optimizations, namely, when
- * the hot spot has already been identified. In no way Prof is a replacement for
- * a fully-fledged profiler like perf, gprof, callgrind, etc.
- *
- * Please be aware that Prof uses `__attribute__((constructor))` to be as more
- * straightforward to setup as possible, so it cannot be included more than
- * once.
- *
- * Examples
- * --------
- *
- * ### Minimal
- *
- * The following snippet prints the rough number of CPU clock cycles spent in
- * executing the code between the two Prof calls:
- *
- * ```c
- * #include "prof.h"
- *
- * int main()
- * {
- *     PROF_START();
- *     // slow code goes here...
- *     PROF_STDOUT();
- * }
- * ```
- *
- * ### Custom options
- *
- * The following snippet instead counts both read and write faults of the level
- * 1 data cache that occur in the userland code between the two Prof calls:
- *
- * ```c
- * #include <stdio.h>
- *
- * #define PROF_USER_EVENTS_ONLY
- * #define PROF_EVENT_LIST \
- *     PROF_EVENT_CACHE(L1D, READ, MISS) \
- *     PROF_EVENT_CACHE(L1D, WRITE, MISS)
- * #include "prof.h"
- *
- * int main()
- * {
- *     uint64_t faults[2] = { 0 };
- *
- *     PROF_START();
- *     // slow code goes here...
- *     PROF_DO(faults[index] += counter);
- *
- *     // fast or uninteresting code goes here...
- *
- *     PROF_START();
- *     // slow code goes here...
- *     PROF_DO(faults[index] += counter);
- *
- *     printf("Total L1 faults: R = %lu; W = %lu\n", faults[0], faults[1]);
- * }
- * ```
- *
- * Installation
- * ------------
- *
- * Just include `prof.h`. Here is a quick way to fetch the latest version:
- *
- *     wget -q https://raw.githubusercontent.com/cyrus-and/prof/master/prof.h
- */
-#ifndef PROF_H
-#define PROF_H
-
-#include <errno.h>
-#include <linux/perf_event.h>
-#include <stdarg.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-/*
- * API
- * ---
- */
-
-/*
- * Reset the counters and (re)start counting the events.
- *
- * The events to be monitored are specified by setting the `PROF_EVENT_LIST`
- * macro before including this file to a list of `PROF_EVENT_*` invocations;
- * defaults to counting the number CPU clock cycles.
- *
- * If the `PROF_USER_EVENTS_ONLY` macro is defined before including this file
- * then kernel and hypervisor events are excluded from the count.
- */
-#define PROF_START()                                                           \
-    do {                                                                       \
-        PROF_IOCTL_(ENABLE);                                                   \
-        PROF_IOCTL_(RESET);                                                    \
-    } while (0)
-
-/*
- * Specify an event to be monitored, `type` and `config` are defined in the
- * documentation of the `perf_event_open` system call.
- */
-#define PROF_EVENT(type, config)                                               \
-    (uint32_t)(type), (uint64_t)(config),
-
-/*
- * Same as `PROF_EVENT` but for hardware events; prefix `PERF_COUNT_HW_` must be
- * omitted from `config`.
- */
-#define PROF_EVENT_HW(config)                                                  \
-    PROF_EVENT(PERF_TYPE_HARDWARE, PERF_COUNT_HW_ ## config)
-
-/*
- * Same as `PROF_EVENT` but for software events; prefix `PERF_COUNT_SW_` must be
- * omitted from `config`.
- */
-#define PROF_EVENT_SW(config)                                                  \
-    PROF_EVENT(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ ## config)
-
-/*
- * Same as `PROF_EVENT` but for cache events; prefixes `PERF_COUNT_HW_CACHE_`,
- * `PERF_COUNT_HW_CACHE_OP_` and `PERF_COUNT_HW_CACHE_RESULT_` must be omitted
- * from `cache`, `op` and `result`, respectively. Again `cache`, `op` and
- * `result` are defined in the documentation of the `perf_event_open` system
- * call.
- */
-#define PROF_EVENT_CACHE(cache, op, result)                                    \
-    PROF_EVENT(PERF_TYPE_HW_CACHE,                                             \
-               (PERF_COUNT_HW_CACHE_ ## cache) |                               \
-               (PERF_COUNT_HW_CACHE_OP_ ## op << 8) |                          \
-               (PERF_COUNT_HW_CACHE_RESULT_ ## result << 16))
-
-/*
- * Stop counting the events. The counter array can then be accessed with
- * `PROF_COUNTERS`.
- */
-#define PROF_STOP()                                                            \
-    do {                                                                       \
-        PROF_IOCTL_(DISABLE);                                                  \
-        PROF_READ_COUNTERS_(prof_event_buf_);                                  \
-    } while (0)
-
-/*
- * Access the counter array. The order of counters is the same of the events
- * defined in `PROF_EVENT_LIST`. Elements of this array are 64 bit unsigned
- * integers.
- */
-#define PROF_COUNTERS                                                          \
-    (prof_event_buf_ + 1)
-
-/*
- * Stop counting the events and execute the code provided by `block` for each
- * event. Within `code`: `index` refers to the event position index in the
- * counter array defined by `PROF_COUNTERS`; `counter` is the actual value of
- * the counter. `index` is a 64 bit unsigned integer.
- */
-#define PROF_DO(block)                                                         \
-    do {                                                                       \
-        uint64_t i_;                                                           \
-        PROF_STOP();                                                           \
-        for (i_ = 0; i_ < prof_event_cnt_; i_++) {                             \
-            uint64_t index = i_;                                               \
-            uint64_t counter = prof_event_buf_[i_ + 1];                        \
-            (void)index;                                                       \
-            (void)counter;                                                     \
-            block;                                                             \
-        }                                                                      \
-    } while (0)
-
-/*
- * Same as `PROF_DO` except that `callback` is the name of a *callable* object
- * (e.g. a function) which, for each event, is be called with the two parameters
- * `index` and `counter`.
- */
-#define PROF_CALL(callback)                                                    \
-    PROF_DO(callback(index, counter))
-
-/*
- * Stop counting the events and write to `file` (a stdio.h `FILE *`) as many
- * lines as are events in `PROF_EVENT_LIST`. Each line contains `index` and
- * `counter` (as defined by `PROF_DO`) separated by a tabulation character. If
- * there is only one event then `index` is omitted.
- */
-#define PROF_FILE(file)                                                        \
-    PROF_DO(if (prof_event_cnt_ > 1) {                                         \
-            fprintf((file), "%lu\t%lu\n", index, counter);                     \
-        } else {                                                               \
-            fprintf((file), "%lu\n", counter);                                 \
-        }                                                                      \
-    )
-
-/*
- * Same as `PROF_LOG_FILE` except that `file` is `stdout`.
- */
-#define PROF_STDOUT()                                                          \
-    PROF_FILE(stdout)
-
-/*
- * Same as `PROF_LOG_FILE` except that `file` is `stderr`.
- */
-#define PROF_STDERR()                                                          \
-    PROF_FILE(stderr)
-
-/* DEFAULTS ----------------------------------------------------------------- */
-
-#ifndef PROF_EVENT_LIST
-#ifdef PERF_COUNT_HW_REF_CPU_CYCLES /* since Linux 3.3 */
-#define PROF_EVENT_LIST PROF_EVENT_HW(REF_CPU_CYCLES)
-#else
-#define PROF_EVENT_LIST PROF_EVENT_HW(CPU_CYCLES)
-#endif
-#endif
-
-/* UTILITY ------------------------------------------------------------------ */
-
-#define PROF_ASSERT_(x)                                                        \
-    do {                                                                       \
-        if (!(x)) {                                                            \
-            fprintf(stderr, "# %s:%d: PROF error", __FILE__, __LINE__);        \
-            if (errno) {                                                       \
-                fprintf(stderr, " (%s)", strerror(errno));                     \
-            }                                                                  \
-            printf("\n");                                                      \
-            abort();                                                           \
-        }                                                                      \
-    } while (0)
-
-#define PROF_IOCTL_(mode)                                                      \
-    do {                                                                       \
-        PROF_ASSERT_(ioctl(prof_fd_,                                           \
-                           PERF_EVENT_IOC_ ## mode,                            \
-                           PERF_IOC_FLAG_GROUP) != -1);                        \
-    } while (0)
-
-#define PROF_READ_COUNTERS_(buffer)                                            \
-    do {                                                                       \
-        const ssize_t to_read = sizeof(uint64_t) * (prof_event_cnt_ + 1);      \
-        PROF_ASSERT_(read(prof_fd_, buffer, to_read) == to_read);              \
-    } while (0)
-
-/* SETUP -------------------------------------------------------------------- */
-
-static int prof_fd_;
-static uint64_t prof_event_cnt_;
-static uint64_t *prof_event_buf_;
-
-static void prof_init_(uint64_t dummy, ...) {
-    uint32_t type;
-    va_list ap;
-
-    prof_fd_ = -1;
-    prof_event_cnt_ = 0;
-    va_start(ap, dummy);
-    while (type = va_arg(ap, uint32_t), type != (uint32_t)-1) {
-        struct perf_event_attr pe;
-        uint64_t config;
-        int fd;
-
-        config = va_arg(ap, uint64_t);
-
-        memset(&pe, 0, sizeof(struct perf_event_attr));
-        pe.size = sizeof(struct perf_event_attr);
-        pe.read_format = PERF_FORMAT_GROUP;
-        pe.type = type;
-        pe.config = config;
-        #ifdef PROF_USER_EVENTS_ONLY
-        pe.exclude_kernel = 1;
-        pe.exclude_hv = 1;
-        #endif
-
-        fd = syscall(__NR_perf_event_open, &pe, 0, -1, prof_fd_, 0);
-        PROF_ASSERT_(fd != -1);
-        if (prof_fd_ == -1) {
-            prof_fd_ = fd;
-        }
-
-        prof_event_cnt_++;
-    }
-    va_end(ap);
-
-    prof_event_buf_ = (uint64_t *)malloc((prof_event_cnt_ + 1) *
-                                         sizeof(uint64_t));
-}
-
-void __attribute__((constructor)) prof_init()
-{
-    prof_init_(0, PROF_EVENT_LIST /*,*/ (uint32_t)-1);
-}
-
-void __attribute__((destructor)) prof_fini()
-{
-    PROF_ASSERT_(close(prof_fd_) != -1);
-    free(prof_event_buf_);
-}
-
-#endif
-
-/*
- * License
- * -------
- *
- * Copyright (c) 2017 Andrea Cardaci <cyrus.and@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
--- a/tests/performance/benchmark.cpp
+++ b/tests/performance/benchmark.cpp
@ -1,314 +0,0 @@
-//RandomX performance test for x86
-//https://github.com/tevador/RandomX
-//License: GPL v3
-
-#include <cstdint> 
-#include <random>
-#include <iostream>
-#include <chrono>
-#include <sstream>
-#include <cmath>
-#include <cstring>
-
-#if defined(_WIN32) || defined(__MINGW32__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
-  #define WINDOWS
-  #include <io.h>
-  #include <fcntl.h>
-#endif
-
-#if defined(__GNUC__) && defined(__x86_64__)
-  #include <x86intrin.h>
-  typedef unsigned __int128 uint128_t;
-  typedef __int128 int128_t;
-  static inline uint64_t umulhi64(uint64_t a, uint64_t b) {
-     return ((uint128_t)a * b) >> 64;
-  }
-  static inline uint64_t imulhi64(int64_t a, int64_t b) {
-     return ((int128_t)a * b) >> 64;
-  }
-  #define ror64 __rorq
-  #define rol64 __rolq
-  #define forceinline inline
-  #ifdef __clang__
-  static inline uint64_t __rolq(uint64_t a, int b) {
-    return (a << b) | (a >> (64 - b));
-  }
-  static inline uint64_t __rorq(uint64_t a, int b) {
-    return (a >> b) | (a << (64 - b));
-  }
-  #endif
-#elif defined(_MSC_VER) && defined(_M_X64)
-  #include <intrin.h>
-  #include <stdlib.h>
-  #define umulhi64 __umulh
-  static inline uint64_t imulhi64(int64_t a, int64_t b) {
-     int64_t hi;
-     _mul128(a, b, &hi);
-     return hi;
-  }
-  #define ror64 _rotr64
-  #define rol64 _rotl64
-  #define forceinline __forceinline
-#else
-	#error "Unsupported platform"
-#endif
-
-typedef union {
-	double f64;
-	int64_t i64;
-	uint64_t u64;
-	int32_t i32;
-	uint32_t u32;
-} convertible_t;
-
-forceinline void NOOP(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64;
-}
-
-forceinline void FNOOP(convertible_t& a, convertible_t& b, convertible_t& c) {
-	c.f64 = (double)a.i64;
-}
-
-forceinline void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 + b.u64;
-}
-
-forceinline void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u32 + b.u32;
-}
-
-forceinline void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 - b.u64;
-}
-
-forceinline void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u32 - b.u32;
-}
-
-forceinline void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 * b.u64;
-}
-
-forceinline void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = umulhi64(a.u64, b.u64);
-}
-
-forceinline void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = (uint64_t)a.u32 * b.u32;
-}
-
-forceinline void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.i64 = (int64_t)a.i32 * b.i32;
-}
-
-forceinline void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.i64 = imulhi64(a.i64, b.i64);
-}
-
-forceinline void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
-}
-
-forceinline void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
-}
-
-forceinline void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 & b.u64;
-}
-
-forceinline void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u32 & b.u32;
-}
-
-forceinline void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 | b.u64;
-}
-
-forceinline void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u32 | b.u32;
-}
-
-forceinline void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 ^ b.u64;
-}
-
-forceinline void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u32 ^ b.u32;
-}
-
-forceinline void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 << (b.u64 & 63);
-}
-
-forceinline void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = a.u64 >> (b.u64 & 63);
-}
-
-forceinline void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.i64 = a.i64 >> (b.u64 & 63);
-}
-
-forceinline void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = rol64(a.u64, (b.u64 & 63));
-}
-
-forceinline void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.u64 = ror64(a.u64, (b.u64 & 63));
-}
-
-forceinline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.f64 = (double)a.i64 + (double)b.i64;
-}
-
-forceinline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.f64 = (double)a.i64 - (double)b.i64;
-}
-
-forceinline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.f64 = (double)a.i64 * (double)b.i64;
-}
-
-forceinline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.f64 = (double)a.i64 / (double)b.i64;
-}
-
-forceinline void FSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
-  double d = fabs((double)a.i64);
-  c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
-}
-
-static uint32_t mxcsr;
-
-forceinline void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
-  c.f64 = (double)a.i64;
-  _mm_setcsr(mxcsr | ((uint32_t)(a.u64 << 13) & _MM_ROUND_MASK));
-}
-
-inline void init_FPU() {
-  mxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK;
-  _mm_setcsr(mxcsr);
-}
-
-template<typename T>
-bool tryParse(char* buffer, T& out) {
-	std::istringstream ss(buffer);
-	if (!(ss >> out)) {
-		std::cout << "Invalid value '" << buffer << "'" << std::endl;
-		return false;
-	}
-	return true;
-}
-
-//#define ITERATIONS 10000000
-#define SCRATCHPAD_SIZE (16 * 1024)
-#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))
-#define SCRATCHPAD_MASK (SCRATCHPAD_SIZE / sizeof(convertible_t) - 1)
-#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK]
-
-#define BENCHMARK(FUNC,TYPE) do { \
-	memcpy((void*)scratchpad, input, SCRATCHPAD_SIZE); \
-	tstart = std::chrono::high_resolution_clock::now(); \
-	for (uint64_t i = 0; i < iterations; ++i) { \
-		FUNC(SCRATCHPAD_16K(i + 8 + 0), r0, SCRATCHPAD_16K(i + 0)); \
-		SCRATCHPAD_16K(i + 0).u64 ^= r7.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 1), r1, SCRATCHPAD_16K(i + 1)); \
-		SCRATCHPAD_16K(i + 1).u64 ^= r6.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 2), r2, SCRATCHPAD_16K(i + 2)); \
-		SCRATCHPAD_16K(i + 2).u64 ^= r5.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 3), r3, SCRATCHPAD_16K(i + 3)); \
-		SCRATCHPAD_16K(i + 3).u64 ^= r4.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 4), r4, SCRATCHPAD_16K(i + 4)); \
-		SCRATCHPAD_16K(i + 4).u64 ^= r3.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 5), r5, SCRATCHPAD_16K(i + 5)); \
-		SCRATCHPAD_16K(i + 5).u64 ^= r2.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 6), r6, SCRATCHPAD_16K(i + 6)); \
-		SCRATCHPAD_16K(i + 6).u64 ^= r1.u64;\
-		FUNC(SCRATCHPAD_16K(i + 8 + 7), r7, SCRATCHPAD_16K(i + 7)); \
-		SCRATCHPAD_16K(i + 7).u64 ^= r0.u64;\
-	} \
-	tend = std::chrono::high_resolution_clock::now(); \
-	uint64_t acum = 0; \
-	for (int i = 0; i < SCRATCHPAD_LENGTH; ++i) \
-		acum += scratchpad[i].u64; \
-	std::cout << "| " << #FUNC << " | " << std::chrono::duration<double>(tend - tstart).count() << " | " << acum << " |" << std::endl; \
-  } while(false)
-
-
-int main(int argc, char** argv) {
-	uint64_t iterations;
-	if (argc > 1) {
-		if (!tryParse(argv[1], iterations))
-			return 1;
-	}
-	else {
-		iterations = 100000000;
-	}
-#ifdef WINDOWS
-  _setmode(_fileno(stdin), O_BINARY);
-#endif
-  convertible_t input[SCRATCHPAD_LENGTH];
-
-  std::cout << "Reading " << sizeof(input) << " bytes from STDIN..." << std::endl;
-  std::cin.read((char*)input, sizeof(input));
-
-  if (!std::cin) {
-	  std::cerr << "Insufficient input" << std::endl;
-	  return 1;
-  }
-
-  convertible_t scratchpad[SCRATCHPAD_LENGTH];
-  convertible_t r0, r1, r2, r3, r4, r5, r6, r7;
-  
-  r0.u64 = input[0].u64;
-  r1.u64 = input[1].u64;
-  r2.u64 = input[2].u64;
-  r3.u64 = input[3].u64;
-  r4.u64 = input[4].u64;
-  r5.u64 = input[5].u64;
-  r6.u64 = input[6].u64;
-  r7.u64 = input[7].u64;
-
-  std::chrono::high_resolution_clock::time_point tstart, tend;
-
-  std::cout << iterations << " iterations:" << std::endl << std::endl;
-
-  std::cout << "| operation | time [s] | (result) |" << std::endl;
-  std::cout << "|-----------|----------|----------|" << std::endl;
-
-  BENCHMARK(NOOP, u64);
-  BENCHMARK(ADD_64, u64);
-  BENCHMARK(ADD_32, u64);
-  BENCHMARK(SUB_64, u64);
-  BENCHMARK(SUB_32, u64);
-  BENCHMARK(MUL_64, u64);
-  BENCHMARK(MULH_64, u64);
-  BENCHMARK(MUL_32, u64);
-  BENCHMARK(IMUL_32, u64);
-  BENCHMARK(IMULH_64, u64);
-  BENCHMARK(DIV_64, u64);
-  BENCHMARK(IDIV_64, u64);
-  BENCHMARK(AND_64, u64);
-  BENCHMARK(AND_32, u64);
-  BENCHMARK(OR_64, u64);
-  BENCHMARK(OR_32, u64);
-  BENCHMARK(XOR_64, u64);
-  BENCHMARK(XOR_32, u64);
-  BENCHMARK(SHL_64, u64);
-  BENCHMARK(SHR_64, u64);
-  BENCHMARK(SAR_64, u64);
-  BENCHMARK(ROR_64, u64);
-  BENCHMARK(ROL_64, u64);
-  
-  init_FPU();
-
-  BENCHMARK(FNOOP, f64);
-  BENCHMARK(FADD, f64);
-  BENCHMARK(FSUB, f64);
-  BENCHMARK(FMUL, f64);
-  BENCHMARK(FDIV, f64);
-  BENCHMARK(FSQRT, f64);
-  BENCHMARK(FROUND, f64);
-
-  return 0;
-}
--- a/tests/performance/test1.data
+++ b/tests/performance/test1.data
--- a/tests/performance/test2.data
+++ b/tests/performance/test2.data
--- a/tests/rx2c.py
+++ b/tests/rx2c.py
@ -1,595 +0,0 @@
-import random
-import sys
-import os
-
-PROGRAM_SIZE = 512
-INSTRUCTION_COUNT = 1024 * 1024
-INSTRUCTION_WEIGHTS = [
-    ("ADD_64", 16),
-    ("ADD_32", 8),
-    ("SUB_64", 16),
-    ("SUB_32", 8),
-    ("MUL_64", 7),
-    ("MULH_64", 7),
-    ("MUL_32", 7),
-    ("IMUL_32", 7),
-    ("IMULH_64", 7),
-    ("DIV_64", 1),
-    ("IDIV_64", 1),
-    ("AND_64", 4),
-    ("AND_32", 3),
-    ("OR_64", 4),
-    ("OR_32", 3),
-    ("XOR_64", 4),
-    ("XOR_32", 3),
-    ("SHL_64", 6),
-    ("SHR_64", 6),
-    ("SAR_64", 6),
-    ("ROL_64", 9),
-    ("ROR_64", 9),
-    ("FADD", 22),
-    ("FSUB", 22),
-    ("FMUL", 22),
-    ("FDIV", 8),
-    ("FSQRT", 6),
-    ("FROUND", 2),
-    ("CALL", 17),
-    ("RET", 15),
-]
-        
-def genBytes(count):
-    return ', '.join(str(random.getrandbits(8)) for i in range(count))
-        
-class OperandType:
-    INT32 = 0
-    UINT32 = 1
-    INT64 = 2
-    UINT64 = 3
-    FLOAT = 4
-    SHIFT = 5
-
-def declareType(type):
-    converters = {
-        0: "int32_t",
-        1: "uint32_t",
-        2: "int64_t",
-        3: "uint64_t",
-        4: "double",
-        5: "int32_t"
-    }
-    return converters.get(type)
-    
-def toSigned32(x):
-    return x - ((x & 0x80000000) << 1)
-    
-def toSigned64(x):
-    return x - ((x & 0x8000000000000000) << 1)
-
-def immediateTo(symbol, type):
-    converters = {
-        0: toSigned32(symbol.imm1),
-        1: symbol.imm1,
-        2: toSigned32(symbol.imm1),
-        3: symbol.imm1,
-        4: float(toSigned32(symbol.imm1) << 32),
-        5: symbol.imm0 & 63
-    }
-    return repr(converters.get(type))
-    
-def registerTo(expr, type):
-    converters = {
-        0: "(int64_t){0}",
-        1: "{0}",
-        2: "(int64_t){0}",
-        3: "{0}",
-        4: "{0}",
-        5: "({0} & 63)"
-    }
-    return converters.get(type).format(expr)
-    
-def registerFrom(num, type):
-    converters = {
-        0: "r{0}",
-        1: "r{0}",
-        2: "r{0}",
-        3: "r{0}",
-        4: "((convertible_t)f{0}).u64",
-        5: "r{0}"
-    }
-    return converters.get(type).format(num)
-    
-def convertibleTo(expr, type):
-    converters = {
-        0: "{0}.i32",
-        1: "{0}.u32",
-        2: "{0}.i64",
-        3: "{0}.u64",
-        4: "(double){0}.i64",
-        5: "({0}.u64 & 63)"
-    }
-    return converters.get(type).format(expr)
-    
-def convertibleFrom(expr, type):
-    converters = {
-        0: "{0}.i32",
-        1: "{0}.u32",
-        2: "{0}.i64",
-        3: "{0}.u64",
-        4: "{0}.f64",
-        5: "({0}.u64 & 63)"
-    }
-    return converters.get(type).format(expr)
-
-def getRegister(num, type):
-    registers = {
-        0: "r{0}",
-        1: "r{0}",
-        2: "r{0}",
-        3: "r{0}",
-        4: "f{0}",
-        5: "r{0}"
-    }
-    return registers.get(type).format(num)
-
-def writeInitialValues(file):
-    file.write("#ifdef RAM\n")
-    file.write("\tmmu.buffer = (char*)_mm_malloc(DRAM_SIZE, 16);\n")
-    file.write("\tif(!mmu.buffer) {\n")
-    file.write('\t\tprintf("DRAM buffer allocation failed\\n");\n')
-    file.write("\t\treturn 1;\n")
-    file.write("\t}\n")
-    file.write('\tprintf("Initializing DRAM buffer...\\n");\n')
-    file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)mmu.buffer, DRAM_SIZE);\n")
-    file.write("#endif\n")
-    file.write("\tclock_t clockStart = clock(), clockEnd;\n")
-    for i in range(8):
-        file.write("\tr{0} = *(uint64_t*)(aesSeed + {1});\n".format(i, i * 8))
-    for i in range(8):
-        file.write("\tf{0} = *(int64_t*)(aesSeed + {1});\n".format(i, 64 + i * 8))
-    file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)scratchpad, SCRATCHPAD_SIZE);\n")
-    file.write("\tmmu.ma = *(addr_t*)(aesKey + 8) & ~7U;\n")
-    file.write("#ifdef PRNTADDR\n")
-    file.write('\tprintf("DRAM address = %#010x\\n", mmu.ma);\n')
-    file.write("#endif\n")
-    file.write("\tmmu.mx = 0;\n")
-    file.write("\tsp = 0;\n")
-    file.write("\tic = {0};\n".format(INSTRUCTION_COUNT))
-    file.write("\tmxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; //flush denormals to zero, round to nearest\n")
-    file.write("\t_mm_setcsr(mxcsr);\n")
-    
-def writeEpilog(file):
-    file.write("\tend:\n")
-    file.write("\t\tclockEnd = clock();\n")
-    for i in range(8):
-        file.write('\t\tprintf("r{0} = %-36" PRIu64 " f{0} = %g\\n", r{0}, f{0});\n'.format(i))
-    file.write(("\t\tuint64_t spadsum = 0;\n"
-        "\t\tfor(int i = 0; i < SCRATCHPAD_LENGTH; ++i) {\n"
-        "\t\t	spadsum += scratchpad[i].u64;\n"
-        "\t\t}\n"
-        '\t\tprintf("scratchpad sum = %" PRIu64 "\\n", spadsum);\n'
-        '\t\tprintf("runtime: %f\\n", (clockEnd - clockStart) / (double)CLOCKS_PER_SEC);\n'
-        "#ifdef RAM\n"
-        "\t\t_mm_free((void*)mmu.buffer);\n"
-        "#endif\n"))
-    file.write("\t\treturn 0;")
-    file.write("}")
-
-def writeCommon(file, i, symbol, type, name):
-    file.write("\ti_{0}: {{ //{1}\n".format(i, name))
-    file.write("\t\tif(0 == ic--) goto end;\n")
-    file.write("\t\tr{0} ^= {1};\n".format(symbol.rega, symbol.addr0))
-    file.write("\t\taddr_t addr = r{0};\n".format(symbol.rega))
-
-def readA(symbol, type):
-    location = {
-        0: "readDram(&mmu, addr)",
-        1: "readDram(&mmu, addr)",
-        2: "readDram(&mmu, addr)",
-        3: "readDram(&mmu, addr)",
-        4: "SCRATCHPAD_256K(addr)",
-        5: "SCRATCHPAD_16K(addr)",
-        6: "SCRATCHPAD_16K(addr)",
-        7: "SCRATCHPAD_16K(addr)",
-    }
-    return convertibleTo(location.get(symbol.loca), type)
-
-def writeC(symbol, type):
-    location = {
-        0: "SCRATCHPAD_256K(r{0} ^ {1})",
-        1: "SCRATCHPAD_16K(r{0} ^ {1})",
-        2: "SCRATCHPAD_16K(r{0} ^ {1})",
-        3: "SCRATCHPAD_16K(r{0} ^ {1})",
-        4: "",
-        5: "",
-        6: "",
-        7: ""
-    }
-    c = location.get(symbol.locc)
-    if c == "":
-        c = getRegister(symbol.regc, type)
-    else:
-        c = convertibleFrom(c.format(symbol.regc, symbol.addr1), type)
-    return c
-
-def readB(symbol, type):
-    if symbol.locb < 6:
-        return registerTo(getRegister(symbol.regb, type), type)
-    else:
-        return immediateTo(symbol, type)
-
-class CodeSymbol:
-    def __init__(self, qi):
-        self.opcode = qi & 255
-        self.loca = (qi >> 8) & 7
-        self.rega = (qi >> 16) & 7
-        self.locb = (qi >> 24) & 7
-        self.regb = (qi >> 32) & 7
-        self.locc = (qi >> 40) & 7
-        self.regc = (qi >> 48) & 7
-        self.imm0 = (qi >> 56) & 255
-        self.addr0 = (qi >> 64) & 0xFFFFFFFF
-        self.addr1 = self.imm1 = qi >> 96
-
-def writeOperation(file, i, symbol, type, name, op):
-    writeCommon(file, i, symbol, type, name)
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = A {1} B; }}\n".format(writeC(symbol, type), op))
-
-def write_ADD_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'ADD_64', '+');
-
-def write_ADD_32(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT32, 'ADD_32', '+');
-
-def write_SUB_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'SUB_64', '-');
-
-def write_SUB_32(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT32, 'SUB_32', '-');
-
-def write_MUL_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'MUL_64', '*');
-
-def write_MULH_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'MULH_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = ((uint128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
-
-def write_MUL_32(file, i, symbol):
-    type = OperandType.UINT32
-    writeCommon(file, i, symbol, type, 'MUL_32')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = (uint64_t)A * B; }}\n".format(writeC(symbol, OperandType.UINT64)))
-
-def write_IMUL_32(file, i, symbol):
-    type = OperandType.INT32
-    writeCommon(file, i, symbol, type, 'IMUL_32')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = (int64_t)A * B; }}\n".format(writeC(symbol, OperandType.INT64)))
-
-def write_IMULH_64(file, i, symbol):
-    type = OperandType.INT64
-    writeCommon(file, i, symbol, type, 'IMULH_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = ((int128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
-
-def write_DIV_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'DIV_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.UINT32), readB(symbol, OperandType.UINT32)))
-    file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
-
-def write_IDIV_64(file, i, symbol):
-    type = OperandType.INT64
-    writeCommon(file, i, symbol, type, 'IDIV_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.INT32), readB(symbol, OperandType.INT32)))
-    file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
-    file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
-
-def write_AND_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'AND_64', '&');
-
-def write_AND_32(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT32, 'AND_32', '&');
-
-def write_OR_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'OR_64', '|');
-
-def write_OR_32(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT32, 'OR_32', '|');
-
-def write_XOR_64(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT64, 'XOR_64', '^');
-
-def write_XOR_32(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.UINT32, 'XOR_32', '^');
-
-def write_SHL_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'SHL_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
-    file.write("\t\t{0} = A << B; }}\n".format(writeC(symbol, type)))
-
-def write_SHR_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'SHR_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
-    file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
-
-def write_SAR_64(file, i, symbol):
-    type = OperandType.INT64
-    writeCommon(file, i, symbol, type, 'SAR_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
-    file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
-
-def write_ROL_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'ROL_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
-    file.write("\t\t{0} = __rolq(A, B); }}\n".format(writeC(symbol, type)))
-
-def write_ROR_64(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'ROR_64')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
-    file.write("\t\t{0} = __rorq(A, B); }}\n".format(writeC(symbol, type)))
-
-def write_FADD(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.FLOAT, 'FADD', '+');
-
-def write_FSUB(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.FLOAT, 'FSUB', '-');
-
-def write_FMUL(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.FLOAT, 'FMUL', '*');
-
-def write_FDIV(file, i, symbol):
-    writeOperation(file, i, symbol, OperandType.FLOAT, 'FDIV', '/');
-
-def write_FSQRT(file, i, symbol):
-    type = OperandType.FLOAT
-    writeCommon(file, i, symbol, type, 'FSQRT')
-    file.write("\t\t{0} A = fabs({1});\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\t{0} = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&A))); }}\n".format(writeC(symbol, type)))
-
-def write_FROUND(file, i, symbol):
-    type = OperandType.FLOAT
-    writeCommon(file, i, symbol, type, 'FROUND')
-    file.write("\t\t{0} A = {1};\n".format(declareType(OperandType.INT64), readA(symbol, OperandType.INT64)))
-    file.write("\t\t{0} = A;\n".format(writeC(symbol, type)))
-    file.write("\t\t_mm_setcsr(mxcsr | ((uint32_t)(A << 13) & _MM_ROUND_MASK)); }\n")
-
-def write_CALL(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'CALL')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    if symbol.locb < 6:
-        file.write("\t\tif((uint32_t)r{0} <= {1}) {{\n".format(symbol.regb, symbol.imm1))
-    file.write("\t\t\tPUSH_VALUE(A);\n");
-    file.write("\t\t\tPUSH_ADDRESS(&&i_{0});\n".format((i + 1) & (PROGRAM_SIZE - 1)));
-    file.write("\t\t\tgoto i_{0};\n".format((i + 1 + (symbol.imm0 & ((PROGRAM_SIZE >> 2) - 1))) & (PROGRAM_SIZE - 1)));
-    if symbol.locb < 6:
-        file.write("\t\t}}\n\t\t{0} = A;".format(writeC(symbol, type)))
-    file.write("\t\t}\n")
-
-def write_RET(file, i, symbol):
-    type = OperandType.UINT64
-    writeCommon(file, i, symbol, type, 'RET')
-    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
-    file.write("\t\tif(!STACK_IS_EMPTY()")
-    if symbol.locb < 6:
-        file.write(" && (uint32_t)r{0} <= {1}".format(symbol.regb, symbol.imm1))
-    file.write(") {\n")
-    file.write("\t\t\tvoid* target = POP_ADDRESS();\n")
-    file.write("\t\t\tuint64_t C = POP_VALUE();\n")
-    file.write("\t\t\t{0} = A ^ C;\n".format(writeC(symbol, type)))
-    file.write("\t\t\tgoto *target;\n")
-    file.write("\t\t}}\n\t\t{0} = A; }}\n".format(writeC(symbol, type)))
-
-opcodeMap = { }
-
-def buildOpcodeMap():
-    functions = globals()
-    totalWeight = 0;
-    for instruction, weight in INSTRUCTION_WEIGHTS:
-        func = functions['write_' + instruction]
-        for i in range(weight):
-            opcodeMap[totalWeight] = func
-            totalWeight = totalWeight + 1
-    assert totalWeight == 256
-
-def writeCode(file, i, symbol):
-    opcodeMap.get(symbol.opcode)(file, i, symbol)
-
-def writeMain(file):
-    file.write(('__attribute__((optimize("Os"))) int main() {\n'
-                "	register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n"
-                "	register double f0, f1, f2, f3, f4, f5, f6, f7;\n"
-                "	register uint64_t ic, sp;\n"
-                "	stack_t stack[STACK_LENGTH];\n"
-                "	convertible_t scratchpad[SCRATCHPAD_LENGTH] __attribute__ ((aligned (16)));\n"
-                "	mmu_t mmu;\n"
-                "	uint32_t mxcsr;\n"
-                ))
-
-def writeProlog(file):
-    file.write(("#include <stdint.h>\n"
-                "#include <time.h>\n"
-                "#include <stdio.h>\n"
-                "#include <x86intrin.h>\n"
-                "#include <emmintrin.h>\n"
-                "#include <wmmintrin.h>\n"
-                "#include <math.h>\n"
-                "#include <inttypes.h>\n"
-                "typedef uint32_t addr_t;\n"
-                "typedef unsigned __int128 uint128_t;\n"
-                "typedef __int128 int128_t;\n"
-                "typedef unsigned char byte;\n"
-                "typedef union {\n"
-                "	double f64;\n"
-                "	int64_t i64;\n"
-                "	uint64_t u64;\n"
-                "	int32_t i32;\n"
-                "	uint32_t u32;\n"
-                "} convertible_t;\n"
-                "typedef union {\n"
-                "	uint64_t value;\n"
-                "	void* address;\n"
-                "} stack_t;\n"
-                "typedef struct {\n"
-                "	addr_t ma;\n"
-                "	addr_t mx;\n"
-                "#ifdef RAM\n"
-                "	const char* buffer;\n"
-                "#endif\n"
-                "} mmu_t;\n"
-                "#define DRAM_SIZE (1ULL << 32)\n"
-                "#define SCRATCHPAD_SIZE (256 * 1024)\n"
-                "#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n"
-                "#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n"
-                "#define SCRATCHPAD_MASK18 (SCRATCHPAD_LENGTH - 1)\n"
-                "#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK14]\n"
-                "#define SCRATCHPAD_256K(x) scratchpad[(x) & SCRATCHPAD_MASK18]\n"
-                "#define STACK_LENGTH (128 * 1024)\n"
-                "#ifdef RAM\n"
-                "#define DRAM_READ(mmu) (convertible_t)*(uint64_t*)((mmu)->buffer + (mmu)->ma)\n"
-                "#define PREFETCH(mmu) _mm_prefetch(((mmu)->buffer + (mmu)->ma), _MM_HINT_T0)\n"
-                "#else\n"
-                "#define DRAM_READ(mmu) (convertible_t)(uint64_t)__rolq(6364136223846793005ULL*((mmu)->ma)+1442695040888963407ULL,32)\n"
-                "#define PREFETCH(mmu)\n"
-                "#endif\n"
-                "#define PUSH_VALUE(x) stack[sp++].value = x\n"
-                "#define PUSH_ADDRESS(x) stack[sp++].address = x\n"
-                "#define STACK_IS_EMPTY() (sp == 0)\n"
-                "#define POP_VALUE() stack[--sp].value\n"
-                "#define POP_ADDRESS() stack[--sp].address\n"
-                "static convertible_t readDram(mmu_t* mmu, addr_t addr) {\n"
-                "	convertible_t data;\n"
-                "	data = DRAM_READ(mmu);\n"
-                "	mmu->ma += 8;\n"
-                "	mmu->mx ^= addr;\n"
-                "	if((mmu->mx & 0x1FFF) == 0) {\n"
-                "#ifdef PRNTADDR\n"
-                '		printf("DRAM jump %#010x -> %#010x\\n", mmu->ma, mmu->mx);\n'
-                "#endif\n"
-                "		mmu->ma = mmu->mx;\n"
-                "#ifdef PREF\n"
-                "		PREFETCH(mmu);\n"
-                "#endif\n"
-                "	}\n"
-                "	return data;\n"
-                "}\n"
-                "static inline __m128i sl_xor(__m128i tmp1) {\n"
-                "	__m128i tmp4;\n"
-                "	tmp4 = _mm_slli_si128(tmp1, 0x04);\n"
-                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
-                "	tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
-                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
-                "	tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
-                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
-                "	return tmp1;\n"
-                "}\n"
-                "#define AES_GENKEY_SUB(rcon) do { \\\n"
-                "	__m128i xout1 = _mm_aeskeygenassist_si128(xout2, rcon);	\\\n"
-                "	xout1 = _mm_shuffle_epi32(xout1, 0xFF);	\\\n"
-                "	xout0 = sl_xor(xout0);  \\\n"
-                "	xout0 = _mm_xor_si128(xout0, xout1); \\\n"
-                "	xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); \\\n"
-                "	xout1 = _mm_shuffle_epi32(xout1, 0xAA); \\\n"
-                "	xout2 = sl_xor(xout2); \\\n"
-                "	xout2 = _mm_xor_si128(xout2, xout1); } while(0)\n"
-                "static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) {\n"
-                "	__m128i xout0, xout2;\n"
-                "	xout0 = _mm_load_si128(memory);\n"
-                "	xout2 = _mm_load_si128(memory+1);\n"
-                "	*k0 = xout0;\n"
-                "	*k1 = xout2;\n"
-                "	AES_GENKEY_SUB(0x01);\n"
-                "	*k2 = xout0;\n"
-                "	*k3 = xout2;\n"
-                "	AES_GENKEY_SUB(0x02);\n"
-                "	*k4 = xout0;\n"
-                "	*k5 = xout2;\n"
-                "	AES_GENKEY_SUB(0x04);\n"
-                "	*k6 = xout0;\n"
-                "	*k7 = xout2;\n"
-                "	AES_GENKEY_SUB(0x08);\n"
-                "	*k8 = xout0;\n"
-                "	*k9 = xout2;\n"
-                "}\n"
-                "static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) {\n"
-                "	*x0 = _mm_aesenc_si128(*x0, key);\n"
-                "	*x1 = _mm_aesenc_si128(*x1, key);\n"
-                "	*x2 = _mm_aesenc_si128(*x2, key);\n"
-                "	*x3 = _mm_aesenc_si128(*x3, key);\n"
-                "	*x4 = _mm_aesenc_si128(*x4, key);\n"
-                "	*x5 = _mm_aesenc_si128(*x5, key);\n"
-                "	*x6 = _mm_aesenc_si128(*x6, key);\n"
-                "	*x7 = _mm_aesenc_si128(*x7, key);\n"
-                "}\n"
-                "static void aesInitialize(__m128i* key, __m128i* seed, __m128i* output, size_t count) {\n"
-                "	\n"
-                "	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;\n"
-                "	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;\n"
-                "	\n"
-                "	aes_genkey(key, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);\n"
-                "	\n"
-                "	xin0 = _mm_load_si128(seed + 0);\n"
-                "	xin1 = _mm_load_si128(seed + 1);\n"
-                "	xin2 = _mm_load_si128(seed + 2);\n"
-                "	xin3 = _mm_load_si128(seed + 3);\n"
-                "	xin4 = _mm_load_si128(seed + 4);\n"
-                "	xin5 = _mm_load_si128(seed + 5);\n"
-                "	xin6 = _mm_load_si128(seed + 6);\n"
-                "	xin7 = _mm_load_si128(seed + 7);\n"
-                "	\n"
-                "	for (size_t i = 0; i < count / sizeof(__m128i); i += 8)\n"
-                "	{\n"
-                "		aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
-                "		\n"
-                "		_mm_store_si128(output + i + 0, xin0);\n"
-                "		_mm_store_si128(output + i + 1, xin1);\n"
-                "		_mm_store_si128(output + i + 2, xin2);\n"
-                "		_mm_store_si128(output + i + 3, xin3);\n"
-                "		_mm_store_si128(output + i + 4, xin4);\n"
-                "		_mm_store_si128(output + i + 5, xin5);\n"
-                "		_mm_store_si128(output + i + 6, xin6);\n"
-                "		_mm_store_si128(output + i + 7, xin7);\n"
-                "	}\n"
-                "}\n"))
-
-with sys.stdout as file:
-    buildOpcodeMap()
-    writeProlog(file)
-    file.write("const byte aesKey[32] = {{ {0} }};\n".format(genBytes(32)))
-    file.write("const byte aesSeed[128] = {{ {0} }};\n".format(genBytes(128)))
-    writeMain(file)
-    writeInitialValues(file)
-    for i in range(PROGRAM_SIZE):
-        writeCode(file, i, CodeSymbol(random.getrandbits(128)))
-    if PROGRAM_SIZE > 0:
-        file.write("\t\tgoto i_0;\n")
-    writeEpilog(file)
--- a/tests/test_alu_fpu/Instructions.h
+++ b/tests/test_alu_fpu/Instructions.h
@ -1,69 +0,0 @@
-//RandomX ALU + FPU test
-//https://github.com/tevador/RandomX
-//License: GPL v3
-
-#include <cstdint>
-
-namespace RandomX {
-
-	constexpr int RoundToNearest = 0;
-	constexpr int RoundDown = 1;
-	constexpr int RoundUp = 2;
-	constexpr int RoundToZero = 3;
-
-	typedef union {
-		double f64;
-		int64_t i64;
-		uint64_t u64;
-		int32_t i32;
-		uint32_t u32;
-	} convertible_t;
-
-	extern "C" {
-		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c);
-		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
-		void FPINIT();
-		void FADD_64(convertible_t& a, double b, convertible_t& c);
-		void FSUB_64(convertible_t& a, double b, convertible_t& c);
-		void FMUL_64(convertible_t& a, double b, convertible_t& c);
-		void FDIV_64(convertible_t& a, double b, convertible_t& c);
-		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
-		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c);
-
-		inline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FADD_64(a, (double)b.i64, c);
-		}
-
-		inline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FSUB_64(a, (double)b.i64, c);
-		}
-
-		inline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FMUL_64(a, (double)b.i64, c);
-		}
-
-		inline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
-			FDIV_64(a, (double)b.i64, c);
-		}
-	}
-}
--- a/tests/test_alu_fpu/InstructionsPortable.cpp
+++ b/tests/test_alu_fpu/InstructionsPortable.cpp
@ -1,247 +0,0 @@
-//RandomX ALU + FPU test
-//https://github.com/tevador/RandomX
-//License: GPL v3
-
-#include "Instructions.h"
-#include <cfenv>
-#include <cmath>
-
-#if defined(__SIZEOF_INT128__)
-	typedef unsigned __int128 uint128_t;
-	typedef __int128 int128_t;
-	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
-		return ((uint128_t)a * b) >> 64;
-	}
-	static inline uint64_t __imulhi64(int64_t a, int64_t b) {
-		return ((int128_t)a * b) >> 64;
-	}
-	#define umulhi64 __umulhi64
-	#define imulhi64 __imulhi64
-#endif
-
-#if defined(_MSC_VER)
-	#define HAS_VALUE(X) X ## 0
-	#define EVAL_DEFINE(X) HAS_VALUE(X)
-	#include <intrin.h>
-	#include <stdlib.h>
-	#define ror64 _rotr64
-	#define rol64 _rotl64
-	#if EVAL_DEFINE(__MACHINEARM64_X64(1))
-		#define umulhi64 __umulh
-	#endif
-	#if EVAL_DEFINE(__MACHINEX64(1))
-		static inline uint64_t __imulhi64(int64_t a, int64_t b) {
-			int64_t hi;
-			_mul128(a, b, &hi);
-			return hi;
-		}
-		#define imulhi64 __imulhi64
-	#endif
-#endif
-
-#ifndef ror64
-	static inline uint64_t __ror64(uint64_t a, int b) {
-		return (a >> b) | (a << (64 - b));
-	}
-	#define ror64 __ror64
-#endif
-
-#ifndef rol64
-	static inline uint64_t __rol64(uint64_t a, int b) {
-		return (a << b) | (a >> (64 - b));
-	}
-	#define rol64 __rol64
-#endif
-
-#ifndef sar64
-	#include <type_traits>
-	constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
-		return value >> shift;
-	}
-
-	struct UsesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
-	};
-
-	static inline int64_t __sar64(int64_t a, int b) {
-		return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
-	}
-	#define sar64 __sar64
-#endif
-
-#ifndef umulhi64
-	#define LO(x) ((x)&0xffffffff)
-	#define HI(x) ((x)>>32)
-	static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
-		uint64_t ah = HI(a), al = LO(a);
-		uint64_t bh = HI(b), bl = LO(b);
-		uint64_t x00 = al * bl;
-		uint64_t x01 = al * bh;
-		uint64_t x10 = ah * bl;
-		uint64_t x11 = ah * bh;
-		uint64_t m1 = LO(x10) + LO(x01) + HI(x00);
-		uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1);
-		uint64_t m3 = HI(x11) + HI(m2);
-
-		return (m3 << 32) + LO(m2);
-	}
-	#define umulhi64 __umulhi64
-#endif
-
-#ifndef imulhi64
-	static inline int64_t __imulhi64(int64_t a, int64_t b) {
-		int64_t hi = umulhi64(a, b);
-		if (a < 0LL) hi -= b;
-		if (b < 0LL) hi -= a;
-		return hi;
-	}
-	#define imulhi64 __imulhi64
-#endif
-
-static double FlushDenormal(double x) {
-	if (std::fpclassify(x) == FP_SUBNORMAL) {
-		return 0;
-	}
-	return x;
-}
-
-#define FTZ(x) FlushDenormal(x)
-
-namespace RandomX {
-
-	extern "C" {
-
-		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 + b.u64;
-		}
-
-		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 + b.u32;
-		}
-
-		void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 - b.u64;
-		}
-
-		void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 - b.u32;
-		}
-
-		void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 * b.u64;
-		}
-
-		void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = umulhi64(a.u64, b.u64);
-		}
-
-		void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = (uint64_t)a.u32 * b.u32;
-		}
-
-		void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.i64 = (int64_t)a.i32 * b.i32;
-		}
-
-		void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.i64 = imulhi64(a.i64, b.i64);
-		}
-
-		void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
-		}
-
-		void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			if (a.i64 == INT64_MIN && b.i64 == -1)
-				c.i64 = INT64_MIN;
-			else
-				c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
-		}
-
-		void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 & b.u64;
-		}
-
-		void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 & b.u32;
-		}
-
-		void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 | b.u64;
-		}
-
-		void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 | b.u32;
-		}
-
-		void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 ^ b.u64;
-		}
-
-		void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u32 ^ b.u32;
-		}
-
-		void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 << (b.u64 & 63);
-		}
-
-		void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = a.u64 >> (b.u64 & 63);
-		}
-
-		void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = sar64(a.i64, b.u64 & 63);
-		}
-
-		void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = rol64(a.u64, (b.u64 & 63));
-		}
-
-		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.u64 = ror64(a.u64, (b.u64 & 63));
-		}
-
-		void FPINIT() {
-			fesetround(FE_TONEAREST);
-		}
-
-		void FADD_64(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ((double)a.i64 + b);
-		}
-
-		void FSUB_64(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ((double)a.i64 - b);
-		}
-
-		void FMUL_64(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ((double)a.i64 * b);
-		}
-
-		void FDIV_64(convertible_t& a, double b, convertible_t& c) {
-			c.f64 = FTZ((double)a.i64 / b);
-		}
-
-		void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
-			double d = fabs((double)a.i64);
-			c.f64 = FTZ(sqrt(d));
-		}
-
-		void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
-			c.f64 = (double)a.i64;
-			switch (a.u64 & 3) {
-				case RoundDown:
-					fesetround(FE_DOWNWARD);
-					break;
-				case RoundUp:
-					fesetround(FE_UPWARD);
-					break;
-				case RoundToZero:
-					fesetround(FE_TOWARDZERO);
-					break;
-				default:
-					fesetround(FE_TONEAREST);
-					break;
-			}
-		}
-	}
-}
--- a/tests/test_alu_fpu/InstructionsX64.asm
+++ b/tests/test_alu_fpu/InstructionsX64.asm
@ -1,276 +0,0 @@
-;RandomX ALU + FPU test
-;https://github.com/tevador/RandomX
-;License: GPL v3
-
-PUBLIC ADD_64
-PUBLIC ADD_32
-PUBLIC SUB_64
-PUBLIC SUB_32
-PUBLIC MUL_64
-PUBLIC MULH_64
-PUBLIC MUL_32
-PUBLIC IMUL_32
-PUBLIC IMULH_64
-PUBLIC DIV_64
-PUBLIC IDIV_64
-PUBLIC AND_64
-PUBLIC AND_32
-PUBLIC OR_64
-PUBLIC OR_32
-PUBLIC XOR_64
-PUBLIC XOR_32
-PUBLIC SHL_64
-PUBLIC SHR_64
-PUBLIC SAR_64
-PUBLIC ROL_64
-PUBLIC ROR_64
-PUBLIC FPINIT
-PUBLIC FADD_64
-PUBLIC FSUB_64
-PUBLIC FMUL_64
-PUBLIC FDIV_64
-PUBLIC FABSQRT
-PUBLIC FROUND
-
-CONST	SEGMENT
-__XMMABS	DB	0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH
-CONST	ENDS
-
-.code
-
-ADD_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	add	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-ADD_64 ENDP
-
-ADD_32 PROC
-	mov	eax, DWORD PTR [rcx]
-	add	eax, DWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-ADD_32 ENDP
-
-SUB_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	sub	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-SUB_64 ENDP
-
-SUB_32 PROC
-	mov	eax, DWORD PTR [rcx]
-	sub	eax, DWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-SUB_32 ENDP
-
-MUL_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	imul	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-MUL_64 ENDP
-
-MULH_64 PROC
-	mov	rax, QWORD PTR [rdx]
-	mul	QWORD PTR [rcx]
-	mov	QWORD PTR [r8], rdx
-	ret	0
-MULH_64 ENDP
-
-MUL_32 PROC
-	mov	r9d, DWORD PTR [rcx]
-	mov	eax, DWORD PTR [rdx]
-	imul	r9, rax
-	mov	QWORD PTR [r8], r9
-	ret	0
-MUL_32 ENDP
-
-IMUL_32 PROC
-	movsxd	r9, DWORD PTR [rcx]
-	movsxd	rax, DWORD PTR [rdx]
-	imul	r9, rax
-	mov	QWORD PTR [r8], r9
-	ret	0
-IMUL_32 ENDP
-
-IMULH_64 PROC
-	mov	rax, QWORD PTR [rdx]
-	imul	QWORD PTR [rcx]
-	mov	QWORD PTR [r8], rdx
-	ret	0
-IMULH_64 ENDP
-
-DIV_64 PROC
-	mov	r9d, DWORD PTR [rdx]
-	mov	eax, 1
-	test	r9d, r9d
-	cmovne	eax, r9d
-	xor	edx, edx
-	mov	r9d, eax
-	mov	rax, QWORD PTR [rcx]
-	div	r9
-	mov	QWORD PTR [r8], rax
-	ret 0
-DIV_64 ENDP
-
-IDIV_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, -9223372036854775808
-	cmp	rax, rcx
-	jne	SHORT SAFE_IDIV_64
-	cmp	QWORD PTR [rdx], -1
-	jne	SHORT SAFE_IDIV_64
-	mov	QWORD PTR [r8], rcx
-	ret	0
-SAFE_IDIV_64:
-	mov	ecx, DWORD PTR [rdx]
-	test	ecx, ecx
-	mov	edx, 1
-	cmovne	edx, ecx
-	movsxd	rcx, edx
-	cqo
-	idiv	rcx
-	mov	QWORD PTR [r8], rax
-	ret 0
-IDIV_64 ENDP
-
-AND_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	and	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-AND_64 ENDP
-
-AND_32 PROC
-	mov	eax, DWORD PTR [rcx]
-	and	eax, DWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-AND_32 ENDP
-
-OR_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	or	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-OR_64 ENDP
-
-OR_32 PROC
-	mov	eax, DWORD PTR [rcx]
-	or	eax, DWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-OR_32 ENDP
-
-XOR_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	xor	rax, QWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-XOR_64 ENDP
-
-XOR_32 PROC
-	mov	eax, DWORD PTR [rcx]
-	xor	eax, DWORD PTR [rdx]
-	mov	QWORD PTR [r8], rax
-	ret	0
-XOR_32 ENDP
-
-SHL_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, QWORD PTR [rdx]
-	shl	rax, cl
-	mov	QWORD PTR [r8], rax
-	ret	0
-SHL_64 ENDP
-
-SHR_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, QWORD PTR [rdx]
-	shr	rax, cl
-	mov	QWORD PTR [r8], rax
-	ret	0
-SHR_64 ENDP
-
-SAR_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, QWORD PTR [rdx]
-	sar	rax, cl
-	mov	QWORD PTR [r8], rax
-	ret	0
-SAR_64 ENDP
-
-ROL_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, QWORD PTR [rdx]
-	rol	rax, cl
-	mov	QWORD PTR [r8], rax
-	ret	0
-ROL_64 ENDP
-
-ROR_64 PROC
-	mov	rax, QWORD PTR [rcx]
-	mov	rcx, QWORD PTR [rdx]
-	ror	rax, cl
-	mov	QWORD PTR [r8], rax
-	ret	0
-ROR_64 ENDP
-
-FPINIT PROC
-	mov	DWORD PTR [rsp+8], 40896
-	ldmxcsr	DWORD PTR [rsp+8]
-	ret	0
-FPINIT ENDP
-
-FADD_64 PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	addsd	xmm0, xmm1
-	movsd	QWORD PTR [r8], xmm0
-	ret	0
-FADD_64 ENDP
-
-FSUB_64 PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	subsd	xmm0, xmm1
-	movsd	QWORD PTR [r8], xmm0
-	ret	0
-FSUB_64 ENDP
-
-FMUL_64 PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	mulsd	xmm0, xmm1
-	movsd	QWORD PTR [r8], xmm0
-	ret	0
-FMUL_64 ENDP
-
-FDIV_64 PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	divsd	xmm0, xmm1
-	movsd	QWORD PTR [r8], xmm0
-	ret	0
-FDIV_64 ENDP
-
-FABSQRT PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	andps	xmm0, XMMWORD PTR __XMMABS
-	sqrtsd	xmm1, xmm0
-	movsd	QWORD PTR [r8], xmm1
-	ret	0
-FABSQRT ENDP
-
-FROUND PROC
-	cvtsi2sd xmm0, QWORD PTR [rcx]
-	movsd	QWORD PTR [r8], xmm0
-	mov	rax, QWORD PTR [rcx]
-	shl	rax, 13
-	and	eax, 24576
-	or	eax, 40896
-	mov	DWORD PTR [rsp+8], eax
-	ldmxcsr	DWORD PTR [rsp+8]
-	ret	0
-FROUND ENDP
-
-END
--- a/tests/test_alu_fpu/TestAluFpu.cpp
+++ b/tests/test_alu_fpu/TestAluFpu.cpp
@ -1,283 +0,0 @@
-//RandomX ALU + FPU test
-//https://github.com/tevador/RandomX
-//License: GPL v3
-
-#include <iostream>
-#include <iomanip>
-#include <limits>
-#include "Instructions.h"
-
-using namespace RandomX;
-
-typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
-
-double rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
-	convertible_t a, b, c;
-	a.u64 = mode;
-	FROUND(a, b, c);
-	a.i64 = x;
-	b.i64 = y;
-	op(a, b, c);
-	return c.f64;
-}
-
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"
-
-#define RX_EXECUTE_U64(va, vb, INST) do { \
-	a.u64 = va; \
-	b.u64 = vb; \
-	INST(a, b, c); \
-	} while(false)
-
-#define RX_EXECUTE_I64(va, vb, INST) do { \
-	a.i64 = va; \
-	b.i64 = vb; \
-	INST(a, b, c); \
-	} while(false)
-
-TEST_CASE("Integer addition (64-bit)", "[ADD_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_64);
-	REQUIRE(c.u64 == 0x100000000);
-
-	RX_EXECUTE_U64(0x8000000000000000, 0x8000000000000000, ADD_64);
-	REQUIRE(c.u64 == 0x0);
-}
-
-TEST_CASE("Integer addition (32-bit)", "[ADD_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_32);
-	REQUIRE(c.u64 == 0);
-
-	RX_EXECUTE_U64(0xFF00000000000001, 0x0000000100000001, ADD_32);
-	REQUIRE(c.u64 == 2);
-}
-
-TEST_CASE("Integer subtraction (64-bit)", "[SUB_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_64);
-	REQUIRE(c.u64 == 0xFFFFFFFF00000002);
-}
-
-TEST_CASE("Integer subtraction (32-bit)", "[SUB_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_32);
-	REQUIRE(c.u64 == 2);
-}
-
-TEST_CASE("Unsigned multiplication (64-bit, low half)", "[MUL_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_64);
-	REQUIRE(c.u64 == 0x28723424A9108E51);
-}
-
-TEST_CASE("Unsigned multiplication (64-bit, high half)", "[MULH_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MULH_64);
-	REQUIRE(c.u64 == 0xB4676D31D2B34883);
-}
-
-TEST_CASE("Unsigned multiplication (32-bit x 32-bit -> 64-bit)", "[MUL_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_32);
-	REQUIRE(c.u64 == 0xB001AA5FA9108E51);
-}
-
-TEST_CASE("Signed multiplication (32-bit x 32-bit -> 64-bit)", "[IMUL_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMUL_32);
-	REQUIRE(c.u64 == 0x03EBA0C1A9108E51);
-}
-
-TEST_CASE("Signed multiplication (64-bit, high half)", "[IMULH_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMULH_64);
-	REQUIRE(c.u64 == 0x02D93EF1269D3EE5);
-}
-
-TEST_CASE("Unsigned division (64-bit / 32-bit -> 32-bit)", "[DIV_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(8774217225983458895, 3014068202, DIV_64);
-	REQUIRE(c.u64 == 2911087818);
-
-	RX_EXECUTE_U64(8774217225983458895, 0, DIV_64);
-	REQUIRE(c.u64 == 8774217225983458895);
-
-	RX_EXECUTE_U64(3014068202, 8774217225983458895, DIV_64);
-	REQUIRE(c.u64 == 2);
-}
-
-TEST_CASE("Signed division (64-bit / 32-bit -> 32-bit)", "[IDIV_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(8774217225983458895, 3014068202, IDIV_64);
-	REQUIRE(c.u64 == 0xFFFFFFFE67B4994E);
-
-	RX_EXECUTE_U64(8774217225983458895, 0, IDIV_64);
-	REQUIRE(c.u64 == 8774217225983458895);
-
-	RX_EXECUTE_U64(0x8000000000000000, 0xFFFFFFFFFFFFFFFF, IDIV_64);
-	REQUIRE(c.u64 == 0x8000000000000000);
-
-	RX_EXECUTE_U64(0xFFFFFFFFB3A707EA, 8774217225983458895, IDIV_64);
-	REQUIRE(c.u64 == 0xFFFFFFFFFFFFFFFF);
-}
-
-TEST_CASE("Bitwise AND (64-bit)", "[AND_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_64);
-	REQUIRE(c.u64 == 0x8888888888888888);
-}
-
-TEST_CASE("Bitwise AND (32-bit)", "[AND_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_32);
-	REQUIRE(c.u64 == 0x88888888);
-}
-
-TEST_CASE("Bitwise OR (64-bit)", "[OR_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_64);
-	REQUIRE(c.u64 == 0xEEEEEEEEEEEEEEEE);
-}
-
-TEST_CASE("Bitwise OR (32-bit)", "[OR_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_32);
-	REQUIRE(c.u64 == 0xEEEEEEEE);
-}
-
-TEST_CASE("Bitwise XOR (64-bit)", "[XOR_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_64);
-	REQUIRE(c.u64 == 0x2222222222222222);
-}
-
-TEST_CASE("Bitwise XOR (32-bit)", "[XOR_32]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_32);
-	REQUIRE(c.u64 == 0x22222222);
-}
-
-TEST_CASE("Logical left shift (64-bit)", "[SHL_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x3, 52, SHL_64);
-	REQUIRE(c.u64 == 0x30000000000000);
-
-	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHL_64);
-	REQUIRE(c.u64 == 6978065200108797952);
-
-	RX_EXECUTE_U64(0x8000000000000000, 1, SHL_64);
-	REQUIRE(c.u64 == 0);
-}
-
-TEST_CASE("Logical right shift (64-bit)", "[SHR_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x3, 52, SHR_64);
-	REQUIRE(c.u64 == 0);
-
-	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHR_64);
-	REQUIRE(c.u64 == 110985711);
-
-	RX_EXECUTE_U64(0x8000000000000000, 1, SHR_64);
-	REQUIRE(c.u64 == 0x4000000000000000);
-}
-
-TEST_CASE("Arithmetic right shift (64-bit)", "[SAR_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_I64(-9, 2, SAR_64);
-	REQUIRE(c.i64 == -3);
-
-	RX_EXECUTE_I64(INT64_MIN, 63, SAR_64);
-	REQUIRE(c.i64 == -1);
-
-	RX_EXECUTE_I64(INT64_MAX, 163768499474606398, SAR_64);
-	REQUIRE(c.i64 == 1);
-}
-
-TEST_CASE("Circular left shift (64-bit)", "[ROL_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x3, 52, ROL_64);
-	REQUIRE(c.u64 == 0x30000000000000);
-
-	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROL_64);
-	REQUIRE(c.u64 == 6978065200552740799);
-
-	RX_EXECUTE_U64(0x8000000000000000, 1, ROL_64);
-	REQUIRE(c.u64 == 1);
-}
-
-TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
-	convertible_t a, b, c;
-
-	RX_EXECUTE_U64(0x3, 52, ROR_64);
-	REQUIRE(c.u64 == 12288);
-
-	RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROR_64);
-	REQUIRE(c.u64 == 0xD835C455069D81EF);
-
-	RX_EXECUTE_U64(0x8000000000000000, 1, ROR_64);
-	REQUIRE(c.u64 == 0x4000000000000000);
-}
-
-TEST_CASE("Denormal numbers are flushed to zero", "[FTZ]") {
-	FPINIT();
-	convertible_t a, c;
-	a.i64 = 1;
-	FDIV_64(a, std::numeric_limits<double>::max(), c);
-	REQUIRE(c.f64 == 0.0);
-}
-
-TEST_CASE("IEEE-754 compliance", "[FPU]") {
-	FPINIT();
-	convertible_t a, c;
-
-	a.i64 = 1;
-	FDIV_64(a, 0, c);
-	REQUIRE(c.f64 == std::numeric_limits<double>::infinity());
-
-	a.i64 = -1;
-	FDIV_64(a, 0, c);
-	REQUIRE(c.f64 == -std::numeric_limits<double>::infinity());
-
-	REQUIRE(rxRound(RoundToNearest, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
-	REQUIRE(rxRound(RoundDown, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
-	REQUIRE(rxRound(RoundUp, 33073499373184121, -37713516328519941, &FADD) == -4640016955335812.0);
-	REQUIRE(rxRound(RoundToZero, 33073499373184121, -37713516328519941, &FADD) == -4640016955335816.0);
-
-	REQUIRE(rxRound(RoundToNearest, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107858e+18);
-	REQUIRE(rxRound(RoundDown, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107868e+18);
-	REQUIRE(rxRound(RoundUp, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
-	REQUIRE(rxRound(RoundToZero, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
-
-	REQUIRE(rxRound(RoundToNearest, 1, -10, &FDIV) == -0.10000000000000001);
-	REQUIRE(rxRound(RoundDown, 1, -10, &FDIV) == -0.10000000000000001);
-	REQUIRE(rxRound(RoundUp, 1, -10, &FDIV) == -0.099999999999999992);
-	REQUIRE(rxRound(RoundToZero, 1, -10, &FDIV) == -0.099999999999999992);
-
-	REQUIRE(rxRound(RoundToNearest, -2, 0, &FABSQRT) == 1.4142135623730951);
-	REQUIRE(rxRound(RoundDown, -2, 0, &FABSQRT) == 1.4142135623730949);
-	REQUIRE(rxRound(RoundUp, -2, 0, &FABSQRT) == 1.4142135623730951);
-	REQUIRE(rxRound(RoundToZero, -2, 0, &FABSQRT) == 1.4142135623730949);
-}
--- a/tests/test_alu_fpu/catch.hpp
+++ b/tests/test_alu_fpu/catch.hpp
--- a/tests/test_alu_fpu/makefile
+++ b/tests/test_alu_fpu/makefile
@ -1,10 +0,0 @@
-CXXFLAGS=-Wall -std=c++17 -O0
-
-TestAluFpu: TestAluFpu.o InstructionsPortable.o
-	$(CXX) TestAluFpu.o InstructionsPortable.o -o $@
-  
-TestAluFpu.o: TestAluFpu.cpp
-InstructionsPortable.o: InstructionsPortable.cpp
-
-clean:
-	rm -f TestAluFpu TestAluFpu.o InstructionsPortable.o