From 8b1102ee05200c0b22b8c89b6b204142725e1958 Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Tue, 15 Jan 2019 00:01:11 +0100
Subject: [PATCH] Interpreter + async mode

---
 src/AddressTransform.cpp          | 292 ++++++++++++++++++++++++++++++
 src/AssemblyGeneratorX86.cpp      |   4 +-
 src/Cache.hpp                     |   2 +-
 src/CompiledVirtualMachine.cpp    |  13 +-
 src/CompiledVirtualMachine.hpp    |   5 +-
 src/InterpretedVirtualMachine.cpp | 246 +++++++++++++++----------
 src/InterpretedVirtualMachine.hpp |  31 +++-
 src/JitCompilerX86.cpp            |   6 +-
 src/LightClientAsyncWorker.cpp    |  94 ++++++++++
 src/LightClientAsyncWorker.hpp    |  52 ++++++
 src/VirtualMachine.cpp            |  56 +-----
 src/VirtualMachine.hpp            |   9 +-
 src/common.hpp                    |  23 ++-
 src/dataset.cpp                   |  48 +++--
 src/dataset.hpp                   |   6 +-
 src/instructions.hpp              |   8 +-
 src/instructionsPortable.cpp      |   6 +-
 src/main.cpp                      |  16 +-
 18 files changed, 702 insertions(+), 215 deletions(-)
 create mode 100644 src/AddressTransform.cpp
 create mode 100644 src/LightClientAsyncWorker.cpp
 create mode 100644 src/LightClientAsyncWorker.hpp
diff --git a/src/AddressTransform.cpp b/src/AddressTransform.cpp
new file mode 100644
index 0000000..b8070a0
--- /dev/null
+++ b/src/AddressTransform.cpp
@@ -0,0 +1,292 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "common.hpp"
+#include "InterpretedVirtualMachine.hpp"
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+namespace RandomX {
+
+	class Mul9Transform : public ITransform {
+	public:
+		Mul9Transform(int32_t cc) : c(cc) {
+			std::ostringstream oss;
+			oss << "mul9_" << std::hex << (cc & 255);
+			name = oss.str();
+		}
+		int32_t apply(int32_t x) const override {
+			return 9 * x + c;
+		}
+		const char* getName() const override {
+			return name.c_str();
+		}
+		std::ostream& printAsm(std::ostream& os) const override {
+			os << "lea ecx, [rcx+rcx*8" << std::showpos << c << "]" << std::noshowpos << std::endl;
+			return os;
+		}
+		std::ostream& printCxx(std::ostream& os) const override {
+			os << "static const Mul9Transform " << name << "(" << c << ");" << std::endl;
+			return os;
+		}
+	private:
+		int32_t c;
+		std::string name;
+	};
+
+	class AddTransform : public ITransform {
+	public:
+		AddTransform(int32_t cc) : c(cc) {
+			std::ostringstream oss;
+			oss << "add_" << std::hex << (cc & 255);
+			name = oss.str();
+		}
+		int32_t apply(int32_t x) const override {
+			return x + c;
+		}
+		const char* getName() const override {
+			return name.c_str();
+		}
+		std::ostream& printAsm(std::ostream& os) const override {
+			os << "db 64" << std::endl;
+			os << "add ecx, " << c << std::endl;
+			return os;
+		}
+		std::ostream& printCxx(std::ostream& os) const override {
+			os << "static const AddTransform " << name << "(" << c << ");" << std::endl;
+			return os;
+		}
+	private:
+		int32_t c;
+		std::string name;
+	};
+
+	class XorTransform : public ITransform {
+	public:
+		XorTransform(int32_t cc) : c(cc) {
+			std::ostringstream oss;
+			oss << "xor_" << std::hex << (cc & 255);
+			name = oss.str();
+		}
+		int32_t apply(int32_t x) const override {
+			return x ^ c;
+		}
+		const char* getName() const override {
+			return name.c_str();
+		}
+		std::ostream& printAsm(std::ostream& os) const override {
+			os << "db 64" << std::endl;
+			os << "xor ecx, " << c << std::endl;
+			return os;
+		}
+		std::ostream& printCxx(std::ostream& os) const override {
+			os << "static const XorTransform " << name << "(" << c << ");" << std::endl;
+			return os;
+		}
+	private:
+		int32_t c;
+		std::string name;
+	};
+
+	static const Mul9Transform mul9_6d(109);
+	static const XorTransform xor_60(96);
+	static const Mul9Transform mul9_ed(-19);
+	static const AddTransform add_9e(-98);
+	static const AddTransform add_eb(-21);
+	static const XorTransform xor_b0(-80);
+	static const Mul9Transform mul9_a4(-92);
+	static const AddTransform add_71(113);
+	static const Mul9Transform mul9_64(100);
+	static const AddTransform add_d9(-39);
+	static const XorTransform xor_78(120);
+	static const Mul9Transform mul9_89(-119);
+	static const AddTransform add_8f(-113);
+	static const AddTransform add_6f(111);
+	static const XorTransform xor_68(104);
+	static const Mul9Transform mul9_ad(-83);
+	static const Mul9Transform mul9_7f(127);
+	static const XorTransform xor_90(-112);
+	static const AddTransform add_59(89);
+	static const AddTransform add_e0(-32);
+	static const AddTransform add_68(104);
+	static const XorTransform xor_88(-120);
+	static const XorTransform xor_18(24);
+	static const Mul9Transform mul9_9(9);
+	static const AddTransform add_e1(-31);
+	static const XorTransform xor_f0(-16);
+	static const AddTransform add_44(68);
+	static const Mul9Transform mul9_92(-110);
+	static const XorTransform xor_40(64);
+	static const XorTransform xor_d8(-40);
+	static const XorTransform xor_f8(-8);
+	static const AddTransform add_f6(-10);
+	static const XorTransform xor_e0(-32);
+	static const AddTransform add_e(14);
+	static const Mul9Transform mul9_d2(-46);
+	static const XorTransform xor_98(-104);
+	static const Mul9Transform mul9_24(36);
+	static const AddTransform add_64(100);
+	static const Mul9Transform mul9_bf(-65);
+	static const Mul9Transform mul9_1b(27);
+	static const Mul9Transform mul9_5b(91);
+	static const AddTransform add_9b(-101);
+	static const AddTransform add_a2(-94);
+	static const Mul9Transform mul9_f6(-10);
+	static const XorTransform xor_50(80);
+	static const AddTransform add_94(-108);
+	static const AddTransform add_c6(-58);
+	static const XorTransform xor_30(48);
+	static const Mul9Transform mul9_49(73);
+	static const XorTransform xor_d0(-48);
+	static const XorTransform xor_20(32);
+	static const XorTransform xor_a0(-96);
+	static const AddTransform add_76(118);
+	static const AddTransform add_5b(91);
+	static const Mul9Transform mul9_12(18);
+	static const AddTransform add_f5(-11);
+	static const Mul9Transform mul9_3f(63);
+	static const AddTransform add_72(114);
+	static const Mul9Transform mul9_2d(45);
+	static const AddTransform add_bd(-67);
+	static const AddTransform add_35(53);
+	static const Mul9Transform mul9_9b(-101);
+	static const Mul9Transform mul9_ff(-1);
+	static const XorTransform xor_10(16);
+	static const Mul9Transform mul9_db(-37);
+	static const Mul9Transform mul9_e4(-28);
+	static const Mul9Transform mul9_c9(-55);
+	static const XorTransform xor_a8(-88);
+	static const XorTransform xor_b8(-72);
+	static const AddTransform add_24(36);
+	static const XorTransform xor_c8(-56);
+	static const AddTransform add_74(116);
+	static const XorTransform xor_58(88);
+	static const XorTransform xor_80(-128);
+	static const AddTransform add_32(50);
+	static const AddTransform add_69(105);
+	static const AddTransform add_db(-37);
+	static const XorTransform xor_70(112);
+	static const XorTransform xor_8(8);
+	static const XorTransform xor_e8(-24);
+	static const Mul9Transform mul9_76(118);
+	static const XorTransform xor_48(72);
+	static const XorTransform xor_c0(-64);
+	static const AddTransform add_28(40);
+	static const Mul9Transform mul9_b6(-74);
+	static const Mul9Transform mul9_52(82);
+	static const Mul9Transform mul9_36(54);
+	static const XorTransform xor_38(56);
+	static const XorTransform xor_28(40);
+	static const AddTransform add_57(87);
+
+	const ITransform* InterpretedVirtualMachine::addressTransformations[TransformationCount] = {
+		(ITransform*)&mul9_6d,
+		(ITransform*)&xor_60,
+		(ITransform*)&mul9_ed,
+		(ITransform*)&add_9e,
+		(ITransform*)&add_eb,
+		(ITransform*)&xor_b0,
+		(ITransform*)&mul9_a4,
+		(ITransform*)&add_71,
+		(ITransform*)&mul9_64,
+		(ITransform*)&add_d9,
+		(ITransform*)&xor_78,
+		(ITransform*)&mul9_89,
+		(ITransform*)&add_8f,
+		(ITransform*)&add_6f,
+		(ITransform*)&xor_68,
+		(ITransform*)&mul9_ad,
+		(ITransform*)&mul9_7f,
+		(ITransform*)&xor_90,
+		(ITransform*)&add_59,
+		(ITransform*)&add_e0,
+		(ITransform*)&add_68,
+		(ITransform*)&xor_88,
+		(ITransform*)&xor_18,
+		(ITransform*)&mul9_9,
+		(ITransform*)&add_e1,
+		(ITransform*)&xor_f0,
+		(ITransform*)&add_44,
+		(ITransform*)&mul9_92,
+		(ITransform*)&xor_40,
+		(ITransform*)&xor_d8,
+		(ITransform*)&xor_f8,
+		(ITransform*)&add_f6,
+		(ITransform*)&xor_e0,
+		(ITransform*)&add_e,
+		(ITransform*)&mul9_d2,
+		(ITransform*)&xor_98,
+		(ITransform*)&mul9_24,
+		(ITransform*)&add_64,
+		(ITransform*)&mul9_bf,
+		(ITransform*)&mul9_1b,
+		(ITransform*)&mul9_5b,
+		(ITransform*)&add_9b,
+		(ITransform*)&add_a2,
+		(ITransform*)&mul9_f6,
+		(ITransform*)&xor_50,
+		(ITransform*)&add_94,
+		(ITransform*)&add_c6,
+		(ITransform*)&xor_30,
+		(ITransform*)&mul9_49,
+		(ITransform*)&xor_d0,
+		(ITransform*)&xor_20,
+		(ITransform*)&xor_a0,
+		(ITransform*)&add_76,
+		(ITransform*)&add_5b,
+		(ITransform*)&mul9_12,
+		(ITransform*)&add_f5,
+		(ITransform*)&mul9_3f,
+		(ITransform*)&add_72,
+		(ITransform*)&mul9_2d,
+		(ITransform*)&add_bd,
+		(ITransform*)&add_35,
+		(ITransform*)&mul9_9b,
+		(ITransform*)&mul9_ff,
+		(ITransform*)&xor_10,
+		(ITransform*)&mul9_db,
+		(ITransform*)&mul9_e4,
+		(ITransform*)&mul9_c9,
+		(ITransform*)&xor_a8,
+		(ITransform*)&xor_b8,
+		(ITransform*)&add_24,
+		(ITransform*)&xor_c8,
+		(ITransform*)&add_74,
+		(ITransform*)&xor_58,
+		(ITransform*)&xor_80,
+		(ITransform*)&add_32,
+		(ITransform*)&add_69,
+		(ITransform*)&add_db,
+		(ITransform*)&xor_70,
+		(ITransform*)&xor_8,
+		(ITransform*)&xor_e8,
+		(ITransform*)&mul9_76,
+		(ITransform*)&xor_48,
+		(ITransform*)&xor_c0,
+		(ITransform*)&add_28,
+		(ITransform*)&mul9_b6,
+		(ITransform*)&mul9_52,
+		(ITransform*)&mul9_36,
+		(ITransform*)&xor_38,
+		(ITransform*)&xor_28,
+		(ITransform*)&add_57,
+	};
+}
\ No newline at end of file
diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index 8a4a0a1..25ae7ef 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -67,12 +67,12 @@ namespace RandomX {
 	void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
 		asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
 		asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
-		if ((instr.loca & 192) == 0)
-			asmCode << "\txor " << regMx << ", rax" << std::endl;
 		asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
 		asmCode << "\tjnz short rx_body_" << i << std::endl;
 		asmCode << "\tcall rx_read" << std::endl;
 		asmCode << "rx_body_" << i << ":" << std::endl;
+		if ((instr.loca & 192) == 0)
+			asmCode << "\txor " << regMx << ", rax" << std::endl;
 		if (instr.loca & 3) {
 			asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
 		}
diff --git a/src/Cache.hpp b/src/Cache.hpp
index 4137b97..8a2b93a 100644
--- a/src/Cache.hpp
+++ b/src/Cache.hpp
@@ -46,7 +46,7 @@ namespace RandomX {
 			return keys;
 		}
 
-		const uint8_t* getCache() {
+		const uint8_t* getCache() const {
 			return memory;
 		}
 	private:
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index ef78d2f..28a3cca 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -25,15 +25,16 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 namespace RandomX {
 
-	CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {
+	CompiledVirtualMachine::CompiledVirtualMachine() {
 		totalSize = 0;
 	}
 
-	void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
-		if (lightClient) {
-			throw std::runtime_error("Compiled VM does not support light-client mode");
-		}
-		VirtualMachine::setDataset(ds, lightClient);
+	void CompiledVirtualMachine::setDataset(dataset_t ds) {
+		mem.ds = ds;
+	}
+
+	void CompiledVirtualMachine::initializeScratchpad(uint32_t index) {
+		memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize);
 	}
 
 	void CompiledVirtualMachine::initializeProgram(const void* seed) {
diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp
index a77bdb8..98b0b78 100644
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@@ -37,8 +37,9 @@ namespace RandomX {
 		void operator delete(void* ptr) {
 			_mm_free(ptr);
 		}
-		CompiledVirtualMachine(bool softAes);
-		void setDataset(dataset_t ds, bool light = false) override;
+		CompiledVirtualMachine();
+		void setDataset(dataset_t ds) override;
+		void initializeScratchpad(uint32_t index) override;
 		void initializeProgram(const void* seed) override;
 		virtual void execute() override;
 		void* getProgram() {
diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index a6a3a0c..54d2279 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -21,11 +21,15 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "InterpretedVirtualMachine.hpp"
 #include "Pcg32.hpp"
 #include "instructions.hpp"
+#include "dataset.hpp"
+#include "Cache.hpp"
+#include "LightClientAsyncWorker.hpp"
 #include <iostream>
 #include <iomanip>
 #include <stdexcept>
 #include <sstream>
 #include <cmath>
+#include <thread>
 #ifdef STATS
 #include <algorithm>
 #endif
@@ -38,6 +42,57 @@ constexpr bool fpuCheck = false;
 
 namespace RandomX {
 
+	InterpretedVirtualMachine::~InterpretedVirtualMachine() {
+		if (asyncWorker) {
+			delete mem.ds.asyncWorker;
+		}
+	}
+
+	void InterpretedVirtualMachine::setDataset(dataset_t ds) {
+		if (asyncWorker) {
+			if (softAes) {
+				mem.ds.asyncWorker = new LightClientAsyncWorker<true>(ds.cache);
+			}
+			else {
+				mem.ds.asyncWorker = new LightClientAsyncWorker<false>(ds.cache);
+			}
+			readDataset = &datasetReadLightAsync;
+		}
+		else {
+			mem.ds = ds;
+			if (softAes) {
+				readDataset = &datasetReadLight<true>;
+			}
+			else {
+				readDataset = &datasetReadLight<false>;
+			}
+		}
+	}
+
+	void InterpretedVirtualMachine::initializeScratchpad(uint32_t index) {
+		uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index;
+		if (asyncWorker) {
+			ILightClientAsyncWorker* worker = mem.ds.asyncWorker;
+			const uint32_t blocksPerThread = (ScratchpadSize / CacheLineSize) / 2;
+			worker->prepareBlocks(scratchpad, startingBlock, blocksPerThread);                                       //async first half
+			worker->getBlocks(scratchpad + ScratchpadLength / 2, startingBlock + blocksPerThread, blocksPerThread);  //sync second half
+			worker->sync();
+		}
+		else {
+			auto cache = mem.ds.cache;
+			if (softAes) {
+				for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) {
+					initBlock<true>(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys());
+				}
+			}
+			else {
+				for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) {
+					initBlock<false>(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys());
+				}
+			}
+		}
+	}
+
 	void InterpretedVirtualMachine::initializeProgram(const void* seed) {
 		Pcg32 gen(seed);
 		for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
@@ -50,6 +105,7 @@ namespace RandomX {
 		}
 		//std::cout << reg;
 		p.initialize(gen);
+		currentTransform = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
 		mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
 		mem.mx = *(((uint32_t*)seed) + 5);
 		pc = 0;
@@ -74,61 +130,61 @@ namespace RandomX {
 #endif
 	}
 
-	convertible_t InterpretedVirtualMachine::loada(Instruction& inst) {
-		convertible_t& rega = reg.r[inst.rega % RegistersCount];
-		rega.i64 ^= inst.addra; //sign-extend addra
+	convertible_t InterpretedVirtualMachine::loada(Instruction& instr) {
+		convertible_t& rega = reg.r[instr.rega % RegistersCount];
+		rega.i64 ^= instr.addra; //sign-extend addra
 		addr_t addr = rega.u32;
-		switch (inst.loca & 7)
-		{
-			case 0:
-			case 1:
-			case 2:
-			case 3:
-				return readDataset(addr, mem);
 
-			case 4:
-				return scratchpad[addr % ScratchpadL2];
+		if ((ic % 64) == 0) {
+			addr = currentTransform->apply(addr);
+#ifdef STATS
+			datasetAccess[mem.ma / (DatasetBlockCount / 256) / CacheLineSize]++;
+#endif
+			readDataset(addr, mem, reg);
+		}
 
-			case 5:
-			case 6:
-			case 7:
-				return scratchpad[addr % ScratchpadL1];
+		if ((instr.loca & 192) == 0) {
+			mem.mx ^= addr;
+		}
+
+		if (instr.loca & 3) {
+			return scratchpad[addr % ScratchpadL1];
+		}
+		else {
+			return scratchpad[addr % ScratchpadL2];
 		}
 	}
 
-	convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) {
-		switch (inst.locb & 7)
-		{
-			case 0:
-			case 1:
-			case 2:
-			case 3:
-			case 4:
-			case 5:
-				return reg.r[inst.regb % RegistersCount];
-			case 6:
-			case 7:
-				convertible_t temp;
-				temp.i64 = inst.imm32; //sign-extend imm32
-				return temp;
+	convertible_t InterpretedVirtualMachine::loadbia(Instruction& instr) {
+		if (instr.locb & 3) {
+			return reg.r[instr.regb % RegistersCount];
+		}
+		else {
+			convertible_t temp;
+			temp.i64 = instr.imm32; //sign-extend imm32
+			return temp;
 		}
 	}
 
-	convertible_t InterpretedVirtualMachine::loadbr0(Instruction& inst) {
-		switch (inst.locb & 7)
-		{
-			case 0:
-			case 1:
-			case 2:
-			case 3:
-				return reg.r[inst.regb % RegistersCount];
-			case 4:
-			case 5:
-			case 6:
-			case 7:
-				convertible_t temp;
-				temp.u64 = inst.imm8;
-				return temp;
+	convertible_t InterpretedVirtualMachine::loadbiashift(Instruction& instr) {
+		if (instr.locb & 1) {
+			return reg.r[instr.regb % RegistersCount];
+		}
+		else {
+			convertible_t temp;
+			temp.u64 = instr.imm8;
+			return temp;
+		}
+	}
+
+	convertible_t InterpretedVirtualMachine::loadbiadiv(Instruction& instr) {
+		if (instr.locb & 3) {
+			convertible_t temp;
+			temp.u64 = instr.imm32;
+			return temp;
+		}
+		else {
+			return reg.r[instr.regb % RegistersCount];
 		}
 	}
 
@@ -174,26 +230,6 @@ namespace RandomX {
 		}
 	}
 
-	void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) {
-		addr_t addr;
-		switch (inst.locc & 7)
-		{
-			case 4:
-				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
-				scratchpad[addr % ScratchpadL2] = regc.lo;
-				break;
-
-			case 5:
-			case 6:
-			case 7:
-				addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
-				scratchpad[addr % ScratchpadL1] = regc.lo;
-
-			default:
-				break;
-		}
-	}
-
 #define ALU_RETIRE(x) x(a, b, c); \
 	if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
 
@@ -242,7 +278,7 @@ namespace RandomX {
 #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
 	INC_COUNT(x) \
 	convertible_t a = loada(inst); \
-	convertible_t b = loadbr1(inst); \
+	convertible_t b = loadbia(inst); \
 	convertible_t& c = getcr(inst); \
 	ALU_RETIRE(x) \
 	}
@@ -250,7 +286,15 @@ namespace RandomX {
 #define ALU_INST_SR(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
 	INC_COUNT(x) \
 	convertible_t a = loada(inst); \
-	convertible_t b = loadbr0(inst); \
+	convertible_t b = loadbiashift(inst); \
+	convertible_t& c = getcr(inst); \
+	ALU_RETIRE(x) \
+	}
+
+#define ALU_INST_DIV(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
+	INC_COUNT(x) \
+	convertible_t a = loada(inst); \
+	convertible_t b = loadbiadiv(inst); \
 	convertible_t& c = getcr(inst); \
 	ALU_RETIRE(x) \
 	}
@@ -282,8 +326,8 @@ namespace RandomX {
 	ALU_INST(MUL_32)
 	ALU_INST(IMUL_32)
 	ALU_INST(IMULH_64)
-	ALU_INST(DIV_64)
-	ALU_INST(IDIV_64)
+	ALU_INST_DIV(DIV_64)
+	ALU_INST_DIV(IDIV_64)
 	ALU_INST(AND_64)
 	ALU_INST(AND_32)
 	ALU_INST(OR_64)
@@ -301,42 +345,68 @@ namespace RandomX {
 	FPU_INST(FPSUB)
 	FPU_INST(FPMUL)
 	FPU_INST(FPDIV)
-
 	FPU_INST_NB(FPSQRT)
-	FPU_INST_NB(FPROUND)
+
+	void InterpretedVirtualMachine::h_FPROUND(Instruction& inst) {
+		convertible_t a = loada(inst);
+		convertible_t& c = getcr(inst);
+		c.u64 = a.u64;
+		if (trace) std::cout << std::hex << a.u64 << std::endl;
+		FPROUND(a, inst.imm8);
+	}
+
+	void InterpretedVirtualMachine::h_JUMP(Instruction& inst) {
+		convertible_t a = loada(inst);
+		convertible_t& c = getcr(inst);
+		c.u64 = a.u64;
+		if (trace) std::cout << std::hex << a.u64 << std::endl;
+		if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) {
+#ifdef STATS
+			count_JUMP_taken++;
+			count_jump_taken[inst.locb & 7]++;
+#endif
+			pc += (inst.imm8 & 127) + 1;
+			pc = pc % ProgramLength;
+		}
+#ifdef STATS
+		else {
+			count_JUMP_not_taken++;
+			count_jump_not_taken[inst.locb & 7]++;
+		}
+#endif
+	}
 
 	void InterpretedVirtualMachine::h_CALL(Instruction& inst) {
 		convertible_t a = loada(inst);
+		convertible_t& c = getcr(inst);
+		c.u64 = a.u64;
+		if (trace) std::cout << std::hex << a.u64 << std::endl;
 		if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) {
 #ifdef STATS
 			count_CALL_taken++;
 			count_jump_taken[inst.locb & 7]++;
 			count_retdepth = std::max(0, count_retdepth - 1);
 #endif
-			stackPush(a);
 			stackPush(pc);
 #ifdef STATS
 			count_max_stack = std::max(count_max_stack, (int)stack.size());
 #endif
 			pc += (inst.imm8 & 127) + 1;
 			pc = pc % ProgramLength;
-			if (trace) std::cout << std::hex << a.u64 << std::endl;
 		}
-		else {
-			convertible_t& c = getcr(inst);
 #ifdef STATS
+		else {
 			count_CALL_not_taken++;
 			count_jump_not_taken[inst.locb & 7]++;
-#endif
-			c.u64 = a.u64;
-			if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
 		}
+#endif
 	}
 
 	void InterpretedVirtualMachine::h_RET(Instruction& inst) {
 		convertible_t a = loada(inst);
-		convertible_t b = loadbr1(inst);
 		convertible_t& c = getcr(inst);
+		c.u64 = a.u64;
+		if (trace) std::cout << std::hex << a.u64 << std::endl;
 		if (stack.size() > 0) {
 #ifdef STATS
 			count_RET_taken++;
@@ -344,22 +414,13 @@ namespace RandomX {
 			count_retdepth_max = std::max(count_retdepth_max, count_retdepth);
 #endif
 			auto raddr = stackPopAddress();
-			auto retval = stackPopValue();
-			c.u64 = a.u64 ^ retval.u64;
 			pc = raddr;
 		}
-		else {
 #ifdef STATS
-			if (stack.size() == 0)
-				count_RET_stack_empty++;
-			else {
-				count_RET_not_taken++;
-				count_jump_not_taken[inst.locb & 7]++;
-			}
-#endif
-			c.u64 = a.u64;
+		else {
+			count_RET_stack_empty++;
 		}
-		if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
+#endif
 	}
 
 #include "instructionWeights.hpp"
@@ -394,6 +455,7 @@ namespace RandomX {
 		INST_HANDLE(FPDIV)
 		INST_HANDLE(FPSQRT)
 		INST_HANDLE(FPROUND)
+		INST_HANDLE(JUMP)
 		INST_HANDLE(CALL)
 		INST_HANDLE(RET)
 	};
diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp
index 8c34936..7745cad 100644
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@@ -25,23 +25,37 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 namespace RandomX {
 
+	class ITransform {
+	public:
+		virtual int32_t apply(int32_t) const = 0;
+		virtual const char* getName() const = 0;
+		virtual std::ostream& printAsm(std::ostream&) const = 0;
+		virtual std::ostream& printCxx(std::ostream&) const = 0;
+	};
+
 	class InterpretedVirtualMachine;
 
 	typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&);
 
 	class InterpretedVirtualMachine : public VirtualMachine {
 	public:
-		InterpretedVirtualMachine(bool softAes) : VirtualMachine(softAes) {}
-		virtual void initializeProgram(const void* seed) override;
-		virtual void execute() override;
+		InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {}
+		~InterpretedVirtualMachine();
+		void setDataset(dataset_t ds) override;
+		void initializeScratchpad(uint32_t index) override;
+		void initializeProgram(const void* seed) override;
+		void execute() override;
 		const Program& getProgam() {
 			return p;
 		}
 	private:
 		static InstructionHandler engine[256];
+		static const ITransform* addressTransformations[TransformationCount];
+		bool softAes, asyncWorker;
 		Program p;
 		std::vector<convertible_t> stack;
 		uint64_t pc, ic;
+		const ITransform* currentTransform;
 #ifdef STATS
 		int count_ADD_64 = 0;
 		int count_ADD_32 = 0;
@@ -71,11 +85,12 @@ namespace RandomX {
 		int count_FPDIV = 0;
 		int count_FPSQRT = 0;
 		int count_FPROUND = 0;
+		int count_JUMP_taken = 0;
+		int count_JUMP_not_taken = 0;
 		int count_CALL_taken = 0;
 		int count_CALL_not_taken = 0;
 		int count_RET_stack_empty = 0;
 		int count_RET_taken = 0;
-		int count_RET_not_taken = 0;
 		int count_jump_taken[8] = { 0 };
 		int count_jump_not_taken[8] = { 0 };
 		int count_max_stack = 0;
@@ -89,14 +104,15 @@ namespace RandomX {
 		int count_FPSUB_nop2 = 0;
 		int count_FPMUL_nop = 0;
 		int count_FPMUL_nop2 = 0;
+		int datasetAccess[256] = { 0 };
 #endif
 
 		convertible_t loada(Instruction&);
-		convertible_t loadbr0(Instruction&);
-		convertible_t loadbr1(Instruction&);
+		convertible_t loadbiashift(Instruction&);
+		convertible_t loadbiadiv(Instruction&);
+		convertible_t loadbia(Instruction&);
 		convertible_t& getcr(Instruction&);
 		void writecf(Instruction&, fpu_reg_t&);
-		void writecflo(Instruction&, fpu_reg_t&);
 
 		void stackPush(convertible_t& c) {
 			stack.push_back(c);
@@ -148,6 +164,7 @@ namespace RandomX {
 		void h_FPDIV(Instruction&);
 		void h_FPSQRT(Instruction&);
 		void h_FPROUND(Instruction&);
+		void h_JUMP(Instruction&);
 		void h_CALL(Instruction&);
 		void h_RET(Instruction&);
 	};
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index f76ab74..b41f7b5 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -170,13 +170,13 @@ namespace RandomX {
 		emit(instr.addra);
 		emit(uint16_t(0x8b41)); //mov
 		emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
+		emit(0x753fc3f6); //test bl,0x3f; jne
+		emit(uint16_t(0xe805));
+		emit(readDatasetOffset - (codePos + 4));
 		if ((instr.loca & 192) == 0) { //A.LOC.X
 			emit(uint16_t(0x3348));
 			emitByte(0xe8); //xor rbp, rax
 		}
-		emit(0x753fc3f6); //test bl,0x3f; jne
-		emit(uint16_t(0xe805));
-		emit(readDatasetOffset - (codePos + 4));
 		emitByte(0x25); //and eax,
 		if (instr.loca & 3) {
 			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp
new file mode 100644
index 0000000..c069f3f
--- /dev/null
+++ b/src/LightClientAsyncWorker.cpp
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "LightClientAsyncWorker.hpp"
+#include "dataset.hpp"
+#include "Cache.hpp"
+
+namespace RandomX {
+
+	template<bool softAes>
+	LightClientAsyncWorker<softAes>::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), workerThread(&LightClientAsyncWorker::runWorker, this) {
+
+	}
+
+	template<bool softAes>
+	void LightClientAsyncWorker<softAes>::prepareBlock(addr_t addr) {
+		{
+			std::lock_guard<std::mutex> lk(mutex);
+			startBlock = addr / CacheLineSize;
+			blockCount = 1;
+			output = currentLine.data();
+			hasWork = true;
+		}
+		notifier.notify_all();
+	}
+
+	template<bool softAes>
+	const uint64_t* LightClientAsyncWorker<softAes>::getBlock(addr_t addr) {
+		uint32_t currentBlock = addr / CacheLineSize;
+		if (currentBlock != startBlock || output != currentLine.data()) {
+			initBlock<softAes>(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys());
+		}
+		else {
+			sync();
+		}
+		return currentLine.data();
+	}
+
+	template<bool softAes>
+	void LightClientAsyncWorker<softAes>::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
+		{
+			std::lock_guard<std::mutex> lk(mutex);
+			startBlock = startBlock;
+			blockCount = blockCount;
+			output = out;
+			hasWork = true;
+		}
+		notifier.notify_all();
+	}
+
+	template<bool softAes>
+	void LightClientAsyncWorker<softAes>::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
+		for (uint32_t i = 0; i < blockCount; ++i) {
+			initBlock<softAes>(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys());
+		}
+	}
+
+	template<bool softAes>
+	void LightClientAsyncWorker<softAes>::sync() {
+		std::unique_lock<std::mutex> lk(mutex);
+		notifier.wait(lk, [this] { return !hasWork; });
+	}
+
+	template<bool softAes>
+	void LightClientAsyncWorker<softAes>::runWorker() {
+		for (;;) {
+			std::unique_lock<std::mutex> lk(mutex);
+			notifier.wait(lk, [this] { return hasWork; });
+			getBlocks(output, startBlock, blockCount);
+			hasWork = false;
+			lk.unlock();
+			notifier.notify_all();
+		}
+	}
+
+	template class LightClientAsyncWorker<true>;
+	template class LightClientAsyncWorker<false>;
+}
\ No newline at end of file
diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp
new file mode 100644
index 0000000..7596fd5
--- /dev/null
+++ b/src/LightClientAsyncWorker.hpp
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "common.hpp"
+
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <array>
+
+namespace RandomX {
+
+	class Cache;
+
+	using DatasetLine = std::array<uint64_t, CacheLineSize / sizeof(uint64_t)>;
+
+	template<bool softAes>
+	class LightClientAsyncWorker : public ILightClientAsyncWorker {
+	public:
+		LightClientAsyncWorker(const Cache*);
+		void prepareBlock(addr_t) final;
+		void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final;
+		const uint64_t* getBlock(addr_t) final;
+		void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final;
+		void sync() final;
+	private:
+		void runWorker();
+		std::condition_variable notifier;
+		std::mutex mutex;
+		DatasetLine currentLine;
+		void* output;
+		uint32_t startBlock, blockCount;
+		bool hasWork;
+		std::thread workerThread;
+	};
+}
\ No newline at end of file
diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp
index 6e8cfad..7a2be48 100644
--- a/src/VirtualMachine.cpp
+++ b/src/VirtualMachine.cpp
@@ -19,8 +19,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 #include "VirtualMachine.hpp"
 #include "common.hpp"
-#include "dataset.hpp"
-#include "Cache.hpp"
 #include "t1ha/t1ha.h"
 #include "blake2/blake2.h"
 #include <cstring>
@@ -37,62 +35,10 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
 
 namespace RandomX {
 
-	VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
+	VirtualMachine::VirtualMachine() {
 		mem.ds.dataset = nullptr;
 	}
 
-	VirtualMachine::~VirtualMachine() {
-		if (lightClient) {
-			delete mem.ds.lightDataset->block;
-			delete mem.ds.lightDataset;
-		}
-	}
-
-	void VirtualMachine::setDataset(dataset_t ds, bool light) {
-		if (mem.ds.dataset != nullptr) {
-			throw std::runtime_error("Dataset is already initialized");
-		}
-		lightClient = light;
-		if (light) {
-			auto lds = mem.ds.lightDataset = new LightClientDataset();
-			lds->cache = ds.cache;
-			//lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i));
-			lds->blockNumber = -1;
-			if (lds->block == nullptr) {
-				throw std::bad_alloc();
-			}
-			if (softAes) {
-				readDataset = &datasetReadLight<true>;
-			}
-			else {
-				readDataset = &datasetReadLight<false>;
-			}
-		}
-		else {
-			mem.ds = ds;
-			readDataset = &datasetRead;
-		}
-	}
-
-	void VirtualMachine::initializeScratchpad(uint32_t index) {
-		if (lightClient) {
-			auto cache = mem.ds.lightDataset->cache;
-			if (softAes) {
-				for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) {
-					initBlock<true>(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys());
-				}
-			}
-			else {
-				for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) {
-					initBlock<false>(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys());
-				}
-			}
-		}
-		else {
-			memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize);
-		}
-	}
-
 	void VirtualMachine::getResult(void* out) {
 		constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2;
 		uint64_t smallState[smallStateLength];
diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp
index bbcfec3..78f7cf6 100644
--- a/src/VirtualMachine.hpp
+++ b/src/VirtualMachine.hpp
@@ -25,10 +25,10 @@ namespace RandomX {
 
 	class VirtualMachine {
 	public:
-		VirtualMachine(bool softAes);
-		virtual ~VirtualMachine();
-		virtual void setDataset(dataset_t ds, bool light = false);
-		void initializeScratchpad(uint32_t index);
+		VirtualMachine();
+		virtual ~VirtualMachine() {}
+		virtual void setDataset(dataset_t ds) = 0;
+		virtual void initializeScratchpad(uint32_t index) = 0;
 		virtual void initializeProgram(const void* seed) = 0;
 		virtual void execute() = 0;
 		void getResult(void*);
@@ -36,7 +36,6 @@ namespace RandomX {
 			return reg;
 		}
 	protected:
-		bool softAes, lightClient;
 		DatasetReadFunc readDataset;
 		alignas(16) RegisterFile reg;
 		MemoryRegisters mem;
diff --git a/src/common.hpp b/src/common.hpp
index 3831175..62fae70 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -38,7 +38,7 @@ namespace RandomX {
 	constexpr int CacheLineSize = 64;
 	constexpr int BlockExpansionRatio = 64;
 	constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount;
-	constexpr int DatasetIterations = 64;
+	constexpr int DatasetIterations = 16;
 	constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize;
 	constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio;
 
@@ -86,16 +86,25 @@ namespace RandomX {
 		return i % RandomX::ProgramLength;
 	}
 
-	struct LightClientDataset {
-		Cache* cache;
-		uint8_t* block;
-		uint32_t blockNumber;
+	class ILightClientAsyncWorker {
+	public:
+		virtual void prepareBlock(addr_t) = 0;
+		virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0;
+		virtual const uint64_t* getBlock(addr_t) = 0;
+		virtual void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0;
+		virtual void sync() = 0;
+		const Cache* getCache() {
+			return cache;
+		}
+	protected:
+		ILightClientAsyncWorker(const Cache* c) : cache(c) {}
+		const Cache* cache;
 	};
 
 	union dataset_t {
 		uint8_t* dataset;
 		Cache* cache;
-		LightClientDataset* lightDataset;
+		ILightClientAsyncWorker* asyncWorker;
 	};
 
 	struct MemoryRegisters {
@@ -112,7 +121,7 @@ namespace RandomX {
 
 	static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
 
-	typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&);
+	typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&);
 
 	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*);
 
diff --git a/src/dataset.cpp b/src/dataset.cpp
index d9c7b3f..ae31963 100644
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@@ -30,7 +30,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 #if defined(__SSE2__)
 #include <wmmintrin.h>
-#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_T0)
+#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA)
 #else
 #define PREFETCH(memory)
 #endif
@@ -106,32 +106,44 @@ namespace RandomX {
 	template
 		void initBlock<false>(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&);
 
-	convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) {
-		convertible_t data;
-		data.u64 = *(uint64_t*)(memory.ds.dataset + memory.ma);
-		memory.ma += 8;
+	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
+		uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma);
 		memory.mx ^= addr;
-		if ((memory.mx & 0xFFF8) == 0) {
-			memory.ma = memory.mx & ~7;
-			PREFETCH(memory);
-		}
-		return data;
+		memory.mx &= -64; //align to cache line
+		std::swap(memory.mx, memory.ma);
+		PREFETCH(memory);
+		for (int i = 0; i < RegistersCount; ++i)
+			reg.r[i].u64 ^= datasetLine[i];
 	}
 
 	template<bool softAes>
-	convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) {
-		convertible_t data;
-		LightClientDataset* lds = memory.ds.lightDataset;
-		auto blockNumber = memory.ma / CacheLineSize;
-
-		return data;
+	void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
+		Cache* cache = memory.ds.cache;
+		uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
+		initBlock<softAes>(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys());
+		for (int i = 0; i < RegistersCount; ++i)
+			reg.r[i].u64 ^= datasetLine[i];
+		memory.mx ^= addr;
+		memory.mx &= -64; //align to cache line
+		std::swap(memory.mx, memory.ma);
 	}
 
 	template
-		convertible_t datasetReadLight<false>(addr_t addr, MemoryRegisters& memory);
+		void datasetReadLight<false>(addr_t addr, MemoryRegisters& memory, RegisterFile& reg);
 
 	template
-		convertible_t datasetReadLight<true>(addr_t addr, MemoryRegisters& memory);
+		void datasetReadLight<true>(addr_t addr, MemoryRegisters& memory, RegisterFile& reg);
+
+	void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
+		ILightClientAsyncWorker* aw = memory.ds.asyncWorker;
+		const uint64_t* datasetLine = aw->getBlock(memory.ma);
+		for (int i = 0; i < RegistersCount; ++i)
+			reg.r[i].u64 ^= datasetLine[i];
+		memory.mx ^= addr;
+		memory.mx &= -64; //align to cache line
+		std::swap(memory.mx, memory.ma);
+		aw->prepareBlock(memory.ma);
+	}
 
 	void datasetAlloc(dataset_t& ds, bool largePages) {
 		if (sizeof(size_t) <= 4)
diff --git a/src/dataset.hpp b/src/dataset.hpp
index 5f9836c..0103271 100644
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@@ -40,12 +40,14 @@ namespace RandomX {
 	template<bool softAes>
 	void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount);
 
-	convertible_t datasetRead(addr_t addr, MemoryRegisters& memory);
+	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&);
 
 	template<bool softAes>
 	void datasetInitCache(const void* seed, dataset_t& dataset);
 
 	template<bool softAes>
-	convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory);
+	void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&);
+
+	void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg);
 }
 
diff --git a/src/instructions.hpp b/src/instructions.hpp
index 2321be6..dc5d4ee 100644
--- a/src/instructions.hpp
+++ b/src/instructions.hpp
@@ -22,12 +22,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 namespace RandomX {
 
-	//Clears the 11 least-significant bits before conversion. This is done so the number
-	//fits exactly into the 52-bit mantissa without rounding.
-	inline double convertSigned52(int64_t x) {
-		return (double)(x & -2048L);
-	}
-
 	extern "C" {
 		void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c);
@@ -53,11 +47,11 @@ namespace RandomX {
 		void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
 		bool JMP_COND(uint8_t, convertible_t&, int32_t);
 		void FPINIT();
+		void FPROUND(convertible_t, uint8_t);
 		void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 		void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 		void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 		void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 		void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
-		void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
 	}
 }
\ No newline at end of file
diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp
index 790506b..78bdb6f 100644
--- a/src/instructionsPortable.cpp
+++ b/src/instructionsPortable.cpp
@@ -370,9 +370,9 @@ namespace RandomX {
 #endif
 		}
 
-		void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
-			c.lo.f64 = convertSigned52(a.i64);
-			switch (a.u64 & 3) {
+		void FPROUND(convertible_t a, uint8_t rot) {
+			uint64_t flag = ror64(a.u64, rot);
+			switch (flag & 3) {
 				case RoundDown:
 #ifdef DEBUG
 					std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " <<
diff --git a/src/main.cpp b/src/main.cpp
index 6366821..3295500 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
 }
 
 int main(int argc, char** argv) {
-	bool softAes, lightClient, genAsm, compiled, help, largePages;
+	bool softAes, lightClient, genAsm, compiled, help, largePages, async;
 	int programCount, threadCount;
 	readOption("--help", argc, argv, help);
 
@@ -178,6 +178,7 @@ int main(int argc, char** argv) {
 	readIntOption("--threads", argc, argv, threadCount, 1);
 	readIntOption("--nonces", argc, argv, programCount, 1000);
 	readOption("--largePages", argc, argv, largePages);
+	readOption("--async", argc, argv, async);
 
 	if (genAsm) {
 		generateAsm(programCount);
@@ -250,12 +251,12 @@ int main(int argc, char** argv) {
 		for (int i = 0; i < threadCount; ++i) {
 			RandomX::VirtualMachine* vm;
 			if (compiled) {
-				vm = new RandomX::CompiledVirtualMachine(softAes);
+				vm = new RandomX::CompiledVirtualMachine();
 			}
 			else {
-				vm = new RandomX::InterpretedVirtualMachine(softAes);
+				vm = new RandomX::InterpretedVirtualMachine(softAes, async);
 			}
-			vm->setDataset(dataset, lightClient);
+			vm->setDataset(dataset);
 			vms.push_back(vm);
 		}
 		std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl;
@@ -278,7 +279,12 @@ int main(int argc, char** argv) {
 		result.print(std::cout);
 		if(programCount == 1000)
 		std::cout << "Reference result:  3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
-		std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
+		if (lightClient) {
+			std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per program" << std::endl;
+		}
+		else {
+			std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
+		}
 	}
 	catch (std::exception& e) {
 		std::cout << "ERROR: " << e.what() << std::endl;