feat(crypto): RandomX PoW hash via CGo bridge

Vendor RandomX source, add bridge_randomx_hash() with static
VM lifecycle. Key: LetheanRandomXv1. Input: header_hash || nonce.

Co-Authored-By: Charon <charon@lethean.io>
This commit is contained in:
Claude 2026-02-21 01:01:23 +00:00
parent 7abac5e011
commit 4fe3fdfbd2
No known key found for this signature in database
GPG key ID: AF404715446AEB41
102 changed files with 19678 additions and 2 deletions

View file

@ -11,6 +11,8 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
include(CheckCCompilerFlag)
# Include paths: upstream sources + compat stubs
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/upstream
@ -43,6 +45,65 @@ set(CXX_SOURCES
bridge.cpp
)
# --- RandomX PoW library ---
set(RANDOMX_SOURCES
randomx/aes_hash.cpp
randomx/argon2_ref.c
randomx/argon2_ssse3.c
randomx/argon2_avx2.c
randomx/bytecode_machine.cpp
randomx/cpu.cpp
randomx/dataset.cpp
randomx/soft_aes.cpp
randomx/virtual_memory.c
randomx/vm_interpreted.cpp
randomx/allocator.cpp
randomx/assembly_generator_x86.cpp
randomx/instruction.cpp
randomx/randomx.cpp
randomx/superscalar.cpp
randomx/vm_compiled.cpp
randomx/vm_interpreted_light.cpp
randomx/argon2_core.c
randomx/blake2_generator.cpp
randomx/instructions_portable.cpp
randomx/reciprocal.c
randomx/virtual_machine.cpp
randomx/vm_compiled_light.cpp
randomx/blake2/blake2b.c
randomx/jit_compiler_x86.cpp
randomx/jit_compiler_x86_static.S
)
add_library(randomx STATIC ${RANDOMX_SOURCES})
target_include_directories(randomx PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/randomx
)
set_property(TARGET randomx PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET randomx PROPERTY CXX_STANDARD 11)
set_property(TARGET randomx PROPERTY CXX_STANDARD_REQUIRED ON)
# Platform-specific flags for RandomX
enable_language(ASM)
target_compile_options(randomx PRIVATE -maes)
check_c_compiler_flag(-mssse3 HAVE_SSSE3)
if(HAVE_SSSE3)
set_source_files_properties(randomx/argon2_ssse3.c PROPERTIES COMPILE_FLAGS -mssse3)
endif()
check_c_compiler_flag(-mavx2 HAVE_AVX2)
if(HAVE_AVX2)
set_source_files_properties(randomx/argon2_avx2.c PROPERTIES COMPILE_FLAGS -mavx2)
endif()
target_compile_options(randomx PRIVATE
-Wno-unused-variable
-Wno-unused-function
-Wno-sign-compare
-Wno-unused-parameter
-Wno-implicit-fallthrough
)
# --- Find system dependencies ---
find_package(OpenSSL REQUIRED)
find_package(Boost REQUIRED)
@ -53,12 +114,14 @@ add_library(cryptonote STATIC ${C_SOURCES} ${CXX_SOURCES})
target_include_directories(cryptonote PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/upstream
${CMAKE_CURRENT_SOURCE_DIR}/compat
${CMAKE_CURRENT_SOURCE_DIR}/randomx
${OPENSSL_INCLUDE_DIR}
${Boost_INCLUDE_DIRS}
)
target_link_libraries(cryptonote PRIVATE
OpenSSL::Crypto
randomx
)
# Suppress warnings from upstream code (we don't modify it)

View file

@ -11,6 +11,7 @@
#include "crypto-ops.h"
#include "clsag.h"
#include "hash-ops.h"
#include "randomx.h"
extern "C" {
@ -417,4 +418,33 @@ int cn_zarcanum_verify(const uint8_t /*hash*/[32], const uint8_t * /*proof*/,
return -1; // not implemented
}
// ── RandomX PoW Hashing ──────────────────────────────────
int bridge_randomx_hash(const uint8_t* key, size_t key_size,
const uint8_t* input, size_t input_size,
uint8_t* output) {
// Static RandomX state — initialised on first call.
// Thread safety: not thread-safe; Go wrapper must serialise calls.
static randomx_cache* rx_cache = nullptr;
static randomx_vm* rx_vm = nullptr;
if (rx_cache == nullptr) {
randomx_flags flags = randomx_get_flags();
// Use light mode (no dataset) for verification.
flags = (randomx_flags)(flags | RANDOMX_FLAG_DEFAULT);
rx_cache = randomx_alloc_cache(flags);
if (rx_cache == nullptr) return -1;
randomx_init_cache(rx_cache, key, key_size);
rx_vm = randomx_create_vm(flags, rx_cache, nullptr);
if (rx_vm == nullptr) {
randomx_release_cache(rx_cache);
rx_cache = nullptr;
return -1;
}
}
randomx_calculate_hash(rx_vm, input, input_size, output);
return 0;
}
} // extern "C"

View file

@ -105,6 +105,15 @@ int cn_bge_verify(const uint8_t context[32], const uint8_t *ring,
int cn_zarcanum_verify(const uint8_t hash[32], const uint8_t *proof,
size_t proof_len);
// ── RandomX PoW Hashing ──────────────────────────────────
// key/key_size: RandomX cache key (e.g. "LetheanRandomXv1")
// input/input_size: block header hash (32 bytes) + nonce (8 bytes LE)
// output: 32-byte hash result
// Returns 0 on success.
int bridge_randomx_hash(const uint8_t* key, size_t key_size,
const uint8_t* input, size_t input_size,
uint8_t* output);
#ifdef __cplusplus
}
#endif

View file

@ -3,8 +3,8 @@
package crypto
/*
#cgo CPPFLAGS: -I${SRCDIR}/upstream -I${SRCDIR}/compat
#cgo LDFLAGS: -L${SRCDIR}/build -lcryptonote -lstdc++ -lssl -lcrypto
#cgo CPPFLAGS: -I${SRCDIR}/upstream -I${SRCDIR}/compat -I${SRCDIR}/randomx
#cgo LDFLAGS: -L${SRCDIR}/build -lcryptonote -lrandomx -lstdc++ -lssl -lcrypto -lpthread
#include "bridge.h"
*/
import "C"

29
crypto/pow.go Normal file
View file

@ -0,0 +1,29 @@
// Copyright (c) 2017-2026 Lethean (https://lt.hn)
//
// Licensed under the European Union Public Licence (EUPL) version 1.2.
// SPDX-Licence-Identifier: EUPL-1.2
package crypto
// #include "bridge.h"
import "C"
import (
"fmt"
"unsafe"
)
// RandomXHash computes the RandomX PoW hash. The key is the cache
// initialisation key (e.g. "LetheanRandomXv1"). Input is typically
// the block header hash (32 bytes) concatenated with the nonce (8 bytes LE).
func RandomXHash(key, input []byte) ([32]byte, error) {
var output [32]byte
ret := C.bridge_randomx_hash(
(*C.uint8_t)(unsafe.Pointer(&key[0])), C.size_t(len(key)),
(*C.uint8_t)(unsafe.Pointer(&input[0])), C.size_t(len(input)),
(*C.uint8_t)(unsafe.Pointer(&output[0])),
)
if ret != 0 {
return output, fmt.Errorf("crypto: RandomX hash failed with code %d", ret)
}
return output, nil
}

40
crypto/pow_test.go Normal file
View file

@ -0,0 +1,40 @@
// Copyright (c) 2017-2026 Lethean (https://lt.hn)
//
// Licensed under the European Union Public Licence (EUPL) version 1.2.
// SPDX-Licence-Identifier: EUPL-1.2
package crypto
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRandomXHash_Good(t *testing.T) {
key := []byte("LetheanRandomXv1")
input := make([]byte, 40) // 32-byte hash + 8-byte nonce
hash, err := RandomXHash(key, input)
require.NoError(t, err)
assert.NotEqual(t, [32]byte{}, hash, "hash should be non-zero")
// Determinism: same input must produce the same output.
hash2, err := RandomXHash(key, input)
require.NoError(t, err)
assert.Equal(t, hash, hash2, "hash must be deterministic")
}
func TestRandomXHash_Bad(t *testing.T) {
key := []byte("LetheanRandomXv1")
input1 := make([]byte, 40)
input2 := make([]byte, 40)
input2[0] = 1
hash1, err := RandomXHash(key, input1)
require.NoError(t, err)
hash2, err := RandomXHash(key, input2)
require.NoError(t, err)
assert.NotEqual(t, hash1, hash2, "different inputs must produce different hashes")
}

379
crypto/randomx/aes_hash.cpp Executable file
View file

@ -0,0 +1,379 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "soft_aes.h"
#include "cpu.hpp"
#include <cassert>
#ifdef __riscv
#include "aes_hash_rv64_zvkned.hpp"
#include "aes_hash_rv64_vector.hpp"
#endif
//NOTE: The functions below were tuned for maximum performance
//and are not cryptographically secure outside of the scope of RandomX.
//It's not recommended to use them as general hash functions and PRNGs.
//AesHash1R:
//state0, state1, state2, state3 = Blake2b-512("RandomX AesHash1R state")
//xkey0, xkey1 = Blake2b-256("RandomX AesHash1R xkeys")
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
#define AES_HASH_1R_STATE2 0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017
#define AES_HASH_1R_STATE3 0x7e994948, 0x79a10005, 0x07ad828d, 0x630a240c
#define AES_HASH_1R_XKEY0 0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389
#define AES_HASH_1R_XKEY1 0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1
/*
Calculate a 512-bit hash of 'input' using 4 lanes of AES.
The input is treated as a set of round keys for the encryption
of the initial state.
'inputSize' must be a multiple of 64.
For a 2 MiB input, this has the same security as 32768-round
AES encryption.
Hashing throughput: >20 GiB/s per CPU core with hardware AES
*/
template<bool softAes>
void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
assert(inputSize % 64 == 0);
#ifdef __riscv
if (randomx::cpu.hasAes()) {
hashAes1Rx4_zvkned(input, inputSize, hash);
return;
}
if (randomx::cpu.hasRVV() && (randomx::cpu.getRVV_Length() >= 256)) {
hashAes1Rx4_RVV(input, inputSize, hash);
return;
}
#endif
const uint8_t* inptr = (uint8_t*)input;
const uint8_t* inputEnd = inptr + inputSize;
rx_vec_i128 state0, state1, state2, state3;
rx_vec_i128 in0, in1, in2, in3;
//initial state
state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);
//process 64 bytes at a time in 4 lanes
while (inptr < inputEnd) {
in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0);
in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1);
in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2);
in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3);
state0 = aesenc<softAes>(state0, in0);
state1 = aesdec<softAes>(state1, in1);
state2 = aesenc<softAes>(state2, in2);
state3 = aesdec<softAes>(state3, in3);
inptr += 64;
}
//two extra rounds to achieve full diffusion
rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);
state0 = aesenc<softAes>(state0, xkey0);
state1 = aesdec<softAes>(state1, xkey0);
state2 = aesenc<softAes>(state2, xkey0);
state3 = aesdec<softAes>(state3, xkey0);
state0 = aesenc<softAes>(state0, xkey1);
state1 = aesdec<softAes>(state1, xkey1);
state2 = aesenc<softAes>(state2, xkey1);
state3 = aesdec<softAes>(state3, xkey1);
//output hash
rx_store_vec_i128((rx_vec_i128*)hash + 0, state0);
rx_store_vec_i128((rx_vec_i128*)hash + 1, state1);
rx_store_vec_i128((rx_vec_i128*)hash + 2, state2);
rx_store_vec_i128((rx_vec_i128*)hash + 3, state3);
}
template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash);
//AesGenerator1R:
//key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator1R keys")
#define AES_GEN_1R_KEY0 0xb4f44917, 0xdbb5552b, 0x62716609, 0x6daca553
#define AES_GEN_1R_KEY1 0x0da1dc4e, 0x1725d378, 0x846a710d, 0x6d7caf07
#define AES_GEN_1R_KEY2 0x3e20e345, 0xf4c0794f, 0x9f947ec6, 0x3f1262f1
#define AES_GEN_1R_KEY3 0x49169154, 0x16314c88, 0xb1ba317c, 0x6aef8135
/*
Fill 'buffer' with pseudorandom data based on 512-bit 'state'.
The state is encrypted using a single AES round per 16 bytes of output
in 4 lanes.
'outputSize' must be a multiple of 64.
The modified state is written back to 'state' to allow multiple
calls to this function.
*/
template<bool softAes>
void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
assert(outputSize % 64 == 0);
#ifdef __riscv
if (randomx::cpu.hasAes()) {
fillAes1Rx4_zvkned(state, outputSize, buffer);
return;
}
if (randomx::cpu.hasRVV() && (randomx::cpu.getRVV_Length() >= 256)) {
fillAes1Rx4_RVV(state, outputSize, buffer);
return;
}
#endif
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
rx_vec_i128 state0, state1, state2, state3;
rx_vec_i128 key0, key1, key2, key3;
key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);
state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
state2 = rx_load_vec_i128((rx_vec_i128*)state + 2);
state3 = rx_load_vec_i128((rx_vec_i128*)state + 3);
while (outptr < outputEnd) {
state0 = aesdec<softAes>(state0, key0);
state1 = aesenc<softAes>(state1, key1);
state2 = aesdec<softAes>(state2, key2);
state3 = aesenc<softAes>(state3, key3);
rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2);
rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3);
outptr += 64;
}
rx_store_vec_i128((rx_vec_i128*)state + 0, state0);
rx_store_vec_i128((rx_vec_i128*)state + 1, state1);
rx_store_vec_i128((rx_vec_i128*)state + 2, state2);
rx_store_vec_i128((rx_vec_i128*)state + 3, state3);
}
template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);
//AesGenerator4R:
//key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator4R keys 0-3")
//key4, key5, key6, key7 = Blake2b-512("RandomX AesGenerator4R keys 4-7")
#define AES_GEN_4R_KEY0 0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd
#define AES_GEN_4R_KEY1 0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450
#define AES_GEN_4R_KEY2 0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904
#define AES_GEN_4R_KEY3 0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763
#define AES_GEN_4R_KEY4 0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73
#define AES_GEN_4R_KEY5 0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3
#define AES_GEN_4R_KEY6 0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7
#define AES_GEN_4R_KEY7 0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609
template<bool softAes>
void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
assert(outputSize % 64 == 0);
#ifdef __riscv
if (randomx::cpu.hasAes()) {
fillAes4Rx4_zvkned(state, outputSize, buffer);
return;
}
if (randomx::cpu.hasRVV() && (randomx::cpu.getRVV_Length() >= 256)) {
fillAes4Rx4_RVV(state, outputSize, buffer);
return;
}
#endif
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
rx_vec_i128 state0, state1, state2, state3;
rx_vec_i128 key0, key1, key2, key3, key4, key5, key6, key7;
key0 = rx_set_int_vec_i128(AES_GEN_4R_KEY0);
key1 = rx_set_int_vec_i128(AES_GEN_4R_KEY1);
key2 = rx_set_int_vec_i128(AES_GEN_4R_KEY2);
key3 = rx_set_int_vec_i128(AES_GEN_4R_KEY3);
key4 = rx_set_int_vec_i128(AES_GEN_4R_KEY4);
key5 = rx_set_int_vec_i128(AES_GEN_4R_KEY5);
key6 = rx_set_int_vec_i128(AES_GEN_4R_KEY6);
key7 = rx_set_int_vec_i128(AES_GEN_4R_KEY7);
state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
state2 = rx_load_vec_i128((rx_vec_i128*)state + 2);
state3 = rx_load_vec_i128((rx_vec_i128*)state + 3);
while (outptr < outputEnd) {
state0 = aesdec<softAes>(state0, key0);
state1 = aesenc<softAes>(state1, key0);
state2 = aesdec<softAes>(state2, key4);
state3 = aesenc<softAes>(state3, key4);
state0 = aesdec<softAes>(state0, key1);
state1 = aesenc<softAes>(state1, key1);
state2 = aesdec<softAes>(state2, key5);
state3 = aesenc<softAes>(state3, key5);
state0 = aesdec<softAes>(state0, key2);
state1 = aesenc<softAes>(state1, key2);
state2 = aesdec<softAes>(state2, key6);
state3 = aesenc<softAes>(state3, key6);
state0 = aesdec<softAes>(state0, key3);
state1 = aesenc<softAes>(state1, key3);
state2 = aesdec<softAes>(state2, key7);
state3 = aesenc<softAes>(state3, key7);
rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2);
rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3);
outptr += 64;
}
}
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
#ifdef __riscv
if (randomx::cpu.hasAes()) {
hashAndFillAes1Rx4_zvkned(scratchpad, scratchpadSize, hash, fill_state);
return;
}
if (randomx::cpu.hasRVV() && (randomx::cpu.getRVV_Length() >= 256)) {
hashAndFillAes1Rx4_RVV(scratchpad, scratchpadSize, hash, fill_state);
return;
}
#endif
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
// initial state
rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);
const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);
rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0);
rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1);
rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);
constexpr int PREFETCH_DISTANCE = 4096;
const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
scratchpadEnd -= PREFETCH_DISTANCE;
for (int i = 0; i < 2; ++i) {
//process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) {
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0));
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1));
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2));
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3));
fill_state0 = aesdec<softAes>(fill_state0, key0);
fill_state1 = aesenc<softAes>(fill_state1, key1);
fill_state2 = aesdec<softAes>(fill_state2, key2);
fill_state3 = aesenc<softAes>(fill_state3, key3);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;
}
rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3);
//two extra rounds to achieve full diffusion
rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);
hash_state0 = aesenc<softAes>(hash_state0, xkey0);
hash_state1 = aesdec<softAes>(hash_state1, xkey0);
hash_state2 = aesenc<softAes>(hash_state2, xkey0);
hash_state3 = aesdec<softAes>(hash_state3, xkey0);
hash_state0 = aesenc<softAes>(hash_state0, xkey1);
hash_state1 = aesdec<softAes>(hash_state1, xkey1);
hash_state2 = aesenc<softAes>(hash_state2, xkey1);
hash_state3 = aesdec<softAes>(hash_state3, xkey1);
//output hash
rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0);
rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1);
rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
}
template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

43
crypto/randomx/aes_hash.hpp Executable file
View file

@ -0,0 +1,43 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstddef>
template<bool softAes>
void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
template<bool softAes>
void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void fillAes4Rx4(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

View file

@ -0,0 +1,274 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <riscv_vector.h>
#include "soft_aes.h"
#include "randomx.h"
#include "blake2/endian.h"
static FORCE_INLINE vuint32m1_t softaes_vector_double(
vuint32m1_t in,
vuint32m1_t key,
vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3,
const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3)
{
const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in);
const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32));
const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32));
const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32));
const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32));
vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8);
vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8);
vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8);
vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8);
s0 = __riscv_vxor_vv_u32m1(s0, s1, 8);
s2 = __riscv_vxor_vv_u32m1(s2, s3, 8);
s0 = __riscv_vxor_vv_u32m1(s0, s2, 8);
return __riscv_vxor_vv_u32m1(s0, key, 8);
}
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 12, 8, 4, 0, 76, 72, 68, 64 };
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) {
const uint8_t* inptr = (const uint8_t*)input;
const uint8_t* inputEnd = inptr + inputSize;
//intial state
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
const vuint8m1_t& lutdec_index0 = lutenc_index0;
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
const vuint8m1_t& lutdec_index2 = lutenc_index2;
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
//process 64 bytes at a time in 4 lanes
while (inptr < inputEnd) {
state02 = softaes_vector_double(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state13 = softaes_vector_double(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
inptr += 64;
}
//two extra rounds to achieve full diffusion
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
state02 = softaes_vector_double(state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state13 = softaes_vector_double(state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state02 = softaes_vector_double(state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state13 = softaes_vector_double(state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
//output hash
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
}
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer) {
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
const vuint8m1_t& lutdec_index0 = lutenc_index0;
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
const vuint8m1_t& lutdec_index2 = lutenc_index2;
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
while (outptr < outputEnd) {
state02 = softaes_vector_double(state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state13 = softaes_vector_double(state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
outptr += 64;
}
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
}
static constexpr uint32_t fillAes4Rx4_Key[] = {
0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd,
0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450,
0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904,
0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763,
0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73,
0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3,
0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7,
0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609,
};
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer) {
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 0, stride4, 8);
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 4, stride4, 8);
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 8, stride4, 8);
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 12, stride4, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
const vuint8m1_t& lutdec_index0 = lutenc_index0;
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
const vuint8m1_t& lutdec_index2 = lutenc_index2;
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
while (outptr < outputEnd) {
state02 = softaes_vector_double(state02, key04, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state13 = softaes_vector_double(state13, key04, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state02 = softaes_vector_double(state02, key15, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state13 = softaes_vector_double(state13, key15, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state02 = softaes_vector_double(state02, key26, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state13 = softaes_vector_double(state13, key26, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
state02 = softaes_vector_double(state02, key37, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
state13 = softaes_vector_double(state13, key37, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
outptr += 64;
}
}
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
const vuint8m1_t& lutdec_index0 = lutenc_index0;
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
const vuint8m1_t& lutdec_index2 = lutenc_index2;
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
//process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) {
#define HASH_STATE(k) \
hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
#define FILL_STATE(k) \
fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \
fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8);
HASH_STATE(0);
HASH_STATE(1);
FILL_STATE(0);
FILL_STATE(1);
scratchpadPtr += 128;
}
#undef HASH_STATE
#undef FILL_STATE
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
//two extra rounds to achieve full diffusion
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
//output hash
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
}

View file

@ -0,0 +1,35 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash);
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer);
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer);
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

View file

@ -0,0 +1,210 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "aes_hash.hpp"
#include "randomx.h"
#include "blake2/endian.h"
#include <riscv_vector.h>
static FORCE_INLINE vuint32m1_t aesenc_zvkned(vuint32m1_t a, vuint32m1_t b) { return __riscv_vaesem_vv_u32m1(a, b, 8); }
static FORCE_INLINE vuint32m1_t aesdec_zvkned(vuint32m1_t a, vuint32m1_t b, vuint32m1_t zero) { return __riscv_vxor_vv_u32m1(__riscv_vaesdm_vv_u32m1(a, zero, 8), b, 8); }
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 12, 8, 4, 0, 76, 72, 68, 64 };
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash)
{
const uint8_t* inptr = (const uint8_t*)input;
const uint8_t* inputEnd = inptr + inputSize;
//intial state
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
const vuint32m1_t zero = {};
//process 64 bytes at a time in 4 lanes
while (inptr < inputEnd) {
state02 = aesenc_zvkned(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8));
state13 = aesdec_zvkned(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), zero);
inptr += 64;
}
//two extra rounds to achieve full diffusion
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
state02 = aesenc_zvkned(state02, xkey00);
state13 = aesdec_zvkned(state13, xkey00, zero);
state02 = aesenc_zvkned(state02, xkey11);
state13 = aesdec_zvkned(state13, xkey11, zero);
//output hash
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
}
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer)
{
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
const vuint32m1_t zero = {};
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
while (outptr < outputEnd) {
state02 = aesdec_zvkned(state02, key02, zero);
state13 = aesenc_zvkned(state13, key13);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
outptr += 64;
}
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
}
static constexpr uint32_t fillAes4Rx4_Key[] = {
0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd,
0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450,
0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904,
0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763,
0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73,
0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3,
0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7,
0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609,
};
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer)
{
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 0, stride4, 8);
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 4, stride4, 8);
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 8, stride4, 8);
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1(fillAes4Rx4_Key + 12, stride4, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
const vuint32m1_t zero = {};
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
while (outptr < outputEnd) {
state02 = aesdec_zvkned(state02, key04, zero);
state13 = aesenc_zvkned(state13, key04);
state02 = aesdec_zvkned(state02, key15, zero);
state13 = aesenc_zvkned(state13, key15);
state02 = aesdec_zvkned(state02, key26, zero);
state13 = aesenc_zvkned(state13, key26);
state02 = aesdec_zvkned(state02, key37, zero);
state13 = aesenc_zvkned(state13, key37);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
outptr += 64;
}
}
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
{
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
const vuint32m1_t zero = {};
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
//process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) {
hash_state02 = aesenc_zvkned(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, 8));
hash_state13 = aesdec_zvkned(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, 8), zero);
fill_state02 = aesdec_zvkned(fill_state02, key02, zero);
fill_state13 = aesenc_zvkned(fill_state13, key13);
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, fill_state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, fill_state13, 8);
scratchpadPtr += 64;
}
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
//two extra rounds to achieve full diffusion
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
hash_state02 = aesenc_zvkned(hash_state02, xkey00);
hash_state13 = aesdec_zvkned(hash_state13, xkey00, zero);
hash_state02 = aesenc_zvkned(hash_state02, xkey11);
hash_state13 = aesdec_zvkned(hash_state13, xkey11, zero);
//output hash
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
}

View file

@ -0,0 +1,35 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash);
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer);
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer);
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

63
crypto/randomx/allocator.cpp Executable file
View file

@ -0,0 +1,63 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <new>
#include "allocator.hpp"
#include "intrin_portable.h"
#include "virtual_memory.h"
#include "common.hpp"
namespace randomx {
template<size_t alignment>
void* AlignedAllocator<alignment>::allocMemory(size_t count) {
void *mem = rx_aligned_alloc(count, alignment);
if (mem == nullptr)
throw std::bad_alloc();
return mem;
}
template<size_t alignment>
void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
rx_aligned_free(ptr);
}
template struct AlignedAllocator<CacheLineSize>;
void* LargePageAllocator::allocMemory(size_t count) {
void *mem = allocLargePagesMemory(count);
if (mem == nullptr)
throw std::bad_alloc();
return mem;
}
void LargePageAllocator::freeMemory(void* ptr, size_t count) {
freePagedMemory(ptr, count);
};
}

46
crypto/randomx/allocator.hpp Executable file
View file

@ -0,0 +1,46 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstddef>
namespace randomx {
template<size_t alignment>
struct AlignedAllocator {
static void* allocMemory(size_t);
static void freeMemory(void*, size_t);
};
struct LargePageAllocator {
static void* allocMemory(size_t);
static void freeMemory(void*, size_t);
};
}

261
crypto/randomx/argon2.h Executable file
View file

@ -0,0 +1,261 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <limits.h>
/*
* Argon2 input parameter restrictions
*/
/* Minimum and maximum number of lanes (degree of parallelism) */
#define ARGON2_MIN_LANES UINT32_C(1)
#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
/* Minimum and maximum number of threads */
#define ARGON2_MIN_THREADS UINT32_C(1)
#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
/* Number of synchronization points between lanes per pass */
#define ARGON2_SYNC_POINTS UINT32_C(4)
/* Minimum and maximum digest size in bytes */
#define ARGON2_MIN_OUTLEN UINT32_C(4)
#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */
#define ARGON2_MAX_MEMORY_BITS \
ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
#define ARGON2_MAX_MEMORY \
ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
/* Minimum and maximum number of passes */
#define ARGON2_MIN_TIME UINT32_C(1)
#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
/* Minimum and maximum password length in bytes */
#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
/* Minimum and maximum associated data length in bytes */
#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
/* Minimum and maximum salt length in bytes */
#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
/* Minimum and maximum key length in bytes */
#define ARGON2_MIN_SECRET UINT32_C(0)
#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
/* Flags to determine which fields are securely wiped (default = no wipe). */
#define ARGON2_DEFAULT_FLAGS UINT32_C(0)
#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
/* Error codes */
typedef enum Argon2_ErrorCodes {
ARGON2_OK = 0,
ARGON2_OUTPUT_PTR_NULL = -1,
ARGON2_OUTPUT_TOO_SHORT = -2,
ARGON2_OUTPUT_TOO_LONG = -3,
ARGON2_PWD_TOO_SHORT = -4,
ARGON2_PWD_TOO_LONG = -5,
ARGON2_SALT_TOO_SHORT = -6,
ARGON2_SALT_TOO_LONG = -7,
ARGON2_AD_TOO_SHORT = -8,
ARGON2_AD_TOO_LONG = -9,
ARGON2_SECRET_TOO_SHORT = -10,
ARGON2_SECRET_TOO_LONG = -11,
ARGON2_TIME_TOO_SMALL = -12,
ARGON2_TIME_TOO_LARGE = -13,
ARGON2_MEMORY_TOO_LITTLE = -14,
ARGON2_MEMORY_TOO_MUCH = -15,
ARGON2_LANES_TOO_FEW = -16,
ARGON2_LANES_TOO_MANY = -17,
ARGON2_PWD_PTR_MISMATCH = -18, /* NULL ptr with non-zero length */
ARGON2_SALT_PTR_MISMATCH = -19, /* NULL ptr with non-zero length */
ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */
ARGON2_AD_PTR_MISMATCH = -21, /* NULL ptr with non-zero length */
ARGON2_MEMORY_ALLOCATION_ERROR = -22,
ARGON2_FREE_MEMORY_CBK_NULL = -23,
ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24,
ARGON2_INCORRECT_PARAMETER = -25,
ARGON2_INCORRECT_TYPE = -26,
ARGON2_OUT_PTR_MISMATCH = -27,
ARGON2_THREADS_TOO_FEW = -28,
ARGON2_THREADS_TOO_MANY = -29,
ARGON2_MISSING_ARGS = -30,
ARGON2_ENCODING_FAIL = -31,
ARGON2_DECODING_FAIL = -32,
ARGON2_THREAD_FAIL = -33,
ARGON2_DECODING_LENGTH_FAIL = -34,
ARGON2_VERIFY_MISMATCH = -35
} argon2_error_codes;
/* Memory allocator types --- for external allocation */
typedef int(*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
typedef void(*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
/* Argon2 external data structures */
/*
*****
* Context: structure to hold Argon2 inputs:
* output array and its length,
* password and its length,
* salt and its length,
* secret and its length,
* associated data and its length,
* number of passes, amount of used memory (in KBytes, can be rounded up a bit)
* number of parallel threads that will be run.
* All the parameters above affect the output hash value.
* Additionally, two function pointers can be provided to allocate and
* deallocate the memory (if NULL, memory will be allocated internally).
* Also, three flags indicate whether to erase password, secret as soon as they
* are pre-hashed (and thus not needed anymore), and the entire memory
*****
* Simplest situation: you have output array out[8], password is stored in
* pwd[32], salt is stored in salt[16], you do not have keys nor associated
* data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with
* 4 parallel lanes.
* You want to erase the password, but you're OK with last pass not being
* erased. You want to use the default memory allocator.
* Then you initialize:
Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false)
*/
typedef struct Argon2_Context {
uint8_t *out; /* output array */
uint32_t outlen; /* digest length */
uint8_t *pwd; /* password array */
uint32_t pwdlen; /* password length */
uint8_t *salt; /* salt array */
uint32_t saltlen; /* salt length */
uint8_t *secret; /* key array */
uint32_t secretlen; /* key length */
uint8_t *ad; /* associated data array */
uint32_t adlen; /* associated data length */
uint32_t t_cost; /* number of passes */
uint32_t m_cost; /* amount of memory requested (KB) */
uint32_t lanes; /* number of lanes */
uint32_t threads; /* maximum number of threads */
uint32_t version; /* version number */
allocate_fptr allocate_cbk; /* pointer to memory allocator */
deallocate_fptr free_cbk; /* pointer to memory deallocator */
uint32_t flags; /* array of bool options */
} argon2_context;
/* Argon2 primitive type */
typedef enum Argon2_type {
Argon2_d = 0,
Argon2_i = 1,
Argon2_id = 2
} argon2_type;
/* Version of the algorithm */
typedef enum Argon2_version {
ARGON2_VERSION_10 = 0x10,
ARGON2_VERSION_13 = 0x13,
ARGON2_VERSION_NUMBER = ARGON2_VERSION_13
} argon2_version;
//Argon2 instance - forward declaration
typedef struct Argon2_instance_t argon2_instance_t;
//Argon2 position = forward declaration
typedef struct Argon2_position_t argon2_position_t;
//Argon2 implementation function
typedef void randomx_argon2_impl(const argon2_instance_t* instance,
argon2_position_t position);
#if defined(__cplusplus)
extern "C" {
#endif
/*
* Function that fills the segment using previous segments also from other
* threads
* @param context current context
* @param instance Pointer to the current instance
* @param position Current position
* @pre all block pointers must be valid
*/
void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl *randomx_argon2_impl_ssse3();
randomx_argon2_impl *randomx_argon2_impl_avx2();
#if defined(__cplusplus)
}
#endif

174
crypto/randomx/argon2_avx2.c Executable file
View file

@ -0,0 +1,174 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "argon2.h"
void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl* randomx_argon2_impl_avx2() {
#if defined(__AVX2__)
return &randomx_argon2_fill_segment_avx2;
#endif
return NULL;
}
#if defined(__AVX2__)
#include "argon2_core.h"
#include "blake2/blamka-round-avx2.h"
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void fill_block(__m256i* state, const block* ref_block,
block* next_block, int with_xor) {
__m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
unsigned int i;
if (with_xor) {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
block_XY[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i));
}
}
else {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
}
}
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
}
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i],
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
}
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
_mm256_storeu_si256((__m256i*)next_block->v + i, state[i]);
}
}
void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
argon2_position_t position) {
block* ref_block = NULL, * curr_block = NULL;
block address_block, input_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
__m256i state[ARGON2_HWORDS_IN_BLOCK];
if (instance == NULL) {
return;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
}
else {
/* Previous block */
prev_offset = curr_offset - 1;
}
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(state, ref_block, curr_block, 0);
}
else {
if (0 == position.pass) {
fill_block(state, ref_block, curr_block, 0);
}
else {
fill_block(state, ref_block, curr_block, 1);
}
}
}
}
#endif

411
crypto/randomx/argon2_core.c Executable file
View file

@ -0,0 +1,411 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
/*For memory wiping*/
#ifdef _MSC_VER
#include <windows.h>
#include <winbase.h> /* For SecureZeroMemory */
#endif
#if defined __STDC_LIB_EXT1__
#define __STDC_WANT_LIB_EXT1__ 1
#endif
#define VC_GE_2005(version) (version >= 1400)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "argon2_core.h"
#include "blake2/blake2.h"
#include "blake2/blake2-impl.h"
#ifdef GENKAT
#include "genkat.h"
#endif
#if defined(__clang__)
#if __has_attribute(optnone)
#define NOT_OPTIMIZED __attribute__((optnone))
#endif
#elif defined(__GNUC__)
#define GCC_VERSION \
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#if GCC_VERSION >= 40400
#define NOT_OPTIMIZED __attribute__((optimize("O0")))
#endif
#endif
#ifndef NOT_OPTIMIZED
#define NOT_OPTIMIZED
#endif
/***************Instance and Position constructors**********/
static void load_block(block *dst, const void *input) {
unsigned i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
}
}
static void store_block(void *output, const block *src) {
unsigned i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
}
}
uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
const argon2_position_t *position, uint32_t pseudo_rand,
int same_lane) {
/*
* Pass 0:
* This lane : all already finished segments plus already constructed
* blocks in this segment
* Other lanes : all already finished segments
* Pass 1+:
* This lane : (SYNC_POINTS - 1) last segments plus already constructed
* blocks in this segment
* Other lanes : (SYNC_POINTS - 1) last segments
*/
uint32_t reference_area_size;
uint64_t relative_position;
uint32_t start_position, absolute_position;
if (0 == position->pass) {
/* First pass */
if (0 == position->slice) {
/* First slice */
reference_area_size =
position->index - 1; /* all but the previous */
}
else {
if (same_lane) {
/* The same lane => add current segment */
reference_area_size =
position->slice * instance->segment_length +
position->index - 1;
}
else {
reference_area_size =
position->slice * instance->segment_length +
((position->index == 0) ? (-1) : 0);
}
}
}
else {
/* Second pass */
if (same_lane) {
reference_area_size = instance->lane_length -
instance->segment_length + position->index -
1;
}
else {
reference_area_size = instance->lane_length -
instance->segment_length +
((position->index == 0) ? (-1) : 0);
}
}
/* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
* relative position */
relative_position = pseudo_rand;
relative_position = relative_position * relative_position >> 32;
relative_position = reference_area_size - 1 -
(reference_area_size * relative_position >> 32);
/* 1.2.5 Computing starting position */
start_position = 0;
if (0 != position->pass) {
start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
? 0
: (position->slice + 1) * instance->segment_length;
}
/* 1.2.6. Computing absolute position */
absolute_position = (start_position + relative_position) %
instance->lane_length; /* absolute position */
return absolute_position;
}
/* Single-threaded version for p=1 case */
static int fill_memory_blocks_st(argon2_instance_t *instance) {
uint32_t r, s, l;
for (r = 0; r < instance->passes; ++r) {
for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
for (l = 0; l < instance->lanes; ++l) {
argon2_position_t position = { r, l, (uint8_t)s, 0 };
//fill the segment using the selected implementation
instance->impl(instance, position);
}
}
}
return ARGON2_OK;
}
int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) {
if (instance == NULL || instance->lanes == 0) {
return ARGON2_INCORRECT_PARAMETER;
}
return fill_memory_blocks_st(instance);
}
int randomx_argon2_validate_inputs(const argon2_context *context) {
if (NULL == context) {
return ARGON2_INCORRECT_PARAMETER;
}
/* Validate password (required param) */
if (NULL == context->pwd) {
if (0 != context->pwdlen) {
return ARGON2_PWD_PTR_MISMATCH;
}
}
if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) {
return ARGON2_PWD_TOO_SHORT;
}
if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) {
return ARGON2_PWD_TOO_LONG;
}
/* Validate salt (required param) */
if (NULL == context->salt) {
if (0 != context->saltlen) {
return ARGON2_SALT_PTR_MISMATCH;
}
}
if (ARGON2_MIN_SALT_LENGTH > context->saltlen) {
return ARGON2_SALT_TOO_SHORT;
}
if (ARGON2_MAX_SALT_LENGTH < context->saltlen) {
return ARGON2_SALT_TOO_LONG;
}
/* Validate secret (optional param) */
if (NULL == context->secret) {
if (0 != context->secretlen) {
return ARGON2_SECRET_PTR_MISMATCH;
}
}
else {
if (ARGON2_MIN_SECRET > context->secretlen) {
return ARGON2_SECRET_TOO_SHORT;
}
if (ARGON2_MAX_SECRET < context->secretlen) {
return ARGON2_SECRET_TOO_LONG;
}
}
/* Validate associated data (optional param) */
if (NULL == context->ad) {
if (0 != context->adlen) {
return ARGON2_AD_PTR_MISMATCH;
}
}
else {
if (ARGON2_MIN_AD_LENGTH > context->adlen) {
return ARGON2_AD_TOO_SHORT;
}
if (ARGON2_MAX_AD_LENGTH < context->adlen) {
return ARGON2_AD_TOO_LONG;
}
}
/* Validate memory cost */
if (ARGON2_MIN_MEMORY > context->m_cost) {
return ARGON2_MEMORY_TOO_LITTLE;
}
if (ARGON2_MAX_MEMORY < context->m_cost) {
return ARGON2_MEMORY_TOO_MUCH;
}
if (context->m_cost < 8 * context->lanes) {
return ARGON2_MEMORY_TOO_LITTLE;
}
/* Validate time cost */
if (ARGON2_MIN_TIME > context->t_cost) {
return ARGON2_TIME_TOO_SMALL;
}
if (ARGON2_MAX_TIME < context->t_cost) {
return ARGON2_TIME_TOO_LARGE;
}
/* Validate lanes */
if (ARGON2_MIN_LANES > context->lanes) {
return ARGON2_LANES_TOO_FEW;
}
if (ARGON2_MAX_LANES < context->lanes) {
return ARGON2_LANES_TOO_MANY;
}
/* Validate threads */
if (ARGON2_MIN_THREADS > context->threads) {
return ARGON2_THREADS_TOO_FEW;
}
if (ARGON2_MAX_THREADS < context->threads) {
return ARGON2_THREADS_TOO_MANY;
}
if (NULL != context->allocate_cbk && NULL == context->free_cbk) {
return ARGON2_FREE_MEMORY_CBK_NULL;
}
if (NULL == context->allocate_cbk && NULL != context->free_cbk) {
return ARGON2_ALLOCATE_MEMORY_CBK_NULL;
}
return ARGON2_OK;
}
void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
uint32_t l;
/* Make the first and second block in each lane as G(H0||0||i) or
G(H0||1||i) */
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
for (l = 0; l < instance->lanes; ++l) {
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
ARGON2_PREHASH_SEED_LENGTH);
load_block(&instance->memory[l * instance->lane_length + 0],
blockhash_bytes);
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
ARGON2_PREHASH_SEED_LENGTH);
load_block(&instance->memory[l * instance->lane_length + 1],
blockhash_bytes);
}
}
void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) {
blake2b_state BlakeHash;
uint8_t value[sizeof(uint32_t)];
if (NULL == context || NULL == blockhash) {
return;
}
blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
store32(&value, context->lanes);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, context->outlen);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, context->m_cost);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, context->t_cost);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, context->version);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, (uint32_t)type);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, context->pwdlen);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
if (context->pwd != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
context->pwdlen);
}
store32(&value, context->saltlen);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
if (context->salt != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->salt, context->saltlen);
}
store32(&value, context->secretlen);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
if (context->secret != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->secret,
context->secretlen);
}
store32(&value, context->adlen);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
if (context->ad != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->ad,
context->adlen);
}
blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
}
int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) {
uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
int result = ARGON2_OK;
if (instance == NULL || context == NULL)
return ARGON2_INCORRECT_PARAMETER;
instance->context_ptr = context;
/* 1. Memory allocation */
//RandomX takes care of memory allocation
/* 2. Initial hashing */
/* H_0 + 8 extra bytes to produce the first blocks */
/* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */
/* Hashing all inputs */
rxa2_initial_hash(blockhash, context, instance->type);
/* Zeroing 8 extra bytes */
/*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
ARGON2_PREHASH_SEED_LENGTH -
ARGON2_PREHASH_DIGEST_LENGTH);*/
/* 3. Creating first blocks, we always have at least two blocks in a slice
*/
rxa2_fill_first_blocks(blockhash, instance);
return ARGON2_OK;
}

163
crypto/randomx/argon2_core.h Executable file
View file

@ -0,0 +1,163 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef ARGON2_CORE_H
#define ARGON2_CORE_H
#include <stdint.h>
#include "argon2.h"
#if defined(__cplusplus)
extern "C" {
#endif
#define CONST_CAST(x) (x)(uintptr_t)
/**********************Argon2 internal constants*******************************/
enum argon2_core_constants {
/* Memory block size in bytes */
ARGON2_BLOCK_SIZE = 1024,
ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32,
ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64,
/* Number of pseudo-random values generated by one call to Blake in Argon2i
to
generate reference block positions */
ARGON2_ADDRESSES_IN_BLOCK = 128,
/* Pre-hashing digest length and its extension*/
ARGON2_PREHASH_DIGEST_LENGTH = 64,
ARGON2_PREHASH_SEED_LENGTH = 72
};
/*************************Argon2 internal data types***********************/
/*
* Structure for the (1KB) memory block implemented as 128 64-bit words.
* Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
* bounds checking).
*/
typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block;
/*
* Argon2 instance: memory pointer, number of passes, amount of memory, type,
* and derived values.
* Used to evaluate the number and location of blocks to construct in each
* thread
*/
typedef struct Argon2_instance_t {
block *memory; /* Memory pointer */
uint32_t version;
uint32_t passes; /* Number of passes */
uint32_t memory_blocks; /* Number of blocks in memory */
uint32_t segment_length;
uint32_t lane_length;
uint32_t lanes;
uint32_t threads;
argon2_type type;
int print_internals; /* whether to print the memory blocks */
argon2_context *context_ptr; /* points back to original context */
randomx_argon2_impl *impl;
} argon2_instance_t;
/*
* Argon2 position: where we construct the block right now. Used to distribute
* work between threads.
*/
typedef struct Argon2_position_t {
uint32_t pass;
uint32_t lane;
uint8_t slice;
uint32_t index;
} argon2_position_t;
/*Struct that holds the inputs for thread handling FillSegment*/
typedef struct Argon2_thread_data {
argon2_instance_t *instance_ptr;
argon2_position_t pos;
} argon2_thread_data;
/*************************Argon2 core functions********************************/
/*
* Computes absolute position of reference block in the lane following a skewed
* distribution and using a pseudo-random value as input
* @param instance Pointer to the current instance
* @param position Pointer to the current position
* @param pseudo_rand 32-bit pseudo-random value used to determine the position
* @param same_lane Indicates if the block will be taken from the current lane.
* If so we can reference the current segment
* @pre All pointers must be valid
*/
uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
const argon2_position_t *position, uint32_t pseudo_rand,
int same_lane);
/*
* Function that validates all inputs against predefined restrictions and return
* an error code
* @param context Pointer to current Argon2 context
* @return ARGON2_OK if everything is all right, otherwise one of error codes
* (all defined in <argon2.h>
*/
int randomx_argon2_validate_inputs(const argon2_context *context);
/*
* Function allocates memory, hashes the inputs with Blake, and creates first
* two blocks. Returns the pointer to the main memory with 2 blocks per lane
* initialized
* @param context Pointer to the Argon2 internal structure containing memory
* pointer, and parameters for time and space requirements.
* @param instance Current Argon2 instance
* @return Zero if successful, -1 if memory failed to allocate. @context->state
* will be modified if successful.
*/
int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context);
/*
* Function that fills the entire memory t_cost times based on the first two
* blocks in each lane
* @param instance Pointer to the current instance
* @return ARGON2_OK if successful, @context->state
*/
int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance);
#if defined(__cplusplus)
}
#endif
#endif

187
crypto/randomx/argon2_ref.c Executable file
View file

@ -0,0 +1,187 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "argon2.h"
#include "argon2_core.h"
#include "blake2/blamka-round-ref.h"
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void copy_block(block* dst, const block* src) {
memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
}
static void xor_block(block* dst, const block* src) {
int i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
dst->v[i] ^= src->v[i];
}
}
/*
* Function fills a new memory block and optionally XORs the old block over the new one.
* @next_block must be initialized.
* @param prev_block Pointer to the previous block
* @param ref_block Pointer to the reference block
* @param next_block Pointer to the block to be constructed
* @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
* @pre all block pointers must be valid
*/
static void fill_block(const block *prev_block, const block *ref_block,
block *next_block, int with_xor) {
block blockR, block_tmp;
unsigned i;
copy_block(&blockR, ref_block);
xor_block(&blockR, prev_block);
copy_block(&block_tmp, &blockR);
/* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */
if (with_xor) {
/* Saving the next block contents for XOR over: */
xor_block(&block_tmp, next_block);
/* Now blockR = ref_block + prev_block and
block_tmp = ref_block + prev_block + next_block */
}
/* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
(16,17,..31)... finally (112,113,...127) */
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND_NOMSG(
blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
blockR.v[16 * i + 15]);
}
/* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
(2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
for (i = 0; i < 8; i++) {
BLAKE2_ROUND_NOMSG(
blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
blockR.v[2 * i + 113]);
}
copy_block(next_block, &block_tmp);
xor_block(next_block, &blockR);
}
void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance,
argon2_position_t position) {
block *ref_block = NULL, *curr_block = NULL;
block address_block, input_block, zero_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index;
uint32_t i;
if (instance == NULL) {
return;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
}
else {
/* Previous block */
prev_offset = curr_offset - 1;
}
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(instance->memory + prev_offset, ref_block, curr_block, 0);
}
else {
if (0 == position.pass) {
fill_block(instance->memory + prev_offset, ref_block,
curr_block, 0);
}
else {
fill_block(instance->memory + prev_offset, ref_block,
curr_block, 1);
}
}
}
}

182
crypto/randomx/argon2_ssse3.c Executable file
View file

@ -0,0 +1,182 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "argon2.h"
#if defined(_MSC_VER) //MSVC doesn't define SSSE3
#define __SSSE3__
#endif
void randomx_argon2_fill_segment_ssse3(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl* randomx_argon2_impl_ssse3() {
#if defined(__SSSE3__)
return &randomx_argon2_fill_segment_ssse3;
#endif
return NULL;
}
#if defined(__SSSE3__)
#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
#include "argon2_core.h"
#include "blake2/blamka-round-ssse3.h"
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void fill_block(__m128i* state, const block* ref_block,
block* next_block, int with_xor) {
__m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
unsigned int i;
if (with_xor) {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
block_XY[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)next_block->v + i));
}
}
else {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
}
}
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
state[8 * i + 6], state[8 * i + 7]);
}
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
state[8 * 6 + i], state[8 * 7 + i]);
}
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(state[i], block_XY[i]);
_mm_storeu_si128((__m128i*)next_block->v + i, state[i]);
}
}
void randomx_argon2_fill_segment_ssse3(const argon2_instance_t* instance,
argon2_position_t position) {
block* ref_block = NULL, * curr_block = NULL;
block address_block, input_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
__m128i state[ARGON2_OWORDS_IN_BLOCK];
if (instance == NULL) {
return;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
}
else {
/* Previous block */
prev_offset = curr_offset - 1;
}
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(state, ref_block, curr_block, 0);
}
else {
if (0 == position.pass) {
fill_block(state, ref_block, curr_block, 0);
}
else {
fill_block(state, ref_block, curr_block, 1);
}
}
}
}
#endif

View file

@ -0,0 +1,48 @@
; File start: ..\src\configuration.h
RANDOMX_ARGON_MEMORY EQU 262144t
RANDOMX_ARGON_ITERATIONS EQU 3t
RANDOMX_ARGON_LANES EQU 1t
RANDOMX_ARGON_SALT TEXTEQU <"RandomX\x03">
RANDOMX_CACHE_ACCESSES EQU 8t
RANDOMX_SUPERSCALAR_LATENCY EQU 170t
RANDOMX_DATASET_BASE_SIZE EQU 2147483648t
RANDOMX_DATASET_EXTRA_SIZE EQU 33554368t
RANDOMX_PROGRAM_SIZE EQU 256t
RANDOMX_PROGRAM_ITERATIONS EQU 2048t
RANDOMX_PROGRAM_COUNT EQU 8t
RANDOMX_SCRATCHPAD_L3 EQU 2097152t
RANDOMX_SCRATCHPAD_L2 EQU 262144t
RANDOMX_SCRATCHPAD_L1 EQU 16384t
RANDOMX_JUMP_BITS EQU 8t
RANDOMX_JUMP_OFFSET EQU 8t
RANDOMX_FREQ_IADD_RS EQU 16t
RANDOMX_FREQ_IADD_M EQU 7t
RANDOMX_FREQ_ISUB_R EQU 16t
RANDOMX_FREQ_ISUB_M EQU 7t
RANDOMX_FREQ_IMUL_R EQU 16t
RANDOMX_FREQ_IMUL_M EQU 4t
RANDOMX_FREQ_IMULH_R EQU 4t
RANDOMX_FREQ_IMULH_M EQU 1t
RANDOMX_FREQ_ISMULH_R EQU 4t
RANDOMX_FREQ_ISMULH_M EQU 1t
RANDOMX_FREQ_IMUL_RCP EQU 8t
RANDOMX_FREQ_INEG_R EQU 2t
RANDOMX_FREQ_IXOR_R EQU 15t
RANDOMX_FREQ_IXOR_M EQU 5t
RANDOMX_FREQ_IROR_R EQU 8t
RANDOMX_FREQ_IROL_R EQU 2t
RANDOMX_FREQ_ISWAP_R EQU 4t
RANDOMX_FREQ_FSWAP_R EQU 4t
RANDOMX_FREQ_FADD_R EQU 16t
RANDOMX_FREQ_FADD_M EQU 5t
RANDOMX_FREQ_FSUB_R EQU 16t
RANDOMX_FREQ_FSUB_M EQU 5t
RANDOMX_FREQ_FSCAL_R EQU 6t
RANDOMX_FREQ_FMUL_R EQU 32t
RANDOMX_FREQ_FDIV_M EQU 4t
RANDOMX_FREQ_FSQRT_R EQU 6t
RANDOMX_FREQ_CBRANCH EQU 25t
RANDOMX_FREQ_CFROUND EQU 1t
RANDOMX_FREQ_ISTORE EQU 16t
RANDOMX_FREQ_NOP EQU 0t
; File end: ..\src\configuration.h

View file

@ -0,0 +1,10 @@
;# restore callee-saved registers - System V AMD64 ABI
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
;# program finished
ret 0

View file

@ -0,0 +1,19 @@
;# save VM register values
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
movdqa xmmword ptr [rcx+64], xmm0
movdqa xmmword ptr [rcx+80], xmm1
movdqa xmmword ptr [rcx+96], xmm2
movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
movdqa xmmword ptr [rcx+64], xmm4
movdqa xmmword ptr [rcx+80], xmm5
movdqa xmmword ptr [rcx+96], xmm6
movdqa xmmword ptr [rcx+112], xmm7

View file

@ -0,0 +1,24 @@
;# restore callee-saved registers - Microsoft x64 calling convention
movdqu xmm15, xmmword ptr [rsp]
movdqu xmm14, xmmword ptr [rsp+16]
movdqu xmm13, xmmword ptr [rsp+32]
movdqu xmm12, xmmword ptr [rsp+48]
movdqu xmm11, xmmword ptr [rsp+64]
add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
;# program finished
ret

View file

@ -0,0 +1,28 @@
lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
cvtdq2pd xmm3, qword ptr [rcx+24]
cvtdq2pd xmm4, qword ptr [rcx+32]
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
andps xmm4, xmm13
andps xmm5, xmm13
andps xmm6, xmm13
andps xmm7, xmm13
orps xmm4, xmm14
orps xmm5, xmm14
orps xmm6, xmm14
orps xmm7, xmm14

View file

@ -0,0 +1,18 @@
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
pop rcx
xorpd xmm0, xmm4
xorpd xmm1, xmm5
xorpd xmm2, xmm6
xorpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3

View file

@ -0,0 +1,35 @@
;# callee-saved registers - System V AMD64 ABI
push rbx
push rbp
push r12
push r13
push r14
push r15
;# function arguments
mov rbx, rcx ;# loop counter
push rdi ;# RegisterFile& registerFile
mov rcx, rdi
mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset
mov rsi, rdx ;# uint8_t* scratchpad
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
;# load constant registers
lea rcx, [rcx+120]
movapd xmm8, xmmword ptr [rcx+72]
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]

View file

@ -0,0 +1,48 @@
;# callee-saved registers - Microsoft x64 calling convention
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm11
movdqu xmmword ptr [rsp+48], xmm12
movdqu xmmword ptr [rsp+32], xmm13
movdqu xmmword ptr [rsp+16], xmm14
movdqu xmmword ptr [rsp+0], xmm15
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbp, qword ptr [rdx] ;# "mx", "ma"
mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset
mov rsi, r8 ;# uint8_t* scratchpad
mov rbx, r9 ;# loop counter
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
;# load constant registers
lea rcx, [rcx+120]
movapd xmm8, xmmword ptr [rcx+72]
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]

View file

@ -0,0 +1,16 @@
mov ecx, ebp ;# ecx = ma
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

View file

@ -0,0 +1,10 @@
mov rbx, qword ptr [rsp+64]
xor r8, qword ptr [rsp+56]
xor r9, qword ptr [rsp+48]
xor r10, qword ptr [rsp+40]
xor r11, qword ptr [rsp+32]
xor r12, qword ptr [rsp+24]
xor r13, qword ptr [rsp+16]
xor r14, qword ptr [rsp+8]
xor r15, qword ptr [rsp+0]
add rsp, 72

View file

@ -0,0 +1,17 @@
sub rsp, 72
mov qword ptr [rsp+64], rbx
mov qword ptr [rsp+56], r8
mov qword ptr [rsp+48], r9
mov qword ptr [rsp+40], r10
mov qword ptr [rsp+32], r11
mov qword ptr [rsp+24], r12
mov qword ptr [rsp+16], r13
mov qword ptr [rsp+8], r14
mov qword ptr [rsp+0], r15
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx"
mov rbx, rbp ;# ebx = ma
shr rbx, 38
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
;# add ebx, datasetOffset / 64
;# call 32768

View file

@ -0,0 +1,24 @@
r0_mul:
;#/ 6364136223846793005
db 45, 127, 149, 76, 45, 244, 81, 88
r1_add:
;#/ 9298411001130361340
db 252, 161, 245, 89, 138, 151, 10, 129
r2_add:
;#/ 12065312585734608966
db 70, 216, 194, 56, 223, 153, 112, 167
r3_add:
;#/ 9306329213124626780
db 92, 73, 34, 191, 28, 185, 38, 129
r4_add:
;#/ 5281919268842080866
db 98, 138, 159, 23, 151, 37, 77, 73
r5_add:
;#/ 10536153434571861004
db 12, 236, 170, 206, 185, 239, 55, 146
r6_add:
;#/ 3398623926847679864
db 120, 45, 230, 108, 116, 86, 42, 47
r7_add:
;#/ 9549104520008361294
db 78, 229, 44, 182, 247, 59, 133, 132

View file

@ -0,0 +1,8 @@
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]

View file

@ -0,0 +1,4 @@
and rbx, RANDOMX_CACHE_MASK
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]

View file

@ -0,0 +1,6 @@
mantissaMask:
db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0
exp240:
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
scaleMask:
db 0, 0, 0, 0, 0, 0, 240, 128, 0, 0, 0, 0, 0, 0, 240, 128

View file

@ -0,0 +1,7 @@
mov edx, 1
mov r8, rcx
xor eax, eax
bsr rcx, rcx
shl rdx, cl
div r8
ret

View file

@ -0,0 +1,611 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <climits>
#include "assembly_generator_x86.hpp"
#include "common.hpp"
#include "reciprocal.h"
#include "program.hpp"
#include "superscalar.hpp"
namespace randomx {
static const char* regR[] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" };
static const char* regR32[] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" };
static const char* regFE[] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regF[] = { "xmm0", "xmm1", "xmm2", "xmm3" };
static const char* regE[] = { "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regA[] = { "xmm8", "xmm9", "xmm10", "xmm11" };
static const char* tempRegx = "xmm12";
static const char* mantissaMaskReg = "xmm13";
static const char* exponentMaskReg = "xmm14";
static const char* scaleMaskReg = "xmm15";
static const char* regIc = "rbx";
static const char* regIc32 = "ebx";
static const char* regIc8 = "bl";
static const char* regScratchpadAddr = "rsi";
void AssemblyGeneratorX86::generateProgram(Program& prog) {
for (unsigned i = 0; i < RegistersCount; ++i) {
registerUsage[i] = -1;
}
asmCode.str(std::string()); //clear
for (unsigned i = 0; i < prog.getSize(); ++i) {
asmCode << "randomx_isn_" << i << ":" << std::endl;
Instruction& instr = prog(i);
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
generateCode(instr, i);
}
}
void AssemblyGeneratorX86::generateAsm(SuperscalarProgram& prog) {
asmCode.str(std::string()); //clear
#ifdef RANDOMX_ALIGN
asmCode << "ALIGN 16" << std::endl;
#endif
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
switch ((SuperscalarInstructionType)instr.opcode)
{
case SuperscalarInstructionType::ISUB_R:
asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
break;
case SuperscalarInstructionType::IXOR_R:
asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
break;
case SuperscalarInstructionType::IADD_RS:
asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << "]" << std::endl;
break;
case SuperscalarInstructionType::IMUL_R:
asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
break;
case SuperscalarInstructionType::IROR_C:
asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl;
break;
case SuperscalarInstructionType::IADD_C7:
asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
break;
case SuperscalarInstructionType::IXOR_C7:
asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
break;
case SuperscalarInstructionType::IADD_C8:
asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
#ifdef RANDOMX_ALIGN
asmCode << "nop" << std::endl;
#endif
break;
case SuperscalarInstructionType::IXOR_C8:
asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
#ifdef RANDOMX_ALIGN
asmCode << "nop" << std::endl;
#endif
break;
case SuperscalarInstructionType::IADD_C9:
asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
#ifdef RANDOMX_ALIGN
asmCode << "xchg ax, ax ;nop" << std::endl;
#endif
break;
case SuperscalarInstructionType::IXOR_C9:
asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
#ifdef RANDOMX_ALIGN
asmCode << "xchg ax, ax ;nop" << std::endl;
#endif
break;
case SuperscalarInstructionType::IMULH_R:
asmCode << "mov rax, " << regR[instr.dst] << std::endl;
asmCode << "mul " << regR[instr.src] << std::endl;
asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl;
break;
case SuperscalarInstructionType::ISMULH_R:
asmCode << "mov rax, " << regR[instr.dst] << std::endl;
asmCode << "imul " << regR[instr.src] << std::endl;
asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl;
break;
case SuperscalarInstructionType::IMUL_RCP:
asmCode << "mov rax, " << (int64_t)randomx_reciprocal(instr.getImm32()) << std::endl;
asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl;
break;
default:
UNREACHABLE;
}
}
}
void AssemblyGeneratorX86::generateC(SuperscalarProgram& prog) {
asmCode.str(std::string()); //clear
asmCode << "#include <stdint.h>" << std::endl;
asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl;
asmCode << " static inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl;
asmCode << " return ((unsigned __int128)a * b) >> 64;" << std::endl;
asmCode << " }" << std::endl;
asmCode << " static inline int64_t smulh(int64_t a, int64_t b) {" << std::endl;
asmCode << " return ((__int128)a * b) >> 64;" << std::endl;
asmCode << " }" << std::endl;
asmCode << " #define HAVE_MULH" << std::endl;
asmCode << " #define HAVE_SMULH" << std::endl;
asmCode << "#endif" << std::endl;
asmCode << "#if defined(_MSC_VER)" << std::endl;
asmCode << " #define HAS_VALUE(X) X ## 0" << std::endl;
asmCode << " #define EVAL_DEFINE(X) HAS_VALUE(X)" << std::endl;
asmCode << " #include <intrin.h>" << std::endl;
asmCode << " #include <stdlib.h>" << std::endl;
asmCode << " static __inline uint64_t rotr(uint64_t x , int c) {" << std::endl;
asmCode << " return _rotr64(x, c);" << std::endl;
asmCode << " }" << std::endl;
asmCode << " #define HAVE_ROTR" << std::endl;
asmCode << " #if EVAL_DEFINE(__MACHINEARM64_X64(1))" << std::endl;
asmCode << " static __inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl;
asmCode << " return __umulh(a, b);" << std::endl;
asmCode << " }" << std::endl;
asmCode << " #define HAVE_MULH" << std::endl;
asmCode << " #endif" << std::endl;
asmCode << " #if EVAL_DEFINE(__MACHINEX64(1))" << std::endl;
asmCode << " static __inline int64_t smulh(int64_t a, int64_t b) {" << std::endl;
asmCode << " int64_t hi;" << std::endl;
asmCode << " _mul128(a, b, &hi);" << std::endl;
asmCode << " return hi;" << std::endl;
asmCode << " }" << std::endl;
asmCode << " #define HAVE_SMULH" << std::endl;
asmCode << " #endif" << std::endl;
asmCode << "#endif" << std::endl;
asmCode << "#ifndef HAVE_ROTR" << std::endl;
asmCode << " static inline uint64_t rotr(uint64_t a, int b) {" << std::endl;
asmCode << " return (a >> b) | (a << (64 - b));" << std::endl;
asmCode << " }" << std::endl;
asmCode << " #define HAVE_ROTR" << std::endl;
asmCode << "#endif" << std::endl;
asmCode << "#if !defined(HAVE_MULH) || !defined(HAVE_SMULH) || !defined(HAVE_ROTR)" << std::endl;
asmCode << " #error \"Required functions are not defined\"" << std::endl;
asmCode << "#endif" << std::endl;
asmCode << "void superScalar(uint64_t r[8]) {" << std::endl;
asmCode << "uint64_t r8 = r[0], r9 = r[1], r10 = r[2], r11 = r[3], r12 = r[4], r13 = r[5], r14 = r[6], r15 = r[7];" << std::endl;
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
switch ((SuperscalarInstructionType)instr.opcode)
{
case SuperscalarInstructionType::ISUB_R:
asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl;
break;
case SuperscalarInstructionType::IXOR_R:
asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl;
break;
case SuperscalarInstructionType::IADD_RS:
asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.getModShift())) << ";" << std::endl;
break;
case SuperscalarInstructionType::IMUL_R:
asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl;
break;
case SuperscalarInstructionType::IROR_C:
asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl;
break;
case SuperscalarInstructionType::IADD_C7:
case SuperscalarInstructionType::IADD_C8:
case SuperscalarInstructionType::IADD_C9:
asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl;
break;
case SuperscalarInstructionType::IXOR_C7:
case SuperscalarInstructionType::IXOR_C8:
case SuperscalarInstructionType::IXOR_C9:
asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl;
break;
case SuperscalarInstructionType::IMULH_R:
asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl;
break;
case SuperscalarInstructionType::ISMULH_R:
asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl;
break;
case SuperscalarInstructionType::IMUL_RCP:
asmCode << regR[instr.dst] << " *= " << (int64_t)randomx_reciprocal(instr.getImm32()) << ";" << std::endl;
break;
default:
UNREACHABLE;
}
}
asmCode << "r[0] = r8; r[1] = r9; r[2] = r10; r[3] = r11; r[4] = r12; r[5] = r13; r[6] = r14; r[7] = r15;" << std::endl;
asmCode << "}" << std::endl;
}
void AssemblyGeneratorX86::traceint(Instruction& instr) {
if (trace) {
asmCode << "\tpush " << regR[instr.dst] << std::endl;
}
}
void AssemblyGeneratorX86::traceflt(Instruction& instr) {
if (trace) {
asmCode << "\tpush 0" << std::endl;
}
}
void AssemblyGeneratorX86::tracenop(Instruction& instr) {
if (trace) {
asmCode << "\tpush 0" << std::endl;
}
}
void AssemblyGeneratorX86::generateCode(Instruction& instr, int i) {
asmCode << "\t; " << instr;
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") {
asmCode << "\tlea " << reg << ", [" << regR32[instr.src] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl;
asmCode << "\tand " << reg << ", " << ((instr.getModMem()) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
}
void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
asmCode << "\tlea eax, [" << regR32[instr.dst] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl;
int mask;
if (instr.getModCond() < StoreL3Condition) {
mask = instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask;
}
else {
mask = ScratchpadL3Mask;
}
asmCode << "\tand eax" << ", " << (mask & (-maskAlign)) << std::endl;
}
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
return (int32_t)instr.getImm32() & ScratchpadL3Mask;
}
void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if(instr.dst == RegisterNeedsDisplacement)
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl;
else
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.getModShift())) << "]" << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_IADD_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
}
else {
asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_ISUB_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\tsub " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_ISUB_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
}
else {
asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IMUL_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\timul " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IMUL_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\timul " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
}
else {
asmCode << "\timul " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul " << regR[instr.src] << std::endl;
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_IMULH_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr, "ecx");
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul qword ptr [" << regScratchpadAddr << "+rcx]" << std::endl;
}
else {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\tmul qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul " << regR[instr.src] << std::endl;
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_ISMULH_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr, "ecx");
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul qword ptr [" << regScratchpadAddr << "+rcx]" << std::endl;
}
else {
asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
asmCode << "\timul qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_INEG_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
asmCode << "\tneg " << regR[instr.dst] << std::endl;
traceint(instr);
}
void AssemblyGeneratorX86::h_IXOR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\txor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IXOR_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
asmCode << "\txor " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
}
else {
asmCode << "\txor " << regR[instr.dst] << ", qword ptr [" << regScratchpadAddr << "+" << genAddressImm(instr) << "]" << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IROR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl;
asmCode << "\tror " << regR[instr.dst] << ", cl" << std::endl;
}
else {
asmCode << "\tror " << regR[instr.dst] << ", " << (instr.getImm32() & 63) << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IROL_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl;
asmCode << "\trol " << regR[instr.dst] << ", cl" << std::endl;
}
else {
asmCode << "\trol " << regR[instr.dst] << ", " << (instr.getImm32() & 63) << std::endl;
}
traceint(instr);
}
void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) {
const uint32_t divisor = instr.getImm32();
if (!isZeroOrPowerOf2(divisor)) {
registerUsage[instr.dst] = i;
asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl;
asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl;
traceint(instr);
}
else {
tracenop(instr);
}
}
void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
registerUsage[instr.dst] = i;
registerUsage[instr.src] = i;
asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
traceint(instr);
}
else {
tracenop(instr);
}
}
void AssemblyGeneratorX86::h_FSWAP_R(Instruction& instr, int i) {
asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FADD_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FADD_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
asmCode << "\taddpd " << regF[instr.dst] << ", " << tempRegx << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FSUB_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FSUB_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
asmCode << "\tsubpd " << regF[instr.dst] << ", " << tempRegx << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FSCAL_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
asmCode << "\txorps " << regF[instr.dst] << ", " << scaleMaskReg << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FMUL_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FDIV_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl;
asmCode << "\tandps " << tempRegx << ", " << mantissaMaskReg << std::endl;
asmCode << "\torps " << tempRegx << ", " << exponentMaskReg << std::endl;
asmCode << "\tdivpd " << regE[instr.dst] << ", " << tempRegx << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_FSQRT_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl;
traceflt(instr);
}
void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
int rotate = (13 - (instr.getImm32() & 63)) & 63;
if (rotate != 0)
asmCode << "\trol rax, " << rotate << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tpush rax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp]" << std::endl;
asmCode << "\tpop rax" << std::endl;
tracenop(instr);
}
void AssemblyGeneratorX86::h_CBRANCH(Instruction& instr, int i) {
int reg = instr.dst;
int target = registerUsage[reg] + 1;
int shift = instr.getModCond() + ConditionOffset;
int32_t imm = instr.getImm32() | (1L << shift);
if (ConditionOffset > 0 || shift > 0)
imm &= ~(1L << (shift - 1));
asmCode << "\tadd " << regR[reg] << ", " << imm << std::endl;
asmCode << "\ttest " << regR[reg] << ", " << (ConditionMask << shift) << std::endl;
asmCode << "\tjz randomx_isn_" << target << std::endl;
//mark all registers as used
for (unsigned j = 0; j < RegistersCount; ++j) {
registerUsage[j] = i;
}
}
void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) {
genAddressRegDst(instr);
asmCode << "\tmov qword ptr [" << regScratchpadAddr << "+rax], " << regR[instr.src] << std::endl;
tracenop(instr);
}
void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) {
asmCode << "\tnop" << std::endl;
tracenop(instr);
}
#include "instruction_weights.hpp"
#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))
InstructionGenerator AssemblyGeneratorX86::engine[256] = {
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IMUL_RCP)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
INST_HANDLE(FSWAP_R)
INST_HANDLE(FADD_R)
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
INST_HANDLE(FSCAL_R)
INST_HANDLE(FMUL_R)
INST_HANDLE(FDIV_M)
INST_HANDLE(FSQRT_R)
INST_HANDLE(CBRANCH)
INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE)
INST_HANDLE(NOP)
};
}

View file

@ -0,0 +1,94 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "common.hpp"
#include <sstream>
namespace randomx {
class Program;
class SuperscalarProgram;
class AssemblyGeneratorX86;
class Instruction;
typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int);
class AssemblyGeneratorX86 {
public:
void generateProgram(Program& prog);
void generateAsm(SuperscalarProgram& prog);
void generateC(SuperscalarProgram& prog);
void printCode(std::ostream& os) {
os << asmCode.rdbuf();
}
private:
void genAddressReg(Instruction&, const char*);
void genAddressRegDst(Instruction&, int);
int32_t genAddressImm(Instruction&);
void generateCode(Instruction&, int);
void traceint(Instruction&);
void traceflt(Instruction&);
void tracenop(Instruction&);
void h_IADD_RS(Instruction&, int);
void h_IADD_M(Instruction&, int);
void h_ISUB_R(Instruction&, int);
void h_ISUB_M(Instruction&, int);
void h_IMUL_R(Instruction&, int);
void h_IMUL_M(Instruction&, int);
void h_IMULH_R(Instruction&, int);
void h_IMULH_M(Instruction&, int);
void h_ISMULH_R(Instruction&, int);
void h_ISMULH_M(Instruction&, int);
void h_IMUL_RCP(Instruction&, int);
void h_INEG_R(Instruction&, int);
void h_IXOR_R(Instruction&, int);
void h_IXOR_M(Instruction&, int);
void h_IROR_R(Instruction&, int);
void h_IROL_R(Instruction&, int);
void h_ISWAP_R(Instruction&, int);
void h_FSWAP_R(Instruction&, int);
void h_FADD_R(Instruction&, int);
void h_FADD_M(Instruction&, int);
void h_FSUB_R(Instruction&, int);
void h_FSUB_M(Instruction&, int);
void h_FSCAL_R(Instruction&, int);
void h_FMUL_R(Instruction&, int);
void h_FDIV_M(Instruction&, int);
void h_FSQRT_R(Instruction&, int);
void h_CBRANCH(Instruction&, int);
void h_CFROUND(Instruction&, int);
void h_ISTORE(Instruction&, int);
void h_NOP(Instruction&, int);
static InstructionGenerator engine[256];
std::stringstream asmCode;
int registerUsage[RegistersCount];
};
}

View file

@ -0,0 +1,76 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef PORTABLE_BLAKE2_IMPL_H
#define PORTABLE_BLAKE2_IMPL_H
#include <stdint.h>
#include "endian.h"
static FORCE_INLINE uint64_t load48(const void *src) {
const uint8_t *p = (const uint8_t *)src;
uint64_t w = *p++;
w |= (uint64_t)(*p++) << 8;
w |= (uint64_t)(*p++) << 16;
w |= (uint64_t)(*p++) << 24;
w |= (uint64_t)(*p++) << 32;
w |= (uint64_t)(*p++) << 40;
return w;
}
static FORCE_INLINE void store48(void *dst, uint64_t w) {
uint8_t *p = (uint8_t *)dst;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
}
static FORCE_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
return (w >> c) | (w << (32 - c));
}
static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
return (w >> c) | (w << (64 - c));
}
#endif

116
crypto/randomx/blake2/blake2.h Executable file
View file

@ -0,0 +1,116 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef PORTABLE_BLAKE2_H
#define PORTABLE_BLAKE2_H
#include <stdint.h>
#include <limits.h>
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2b_constant {
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
typedef struct __blake2b_param {
uint8_t digest_length; /* 1 */
uint8_t key_length; /* 2 */
uint8_t fanout; /* 3 */
uint8_t depth; /* 4 */
uint32_t leaf_length; /* 8 */
uint64_t node_offset; /* 16 */
uint8_t node_depth; /* 17 */
uint8_t inner_length; /* 18 */
uint8_t reserved[14]; /* 32 */
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
} blake2b_param;
#pragma pack(pop)
typedef struct __blake2b_state {
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[BLAKE2B_BLOCKBYTES];
unsigned buflen;
unsigned outlen;
uint8_t last_node;
} blake2b_state;
/* Ensure param structs have not been wrongly padded */
/* Poor man's static_assert */
enum {
blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
blake2_size_check_2 =
1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
};
//randomx namespace
#define blake2b_init randomx_blake2b_init
#define blake2b_init_key randomx_blake2b_init_key
#define blake2b_init_param randomx_blake2b_init_param
#define blake2b_update randomx_blake2b_update
#define blake2b_final randomx_blake2b_final
#define blake2b randomx_blake2b
#define blake2b_long randomx_blake2b_long
/* Streaming API */
int blake2b_init(blake2b_state *S, size_t outlen);
int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
size_t keylen);
int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
int blake2b_final(blake2b_state *S, void *out, size_t outlen);
/* Simple API */
int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
const void *key, size_t keylen);
/* Argon2 Team - Begin Code */
int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
/* Argon2 Team - End Code */
#if defined(__cplusplus)
}
#endif
#endif

409
crypto/randomx/blake2/blake2b.c Executable file
View file

@ -0,0 +1,409 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake2.h"
#include "blake2-impl.h"
static const uint64_t blake2b_IV[8] = {
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
static const unsigned int blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
};
static FORCE_INLINE void blake2b_set_lastnode(blake2b_state *S) {
S->f[1] = (uint64_t)-1;
}
static FORCE_INLINE void blake2b_set_lastblock(blake2b_state *S) {
if (S->last_node) {
blake2b_set_lastnode(S);
}
S->f[0] = (uint64_t)-1;
}
static FORCE_INLINE void blake2b_increment_counter(blake2b_state *S,
uint64_t inc) {
S->t[0] += inc;
S->t[1] += (S->t[0] < inc);
}
static FORCE_INLINE void blake2b_invalidate_state(blake2b_state *S) {
//clear_internal_memory(S, sizeof(*S)); /* wipe */
blake2b_set_lastblock(S); /* invalidate for further use */
}
static FORCE_INLINE void blake2b_init0(blake2b_state *S) {
memset(S, 0, sizeof(*S));
memcpy(S->h, blake2b_IV, sizeof(S->h));
}
int blake2b_init_param(blake2b_state *S, const blake2b_param *P) {
const unsigned char *p = (const unsigned char *)P;
unsigned int i;
if (NULL == P || NULL == S) {
return -1;
}
blake2b_init0(S);
/* IV XOR Parameter Block */
for (i = 0; i < 8; ++i) {
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
}
S->outlen = P->digest_length;
return 0;
}
/* Sequential blake2b initialization */
int blake2b_init(blake2b_state *S, size_t outlen) {
blake2b_param P;
if (S == NULL) {
return -1;
}
if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
blake2b_invalidate_state(S);
return -1;
}
/* Setup Parameter Block for unkeyed BLAKE2 */
P.digest_length = (uint8_t)outlen;
P.key_length = 0;
P.fanout = 1;
P.depth = 1;
P.leaf_length = 0;
P.node_offset = 0;
P.node_depth = 0;
P.inner_length = 0;
memset(P.reserved, 0, sizeof(P.reserved));
memset(P.salt, 0, sizeof(P.salt));
memset(P.personal, 0, sizeof(P.personal));
return blake2b_init_param(S, &P);
}
int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen) {
blake2b_param P;
if (S == NULL) {
return -1;
}
if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
blake2b_invalidate_state(S);
return -1;
}
if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) {
blake2b_invalidate_state(S);
return -1;
}
/* Setup Parameter Block for keyed BLAKE2 */
P.digest_length = (uint8_t)outlen;
P.key_length = (uint8_t)keylen;
P.fanout = 1;
P.depth = 1;
P.leaf_length = 0;
P.node_offset = 0;
P.node_depth = 0;
P.inner_length = 0;
memset(P.reserved, 0, sizeof(P.reserved));
memset(P.salt, 0, sizeof(P.salt));
memset(P.personal, 0, sizeof(P.personal));
if (blake2b_init_param(S, &P) < 0) {
blake2b_invalidate_state(S);
return -1;
}
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset(block, 0, BLAKE2B_BLOCKBYTES);
memcpy(block, key, keylen);
blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
/* Burn the key from stack */
//clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
}
return 0;
}
static void blake2b_compress(blake2b_state *S, const uint8_t *block) {
uint64_t m[16];
uint64_t v[16];
unsigned int i, r;
for (i = 0; i < 16; ++i) {
m[i] = load64(block + i * sizeof(m[i]));
}
for (i = 0; i < 8; ++i) {
v[i] = S->h[i];
}
v[8] = blake2b_IV[0];
v[9] = blake2b_IV[1];
v[10] = blake2b_IV[2];
v[11] = blake2b_IV[3];
v[12] = blake2b_IV[4] ^ S->t[0];
v[13] = blake2b_IV[5] ^ S->t[1];
v[14] = blake2b_IV[6] ^ S->f[0];
v[15] = blake2b_IV[7] ^ S->f[1];
#define G(r, i, a, b, c, d) \
do { \
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while ((void)0, 0)
#define ROUND(r) \
do { \
G(r, 0, v[0], v[4], v[8], v[12]); \
G(r, 1, v[1], v[5], v[9], v[13]); \
G(r, 2, v[2], v[6], v[10], v[14]); \
G(r, 3, v[3], v[7], v[11], v[15]); \
G(r, 4, v[0], v[5], v[10], v[15]); \
G(r, 5, v[1], v[6], v[11], v[12]); \
G(r, 6, v[2], v[7], v[8], v[13]); \
G(r, 7, v[3], v[4], v[9], v[14]); \
} while ((void)0, 0)
for (r = 0; r < 12; ++r) {
ROUND(r);
}
for (i = 0; i < 8; ++i) {
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
}
#undef G
#undef ROUND
}
int blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
const uint8_t *pin = (const uint8_t *)in;
if (inlen == 0) {
return 0;
}
/* Sanity check */
if (S == NULL || in == NULL) {
return -1;
}
/* Is this a reused state? */
if (S->f[0] != 0) {
return -1;
}
if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
/* Complete current block */
size_t left = S->buflen;
size_t fill = BLAKE2B_BLOCKBYTES - left;
memcpy(&S->buf[left], pin, fill);
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
blake2b_compress(S, S->buf);
S->buflen = 0;
inlen -= fill;
pin += fill;
/* Avoid buffer copies when possible */
while (inlen > BLAKE2B_BLOCKBYTES) {
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
blake2b_compress(S, pin);
inlen -= BLAKE2B_BLOCKBYTES;
pin += BLAKE2B_BLOCKBYTES;
}
}
memcpy(&S->buf[S->buflen], pin, inlen);
S->buflen += (unsigned int)inlen;
return 0;
}
int blake2b_final(blake2b_state *S, void *out, size_t outlen) {
uint8_t buffer[BLAKE2B_OUTBYTES] = { 0 };
unsigned int i;
/* Sanity checks */
if (S == NULL || out == NULL || outlen < S->outlen) {
return -1;
}
/* Is this a reused state? */
if (S->f[0] != 0) {
return -1;
}
blake2b_increment_counter(S, S->buflen);
blake2b_set_lastblock(S);
memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
blake2b_compress(S, S->buf);
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
}
memcpy(out, buffer, S->outlen);
//clear_internal_memory(buffer, sizeof(buffer));
//clear_internal_memory(S->buf, sizeof(S->buf));
//clear_internal_memory(S->h, sizeof(S->h));
return 0;
}
int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
const void *key, size_t keylen) {
blake2b_state S;
int ret = -1;
/* Verify parameters */
if (NULL == in && inlen > 0) {
goto fail;
}
if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
goto fail;
}
if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
goto fail;
}
if (keylen > 0) {
if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
goto fail;
}
}
else {
if (blake2b_init(&S, outlen) < 0) {
goto fail;
}
}
if (blake2b_update(&S, in, inlen) < 0) {
goto fail;
}
ret = blake2b_final(&S, out, outlen);
fail:
//clear_internal_memory(&S, sizeof(S));
return ret;
}
/* Argon2 Team - Begin Code */
int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
uint8_t *out = (uint8_t *)pout;
blake2b_state blake_state;
uint8_t outlen_bytes[sizeof(uint32_t)] = { 0 };
int ret = -1;
if (outlen > UINT32_MAX) {
goto fail;
}
/* Ensure little-endian byte order! */
store32(outlen_bytes, (uint32_t)outlen);
#define TRY(statement) \
do { \
ret = statement; \
if (ret < 0) { \
goto fail; \
} \
} while ((void)0, 0)
if (outlen <= BLAKE2B_OUTBYTES) {
TRY(blake2b_init(&blake_state, outlen));
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(blake2b_update(&blake_state, in, inlen));
TRY(blake2b_final(&blake_state, out, outlen));
}
else {
uint32_t toproduce;
uint8_t out_buffer[BLAKE2B_OUTBYTES];
uint8_t in_buffer[BLAKE2B_OUTBYTES];
TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(blake2b_update(&blake_state, in, inlen));
TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
while (toproduce > BLAKE2B_OUTBYTES) {
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
BLAKE2B_OUTBYTES, NULL, 0));
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce -= BLAKE2B_OUTBYTES / 2;
}
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
0));
memcpy(out, out_buffer, toproduce);
}
fail:
//clear_internal_memory(&blake_state, sizeof(blake_state));
return ret;
#undef TRY
}
/* Argon2 Team - End Code */

View file

@ -0,0 +1,189 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef BLAKE_ROUND_MKA_OPT_H
#define BLAKE_ROUND_MKA_OPT_H
#include "blake2-impl.h"
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr32(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
\
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr24(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D1 = rotr32(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
\
B1 = _mm256_xor_si256(B1, C1); \
B1 = rotr24(B1); \
} while((void)0, 0);
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr16(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr63(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D1 = rotr16(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
B1 = _mm256_xor_si256(B1, C1); \
B1 = rotr63(B1); \
} while((void)0, 0);
#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while((void)0, 0);
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
} while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while((void)0, 0);
#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
} while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0);
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0);
#endif /* BLAKE_ROUND_MKA_OPT_H */

View file

@ -0,0 +1,73 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef BLAKE_ROUND_MKA_H
#define BLAKE_ROUND_MKA_H
#include "blake2.h"
#include "blake2-impl.h"
/* designed by the Lyra PHC team */
static FORCE_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
const uint64_t m = UINT64_C(0xFFFFFFFF);
const uint64_t xy = (x & m) * (y & m);
return x + y + 2 * xy;
}
#define G(a, b, c, d) \
do { \
a = fBlaMka(a, b); \
d = rotr64(d ^ a, 32); \
c = fBlaMka(c, d); \
b = rotr64(b ^ c, 24); \
a = fBlaMka(a, b); \
d = rotr64(d ^ a, 16); \
c = fBlaMka(c, d); \
b = rotr64(b ^ c, 63); \
} while ((void)0, 0)
#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
v12, v13, v14, v15) \
do { \
G(v0, v4, v8, v12); \
G(v1, v5, v9, v13); \
G(v2, v6, v10, v14); \
G(v3, v7, v11, v15); \
G(v0, v5, v10, v15); \
G(v1, v6, v11, v12); \
G(v2, v7, v8, v13); \
G(v3, v4, v9, v14); \
} while ((void)0, 0)
#endif

View file

@ -0,0 +1,162 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef BLAKE_ROUND_MKA_OPT_H
#define BLAKE_ROUND_MKA_OPT_H
#include "blake2-impl.h"
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif
#ifdef _mm_roti_epi64 //clang defines it using the XOP instruction set
#undef _mm_roti_epi64
#endif
#define r16 \
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define r24 \
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
const __m128i z = _mm_mul_epu32(x, y);
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
}
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -32); \
D1 = _mm_roti_epi64(D1, -32); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -24); \
B1 = _mm_roti_epi64(B1, -24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -16); \
D1 = _mm_roti_epi64(D1, -16); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -63); \
B1 = _mm_roti_epi64(B1, -63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D1, D0, 8); \
t1 = _mm_alignr_epi8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D0, D1, 8); \
t1 = _mm_alignr_epi8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
#endif /* BLAKE_ROUND_MKA_OPT_H */

107
crypto/randomx/blake2/endian.h Executable file
View file

@ -0,0 +1,107 @@
#pragma once
#include <stdint.h>
#include <string.h>
#if defined(_MSC_VER)
#define FORCE_INLINE __inline
#elif defined(__GNUC__) || defined(__clang__)
#define FORCE_INLINE __inline__
#else
#define FORCE_INLINE
#endif
/* Argon2 Team - Begin Code */
/*
Not an exhaustive list, but should cover the majority of modern platforms
Additionally, the code will always be correct---this is only a performance
tweak.
*/
#if (defined(__BYTE_ORDER__) && \
(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \
defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \
defined(_M_ARM)
#define NATIVE_LITTLE_ENDIAN
#endif
/* Argon2 Team - End Code */
static FORCE_INLINE uint32_t load32(const void *src) {
#if defined(NATIVE_LITTLE_ENDIAN)
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = (const uint8_t *)src;
uint32_t w = *p++;
w |= (uint32_t)(*p++) << 8;
w |= (uint32_t)(*p++) << 16;
w |= (uint32_t)(*p++) << 24;
return w;
#endif
}
static FORCE_INLINE uint64_t load64_native(const void *src) {
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
}
static FORCE_INLINE uint64_t load64(const void *src) {
#if defined(NATIVE_LITTLE_ENDIAN)
return load64_native(src);
#else
const uint8_t *p = (const uint8_t *)src;
uint64_t w = *p++;
w |= (uint64_t)(*p++) << 8;
w |= (uint64_t)(*p++) << 16;
w |= (uint64_t)(*p++) << 24;
w |= (uint64_t)(*p++) << 32;
w |= (uint64_t)(*p++) << 40;
w |= (uint64_t)(*p++) << 48;
w |= (uint64_t)(*p++) << 56;
return w;
#endif
}
static FORCE_INLINE void store32(void *dst, uint32_t w) {
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = (uint8_t *)dst;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
#endif
}
static FORCE_INLINE void store64_native(void *dst, uint64_t w) {
memcpy(dst, &w, sizeof w);
}
static FORCE_INLINE void store64(void *dst, uint64_t w) {
#if defined(NATIVE_LITTLE_ENDIAN)
store64_native(dst, w);
#else
uint8_t *p = (uint8_t *)dst;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
#endif
}

View file

@ -0,0 +1,62 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stddef.h>
#include "blake2/blake2.h"
#include "blake2/endian.h"
#include "blake2_generator.hpp"
namespace randomx {
constexpr int maxSeedSize = 60;
Blake2Generator::Blake2Generator(const void* seed, size_t seedSize, int nonce) : dataIndex(sizeof(data)) {
memset(data, 0, sizeof(data));
memcpy(data, seed, seedSize > maxSeedSize ? maxSeedSize : seedSize);
store32(&data[maxSeedSize], nonce);
}
uint8_t Blake2Generator::getByte() {
checkData(1);
return data[dataIndex++];
}
uint32_t Blake2Generator::getUInt32() {
checkData(4);
auto ret = load32(&data[dataIndex]);
dataIndex += 4;
return ret;
}
void Blake2Generator::checkData(const size_t bytesNeeded) {
if (dataIndex + bytesNeeded > sizeof(data)) {
blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
dataIndex = 0;
}
}
}

View file

@ -0,0 +1,46 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
namespace randomx {
class Blake2Generator {
public:
Blake2Generator(const void* seed, size_t seedSize, int nonce = 0);
uint8_t getByte();
uint32_t getUInt32();
private:
void checkData(const size_t);
uint8_t data[64];
size_t dataIndex;
};
}

View file

@ -0,0 +1,482 @@
/*
Copyright (c) 2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bytecode_machine.hpp"
#include "reciprocal.h"
namespace randomx {
const int_reg_t BytecodeMachine::zero = 0;
#define INSTR_CASE(x) case InstructionType::x: \
exe_ ## x(ibc, pc, scratchpad, config); \
break;
void BytecodeMachine::executeInstruction(RANDOMX_EXE_ARGS) {
switch (ibc.type)
{
INSTR_CASE(IADD_RS)
INSTR_CASE(IADD_M)
INSTR_CASE(ISUB_R)
INSTR_CASE(ISUB_M)
INSTR_CASE(IMUL_R)
INSTR_CASE(IMUL_M)
INSTR_CASE(IMULH_R)
INSTR_CASE(IMULH_M)
INSTR_CASE(ISMULH_R)
INSTR_CASE(ISMULH_M)
INSTR_CASE(INEG_R)
INSTR_CASE(IXOR_R)
INSTR_CASE(IXOR_M)
INSTR_CASE(IROR_R)
INSTR_CASE(IROL_R)
INSTR_CASE(ISWAP_R)
INSTR_CASE(FSWAP_R)
INSTR_CASE(FADD_R)
INSTR_CASE(FADD_M)
INSTR_CASE(FSUB_R)
INSTR_CASE(FSUB_M)
INSTR_CASE(FSCAL_R)
INSTR_CASE(FMUL_R)
INSTR_CASE(FDIV_M)
INSTR_CASE(FSQRT_R)
INSTR_CASE(CBRANCH)
INSTR_CASE(CFROUND)
INSTR_CASE(ISTORE)
case InstructionType::NOP:
break;
case InstructionType::IMUL_RCP: //executed as IMUL_R
default:
UNREACHABLE;
}
}
void BytecodeMachine::compileInstruction(RANDOMX_GEN_ARGS) {
int opcode = instr.opcode;
if (opcode < ceil_IADD_RS) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IADD_RS;
ibc.idst = &nreg->r[dst];
if (dst != RegisterNeedsDisplacement) {
ibc.isrc = &nreg->r[src];
ibc.shift = instr.getModShift();
ibc.imm = 0;
}
else {
ibc.isrc = &nreg->r[src];
ibc.shift = instr.getModShift();
ibc.imm = signExtend2sCompl(instr.getImm32());
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IADD_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IADD_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_ISUB_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::ISUB_R;
ibc.idst = &nreg->r[dst];
if (src != dst) {
ibc.isrc = &nreg->r[src];
}
else {
ibc.imm = signExtend2sCompl(instr.getImm32());
ibc.isrc = &ibc.imm;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_ISUB_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::ISUB_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IMUL_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IMUL_R;
ibc.idst = &nreg->r[dst];
if (src != dst) {
ibc.isrc = &nreg->r[src];
}
else {
ibc.imm = signExtend2sCompl(instr.getImm32());
ibc.isrc = &ibc.imm;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IMUL_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IMUL_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IMULH_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IMULH_R;
ibc.idst = &nreg->r[dst];
ibc.isrc = &nreg->r[src];
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IMULH_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IMULH_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_ISMULH_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::ISMULH_R;
ibc.idst = &nreg->r[dst];
ibc.isrc = &nreg->r[src];
registerUsage[dst] = i;
return;
}
if (opcode < ceil_ISMULH_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::ISMULH_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IMUL_RCP) {
const uint32_t divisor = instr.getImm32();
if (!isZeroOrPowerOf2(divisor)) {
auto dst = instr.dst % RegistersCount;
ibc.type = InstructionType::IMUL_R;
ibc.idst = &nreg->r[dst];
ibc.imm = randomx_reciprocal(divisor);
ibc.isrc = &ibc.imm;
registerUsage[dst] = i;
}
else {
ibc.type = InstructionType::NOP;
}
return;
}
if (opcode < ceil_INEG_R) {
auto dst = instr.dst % RegistersCount;
ibc.type = InstructionType::INEG_R;
ibc.idst = &nreg->r[dst];
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IXOR_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IXOR_R;
ibc.idst = &nreg->r[dst];
if (src != dst) {
ibc.isrc = &nreg->r[src];
}
else {
ibc.imm = signExtend2sCompl(instr.getImm32());
ibc.isrc = &ibc.imm;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IXOR_M) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IXOR_M;
ibc.idst = &nreg->r[dst];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (src != dst) {
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
ibc.isrc = &zero;
ibc.memMask = ScratchpadL3Mask;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IROR_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IROR_R;
ibc.idst = &nreg->r[dst];
if (src != dst) {
ibc.isrc = &nreg->r[src];
}
else {
ibc.imm = instr.getImm32();
ibc.isrc = &ibc.imm;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_IROL_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IROL_R;
ibc.idst = &nreg->r[dst];
if (src != dst) {
ibc.isrc = &nreg->r[src];
}
else {
ibc.imm = instr.getImm32();
ibc.isrc = &ibc.imm;
}
registerUsage[dst] = i;
return;
}
if (opcode < ceil_ISWAP_R) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
if (src != dst) {
ibc.idst = &nreg->r[dst];
ibc.isrc = &nreg->r[src];
ibc.type = InstructionType::ISWAP_R;
registerUsage[dst] = i;
registerUsage[src] = i;
}
else {
ibc.type = InstructionType::NOP;
}
return;
}
if (opcode < ceil_FSWAP_R) {
auto dst = instr.dst % RegistersCount;
ibc.type = InstructionType::FSWAP_R;
if (dst < RegisterCountFlt)
ibc.fdst = &nreg->f[dst];
else
ibc.fdst = &nreg->e[dst - RegisterCountFlt];
return;
}
if (opcode < ceil_FADD_R) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegisterCountFlt;
ibc.type = InstructionType::FADD_R;
ibc.fdst = &nreg->f[dst];
ibc.fsrc = &nreg->a[src];
return;
}
if (opcode < ceil_FADD_M) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::FADD_M;
ibc.fdst = &nreg->f[dst];
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
ibc.imm = signExtend2sCompl(instr.getImm32());
return;
}
if (opcode < ceil_FSUB_R) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegisterCountFlt;
ibc.type = InstructionType::FSUB_R;
ibc.fdst = &nreg->f[dst];
ibc.fsrc = &nreg->a[src];
return;
}
if (opcode < ceil_FSUB_M) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::FSUB_M;
ibc.fdst = &nreg->f[dst];
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
ibc.imm = signExtend2sCompl(instr.getImm32());
return;
}
if (opcode < ceil_FSCAL_R) {
auto dst = instr.dst % RegisterCountFlt;
ibc.fdst = &nreg->f[dst];
ibc.type = InstructionType::FSCAL_R;
return;
}
if (opcode < ceil_FMUL_R) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegisterCountFlt;
ibc.type = InstructionType::FMUL_R;
ibc.fdst = &nreg->e[dst];
ibc.fsrc = &nreg->a[src];
return;
}
if (opcode < ceil_FDIV_M) {
auto dst = instr.dst % RegisterCountFlt;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::FDIV_M;
ibc.fdst = &nreg->e[dst];
ibc.isrc = &nreg->r[src];
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
ibc.imm = signExtend2sCompl(instr.getImm32());
return;
}
if (opcode < ceil_FSQRT_R) {
auto dst = instr.dst % RegisterCountFlt;
ibc.type = InstructionType::FSQRT_R;
ibc.fdst = &nreg->e[dst];
return;
}
if (opcode < ceil_CBRANCH) {
ibc.type = InstructionType::CBRANCH;
//jump condition
int creg = instr.dst % RegistersCount;
ibc.idst = &nreg->r[creg];
ibc.target = registerUsage[creg];
int shift = instr.getModCond() + ConditionOffset;
ibc.imm = signExtend2sCompl(instr.getImm32()) | (1ULL << shift);
if (ConditionOffset > 0 || shift > 0) //clear the bit below the condition mask - this limits the number of successive jumps to 2
ibc.imm &= ~(1ULL << (shift - 1));
ibc.memMask = ConditionMask << shift;
//mark all registers as used
for (unsigned j = 0; j < RegistersCount; ++j) {
registerUsage[j] = i;
}
return;
}
if (opcode < ceil_CFROUND) {
auto src = instr.src % RegistersCount;
ibc.isrc = &nreg->r[src];
ibc.type = InstructionType::CFROUND;
ibc.imm = instr.getImm32() & 63;
return;
}
if (opcode < ceil_ISTORE) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::ISTORE;
ibc.idst = &nreg->r[dst];
ibc.isrc = &nreg->r[src];
ibc.imm = signExtend2sCompl(instr.getImm32());
if (instr.getModCond() < StoreL3Condition)
ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
else
ibc.memMask = ScratchpadL3Mask;
return;
}
if (opcode < ceil_NOP) {
ibc.type = InstructionType::NOP;
return;
}
UNREACHABLE;
}
}

View file

@ -0,0 +1,322 @@
/*
Copyright (c) 2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "common.hpp"
#include "intrin_portable.h"
#include "instruction.hpp"
#include "program.hpp"
namespace randomx {
//register file in machine byte order
struct NativeRegisterFile {
int_reg_t r[RegistersCount] = { 0 };
rx_vec_f128 f[RegisterCountFlt];
rx_vec_f128 e[RegisterCountFlt];
rx_vec_f128 a[RegisterCountFlt];
};
struct InstructionByteCode {
union {
int_reg_t* idst;
rx_vec_f128* fdst;
};
union {
const int_reg_t* isrc;
const rx_vec_f128* fsrc;
};
union {
uint64_t imm;
int64_t simm;
};
InstructionType type;
union {
int16_t target;
uint16_t shift;
};
uint32_t memMask;
};
#define OPCODE_CEIL_DECLARE(curr, prev) constexpr int ceil_ ## curr = ceil_ ## prev + RANDOMX_FREQ_ ## curr;
constexpr int ceil_NULL = 0;
OPCODE_CEIL_DECLARE(IADD_RS, NULL);
OPCODE_CEIL_DECLARE(IADD_M, IADD_RS);
OPCODE_CEIL_DECLARE(ISUB_R, IADD_M);
OPCODE_CEIL_DECLARE(ISUB_M, ISUB_R);
OPCODE_CEIL_DECLARE(IMUL_R, ISUB_M);
OPCODE_CEIL_DECLARE(IMUL_M, IMUL_R);
OPCODE_CEIL_DECLARE(IMULH_R, IMUL_M);
OPCODE_CEIL_DECLARE(IMULH_M, IMULH_R);
OPCODE_CEIL_DECLARE(ISMULH_R, IMULH_M);
OPCODE_CEIL_DECLARE(ISMULH_M, ISMULH_R);
OPCODE_CEIL_DECLARE(IMUL_RCP, ISMULH_M);
OPCODE_CEIL_DECLARE(INEG_R, IMUL_RCP);
OPCODE_CEIL_DECLARE(IXOR_R, INEG_R);
OPCODE_CEIL_DECLARE(IXOR_M, IXOR_R);
OPCODE_CEIL_DECLARE(IROR_R, IXOR_M);
OPCODE_CEIL_DECLARE(IROL_R, IROR_R);
OPCODE_CEIL_DECLARE(ISWAP_R, IROL_R);
OPCODE_CEIL_DECLARE(FSWAP_R, ISWAP_R);
OPCODE_CEIL_DECLARE(FADD_R, FSWAP_R);
OPCODE_CEIL_DECLARE(FADD_M, FADD_R);
OPCODE_CEIL_DECLARE(FSUB_R, FADD_M);
OPCODE_CEIL_DECLARE(FSUB_M, FSUB_R);
OPCODE_CEIL_DECLARE(FSCAL_R, FSUB_M);
OPCODE_CEIL_DECLARE(FMUL_R, FSCAL_R);
OPCODE_CEIL_DECLARE(FDIV_M, FMUL_R);
OPCODE_CEIL_DECLARE(FSQRT_R, FDIV_M);
OPCODE_CEIL_DECLARE(CBRANCH, FSQRT_R);
OPCODE_CEIL_DECLARE(CFROUND, CBRANCH);
OPCODE_CEIL_DECLARE(ISTORE, CFROUND);
OPCODE_CEIL_DECLARE(NOP, ISTORE);
#undef OPCODE_CEIL_DECLARE
#define RANDOMX_EXE_ARGS InstructionByteCode& ibc, int& pc, uint8_t* scratchpad, ProgramConfiguration& config
#define RANDOMX_GEN_ARGS Instruction& instr, int i, InstructionByteCode& ibc
class BytecodeMachine;
typedef void(BytecodeMachine::*InstructionGenBytecode)(RANDOMX_GEN_ARGS);
class BytecodeMachine {
public:
void beginCompilation(NativeRegisterFile& regFile) {
for (unsigned i = 0; i < RegistersCount; ++i) {
registerUsage[i] = -1;
}
nreg = &regFile;
}
void compileProgram(Program& program, InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE], NativeRegisterFile& regFile) {
beginCompilation(regFile);
for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) {
auto& instr = program(i);
auto& ibc = bytecode[i];
compileInstruction(instr, i, ibc);
}
}
static void executeBytecode(InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE], uint8_t* scratchpad, ProgramConfiguration& config) {
for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) {
auto& ibc = bytecode[pc];
executeInstruction(ibc, pc, scratchpad, config);
}
}
void compileInstruction(RANDOMX_GEN_ARGS)
#ifdef RANDOMX_GEN_TABLE
{
auto generator = genTable[instr.opcode];
(this->*generator)(instr, i, ibc);
}
#else
;
#endif
static void executeInstruction(RANDOMX_EXE_ARGS);
static void exe_IADD_RS(RANDOMX_EXE_ARGS) {
*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm;
}
static void exe_IADD_M(RANDOMX_EXE_ARGS) {
*ibc.idst += load64(getScratchpadAddress(ibc, scratchpad));
}
static void exe_ISUB_R(RANDOMX_EXE_ARGS) {
*ibc.idst -= *ibc.isrc;
}
static void exe_ISUB_M(RANDOMX_EXE_ARGS) {
*ibc.idst -= load64(getScratchpadAddress(ibc, scratchpad));
}
static void exe_IMUL_R(RANDOMX_EXE_ARGS) {
*ibc.idst *= *ibc.isrc;
}
static void exe_IMUL_M(RANDOMX_EXE_ARGS) {
*ibc.idst *= load64(getScratchpadAddress(ibc, scratchpad));
}
static void exe_IMULH_R(RANDOMX_EXE_ARGS) {
*ibc.idst = mulh(*ibc.idst, *ibc.isrc);
}
static void exe_IMULH_M(RANDOMX_EXE_ARGS) {
*ibc.idst = mulh(*ibc.idst, load64(getScratchpadAddress(ibc, scratchpad)));
}
static void exe_ISMULH_R(RANDOMX_EXE_ARGS) {
*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(*ibc.isrc));
}
static void exe_ISMULH_M(RANDOMX_EXE_ARGS) {
*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(getScratchpadAddress(ibc, scratchpad))));
}
static void exe_INEG_R(RANDOMX_EXE_ARGS) {
*ibc.idst = ~(*ibc.idst) + 1; //two's complement negative
}
static void exe_IXOR_R(RANDOMX_EXE_ARGS) {
*ibc.idst ^= *ibc.isrc;
}
static void exe_IXOR_M(RANDOMX_EXE_ARGS) {
*ibc.idst ^= load64(getScratchpadAddress(ibc, scratchpad));
}
static void exe_IROR_R(RANDOMX_EXE_ARGS) {
*ibc.idst = rotr(*ibc.idst, *ibc.isrc & 63);
}
static void exe_IROL_R(RANDOMX_EXE_ARGS) {
*ibc.idst = rotl(*ibc.idst, *ibc.isrc & 63);
}
static void exe_ISWAP_R(RANDOMX_EXE_ARGS) {
int_reg_t temp = *ibc.isrc;
*(int_reg_t*)ibc.isrc = *ibc.idst;
*ibc.idst = temp;
}
static void exe_FSWAP_R(RANDOMX_EXE_ARGS) {
*ibc.fdst = rx_swap_vec_f128(*ibc.fdst);
}
static void exe_FADD_R(RANDOMX_EXE_ARGS) {
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc);
}
static void exe_FADD_M(RANDOMX_EXE_ARGS) {
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad));
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc);
}
static void exe_FSUB_R(RANDOMX_EXE_ARGS) {
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc);
}
static void exe_FSUB_M(RANDOMX_EXE_ARGS) {
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad));
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc);
}
static void exe_FSCAL_R(RANDOMX_EXE_ARGS) {
const rx_vec_f128 mask = rx_set1_vec_f128(0x80F0000000000000);
*ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask);
}
static void exe_FMUL_R(RANDOMX_EXE_ARGS) {
*ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc);
}
static void exe_FDIV_M(RANDOMX_EXE_ARGS) {
rx_vec_f128 fsrc = maskRegisterExponentMantissa(
config,
rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad))
);
*ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc);
}
static void exe_FSQRT_R(RANDOMX_EXE_ARGS) {
*ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst);
}
static void exe_CBRANCH(RANDOMX_EXE_ARGS) {
*ibc.idst += ibc.imm;
if ((*ibc.idst & ibc.memMask) == 0) {
pc = ibc.target;
}
}
static void exe_CFROUND(RANDOMX_EXE_ARGS) {
rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4);
}
static void exe_ISTORE(RANDOMX_EXE_ARGS) {
store64(scratchpad + ((*ibc.idst + ibc.imm) & ibc.memMask), *ibc.isrc);
}
protected:
static rx_vec_f128 maskRegisterExponentMantissa(ProgramConfiguration& config, rx_vec_f128 x) {
const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask);
const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask);
x = rx_and_vec_f128(x, xmantissaMask);
x = rx_or_vec_f128(x, xexponentMask);
return x;
}
private:
static const int_reg_t zero;
int registerUsage[RegistersCount];
NativeRegisterFile* nreg;
static void* getScratchpadAddress(InstructionByteCode& ibc, uint8_t* scratchpad) {
uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask;
return scratchpad + addr;
}
#ifdef RANDOMX_GEN_TABLE
static InstructionGenBytecode genTable[256];
void gen_IADD_RS(RANDOMX_GEN_ARGS);
void gen_IADD_M(RANDOMX_GEN_ARGS);
void gen_ISUB_R(RANDOMX_GEN_ARGS);
void gen_ISUB_M(RANDOMX_GEN_ARGS);
void gen_IMUL_R(RANDOMX_GEN_ARGS);
void gen_IMUL_M(RANDOMX_GEN_ARGS);
void gen_IMULH_R(RANDOMX_GEN_ARGS);
void gen_IMULH_M(RANDOMX_GEN_ARGS);
void gen_ISMULH_R(RANDOMX_GEN_ARGS);
void gen_ISMULH_M(RANDOMX_GEN_ARGS);
void gen_IMUL_RCP(RANDOMX_GEN_ARGS);
void gen_INEG_R(RANDOMX_GEN_ARGS);
void gen_IXOR_R(RANDOMX_GEN_ARGS);
void gen_IXOR_M(RANDOMX_GEN_ARGS);
void gen_IROR_R(RANDOMX_GEN_ARGS);
void gen_IROL_R(RANDOMX_GEN_ARGS);
void gen_ISWAP_R(RANDOMX_GEN_ARGS);
void gen_FSWAP_R(RANDOMX_GEN_ARGS);
void gen_FADD_R(RANDOMX_GEN_ARGS);
void gen_FADD_M(RANDOMX_GEN_ARGS);
void gen_FSUB_R(RANDOMX_GEN_ARGS);
void gen_FSUB_M(RANDOMX_GEN_ARGS);
void gen_FSCAL_R(RANDOMX_GEN_ARGS);
void gen_FMUL_R(RANDOMX_GEN_ARGS);
void gen_FDIV_M(RANDOMX_GEN_ARGS);
void gen_FSQRT_R(RANDOMX_GEN_ARGS);
void gen_CBRANCH(RANDOMX_GEN_ARGS);
void gen_CFROUND(RANDOMX_GEN_ARGS);
void gen_ISTORE(RANDOMX_GEN_ARGS);
void gen_NOP(RANDOMX_GEN_ARGS);
#endif
};
}

194
crypto/randomx/common.hpp Executable file
View file

@ -0,0 +1,194 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <iostream>
#include <climits>
#include "blake2/endian.h"
#include "configuration.h"
#include "randomx.h"
namespace randomx {
static_assert(RANDOMX_ARGON_MEMORY >= 8, "RANDOMX_ARGON_MEMORY must be at least 8.");
static_assert(RANDOMX_ARGON_MEMORY <= 2097152, "RANDOMX_ARGON_MEMORY must not exceed 2097152.");
static_assert((RANDOMX_ARGON_MEMORY & (RANDOMX_ARGON_MEMORY - 1)) == 0, "RANDOMX_ARGON_MEMORY must be a power of 2.");
static_assert(RANDOMX_ARGON_ITERATIONS > 0 && RANDOMX_ARGON_ITERATIONS < UINT32_MAX, "RANDOMX_ARGON_ITERATIONS must be a positive 32-bit integer.");
static_assert(RANDOMX_ARGON_LANES > 0 && RANDOMX_ARGON_LANES <= 16777215, "RANDOMX_ARGON_LANES out of range");
static_assert(RANDOMX_DATASET_BASE_SIZE >= 64, "RANDOMX_DATASET_BASE_SIZE must be at least 64.");
static_assert((RANDOMX_DATASET_BASE_SIZE & (RANDOMX_DATASET_BASE_SIZE - 1)) == 0, "RANDOMX_DATASET_BASE_SIZE must be a power of 2.");
static_assert(RANDOMX_DATASET_BASE_SIZE <= 4294967296ULL, "RANDOMX_DATASET_BASE_SIZE must not exceed 4294967296.");
static_assert(RANDOMX_DATASET_EXTRA_SIZE % 64 == 0, "RANDOMX_DATASET_EXTRA_SIZE must be divisible by 64.");
static_assert((uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE <= 17179869184, "Dataset size must not exceed 16 GiB.");
static_assert(RANDOMX_PROGRAM_SIZE > 0, "RANDOMX_PROGRAM_SIZE must be greater than 0");
static_assert(RANDOMX_PROGRAM_SIZE <= 32768, "RANDOMX_PROGRAM_SIZE must not exceed 32768");
static_assert(RANDOMX_PROGRAM_ITERATIONS > 0, "RANDOMX_PROGRAM_ITERATIONS must be greater than 0");
static_assert(RANDOMX_PROGRAM_COUNT > 0, "RANDOMX_PROGRAM_COUNT must be greater than 0");
static_assert((RANDOMX_SCRATCHPAD_L3 & (RANDOMX_SCRATCHPAD_L3 - 1)) == 0, "RANDOMX_SCRATCHPAD_L3 must be a power of 2.");
static_assert(RANDOMX_SCRATCHPAD_L3 >= RANDOMX_SCRATCHPAD_L2, "RANDOMX_SCRATCHPAD_L3 must be greater than or equal to RANDOMX_SCRATCHPAD_L2.");
static_assert((RANDOMX_SCRATCHPAD_L2 & (RANDOMX_SCRATCHPAD_L2 - 1)) == 0, "RANDOMX_SCRATCHPAD_L2 must be a power of 2.");
static_assert(RANDOMX_SCRATCHPAD_L2 >= RANDOMX_SCRATCHPAD_L1, "RANDOMX_SCRATCHPAD_L2 must be greater than or equal to RANDOMX_SCRATCHPAD_L1.");
static_assert(RANDOMX_SCRATCHPAD_L1 >= 64, "RANDOMX_SCRATCHPAD_L1 must be at least 64.");
static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2.");
static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1");
static_assert(RANDOMX_SUPERSCALAR_LATENCY > 0, "RANDOMX_SUPERSCALAR_LATENCY must be greater than 0");
static_assert(RANDOMX_SUPERSCALAR_LATENCY <= 10000, "RANDOMX_SUPERSCALAR_LATENCY must not exceed 10000");
static_assert(RANDOMX_JUMP_BITS > 0, "RANDOMX_JUMP_BITS must be greater than 0.");
static_assert(RANDOMX_JUMP_OFFSET >= 0, "RANDOMX_JUMP_OFFSET must be greater than or equal to 0.");
static_assert(RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET <= 16, "RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET must not exceed 16.");
constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + \
RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \
RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \
RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + \
RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + \
RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R + RANDOMX_FREQ_CBRANCH + \
RANDOMX_FREQ_CFROUND + RANDOMX_FREQ_ISTORE + RANDOMX_FREQ_NOP;
static_assert(wtSum == 256, "Sum of instruction frequencies must be 256.");
constexpr uint32_t ArgonBlockSize = 1024;
constexpr int ArgonSaltSize = sizeof("" RANDOMX_ARGON_SALT) - 1;
static_assert(ArgonSaltSize >= 8, "RANDOMX_ARGON_SALT must be at least 8 characters long");
constexpr int SuperscalarMaxSize = 3 * RANDOMX_SUPERSCALAR_LATENCY + 2;
constexpr size_t CacheLineSize = RANDOMX_DATASET_ITEM_SIZE;
constexpr int ScratchpadSize = RANDOMX_SCRATCHPAD_L3;
constexpr uint32_t CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & ~(CacheLineSize - 1);
constexpr uint32_t CacheSize = RANDOMX_ARGON_MEMORY * ArgonBlockSize;
constexpr uint64_t DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE;
constexpr uint32_t DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE;
constexpr uint32_t ConditionMask = ((1 << RANDOMX_JUMP_BITS) - 1);
constexpr int ConditionOffset = RANDOMX_JUMP_OFFSET;
constexpr int StoreL3Condition = 14;
//Prevent some unsafe configurations.
#ifndef RANDOMX_UNSAFE
static_assert((uint64_t)ArgonBlockSize * RANDOMX_CACHE_ACCESSES * RANDOMX_ARGON_MEMORY + 33554432 >= (uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE, "Unsafe configuration: Memory-time tradeoffs");
static_assert((128 + RANDOMX_PROGRAM_SIZE * RANDOMX_FREQ_ISTORE / 256) * (RANDOMX_PROGRAM_COUNT * RANDOMX_PROGRAM_ITERATIONS) >= RANDOMX_SCRATCHPAD_L3, "Unsafe configuration: Insufficient Scratchpad writes");
static_assert(RANDOMX_PROGRAM_COUNT > 1, "Unsafe configuration: Program filtering strategies");
static_assert(RANDOMX_PROGRAM_SIZE >= 64, "Unsafe configuration: Low program entropy");
static_assert(RANDOMX_PROGRAM_ITERATIONS >= 400, "Unsafe configuration: High compilation overhead");
#endif
#ifdef TRACE
constexpr bool trace = true;
#else
constexpr bool trace = false;
#endif
#ifndef UNREACHABLE
#ifdef __GNUC__
#define UNREACHABLE __builtin_unreachable()
#elif _MSC_VER
#define UNREACHABLE __assume(false)
#else
#define UNREACHABLE
#endif
#endif
#if defined(_M_X64) || defined(__x86_64__)
#define RANDOMX_HAVE_COMPILER 1
#define RANDOMX_COMPILER_X86
class JitCompilerX86;
using JitCompiler = JitCompilerX86;
#elif defined(__aarch64__)
#define RANDOMX_HAVE_COMPILER 1
#define RANDOMX_COMPILER_A64
class JitCompilerA64;
using JitCompiler = JitCompilerA64;
#elif defined(__riscv) && __riscv_xlen == 64
#define RANDOMX_HAVE_COMPILER 1
#define RANDOMX_COMPILER_RV64
class JitCompilerRV64;
using JitCompiler = JitCompilerRV64;
#else
#define RANDOMX_HAVE_COMPILER 0
class JitCompilerFallback;
using JitCompiler = JitCompilerFallback;
#endif
using addr_t = uint32_t;
using int_reg_t = uint64_t;
struct fpu_reg_t {
double lo;
double hi;
};
constexpr uint32_t ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / sizeof(int_reg_t);
constexpr uint32_t ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / sizeof(int_reg_t);
constexpr uint32_t ScratchpadL3 = RANDOMX_SCRATCHPAD_L3 / sizeof(int_reg_t);
constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16;
constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16;
constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8;
constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64;
constexpr int RegistersCount = 8;
constexpr int RegisterCountFlt = RegistersCount / 2;
constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register
constexpr int RegisterNeedsSib = 4; //x86 r12 register
inline bool isZeroOrPowerOf2(uint64_t x) {
return (x & (x - 1)) == 0;
}
constexpr int mantissaSize = 52;
constexpr int exponentSize = 11;
constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1;
constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1;
constexpr int exponentBias = 1023;
constexpr int dynamicExponentBits = 4;
constexpr int staticExponentBits = 4;
constexpr uint64_t constExponentBits = 0x300;
constexpr uint64_t dynamicMantissaMask = (1ULL << (mantissaSize + dynamicExponentBits)) - 1;
struct MemoryRegisters {
addr_t mx, ma;
uint8_t* memory = nullptr;
};
//register file in little-endian byte order
struct RegisterFile {
int_reg_t r[RegistersCount];
fpu_reg_t f[RegisterCountFlt];
fpu_reg_t e[RegisterCountFlt];
fpu_reg_t a[RegisterCountFlt];
};
typedef void(ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t);
typedef void(DatasetInitFunc)(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
typedef void(DatasetDeallocFunc)(randomx_dataset*);
typedef void(CacheDeallocFunc)(randomx_cache*);
typedef void(CacheInitializeFunc)(randomx_cache*, const void*, size_t);
}

125
crypto/randomx/configuration.h Executable file
View file

@ -0,0 +1,125 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
//Cache size in KiB. Must be a power of 2.
#define RANDOMX_ARGON_MEMORY 262144
//Number of Argon2d iterations for Cache initialization.
#define RANDOMX_ARGON_ITERATIONS 3
//Number of parallel lanes for Cache initialization.
#define RANDOMX_ARGON_LANES 1
//Argon2d salt
#define RANDOMX_ARGON_SALT "RandomX\x03"
//Number of random Cache accesses per Dataset item. Minimum is 2.
#define RANDOMX_CACHE_ACCESSES 8
//Target latency for SuperscalarHash (in cycles of the reference CPU).
#define RANDOMX_SUPERSCALAR_LATENCY 170
//Dataset base size in bytes. Must be a power of 2.
#define RANDOMX_DATASET_BASE_SIZE 2147483648
//Dataset extra size. Must be divisible by 64.
#define RANDOMX_DATASET_EXTRA_SIZE 33554368
//Number of instructions in a RandomX program. Must be divisible by 8.
#define RANDOMX_PROGRAM_SIZE 256
//Number of iterations during VM execution.
#define RANDOMX_PROGRAM_ITERATIONS 2048
//Number of chained VM executions per hash.
#define RANDOMX_PROGRAM_COUNT 8
//Scratchpad L3 size in bytes. Must be a power of 2.
#define RANDOMX_SCRATCHPAD_L3 2097152
//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
#define RANDOMX_SCRATCHPAD_L2 262144
//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
#define RANDOMX_SCRATCHPAD_L1 16384
//Jump condition mask size in bits.
#define RANDOMX_JUMP_BITS 8
//Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
#define RANDOMX_JUMP_OFFSET 8
/*
Instruction frequencies (per 256 opcodes)
Total sum of frequencies must be 256
*/
//Integer instructions
#define RANDOMX_FREQ_IADD_RS 16
#define RANDOMX_FREQ_IADD_M 7
#define RANDOMX_FREQ_ISUB_R 16
#define RANDOMX_FREQ_ISUB_M 7
#define RANDOMX_FREQ_IMUL_R 16
#define RANDOMX_FREQ_IMUL_M 4
#define RANDOMX_FREQ_IMULH_R 4
#define RANDOMX_FREQ_IMULH_M 1
#define RANDOMX_FREQ_ISMULH_R 4
#define RANDOMX_FREQ_ISMULH_M 1
#define RANDOMX_FREQ_IMUL_RCP 8
#define RANDOMX_FREQ_INEG_R 2
#define RANDOMX_FREQ_IXOR_R 15
#define RANDOMX_FREQ_IXOR_M 5
#define RANDOMX_FREQ_IROR_R 8
#define RANDOMX_FREQ_IROL_R 2
#define RANDOMX_FREQ_ISWAP_R 4
//Floating point instructions
#define RANDOMX_FREQ_FSWAP_R 4
#define RANDOMX_FREQ_FADD_R 16
#define RANDOMX_FREQ_FADD_M 5
#define RANDOMX_FREQ_FSUB_R 16
#define RANDOMX_FREQ_FSUB_M 5
#define RANDOMX_FREQ_FSCAL_R 6
#define RANDOMX_FREQ_FMUL_R 32
#define RANDOMX_FREQ_FDIV_M 4
#define RANDOMX_FREQ_FSQRT_R 6
//Control instructions
#define RANDOMX_FREQ_CBRANCH 25
#define RANDOMX_FREQ_CFROUND 1
//Store instruction
#define RANDOMX_FREQ_ISTORE 16
//No-op instruction
#define RANDOMX_FREQ_NOP 0
/* ------
256
*/

128
crypto/randomx/cpu.cpp Executable file
View file

@ -0,0 +1,128 @@
/*
Copyright (c) 2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cpu.hpp"
#include <cstring>
#if defined(_M_X64) || defined(__x86_64__)
#define HAVE_CPUID
#if defined(_MSC_VER)
#include <intrin.h>
#define cpuid(info, x) __cpuidex(info, x, 0)
#else //GCC
#include <cpuid.h>
void cpuid(int info[4], int InfoType) {
__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
}
#endif
#endif
#if defined(HAVE_HWCAP)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#ifdef __riscv
#include <signal.h>
#include <setjmp.h>
#include <cstdint>
extern "C" uint64_t rv64_test_vector();
extern "C" uint64_t rv64_test_vector_aes();
static sigjmp_buf jump_buffer;
static void sigill_handler(int) { siglongjmp(jump_buffer, 1); }
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash);
#endif
namespace randomx {
Cpu::Cpu()
{
#ifdef HAVE_CPUID
int info[4];
cpuid(info, 0);
int nIds = info[0];
if (nIds >= 0x00000001) {
cpuid(info, 0x00000001);
ssse3_ = (info[2] & (1 << 9)) != 0;
aes_ = (info[2] & (1 << 25)) != 0;
}
if (nIds >= 0x00000007) {
cpuid(info, 0x00000007);
avx2_ = (info[1] & (1 << 5)) != 0;
}
#elif defined(__aarch64__)
#if defined(HWCAP_AES)
long hwcaps = getauxval(AT_HWCAP);
aes_ = (hwcaps & HWCAP_AES) != 0;
#elif defined(__APPLE__)
aes_ = true;
#endif
#elif defined(__riscv)
struct sigaction new_action, old_action;
new_action.sa_handler = sigill_handler;
sigemptyset(&new_action.sa_mask);
new_action.sa_flags = 0;
if (sigaction(SIGILL, &new_action, &old_action) == 0) {
if (sigsetjmp(jump_buffer, 1) == 0) {
rvv_length = static_cast<int>(rv64_test_vector());
// If execution gets here, vector instructions executed successfully
rvv_ = true;
}
if (sigsetjmp(jump_buffer, 1) == 0) {
if (rv64_test_vector_aes() == 0) {
// If execution gets here, vector AES instructions executed successfully
// Now need to check that they actually do what they're supposed to do
uint64_t input[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
uint64_t hash[8] = {};
static constexpr uint64_t ref_hash[8] = {
0x195268637f56cab0ull, 0xdf7d7d3553e9e1d1ull, 0x3067fb6e5efcbee2ull, 0xfab778b414feaf77ull,
0x99905a5000820817ull, 0xae359bff2379ff97ull, 0x0d87373e6505c4c3ull, 0xf3f5cffd57f2dd62ull
};
hashAes1Rx4_zvkned(input, sizeof(input), hash);
aes_ = (memcmp(hash, ref_hash, sizeof(hash)) == 0);
}
}
sigaction(SIGILL, &old_action, nullptr);
}
#endif
//TODO POWER8 AES
}
const Cpu cpu;
}

56
crypto/randomx/cpu.hpp Executable file
View file

@ -0,0 +1,56 @@
/*
Copyright (c) 2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
namespace randomx {
class Cpu {
public:
Cpu();
inline bool hasAes() const { return aes_; }
inline bool hasSsse3() const { return ssse3_; }
inline bool hasAvx2() const { return avx2_; }
#ifdef __riscv
inline bool hasRVV() const { return rvv_; }
inline int getRVV_Length() const { return rvv_length; }
#endif
private:
bool aes_ = false;
bool ssse3_ = false;
bool avx2_ = false;
#ifdef __riscv
bool rvv_ = false;
int rvv_length = 0;
#endif
};
extern const Cpu cpu;
}

47
crypto/randomx/cpu_rv64.S Executable file
View file

@ -0,0 +1,47 @@
/*
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.text
.option arch, rv64gcv_zvkned
.global rv64_test_vector
.global rv64_test_vector_aes
// Returns vector register length (in bits)
rv64_test_vector:
li x10, 1024
vsetvli x10, x10, e64, m1, ta, ma
slli x10, x10, 6
ret
rv64_test_vector_aes:
vsetivli zero, 8, e32, m1, ta, ma
vaesem.vv v0, v0
vaesdm.vv v0, v0
li x10, 0
ret

196
crypto/randomx/dataset.cpp Executable file
View file

@ -0,0 +1,196 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <new>
#include <algorithm>
#include <stdexcept>
#include <cstring>
#include <limits>
#include <cstring>
#include <cassert>
#include "common.hpp"
#include "dataset.hpp"
#include "virtual_memory.h"
#include "superscalar.hpp"
#include "blake2_generator.hpp"
#include "reciprocal.h"
#include "blake2/endian.h"
#include "argon2.h"
#include "argon2_core.h"
#include "jit_compiler.hpp"
#include "intrin_portable.h"
static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value");
static_assert(ARGON2_BLOCK_SIZE == randomx::ArgonBlockSize, "Unpexpected value of ARGON2_BLOCK_SIZE");
namespace randomx {
template<class Allocator>
void deallocCache(randomx_cache* cache) {
if (cache->memory != nullptr)
Allocator::freeMemory(cache->memory, CacheSize);
if (cache->jit != nullptr)
delete cache->jit;
}
template void deallocCache<DefaultAllocator>(randomx_cache* cache);
template void deallocCache<LargePageAllocator>(randomx_cache* cache);
void initCache(randomx_cache* cache, const void* key, size_t keySize) {
uint32_t memory_blocks, segment_length;
argon2_instance_t instance;
argon2_context context;
context.out = nullptr;
context.outlen = 0;
context.pwd = CONST_CAST(uint8_t *)key;
context.pwdlen = (uint32_t)keySize;
context.salt = CONST_CAST(uint8_t *)RANDOMX_ARGON_SALT;
context.saltlen = (uint32_t)randomx::ArgonSaltSize;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.t_cost = RANDOMX_ARGON_ITERATIONS;
context.m_cost = RANDOMX_ARGON_MEMORY;
context.lanes = RANDOMX_ARGON_LANES;
context.threads = 1;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = ARGON2_DEFAULT_FLAGS;
context.version = ARGON2_VERSION_NUMBER;
int inputsValid = randomx_argon2_validate_inputs(&context);
assert(inputsValid == ARGON2_OK);
/* 2. Align memory size */
/* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
memory_blocks = context.m_cost;
segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS);
instance.version = context.version;
instance.memory = NULL;
instance.passes = context.t_cost;
instance.memory_blocks = memory_blocks;
instance.segment_length = segment_length;
instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
instance.lanes = context.lanes;
instance.threads = context.threads;
instance.type = Argon2_d;
instance.memory = (block*)cache->memory;
instance.impl = cache->argonImpl;
if (instance.threads > instance.lanes) {
instance.threads = instance.lanes;
}
/* 3. Initialization: Hashing inputs, allocating memory, filling first
* blocks
*/
randomx_argon2_initialize(&instance, &context);
randomx_argon2_fill_memory_blocks(&instance);
cache->reciprocalCache.clear();
randomx::Blake2Generator gen(key, keySize);
for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
randomx::generateSuperscalar(cache->programs[i], gen);
for (unsigned j = 0; j < cache->programs[i].getSize(); ++j) {
auto& instr = cache->programs[i](j);
if ((SuperscalarInstructionType)instr.opcode == SuperscalarInstructionType::IMUL_RCP) {
auto rcp = randomx_reciprocal(instr.getImm32());
instr.setImm32(cache->reciprocalCache.size());
cache->reciprocalCache.push_back(rcp);
}
}
}
}
void initCacheCompile(randomx_cache* cache, const void* key, size_t keySize) {
initCache(cache, key, keySize);
cache->jit->enableWriting();
cache->jit->generateSuperscalarHash(cache->programs, cache->reciprocalCache);
cache->jit->generateDatasetInitCode();
cache->jit->enableExecution();
}
constexpr uint64_t superscalarMul0 = 6364136223846793005ULL;
constexpr uint64_t superscalarAdd1 = 9298411001130361340ULL;
constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL;
constexpr uint64_t superscalarAdd3 = 9306329213124626780ULL;
constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL;
constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL;
constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL;
constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL;
static inline uint8_t* getMixBlock(uint64_t registerValue, uint8_t *memory) {
constexpr uint32_t mask = CacheSize / CacheLineSize - 1;
return memory + (registerValue & mask) * CacheLineSize;
}
void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t itemNumber) {
int_reg_t rl[8];
uint8_t* mixBlock;
uint64_t registerValue = itemNumber;
rl[0] = (itemNumber + 1) * superscalarMul0;
rl[1] = rl[0] ^ superscalarAdd1;
rl[2] = rl[0] ^ superscalarAdd2;
rl[3] = rl[0] ^ superscalarAdd3;
rl[4] = rl[0] ^ superscalarAdd4;
rl[5] = rl[0] ^ superscalarAdd5;
rl[6] = rl[0] ^ superscalarAdd6;
rl[7] = rl[0] ^ superscalarAdd7;
for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
mixBlock = getMixBlock(registerValue, cache->memory);
rx_prefetch_nta(mixBlock);
SuperscalarProgram& prog = cache->programs[i];
executeSuperscalar(rl, prog, &cache->reciprocalCache);
for (unsigned q = 0; q < 8; ++q)
rl[q] ^= load64_native(mixBlock + 8 * q);
registerValue = rl[prog.getAddressRegister()];
}
memcpy(out, &rl, CacheLineSize);
}
void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startItem, uint32_t endItem) {
for (uint32_t itemNumber = startItem; itemNumber < endItem; ++itemNumber, dataset += CacheLineSize)
initDatasetItem(cache, dataset, itemNumber);
}
}

103
crypto/randomx/dataset.hpp Executable file
View file

@ -0,0 +1,103 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <vector>
#include <type_traits>
#include "common.hpp"
#include "superscalar_program.hpp"
#include "allocator.hpp"
#include "argon2.h"
/* Global scope for C binding */
struct randomx_dataset {
uint8_t* memory = nullptr;
randomx::DatasetDeallocFunc* dealloc;
};
/* Global scope for C binding */
struct randomx_cache {
uint8_t* memory = nullptr;
randomx::CacheDeallocFunc* dealloc;
randomx::JitCompiler* jit;
randomx::CacheInitializeFunc* initialize;
randomx::DatasetInitFunc* datasetInit;
randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
std::vector<uint64_t> reciprocalCache;
std::string cacheKey;
randomx_argon2_impl* argonImpl;
bool isInitialized() {
return programs[0].getSize() != 0;
}
};
//A pointer to a standard-layout struct object points to its initial member
static_assert(std::is_standard_layout<randomx_dataset>(), "randomx_dataset must be a standard-layout struct");
//the following assert fails when compiling Debug in Visual Studio (JIT mode will crash in Debug)
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && defined(_DEBUG)
#define TO_STR(x) #x
#define STR(x) TO_STR(x)
#pragma message ( __FILE__ "(" STR(__LINE__) ") warning: check std::is_standard_layout<randomx_cache>() is disabled for Debug configuration. JIT mode will crash." )
#undef STR
#undef TO_STR
#else
static_assert(std::is_standard_layout<randomx_cache>(), "randomx_cache must be a standard-layout struct");
#endif
namespace randomx {
using DefaultAllocator = AlignedAllocator<CacheLineSize>;
template<class Allocator>
void deallocDataset(randomx_dataset* dataset) {
if (dataset->memory != nullptr)
Allocator::freeMemory(dataset->memory, DatasetSize);
}
template<class Allocator>
void deallocCache(randomx_cache* cache);
void initCache(randomx_cache*, const void*, size_t);
void initCacheCompile(randomx_cache*, const void*, size_t);
void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber);
void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) {
if (flags & RANDOMX_FLAG_ARGON2_AVX2) {
return randomx_argon2_impl_avx2();
}
if (flags & RANDOMX_FLAG_ARGON2_SSSE3) {
return randomx_argon2_impl_ssse3();
}
return &randomx_argon2_fill_segment_ref;
}
}

390
crypto/randomx/instruction.cpp Executable file
View file

@ -0,0 +1,390 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "instruction.hpp"
#include "common.hpp"
namespace randomx {
void Instruction::print(std::ostream& os) const {
os << names[opcode] << " ";
auto handler = engine[opcode];
(this->*handler)(os);
}
void Instruction::genAddressReg(std::ostream& os, int srcIndex) const {
os << (getModMem() ? "L1" : "L2") << "[r" << srcIndex << std::showpos << (int32_t)getImm32() << std::noshowpos << "]";
}
void Instruction::genAddressRegDst(std::ostream& os, int dstIndex) const {
if (getModCond() < StoreL3Condition)
os << (getModMem() ? "L1" : "L2");
else
os << "L3";
os << "[r" << dstIndex << std::showpos << (int32_t)getImm32() << std::noshowpos << "]";
}
void Instruction::genAddressImm(std::ostream& os) const {
os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]";
}
void Instruction::h_IADD_RS(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
os << "r" << dstIndex << ", r" << srcIndex;
if(dstIndex == RegisterNeedsDisplacement) {
os << ", " << (int32_t)getImm32();
}
os << ", SHFT " << getModShift() << std::endl;
}
void Instruction::h_IADD_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_ISUB_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
else {
os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl;
}
}
void Instruction::h_ISUB_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IMUL_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
else {
os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl;
}
}
void Instruction::h_IMUL_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IMULH_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
void Instruction::h_IMULH_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_ISMULH_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
void Instruction::h_ISMULH_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_INEG_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
os << "r" << dstIndex << std::endl;
}
void Instruction::h_IXOR_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
else {
os << "r" << dstIndex << ", " << (int32_t)getImm32() << std::endl;
}
}
void Instruction::h_IXOR_M(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
else {
os << "r" << dstIndex << ", ";
genAddressImm(os);
os << std::endl;
}
}
void Instruction::h_IROR_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
else {
os << "r" << dstIndex << ", " << (getImm32() & 63) << std::endl;
}
}
void Instruction::h_IROL_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
if (dstIndex != srcIndex) {
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
else {
os << "r" << dstIndex << ", " << (getImm32() & 63) << std::endl;
}
}
void Instruction::h_IMUL_RCP(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
os << "r" << dstIndex << ", " << getImm32() << std::endl;
}
void Instruction::h_ISWAP_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
os << "r" << dstIndex << ", r" << srcIndex << std::endl;
}
void Instruction::h_FSWAP_R(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
const char reg = (dstIndex >= RegisterCountFlt) ? 'e' : 'f';
dstIndex %= RegisterCountFlt;
os << reg << dstIndex << std::endl;
}
void Instruction::h_FADD_R(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegisterCountFlt;
os << "f" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FADD_M(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegistersCount;
os << "f" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
void Instruction::h_FSUB_R(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegisterCountFlt;
os << "f" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FSUB_M(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegistersCount;
os << "f" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
void Instruction::h_FSCAL_R(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
os << "f" << dstIndex << std::endl;
}
void Instruction::h_FMUL_R(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegisterCountFlt;
os << "e" << dstIndex << ", a" << srcIndex << std::endl;
}
void Instruction::h_FDIV_M(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
auto srcIndex = src % RegistersCount;
os << "e" << dstIndex << ", ";
genAddressReg(os, srcIndex);
os << std::endl;
}
void Instruction::h_FSQRT_R(std::ostream& os) const {
auto dstIndex = dst % RegisterCountFlt;
os << "e" << dstIndex << std::endl;
}
void Instruction::h_CFROUND(std::ostream& os) const {
auto srcIndex = src % RegistersCount;
os << "r" << srcIndex << ", " << (getImm32() & 63) << std::endl;
}
void Instruction::h_CBRANCH(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
os << "r" << dstIndex << ", " << (int32_t)getImm32() << ", COND " << (int)(getModCond()) << std::endl;
}
void Instruction::h_ISTORE(std::ostream& os) const {
auto dstIndex = dst % RegistersCount;
auto srcIndex = src % RegistersCount;
genAddressRegDst(os, dstIndex);
os << ", r" << srcIndex << std::endl;
}
void Instruction::h_NOP(std::ostream& os) const {
os << std::endl;
}
#include "instruction_weights.hpp"
#define INST_NAME(x) REPN(#x, WT(x))
#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
const char* Instruction::names[256] = {
INST_NAME(IADD_RS)
INST_NAME(IADD_M)
INST_NAME(ISUB_R)
INST_NAME(ISUB_M)
INST_NAME(IMUL_R)
INST_NAME(IMUL_M)
INST_NAME(IMULH_R)
INST_NAME(IMULH_M)
INST_NAME(ISMULH_R)
INST_NAME(ISMULH_M)
INST_NAME(IMUL_RCP)
INST_NAME(INEG_R)
INST_NAME(IXOR_R)
INST_NAME(IXOR_M)
INST_NAME(IROR_R)
INST_NAME(IROL_R)
INST_NAME(ISWAP_R)
INST_NAME(FSWAP_R)
INST_NAME(FADD_R)
INST_NAME(FADD_M)
INST_NAME(FSUB_R)
INST_NAME(FSUB_M)
INST_NAME(FSCAL_R)
INST_NAME(FMUL_R)
INST_NAME(FDIV_M)
INST_NAME(FSQRT_R)
INST_NAME(CBRANCH)
INST_NAME(CFROUND)
INST_NAME(ISTORE)
INST_NAME(NOP)
};
InstructionFormatter Instruction::engine[256] = {
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IMUL_RCP)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
INST_HANDLE(FSWAP_R)
INST_HANDLE(FADD_R)
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
INST_HANDLE(FSCAL_R)
INST_HANDLE(FMUL_R)
INST_HANDLE(FDIV_M)
INST_HANDLE(FSQRT_R)
INST_HANDLE(CBRANCH)
INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE)
INST_HANDLE(NOP)
};
}

149
crypto/randomx/instruction.hpp Executable file
View file

@ -0,0 +1,149 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <iostream>
#include <type_traits>
#include "blake2/endian.h"
namespace randomx {
class Instruction;
typedef void(Instruction::*InstructionFormatter)(std::ostream&) const;
enum class InstructionType : uint16_t {
IADD_RS = 0,
IADD_M = 1,
ISUB_R = 2,
ISUB_M = 3,
IMUL_R = 4,
IMUL_M = 5,
IMULH_R = 6,
IMULH_M = 7,
ISMULH_R = 8,
ISMULH_M = 9,
IMUL_RCP = 10,
INEG_R = 11,
IXOR_R = 12,
IXOR_M = 13,
IROR_R = 14,
IROL_R = 15,
ISWAP_R = 16,
FSWAP_R = 17,
FADD_R = 18,
FADD_M = 19,
FSUB_R = 20,
FSUB_M = 21,
FSCAL_R = 22,
FMUL_R = 23,
FDIV_M = 24,
FSQRT_R = 25,
CBRANCH = 26,
CFROUND = 27,
ISTORE = 28,
NOP = 29,
};
class Instruction {
public:
uint32_t getImm32() const {
return load32(&imm32);
}
void setImm32(uint32_t val) {
return store32(&imm32, val);
}
const char* getName() const {
return names[opcode];
}
friend std::ostream& operator<<(std::ostream& os, const Instruction& i) {
i.print(os);
return os;
}
int getModMem() const {
return mod % 4; //bits 0-1
}
int getModShift() const {
return (mod >> 2) % 4; //bits 2-3
}
int getModCond() const {
return mod >> 4; //bits 4-7
}
void setMod(uint8_t val) {
mod = val;
}
uint8_t opcode;
uint8_t dst;
uint8_t src;
uint8_t mod;
uint32_t imm32;
private:
void print(std::ostream&) const;
static const char* names[256];
static InstructionFormatter engine[256];
void genAddressReg(std::ostream& os, int) const;
void genAddressImm(std::ostream& os) const;
void genAddressRegDst(std::ostream&, int) const;
void h_IADD_RS(std::ostream&) const;
void h_IADD_M(std::ostream&) const;
void h_ISUB_R(std::ostream&) const;
void h_ISUB_M(std::ostream&) const;
void h_IMUL_R(std::ostream&) const;
void h_IMUL_M(std::ostream&) const;
void h_IMULH_R(std::ostream&) const;
void h_IMULH_M(std::ostream&) const;
void h_ISMULH_R(std::ostream&) const;
void h_ISMULH_M(std::ostream&) const;
void h_IMUL_RCP(std::ostream&) const;
void h_INEG_R(std::ostream&) const;
void h_IXOR_R(std::ostream&) const;
void h_IXOR_M(std::ostream&) const;
void h_IROR_R(std::ostream&) const;
void h_IROL_R(std::ostream&) const;
void h_ISWAP_R(std::ostream&) const;
void h_FSWAP_R(std::ostream&) const;
void h_FADD_R(std::ostream&) const;
void h_FADD_M(std::ostream&) const;
void h_FSUB_R(std::ostream&) const;
void h_FSUB_M(std::ostream&) const;
void h_FSCAL_R(std::ostream&) const;
void h_FMUL_R(std::ostream&) const;
void h_FDIV_M(std::ostream&) const;
void h_FSQRT_R(std::ostream&) const;
void h_CBRANCH(std::ostream&) const;
void h_CFROUND(std::ostream&) const;
void h_ISTORE(std::ostream&) const;
void h_NOP(std::ostream&) const;
};
static_assert(sizeof(Instruction) == 8, "Invalid size of struct randomx::Instruction");
static_assert(std::is_standard_layout<Instruction>(), "randomx::Instruction must be a standard-layout struct");
}

View file

@ -0,0 +1,73 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define REP0(x)
#define REP1(x) x,
#define REP2(x) REP1(x) x,
#define REP3(x) REP2(x) x,
#define REP4(x) REP3(x) x,
#define REP5(x) REP4(x) x,
#define REP6(x) REP5(x) x,
#define REP7(x) REP6(x) x,
#define REP8(x) REP7(x) x,
#define REP9(x) REP8(x) x,
#define REP10(x) REP9(x) x,
#define REP11(x) REP10(x) x,
#define REP12(x) REP11(x) x,
#define REP13(x) REP12(x) x,
#define REP14(x) REP13(x) x,
#define REP15(x) REP14(x) x,
#define REP16(x) REP15(x) x,
#define REP17(x) REP16(x) x,
#define REP18(x) REP17(x) x,
#define REP19(x) REP18(x) x,
#define REP20(x) REP19(x) x,
#define REP21(x) REP20(x) x,
#define REP22(x) REP21(x) x,
#define REP23(x) REP22(x) x,
#define REP24(x) REP23(x) x,
#define REP25(x) REP24(x) x,
#define REP26(x) REP25(x) x,
#define REP27(x) REP26(x) x,
#define REP28(x) REP27(x) x,
#define REP29(x) REP28(x) x,
#define REP30(x) REP29(x) x,
#define REP31(x) REP30(x) x,
#define REP32(x) REP31(x) x,
#define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x)
#define REP64(x) REP32(x) REP32(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x)
#define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x)
#define REPN(x,N) REPNX(x,N)
#define NUM(x) x
#define WT(x) NUM(RANDOMX_FREQ_##x)

View file

@ -0,0 +1,208 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cfenv>
#include <cmath>
#include "common.hpp"
#include "intrin_portable.h"
#include "blake2/endian.h"
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
typedef __int128 int128_t;
uint64_t mulh(uint64_t a, uint64_t b) {
return ((uint128_t)a * b) >> 64;
}
int64_t smulh(int64_t a, int64_t b) {
return ((int128_t)a * b) >> 64;
}
#define HAVE_MULH
#define HAVE_SMULH
#endif
#if defined(_MSC_VER)
#define HAS_VALUE(X) X ## 0
#define EVAL_DEFINE(X) HAS_VALUE(X)
#include <intrin.h>
#include <stdlib.h>
uint64_t rotl(uint64_t x, unsigned int c) {
return _rotl64(x, c);
}
uint64_t rotr(uint64_t x, unsigned int c) {
return _rotr64(x, c);
}
#define HAVE_ROTL
#define HAVE_ROTR
#if EVAL_DEFINE(__MACHINEARM64_X64(1))
uint64_t mulh(uint64_t a, uint64_t b) {
return __umulh(a, b);
}
#define HAVE_MULH
#endif
#if EVAL_DEFINE(__MACHINEX64(1))
int64_t smulh(int64_t a, int64_t b) {
int64_t hi;
_mul128(a, b, &hi);
return hi;
}
#define HAVE_SMULH
#endif
static void setRoundMode_(uint32_t mode) {
_controlfp(mode, _MCW_RC);
}
#define HAVE_SETROUNDMODE_IMPL
#endif
#ifndef HAVE_SETROUNDMODE_IMPL
static void setRoundMode_(uint32_t mode) {
fesetround(mode);
}
#endif
#ifndef HAVE_ROTR
uint64_t rotr(uint64_t a, unsigned int b) {
return (a >> b) | (a << (-b & 63));
}
#define HAVE_ROTR
#endif
#ifndef HAVE_ROTL
uint64_t rotl(uint64_t a, unsigned int b) {
return (a << b) | (a >> (-b & 63));
}
#define HAVE_ROTL
#endif
#ifndef HAVE_MULH
#define LO(x) ((x)&0xffffffff)
#define HI(x) ((x)>>32)
uint64_t mulh(uint64_t a, uint64_t b) {
uint64_t ah = HI(a), al = LO(a);
uint64_t bh = HI(b), bl = LO(b);
uint64_t x00 = al * bl;
uint64_t x01 = al * bh;
uint64_t x10 = ah * bl;
uint64_t x11 = ah * bh;
uint64_t m1 = LO(x10) + LO(x01) + HI(x00);
uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1);
uint64_t m3 = HI(x11) + HI(m2);
return (m3 << 32) + LO(m2);
}
#define HAVE_MULH
#endif
#ifndef HAVE_SMULH
int64_t smulh(int64_t a, int64_t b) {
int64_t hi = mulh(a, b);
if (a < 0LL) hi -= b;
if (b < 0LL) hi -= a;
return hi;
}
#define HAVE_SMULH
#endif
#ifdef RANDOMX_DEFAULT_FENV
void rx_reset_float_state() {
setRoundMode_(FE_TONEAREST);
rx_set_double_precision(); //set precision to 53 bits if needed by the platform
}
void rx_set_rounding_mode(uint32_t mode) {
switch (mode & 3) {
case RoundDown:
setRoundMode_(FE_DOWNWARD);
break;
case RoundUp:
setRoundMode_(FE_UPWARD);
break;
case RoundToZero:
setRoundMode_(FE_TOWARDZERO);
break;
case RoundToNearest:
setRoundMode_(FE_TONEAREST);
break;
default:
UNREACHABLE;
}
}
uint32_t rx_get_rounding_mode() {
switch (fegetround()) {
case FE_DOWNWARD:
return RoundDown;
case FE_UPWARD:
return RoundUp;
case FE_TOWARDZERO:
return RoundToZero;
case FE_TONEAREST:
return RoundToNearest;
default:
UNREACHABLE;
}
}
#endif
#ifdef RANDOMX_USE_X87
#if defined(_MSC_VER) && defined(_M_IX86)
void rx_set_double_precision() {
_control87(_PC_53, _MCW_PC);
}
#elif defined(__i386)
void rx_set_double_precision() {
uint16_t volatile x87cw;
asm volatile("fstcw %0" : "=m" (x87cw));
x87cw &= ~0x300;
x87cw |= 0x200;
asm volatile("fldcw %0" : : "m" (x87cw));
}
#endif
#endif //RANDOMX_USE_X87
union double_ser_t {
double f;
uint64_t i;
};
double loadDoublePortable(const void* addr) {
double_ser_t ds;
ds.i = load64(addr);
return ds.f;
}

751
crypto/randomx/intrin_portable.h Executable file
View file

@ -0,0 +1,751 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include "blake2/endian.h"
constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
}
constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) {
return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x);
}
constexpr uint64_t signExtend2sCompl(uint32_t x) {
return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
}
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
//MSVC doesn't define __SSE2__, so we have to define it manually if SSE2 is available
#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2))
#define __SSE2__ 1
#endif
//MSVC doesn't define __AES__
#if defined(_MSC_VER) && defined(__SSE2__)
#define __AES__
#endif
//the library "sqrt" function provided by MSVC for x86 targets doesn't give
//the correct results, so we have to use inline assembly to call x87 fsqrt directly
#if !defined(__SSE2__)
#if defined(_MSC_VER) && defined(_M_IX86)
inline double __cdecl rx_sqrt(double x) {
__asm {
fld x
fsqrt
}
}
#define rx_sqrt rx_sqrt
void rx_set_double_precision();
#define RANDOMX_USE_X87
#elif defined(__i386)
void rx_set_double_precision();
#define RANDOMX_USE_X87
#endif
#endif //__SSE2__
#if !defined(rx_sqrt)
#define rx_sqrt sqrt
#endif
#if !defined(RANDOMX_USE_X87)
#define rx_set_double_precision(x)
#endif
#ifdef __SSE2__
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif
typedef __m128i rx_vec_i128;
typedef __m128d rx_vec_f128;
#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
#define rx_aligned_free(a) _mm_free(a)
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0)
#define rx_load_vec_f128 _mm_load_pd
#define rx_store_vec_f128 _mm_store_pd
#define rx_add_vec_f128 _mm_add_pd
#define rx_sub_vec_f128 _mm_sub_pd
#define rx_mul_vec_f128 _mm_mul_pd
#define rx_div_vec_f128 _mm_div_pd
#define rx_sqrt_vec_f128 _mm_sqrt_pd
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
return _mm_shuffle_pd(a, a, 1);
}
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return _mm_castsi128_pd(_mm_set1_epi64x(x));
}
#define rx_xor_vec_f128 _mm_xor_pd
#define rx_and_vec_f128 _mm_and_pd
#define rx_or_vec_f128 _mm_or_pd
#ifdef __AES__
#define rx_aesenc_vec_i128 _mm_aesenc_si128
#define rx_aesdec_vec_i128 _mm_aesdec_si128
#define HAVE_AES 1
#endif //__AES__
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return _mm_cvtsi128_si32(a);
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa));
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff));
}
#define rx_set_int_vec_i128 _mm_set_epi32
#define rx_xor_vec_i128 _mm_xor_si128
#define rx_load_vec_i128 _mm_load_si128
#define rx_store_vec_i128 _mm_store_si128
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
return _mm_cvtepi32_pd(ix);
}
constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
FORCE_INLINE void rx_reset_float_state() {
_mm_setcsr(rx_mxcsr_default);
}
FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
_mm_setcsr(rx_mxcsr_default | (mode << 13));
}
FORCE_INLINE uint32_t rx_get_rounding_mode() {
return (_mm_getcsr() >> 13) & 3;
}
#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
#include <cstdint>
#include <stdexcept>
#include <cstdlib>
#include <altivec.h>
#undef vector
#undef pixel
#undef bool
typedef __vector uint8_t __m128i;
typedef __vector uint32_t __m128l;
typedef __vector int __m128li;
typedef __vector uint64_t __m128ll;
typedef __vector double __m128d;
typedef __m128i rx_vec_i128;
typedef __m128d rx_vec_f128;
typedef union{
rx_vec_i128 i;
rx_vec_f128 d;
uint64_t u64[2];
double d64[2];
uint32_t u32[4];
int i32[4];
} vec_u;
#define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)
/* Splat 64-bit long long to 2 64-bit long longs */
FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
{ return (__m128i) vec_splats (scalar); }
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
#if defined(NATIVE_LITTLE_ENDIAN)
return (rx_vec_f128)vec_vsx_ld(0,pd);
#else
vec_u t;
t.u64[0] = load64(pd + 0);
t.u64[1] = load64(pd + 1);
return (rx_vec_f128)t.d;
#endif
}
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
#if defined(NATIVE_LITTLE_ENDIAN)
vec_vsx_st(a,0,(rx_vec_f128*)mem_addr);
#else
vec_u _a;
_a.d = a;
store64(mem_addr + 0, _a.u64[0]);
store64(mem_addr + 1, _a.u64[1]);
#endif
}
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7});
}
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_add(a,b);
}
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_sub(a,b);
}
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_mul(a,b);
}
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_div(a,b);
}
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
return (rx_vec_f128)vec_sqrt(a);
}
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
return (rx_vec_i128)vec_splat2sd(a);
}
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
return (rx_vec_f128)a;
}
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
return (rx_vec_f128)(__m128ll){x0,x1};
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return (rx_vec_f128)vec_splat2sd(x);
}
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_xor(a,b);
}
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_and(a,b);
}
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return (rx_vec_f128)vec_or(a,b);
}
#if defined(__CRYPTO__)
FORCE_INLINE __m128ll vrev(__m128i v){
#if defined(NATIVE_LITTLE_ENDIAN)
return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
#else
return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12});
#endif
}
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
__m128ll _v = vrev(v);
__m128ll _rkey = vrev(rkey);
__m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey));
return (rx_vec_i128)result;
}
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
__m128ll _v = vrev(v);
__m128ll zero = (__m128ll){0};
__m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero));
return (rx_vec_i128)vec_xor((__m128i)out,rkey);
}
#define HAVE_AES 1
#endif //__CRYPTO__
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
vec_u _a;
_a.i = a;
return _a.i32[0];
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
vec_u _a;
_a.i = a;
return _a.i32[1];
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
vec_u _a;
_a.i = a;
return _a.i32[2];
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
vec_u _a;
_a.i = a;
return _a.i32[3];
}
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int i3, int i2, int i1, int i0) {
return (rx_vec_i128)((__m128li){i0,i1,i2,i3});
};
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 a, rx_vec_i128 b) {
return (rx_vec_i128)vec_xor(a,b);
}
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p) {
#if defined(NATIVE_LITTLE_ENDIAN)
return *p;
#else
const uint32_t* ptr = (const uint32_t*)p;
vec_u c;
c.u32[0] = load32(ptr + 0);
c.u32[1] = load32(ptr + 1);
c.u32[2] = load32(ptr + 2);
c.u32[3] = load32(ptr + 3);
return (rx_vec_i128)c.i;
#endif
}
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) {
#if defined(NATIVE_LITTLE_ENDIAN)
*p = b;
#else
uint32_t* ptr = (uint32_t*)p;
vec_u B;
B.i = b;
store32(ptr + 0, B.u32[0]);
store32(ptr + 1, B.u32[1]);
store32(ptr + 2, B.u32[2]);
store32(ptr + 3, B.u32[3]);
#endif
}
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
vec_u x;
x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0));
x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4));
return (rx_vec_f128)x.d;
}
#define RANDOMX_DEFAULT_FENV
#elif defined(__aarch64__)
#include <stdlib.h>
#include <arm_neon.h>
#include <arm_acle.h>
typedef uint8x16_t rx_vec_i128;
typedef float64x2_t rx_vec_f128;
inline void* rx_aligned_alloc(size_t size, size_t align) {
void* p;
if (posix_memalign(&p, align, size) == 0)
return p;
return 0;
};
#define rx_aligned_free(a) free(a)
inline void rx_prefetch_nta(void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}
inline void rx_prefetch_t0(const void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
return vld1q_f64((const float64_t*)pd);
}
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) {
vst1q_f64((float64_t*)mem_addr, val);
}
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
float64x2_t temp;
temp = vcopyq_laneq_f64(temp, 1, a, 1);
a = vcopyq_laneq_f64(a, 1, a, 0);
return vcopyq_laneq_f64(a, 0, temp, 1);
}
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
uint64x2_t temp0 = vdupq_n_u64(x0);
uint64x2_t temp1 = vdupq_n_u64(x1);
return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return vreinterpretq_f64_u64(vdupq_n_u64(x));
}
#define rx_add_vec_f128 vaddq_f64
#define rx_sub_vec_f128 vsubq_f64
#define rx_mul_vec_f128 vmulq_f64
#define rx_div_vec_f128 vdivq_f64
#define rx_sqrt_vec_f128 vsqrtq_f64
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
#ifdef __ARM_FEATURE_CRYPTO
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
const uint8x16_t zero = { 0 };
return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key;
}
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
const uint8x16_t zero = { 0 };
return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key;
}
#define HAVE_AES 1
#endif
#define rx_xor_vec_i128 veorq_u8
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0);
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1);
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2);
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3);
}
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int i3, int i2, int i1, int i0) {
int32_t data[4];
data[0] = i0;
data[1] = i1;
data[2] = i2;
data[3] = i3;
return vreinterpretq_u8_s32(vld1q_s32(data));
};
#define rx_xor_vec_i128 veorq_u8
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) {
return vld1q_u8((const uint8_t*)mem_addr);
}
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) {
vst1q_u8((uint8_t*)mem_addr, val);
}
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
rx_vec_f128 x;
x = vsetq_lane_f64(lo, x, 0);
x = vsetq_lane_f64(hi, x, 1);
return x;
}
#define RANDOMX_DEFAULT_FENV
#else //portable fallback
#include <cstdint>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
typedef union {
uint64_t u64[2];
uint32_t u32[4];
uint16_t u16[8];
uint8_t u8[16];
} rx_vec_i128;
typedef union {
struct {
double lo;
double hi;
};
rx_vec_i128 i;
} rx_vec_f128;
#define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
rx_vec_f128 x;
x.i.u64[0] = load64(pd + 0);
x.i.u64[1] = load64(pd + 1);
return x;
}
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
store64(mem_addr + 0, a.i.u64[0]);
store64(mem_addr + 1, a.i.u64[1]);
}
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
double temp = a.hi;
a.hi = a.lo;
a.lo = temp;
return a;
}
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo + b.lo;
x.hi = a.hi + b.hi;
return x;
}
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo - b.lo;
x.hi = a.hi - b.hi;
return x;
}
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo * b.lo;
x.hi = a.hi * b.hi;
return x;
}
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo / b.lo;
x.hi = a.hi / b.hi;
return x;
}
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
rx_vec_f128 x;
x.lo = rx_sqrt(a.lo);
x.hi = rx_sqrt(a.hi);
return x;
}
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
rx_vec_i128 x;
x.u64[0] = a;
x.u64[1] = a;
return x;
}
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
rx_vec_f128 x;
x.i = a;
return x;
}
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
rx_vec_f128 v;
v.i.u64[0] = x0;
v.i.u64[1] = x1;
return v;
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
rx_vec_f128 v;
v.i.u64[0] = x;
v.i.u64[1] = x;
return v;
}
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
return x;
}
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
return x;
}
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
return x;
}
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return a.u32[0];
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return a.u32[1];
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return a.u32[2];
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return a.u32[3];
}
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int i3, int i2, int i1, int i0) {
rx_vec_i128 v;
v.u32[0] = i0;
v.u32[1] = i1;
v.u32[2] = i2;
v.u32[3] = i3;
return v;
};
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 a, rx_vec_i128 b) {
rx_vec_i128 c;
c.u32[0] = a.u32[0] ^ b.u32[0];
c.u32[1] = a.u32[1] ^ b.u32[1];
c.u32[2] = a.u32[2] ^ b.u32[2];
c.u32[3] = a.u32[3] ^ b.u32[3];
return c;
}
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const* p) {
#if defined(NATIVE_LITTLE_ENDIAN)
return *p;
#else
const uint32_t* ptr = (const uint32_t*)p;
rx_vec_i128 c;
c.u32[0] = load32(ptr + 0);
c.u32[1] = load32(ptr + 1);
c.u32[2] = load32(ptr + 2);
c.u32[3] = load32(ptr + 3);
return c;
#endif
}
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) {
#if defined(NATIVE_LITTLE_ENDIAN)
*p = b;
#else
uint32_t* ptr = (uint32_t*)p;
store32(ptr + 0, b.u32[0]);
store32(ptr + 1, b.u32[1]);
store32(ptr + 2, b.u32[2]);
store32(ptr + 3, b.u32[3]);
#endif
}
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
rx_vec_f128 x;
x.lo = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0));
x.hi = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4));
return x;
}
#define RANDOMX_DEFAULT_FENV
#endif
#ifndef HAVE_AES
static const char* platformError = "Platform doesn't support hardware AES";
#include <stdexcept>
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
throw std::runtime_error(platformError);
}
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
throw std::runtime_error(platformError);
}
#define HAVE_AES 0
#endif
#ifdef RANDOMX_DEFAULT_FENV
void rx_reset_float_state();
void rx_set_rounding_mode(uint32_t mode);
uint32_t rx_get_rounding_mode();
#endif
double loadDoublePortable(const void* addr);
uint64_t mulh(uint64_t, uint64_t);
int64_t smulh(int64_t, int64_t);
uint64_t rotl(uint64_t, unsigned int);
uint64_t rotr(uint64_t, unsigned int);

79
crypto/randomx/jit_compiler.hpp Executable file
View file

@ -0,0 +1,79 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "common.hpp"
namespace randomx {
struct CodeBuffer {
uint8_t* code;
int32_t codePos;
int32_t rcpCount;
void emit(const uint8_t* src, int32_t len) {
memcpy(&code[codePos], src, len);
codePos += len;
}
template<typename T>
void emit(T src) {
memcpy(&code[codePos], &src, sizeof(src));
codePos += sizeof(src);
}
void emitAt(int32_t codePos, const uint8_t* src, int32_t len) {
memcpy(&code[codePos], src, len);
}
template<typename T>
void emitAt(int32_t codePos, T src) {
memcpy(&code[codePos], &src, sizeof(src));
}
};
struct CompilerState : public CodeBuffer {
int32_t instructionOffsets[RANDOMX_PROGRAM_SIZE];
int registerUsage[RegistersCount];
};
}
#if defined(RANDOMX_COMPILER_X86)
#include "jit_compiler_x86.hpp"
#elif defined(RANDOMX_COMPILER_A64)
#include "jit_compiler_a64.hpp"
#elif defined(RANDOMX_COMPILER_RV64)
#include "jit_compiler_rv64.hpp"
#else
#include "jit_compiler_fallback.hpp"
#endif
#if defined(__OpenBSD__) || defined(__NetBSD__) || (defined(__APPLE__) && defined(__aarch64__))
#define RANDOMX_FORCE_SECURE
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,128 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <vector>
#include <stdexcept>
#include "common.hpp"
#include "jit_compiler_a64_static.hpp"
namespace randomx {
class Program;
struct ProgramConfiguration;
class SuperscalarProgram;
class Instruction;
typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&);
class JitCompilerA64 {
public:
JitCompilerA64();
~JitCompilerA64();
void generateProgram(Program&, ProgramConfiguration&);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
template<size_t N>
void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &);
void generateDatasetInitCode() {}
ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
DatasetInitFunc* getDatasetInitFunc();
uint8_t* getCode() { return code; }
size_t getCodeSize();
void enableWriting();
void enableExecution();
void enableAll();
private:
static InstructionGeneratorA64 engine[256];
uint32_t reg_changed_offset[8];
uint8_t* code;
uint32_t literalPos;
uint32_t num32bitLiterals;
static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
{
*(uint32_t*)(code + codePos) = val;
codePos += sizeof(val);
}
static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
{
memcpy(code + codePos, &val, sizeof(val));
codePos += sizeof(val);
}
void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
template<uint32_t tmp_reg>
void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
template<uint32_t tmp_reg_fp>
void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
void h_IADD_RS(Instruction&, uint32_t&);
void h_IADD_M(Instruction&, uint32_t&);
void h_ISUB_R(Instruction&, uint32_t&);
void h_ISUB_M(Instruction&, uint32_t&);
void h_IMUL_R(Instruction&, uint32_t&);
void h_IMUL_M(Instruction&, uint32_t&);
void h_IMULH_R(Instruction&, uint32_t&);
void h_IMULH_M(Instruction&, uint32_t&);
void h_ISMULH_R(Instruction&, uint32_t&);
void h_ISMULH_M(Instruction&, uint32_t&);
void h_IMUL_RCP(Instruction&, uint32_t&);
void h_INEG_R(Instruction&, uint32_t&);
void h_IXOR_R(Instruction&, uint32_t&);
void h_IXOR_M(Instruction&, uint32_t&);
void h_IROR_R(Instruction&, uint32_t&);
void h_IROL_R(Instruction&, uint32_t&);
void h_ISWAP_R(Instruction&, uint32_t&);
void h_FSWAP_R(Instruction&, uint32_t&);
void h_FADD_R(Instruction&, uint32_t&);
void h_FADD_M(Instruction&, uint32_t&);
void h_FSUB_R(Instruction&, uint32_t&);
void h_FSUB_M(Instruction&, uint32_t&);
void h_FSCAL_R(Instruction&, uint32_t&);
void h_FMUL_R(Instruction&, uint32_t&);
void h_FDIV_M(Instruction&, uint32_t&);
void h_FSQRT_R(Instruction&, uint32_t&);
void h_CBRANCH(Instruction&, uint32_t&);
void h_CFROUND(Instruction&, uint32_t&);
void h_ISTORE(Instruction&, uint32_t&);
void h_NOP(Instruction&, uint32_t&);
};
}

View file

@ -0,0 +1,589 @@
# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
# Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the copyright holder nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__APPLE__)
#define DECL(x) _##x
#else
#define DECL(x) x
#endif
.arch armv8-a
.text
.global DECL(randomx_program_aarch64)
.global DECL(randomx_program_aarch64_main_loop)
.global DECL(randomx_program_aarch64_vm_instructions)
.global DECL(randomx_program_aarch64_imul_rcp_literals_end)
.global DECL(randomx_program_aarch64_vm_instructions_end)
.global DECL(randomx_program_aarch64_cacheline_align_mask1)
.global DECL(randomx_program_aarch64_cacheline_align_mask2)
.global DECL(randomx_program_aarch64_update_spMix1)
.global DECL(randomx_program_aarch64_vm_instructions_end_light)
.global DECL(randomx_program_aarch64_light_cacheline_align_mask)
.global DECL(randomx_program_aarch64_light_dataset_offset)
.global DECL(randomx_init_dataset_aarch64)
.global DECL(randomx_init_dataset_aarch64_end)
.global DECL(randomx_calc_dataset_item_aarch64)
.global DECL(randomx_calc_dataset_item_aarch64_prefetch)
.global DECL(randomx_calc_dataset_item_aarch64_mix)
.global DECL(randomx_calc_dataset_item_aarch64_store_result)
.global DECL(randomx_calc_dataset_item_aarch64_end)
#include "configuration.h"
# Register allocation
# x0 -> pointer to reg buffer and then literal for IMUL_RCP
# x1 -> pointer to mem buffer and then to dataset
# x2 -> pointer to scratchpad
# x3 -> loop counter
# x4 -> "r0"
# x5 -> "r1"
# x6 -> "r2"
# x7 -> "r3"
# x8 -> fpcr (reversed bits)
# x9 -> mx, ma
# x10 -> spMix1
# x11 -> literal for IMUL_RCP
# x12 -> "r4"
# x13 -> "r5"
# x14 -> "r6"
# x15 -> "r7"
# x16 -> spAddr0
# x17 -> spAddr1
# x18 -> unused (platform register, don't touch it)
# x19 -> temporary
# x20 -> temporary
# x21 -> literal for IMUL_RCP
# x22 -> literal for IMUL_RCP
# x23 -> literal for IMUL_RCP
# x24 -> literal for IMUL_RCP
# x25 -> literal for IMUL_RCP
# x26 -> literal for IMUL_RCP
# x27 -> literal for IMUL_RCP
# x28 -> literal for IMUL_RCP
# x29 -> literal for IMUL_RCP
# x30 -> literal for IMUL_RCP
# v0-v15 -> store 32-bit literals
# v16 -> "f0"
# v17 -> "f1"
# v18 -> "f2"
# v19 -> "f3"
# v20 -> "e0"
# v21 -> "e1"
# v22 -> "e2"
# v23 -> "e3"
# v24 -> "a0"
# v25 -> "a1"
# v26 -> "a2"
# v27 -> "a3"
# v28 -> temporary
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
# v31 -> scale mask = 0x81f000000000000081f0000000000000
.balign 4
DECL(randomx_program_aarch64):
# Save callee-saved registers
sub sp, sp, 192
stp x16, x17, [sp]
str x19, [sp, 16]
stp x20, x21, [sp, 32]
stp x22, x23, [sp, 48]
stp x24, x25, [sp, 64]
stp x26, x27, [sp, 80]
stp x28, x29, [sp, 96]
stp x8, x30, [sp, 112]
stp d8, d9, [sp, 128]
stp d10, d11, [sp, 144]
stp d12, d13, [sp, 160]
stp d14, d15, [sp, 176]
# Zero integer registers
mov x4, xzr
mov x5, xzr
mov x6, xzr
mov x7, xzr
mov x12, xzr
mov x13, xzr
mov x14, xzr
mov x15, xzr
# Load ma, mx and dataset pointer
ldp x9, x1, [x1]
# Load initial spMix value
mov x10, x9
# Load group A registers
ldp q24, q25, [x0, 192]
ldp q26, q27, [x0, 224]
# Load E 'and' mask
mov x16, 0x00FFFFFFFFFFFFFF
ins v29.d[0], x16
ins v29.d[1], x16
# Load E 'or' mask (stored in reg.f[0])
ldr q30, [x0, 64]
# Load scale mask
mov x16, 0x80f0000000000000
ins v31.d[0], x16
ins v31.d[1], x16
# Read fpcr
mrs x8, fpcr
rbit x8, x8
# Save x0
str x0, [sp, -16]!
# Read literals
ldr x0, literal_x0
ldr x11, literal_x11
ldr x21, literal_x21
ldr x22, literal_x22
ldr x23, literal_x23
ldr x24, literal_x24
ldr x25, literal_x25
ldr x26, literal_x26
ldr x27, literal_x27
ldr x28, literal_x28
ldr x29, literal_x29
ldr x30, literal_x30
ldr q0, literal_v0
ldr q1, literal_v1
ldr q2, literal_v2
ldr q3, literal_v3
ldr q4, literal_v4
ldr q5, literal_v5
ldr q6, literal_v6
ldr q7, literal_v7
ldr q8, literal_v8
ldr q9, literal_v9
ldr q10, literal_v10
ldr q11, literal_v11
ldr q12, literal_v12
ldr q13, literal_v13
ldr q14, literal_v14
ldr q15, literal_v15
DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
lsr x20, x10, 32
# Actual mask will be inserted by JIT compiler
and w16, w10, 1
and w17, w20, 1
# x16 = scratchpad + spAddr0
# x17 = scratchpad + spAddr1
add x16, x16, x2
add x17, x17, x2
# xor integer registers with scratchpad data (spAddr0)
ldp x20, x19, [x16]
eor x4, x4, x20
eor x5, x5, x19
ldp x20, x19, [x16, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x20, x19, [x16, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x20, x19, [x16, 48]
eor x14, x14, x20
eor x15, x15, x19
# Load group F registers (spAddr1)
ldpsw x20, x19, [x17]
ins v16.d[0], x20
ins v16.d[1], x19
ldpsw x20, x19, [x17, 8]
ins v17.d[0], x20
ins v17.d[1], x19
ldpsw x20, x19, [x17, 16]
ins v18.d[0], x20
ins v18.d[1], x19
ldpsw x20, x19, [x17, 24]
ins v19.d[0], x20
ins v19.d[1], x19
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldpsw x20, x19, [x17, 32]
ins v20.d[0], x20
ins v20.d[1], x19
ldpsw x20, x19, [x17, 40]
ins v21.d[0], x20
ins v21.d[1], x19
ldpsw x20, x19, [x17, 48]
ins v22.d[0], x20
ins v22.d[1], x19
ldpsw x20, x19, [x17, 56]
ins v23.d[0], x20
ins v23.d[1], x19
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d
and v20.16b, v20.16b, v29.16b
and v21.16b, v21.16b, v29.16b
and v22.16b, v22.16b, v29.16b
and v23.16b, v23.16b, v29.16b
orr v20.16b, v20.16b, v30.16b
orr v21.16b, v21.16b, v30.16b
orr v22.16b, v22.16b, v30.16b
orr v23.16b, v23.16b, v30.16b
# Execute VM instructions
DECL(randomx_program_aarch64_vm_instructions):
# buffer for generated instructions
# FDIV_M is the largest instruction taking up to 12 ARMv8 instructions
.fill RANDOMX_PROGRAM_SIZE*12,4,0
literal_x0: .fill 1,8,0
literal_x11: .fill 1,8,0
literal_x21: .fill 1,8,0
literal_x22: .fill 1,8,0
literal_x23: .fill 1,8,0
literal_x24: .fill 1,8,0
literal_x25: .fill 1,8,0
literal_x26: .fill 1,8,0
literal_x27: .fill 1,8,0
literal_x28: .fill 1,8,0
literal_x29: .fill 1,8,0
literal_x30: .fill 1,8,0
DECL(randomx_program_aarch64_imul_rcp_literals_end):
literal_v0: .fill 2,8,0
literal_v1: .fill 2,8,0
literal_v2: .fill 2,8,0
literal_v3: .fill 2,8,0
literal_v4: .fill 2,8,0
literal_v5: .fill 2,8,0
literal_v6: .fill 2,8,0
literal_v7: .fill 2,8,0
literal_v8: .fill 2,8,0
literal_v9: .fill 2,8,0
literal_v10: .fill 2,8,0
literal_v11: .fill 2,8,0
literal_v12: .fill 2,8,0
literal_v13: .fill 2,8,0
literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0
DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
lsr x10, x9, 32
# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x20
# Calculate dataset pointer for dataset prefetch
mov w20, w9
DECL(randomx_program_aarch64_cacheline_align_mask1):
# Actual mask will be inserted by JIT compiler
and x20, x20, 1
add x20, x20, x1
# Prefetch dataset data
prfm pldl2strm, [x20]
# mx <-> ma
ror x9, x9, 32
DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler
and x10, x10, 1
add x10, x10, x1
DECL(randomx_program_aarch64_xor_with_dataset_line):
rx_program_xor_with_dataset_line:
# xor integer registers with dataset data
ldp x20, x19, [x10]
eor x4, x4, x20
eor x5, x5, x19
ldp x20, x19, [x10, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x20, x19, [x10, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x20, x19, [x10, 48]
eor x14, x14, x20
eor x15, x15, x19
DECL(randomx_program_aarch64_update_spMix1):
# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
eor x10, x0, x0
# Store integer registers to scratchpad (spAddr1)
stp x4, x5, [x17, 0]
stp x6, x7, [x17, 16]
stp x12, x13, [x17, 32]
stp x14, x15, [x17, 48]
# xor group F and group E registers
eor v16.16b, v16.16b, v20.16b
eor v17.16b, v17.16b, v21.16b
eor v18.16b, v18.16b, v22.16b
eor v19.16b, v19.16b, v23.16b
# Store FP registers to scratchpad (spAddr0)
stp q16, q17, [x16, 0]
stp q18, q19, [x16, 32]
subs x3, x3, 1
bne DECL(randomx_program_aarch64_main_loop)
# Restore x0
ldr x0, [sp], 16
# Store integer registers
stp x4, x5, [x0, 0]
stp x6, x7, [x0, 16]
stp x12, x13, [x0, 32]
stp x14, x15, [x0, 48]
# Store FP registers
stp q16, q17, [x0, 64]
stp q18, q19, [x0, 96]
stp q20, q21, [x0, 128]
stp q22, q23, [x0, 160]
# Restore callee-saved registers
ldp x16, x17, [sp]
ldr x19, [sp, 16]
ldp x20, x21, [sp, 32]
ldp x22, x23, [sp, 48]
ldp x24, x25, [sp, 64]
ldp x26, x27, [sp, 80]
ldp x28, x29, [sp, 96]
ldp x8, x30, [sp, 112]
ldp d8, d9, [sp, 128]
ldp d10, d11, [sp, 144]
ldp d12, d13, [sp, 160]
ldp d14, d15, [sp, 176]
add sp, sp, 192
ret
DECL(randomx_program_aarch64_vm_instructions_end_light):
sub sp, sp, 96
stp x0, x1, [sp, 64]
stp x2, x30, [sp, 80]
# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x20
# mx <-> ma
ror x9, x9, 32
# x0 -> pointer to cache memory
mov x0, x1
# x1 -> pointer to output
mov x1, sp
DECL(randomx_program_aarch64_light_cacheline_align_mask):
# Actual mask will be inserted by JIT compiler
and w2, w9, 1
# x2 -> item number
lsr x2, x2, 6
DECL(randomx_program_aarch64_light_dataset_offset):
# Apply dataset offset (filled in by JIT compiler)
add x2, x2, 0
add x2, x2, 0
bl rx_calc_dataset_item
mov x10, sp
ldp x0, x1, [sp, 64]
ldp x2, x30, [sp, 80]
add sp, sp, 96
b rx_program_xor_with_dataset_line
# Input parameters
#
# x0 -> pointer to cache
# x1 -> pointer to dataset memory at startItem
# x2 -> start item
# x3 -> end item
DECL(randomx_init_dataset_aarch64):
# Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address)
stp x20, x30, [sp, -16]!
# Load pointer to cache memory
ldr x0, [x0]
DECL(randomx_init_dataset_aarch64_main_loop):
bl rx_calc_dataset_item
add x1, x1, 64
add x2, x2, 1
cmp x2, x3
bne DECL(randomx_init_dataset_aarch64_main_loop)
# Restore x20 and x30
ldp x20, x30, [sp], 16
ret
DECL(randomx_init_dataset_aarch64_end):
# Input parameters
#
# x0 -> pointer to cache memory
# x1 -> pointer to output
# x2 -> item number
#
# Register allocation
#
# x0-x7 -> output value (calculated dataset item)
# x8 -> pointer to cache memory
# x9 -> pointer to output
# x10 -> registerValue
# x11 -> mixBlock
# x12 -> temporary
# x13 -> temporary
DECL(randomx_calc_dataset_item_aarch64):
rx_calc_dataset_item:
sub sp, sp, 112
stp x0, x1, [sp]
stp x2, x3, [sp, 16]
stp x4, x5, [sp, 32]
stp x6, x7, [sp, 48]
stp x8, x9, [sp, 64]
stp x10, x11, [sp, 80]
stp x12, x13, [sp, 96]
ldr x12, superscalarMul0
mov x8, x0
mov x9, x1
mov x10, x2
# rl[0] = (itemNumber + 1) * superscalarMul0;
madd x0, x2, x12, x12
# rl[1] = rl[0] ^ superscalarAdd1;
ldr x12, superscalarAdd1
eor x1, x0, x12
# rl[2] = rl[0] ^ superscalarAdd2;
ldr x12, superscalarAdd2
eor x2, x0, x12
# rl[3] = rl[0] ^ superscalarAdd3;
ldr x12, superscalarAdd3
eor x3, x0, x12
# rl[4] = rl[0] ^ superscalarAdd4;
ldr x12, superscalarAdd4
eor x4, x0, x12
# rl[5] = rl[0] ^ superscalarAdd5;
ldr x12, superscalarAdd5
eor x5, x0, x12
# rl[6] = rl[0] ^ superscalarAdd6;
ldr x12, superscalarAdd6
eor x6, x0, x12
# rl[7] = rl[0] ^ superscalarAdd7;
ldr x12, superscalarAdd7
eor x7, x0, x12
b rx_calc_dataset_item_prefetch
superscalarMul0: .quad 6364136223846793005
superscalarAdd1: .quad 9298411001130361340
superscalarAdd2: .quad 12065312585734608966
superscalarAdd3: .quad 9306329213124626780
superscalarAdd4: .quad 5281919268842080866
superscalarAdd5: .quad 10536153434571861004
superscalarAdd6: .quad 3398623926847679864
superscalarAdd7: .quad 9549104520008361294
# Prefetch -> SuperScalar hash -> Mix will be repeated N times
DECL(randomx_calc_dataset_item_aarch64_prefetch):
rx_calc_dataset_item_prefetch:
# Actual mask will be inserted by JIT compiler
and x11, x10, 1
add x11, x8, x11, lsl 6
prfm pldl2strm, [x11]
# Generated SuperScalar hash program goes here
DECL(randomx_calc_dataset_item_aarch64_mix):
ldp x12, x13, [x11]
eor x0, x0, x12
eor x1, x1, x13
ldp x12, x13, [x11, 16]
eor x2, x2, x12
eor x3, x3, x13
ldp x12, x13, [x11, 32]
eor x4, x4, x12
eor x5, x5, x13
ldp x12, x13, [x11, 48]
eor x6, x6, x12
eor x7, x7, x13
DECL(randomx_calc_dataset_item_aarch64_store_result):
stp x0, x1, [x9]
stp x2, x3, [x9, 16]
stp x4, x5, [x9, 32]
stp x6, x7, [x9, 48]
ldp x0, x1, [sp]
ldp x2, x3, [sp, 16]
ldp x4, x5, [sp, 32]
ldp x6, x7, [sp, 48]
ldp x8, x9, [sp, 64]
ldp x10, x11, [sp, 80]
ldp x12, x13, [sp, 96]
add sp, sp, 112
ret
DECL(randomx_calc_dataset_item_aarch64_end):

View file

@ -0,0 +1,51 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
Copyright (c) 2019, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
extern "C" {
void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
void randomx_program_aarch64_main_loop();
void randomx_program_aarch64_vm_instructions();
void randomx_program_aarch64_imul_rcp_literals_end();
void randomx_program_aarch64_vm_instructions_end();
void randomx_program_aarch64_cacheline_align_mask1();
void randomx_program_aarch64_cacheline_align_mask2();
void randomx_program_aarch64_update_spMix1();
void randomx_program_aarch64_vm_instructions_end_light();
void randomx_program_aarch64_light_cacheline_align_mask();
void randomx_program_aarch64_light_dataset_offset();
void randomx_init_dataset_aarch64();
void randomx_init_dataset_aarch64_end();
void randomx_calc_dataset_item_aarch64();
void randomx_calc_dataset_item_aarch64_prefetch();
void randomx_calc_dataset_item_aarch64_mix();
void randomx_calc_dataset_item_aarch64_store_result();
void randomx_calc_dataset_item_aarch64_end();
}

View file

@ -0,0 +1,76 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <vector>
#include <stdexcept>
#include "common.hpp"
namespace randomx {
class Program;
struct ProgramConfiguration;
class SuperscalarProgram;
class JitCompilerFallback {
public:
JitCompilerFallback() {
throw std::runtime_error("JIT compilation is not supported on this platform");
}
void generateProgram(Program&, ProgramConfiguration&) {
}
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
}
template<size_t N>
void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) {
}
void generateDatasetInitCode() {
}
ProgramFunc* getProgramFunc() {
return nullptr;
}
DatasetInitFunc* getDatasetInitFunc() {
return nullptr;
}
uint8_t* getCode() {
return nullptr;
}
size_t getCodeSize() {
return 0;
}
void enableWriting() {}
void enableExecution() {}
void enableAll() {}
};
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,78 @@
/*
Copyright (c) 2023 tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <cstring>
#include <vector>
#include "jit_compiler.hpp"
namespace randomx {
class Program;
struct ProgramConfiguration;
class SuperscalarProgram;
class Instruction;
class JitCompilerRV64 {
public:
JitCompilerRV64();
~JitCompilerRV64();
void generateProgram(Program&, ProgramConfiguration&);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
void generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t>&);
void generateDatasetInitCode() {}
ProgramFunc* getProgramFunc() {
return (ProgramFunc*)(vectorCode ? entryProgramVector : entryProgram);
}
DatasetInitFunc* getDatasetInitFunc() {
return (DatasetInitFunc*)((vectorCode && (vectorRegisterLength >= 256)) ? entryDataInitVector : entryDataInit);
}
uint8_t* getCode() {
return vectorCode ? vectorCode : state.code;
}
size_t getCodeSize();
void enableWriting();
void enableExecution();
void enableAll();
static uint8_t instMap[256];
private:
CompilerState state;
uint8_t* vectorCode = nullptr;
size_t vectorCodeSize = 0;
int vectorRegisterLength = 0;
void* entryDataInit = nullptr;
void* entryDataInitVector = nullptr;
void* entryProgram = nullptr;
void* entryProgramVector = nullptr;
};
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,53 @@
/*
Copyright (c) 2023 tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
extern "C" {
void randomx_riscv64_literals();
void randomx_riscv64_literals_end();
void randomx_riscv64_data_init();
void randomx_riscv64_fix_data_call();
void randomx_riscv64_prologue();
void randomx_riscv64_loop_begin();
void randomx_riscv64_data_read();
void randomx_riscv64_data_read_light();
void randomx_riscv64_fix_loop_call();
void randomx_riscv64_spad_store();
void randomx_riscv64_spad_store_hardaes();
void randomx_riscv64_spad_store_softaes();
void randomx_riscv64_loop_end();
void randomx_riscv64_fix_continue_loop();
void randomx_riscv64_epilogue();
void randomx_riscv64_softaes();
void randomx_riscv64_program_end();
void randomx_riscv64_ssh_init();
void randomx_riscv64_ssh_load();
void randomx_riscv64_ssh_prefetch();
void randomx_riscv64_ssh_end();
}

View file

@ -0,0 +1,903 @@
/*
Copyright (c) 2023, tevador <tevador@gmail.com>
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "configuration.h"
#include "jit_compiler_rv64_vector.h"
#include "jit_compiler_rv64_vector_static.h"
#include "reciprocal.h"
#include "superscalar.hpp"
#include "program.hpp"
namespace randomx {
constexpr int maskLog2(uint32_t x, int prev) {
return x == 1 ? prev : maskLog2(x >> 1, prev + 1);
}
constexpr int MaskL1Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L1, 0);
constexpr int MaskL2Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L2, 0);
constexpr int MaskL3Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L3, 0);
#define ADDR(x) ((uint8_t*) &(x))
#define DIST(x, y) (ADDR(y) - ADDR(x))
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs, std::vector<uint64_t>& reciprocalCache)
{
uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions);
uint8_t* literals = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_imul_rcp_literals);
uint8_t* cur_literal = literals;
for (size_t i = 0; i < num_programs; ++i) {
// Step 4
size_t k = DIST(randomx_riscv64_vector_sshash_cache_prefetch, randomx_riscv64_vector_sshash_xor);
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_cache_prefetch), k);
p += k;
// Step 5
for (uint32_t j = 0; j < programs[i].size; ++j) {
const uint32_t dst = programs[i].programBuffer[j].dst & 7;
const uint32_t src = programs[i].programBuffer[j].src & 7;
const uint32_t modShift = (programs[i].programBuffer[j].mod >> 2) & 3;
const uint32_t imm32 = programs[i].programBuffer[j].imm32;
uint32_t inst;
#define EMIT(data) inst = (data); memcpy(p, &inst, 4); p += 4
switch (static_cast<SuperscalarInstructionType>(programs[i].programBuffer[j].opcode)) {
case SuperscalarInstructionType::ISUB_R:
// 57 00 00 0A vsub.vv v0, v0, v0
EMIT(0x0A000057 | (dst << 7) | (src << 15) | (dst << 20));
break;
case SuperscalarInstructionType::IXOR_R:
// 57 00 00 2E vxor.vv v0, v0, v0
EMIT(0x2E000057 | (dst << 7) | (src << 15) | (dst << 20));
break;
case SuperscalarInstructionType::IADD_RS:
if (modShift == 0) {
// 57 00 00 02 vadd.vv v0, v0, v0
EMIT(0x02000057 | (dst << 7) | (src << 15) | (dst << 20));
}
else {
// 57 39 00 96 vsll.vi v18, v0, 0
// 57 00 09 02 vadd.vv v0, v0, v18
EMIT(0x96003957 | (modShift << 15) | (src << 20));
EMIT(0x02090057 | (dst << 7) | (dst << 20));
}
break;
case SuperscalarInstructionType::IMUL_R:
// 57 20 00 96 vmul.vv v0, v0, v0
EMIT(0x96002057 | (dst << 7) | (src << 15) | (dst << 20));
break;
case SuperscalarInstructionType::IROR_C:
{
#ifdef __riscv_zvkb
// 57 30 00 52 vror.vi v0, v0, 0
EMIT(0x52003057 | (dst << 7) | (dst << 20) | ((imm32 & 31) << 15) | ((imm32 & 32) << 21));
#else // __riscv_zvkb
const uint32_t shift_right = imm32 & 63;
const uint32_t shift_left = 64 - shift_right;
if (shift_right < 32) {
// 57 39 00 A2 vsrl.vi v18, v0, 0
EMIT(0xA2003957 | (shift_right << 15) | (dst << 20));
}
else {
// 93 02 00 00 li x5, 0
// 57 C9 02 A2 vsrl.vx v18, v0, x5
EMIT(0x00000293 | (shift_right << 20));
EMIT(0xA202C957 | (dst << 20));
}
if (shift_left < 32) {
// 57 30 00 96 vsll.vi v0, v0, 0
EMIT(0x96003057 | (dst << 7) | (shift_left << 15) | (dst << 20));
}
else {
// 93 02 00 00 li x5, 0
// 57 C0 02 96 vsll.vx v0, v0, x5
EMIT(0x00000293 | (shift_left << 20));
EMIT(0x9602C057 | (dst << 7) | (dst << 20));
}
// 57 00 20 2B vor.vv v0, v18, v0
EMIT(0x2B200057 | (dst << 7) | (dst << 15));
#endif // __riscv_zvkb
}
break;
case SuperscalarInstructionType::IADD_C7:
case SuperscalarInstructionType::IADD_C8:
case SuperscalarInstructionType::IADD_C9:
// B7 02 00 00 lui x5, 0
// 9B 82 02 00 addiw x5, x5, 0
// 57 C0 02 02 vadd.vx v0, v0, x5
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
EMIT(0x0202C057 | (dst << 7) | (dst << 20));
break;
case SuperscalarInstructionType::IXOR_C7:
case SuperscalarInstructionType::IXOR_C8:
case SuperscalarInstructionType::IXOR_C9:
// B7 02 00 00 lui x5, 0
// 9B 82 02 00 addiw x5, x5, 0
// 57 C0 02 2E vxor.vx v0, v0, x5
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
EMIT(0x2E02C057 | (dst << 7) | (dst << 20));
break;
case SuperscalarInstructionType::IMULH_R:
// 57 20 00 92 vmulhu.vv v0, v0, v0
EMIT(0x92002057 | (dst << 7) | (src << 15) | (dst << 20));
break;
case SuperscalarInstructionType::ISMULH_R:
// 57 20 00 9E vmulh.vv v0, v0, v0
EMIT(0x9E002057 | (dst << 7) | (src << 15) | (dst << 20));
break;
case SuperscalarInstructionType::IMUL_RCP:
{
uint32_t offset = cur_literal - literals;
if (offset == 2040) {
literals += 2040;
offset = 0;
// 93 87 87 7F add x15, x15, 2040
EMIT(0x7F878793);
}
const uint64_t r = reciprocalCache[imm32];
memcpy(cur_literal, &r, 8);
cur_literal += 8;
// 83 B2 07 00 ld x5, (x15)
// 57 E0 02 96 vmul.vx v0, v0, x5
EMIT(0x0007B283 | (offset << 20));
EMIT(0x9602E057 | (dst << 7) | (dst << 20));
}
break;
default:
UNREACHABLE;
}
}
// Step 6
k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_end);
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_xor), k);
p += k;
// Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5.
if (i + 1 < num_programs) {
// vmv.v.v v9, v0 + programs[i].getAddressRegister()
const uint32_t t = 0x5E0004D7 + (static_cast<uint32_t>(programs[i].getAddressRegister()) << 15);
memcpy(p, &t, 4);
p += 4;
}
}
// Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction
const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end);
const uint32_t k = e - p;
const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000);
memcpy(p, &j, 4);
char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init));
#ifdef __GNUC__
__builtin___clear_cache(result, (char*)(buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_end)));
#endif
return result;
}
#define emit16(value) { const uint16_t t = value; memcpy(p, &t, 2); p += 2; }
#define emit32(value) { const uint32_t t = value; memcpy(p, &t, 4); p += 4; }
#define emit64(value) { const uint64_t t = value; memcpy(p, &t, 8); p += 8; }
#define emit_data(arr) { memcpy(p, arr, sizeof(arr)); p += sizeof(arr); }
static void imm_to_x5(uint32_t imm, uint8_t*& p)
{
const uint32_t imm_hi = (imm + ((imm & 0x800) << 1)) & 0xFFFFF000U;
const uint32_t imm_lo = imm & 0x00000FFFU;
if (imm_hi == 0) {
// li x5, imm_lo
emit32(0x00000293 + (imm_lo << 20));
return;
}
if (imm_lo == 0) {
// lui x5, imm_hi
emit32(0x000002B7 + imm_hi);
return;
}
if (imm_hi < (32 << 12)) {
//c.lui x5, imm_hi
emit16(0x6281 + (imm_hi >> 10));
}
else {
// lui x5, imm_hi
emit32(0x000002B7 + imm_hi);
}
// addiw x5, x5, imm_lo
emit32(0x0002829B | (imm_lo << 20));
}
static void loadFromScratchpad(uint32_t src, uint32_t dst, uint32_t mod, uint32_t imm, uint8_t*& p)
{
if (src == dst) {
imm &= RANDOMX_SCRATCHPAD_L3 - 8;
if (imm <= 2047) {
// ld x5, imm(x12)
emit32(0x00063283 | (imm << 20));
}
else if (imm <= 2047 * 2) {
// addi x5, x12, 2047
emit32(0x7FF60293);
// ld x5, (imm - 2047)(x5)
emit32(0x0002B283 | ((imm - 2047) << 20));
}
else {
// lui x5, imm & 0xFFFFF000U
emit32(0x000002B7 | ((imm + ((imm & 0x800) << 1)) & 0xFFFFF000U));
// c.add x5, x12
emit16(0x92B2);
// ld x5, (imm & 0xFFF)(x5)
emit32(0x0002B283 | ((imm & 0xFFF) << 20));
}
return;
}
uint32_t shift;
uint32_t mask_reg;
if ((mod & 3) == 0) {
shift = MaskL2Shift;
mask_reg = 17;
}
else {
shift = MaskL1Shift;
mask_reg = 16;
}
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
// 0-0x7FF, 0xFFFFF800-0xFFFFFFFF fit into 12 bit (a single addi instruction)
if (imm - 0xFFFFF800U < 0x1000U) {
// addi x5, x20 + src, imm
emit32(0x000A0293 + (src << 15) + (imm << 20));
}
else {
imm_to_x5(imm, p);
// c.add x5, x20 + src
emit16(0x92D2 + (src << 2));
}
// and x5, x5, mask_reg
emit32(0x0002F2B3 + (mask_reg << 20));
// c.add x5, x12
emit16(0x92B2);
// ld x5, 0(x5)
emit32(0x0002B283);
}
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset)
{
uint64_t* params = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
params[0] = RANDOMX_SCRATCHPAD_L1 - 8;
params[1] = RANDOMX_SCRATCHPAD_L2 - 8;
params[2] = RANDOMX_SCRATCHPAD_L3 - 8;
params[3] = RANDOMX_DATASET_BASE_SIZE - 64;
params[4] = (1 << RANDOMX_JUMP_BITS) - 1;
uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals));
uint64_t* cur_literal = imul_rcp_literals;
uint32_t* spaddr_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_spaddr_xor));
uint32_t* spaddr_xor2 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch));
uint32_t* mx_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor));
uint32_t* mx_xor_light = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor_light_mode));
*spaddr_xor = 0x014A47B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x15, readReg0, readReg1
*spaddr_xor2 = 0x014A42B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x5, readReg0, readReg1
const uint32_t mx_xor_value = 0x014A42B3 + (pcfg.readReg2 << 15) + (pcfg.readReg3 << 20); // xor x5, readReg2, readReg3
*mx_xor = mx_xor_value;
*mx_xor_light = mx_xor_value;
if (entryDataInitScalar) {
void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data);
const uint64_t data[2] = { reinterpret_cast<uint64_t>(entryDataInitScalar), datasetOffset };
memcpy(light_mode_data, &data, sizeof(data));
}
uint8_t* p = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions));
// 57C8025E vmv.v.x v16, x5
// 57A9034B vsext.vf2 v18, v16
// 5798214B vfcvt.f.x.v v16, v18
static constexpr uint8_t group_f_convert[] = {
0x57, 0xC8, 0x02, 0x5E, 0x57, 0xA9, 0x03, 0x4B, 0x57, 0x98, 0x21, 0x4B
};
// 57080627 vand.vv v16, v16, v12
// 5788062B vor.vv v16, v16, v13
static constexpr uint8_t group_e_post_process[] = { 0x57, 0x08, 0x06, 0x27, 0x57, 0x88, 0x06, 0x2B };
uint8_t* last_modified[RegistersCount] = { p, p, p, p, p, p, p, p };
uint8_t readReg01[RegistersCount] = {};
readReg01[pcfg.readReg0] = 1;
readReg01[pcfg.readReg1] = 1;
uint32_t scratchpad_prefetch_pos = 0;
for (int32_t i = static_cast<int32_t>(prog.getSize()) - 1; i >= 0; --i) {
Instruction instr = prog(i);
const InstructionType inst_type = static_cast<InstructionType>(inst_map[instr.opcode]);
if (inst_type == InstructionType::CBRANCH) {
scratchpad_prefetch_pos = i;
break;
}
if (inst_type < InstructionType::FSWAP_R) {
const uint32_t src = instr.src % RegistersCount;
const uint32_t dst = instr.dst % RegistersCount;
if ((inst_type == InstructionType::ISWAP_R) && (src != dst) && (readReg01[src] || readReg01[dst])) {
scratchpad_prefetch_pos = i;
break;
}
if ((inst_type == InstructionType::IMUL_RCP) && readReg01[dst] && !isZeroOrPowerOf2(instr.getImm32())) {
scratchpad_prefetch_pos = i;
break;
}
if (readReg01[dst]) {
scratchpad_prefetch_pos = i;
break;
}
}
}
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
Instruction instr = prog(i);
uint32_t src = instr.src % RegistersCount;
uint32_t dst = instr.dst % RegistersCount;
const uint32_t shift = instr.getModShift();
uint32_t imm = instr.getImm32();
const uint32_t mod = instr.mod;
switch (static_cast<InstructionType>(inst_map[instr.opcode])) {
case InstructionType::IADD_RS:
if (shift == 0) {
// c.add x20 + dst, x20 + src
emit16(0x9A52 + (src << 2) + (dst << 7));
}
else {
#ifdef __riscv_zba
// sh{shift}add x20 + dst, x20 + src, x20 + dst
emit32(0x214A0A33 + (shift << 13) + (dst << 7) + (src << 15) + (dst << 20));
#else // __riscv_zba
// slli x5, x20 + src, shift
emit32(0x000A1293 + (src << 15) + (shift << 20));
// c.add x20 + dst, x5
emit16(0x9A16 + (dst << 7));
#endif // __riscv_zba
}
if (dst == RegisterNeedsDisplacement) {
imm_to_x5(imm, p);
// c.add x20 + dst, x5
emit16(0x9A16 + (dst << 7));
}
last_modified[dst] = p;
break;
case InstructionType::IADD_M:
loadFromScratchpad(src, dst, mod, imm, p);
// c.add x20 + dst, x5
emit16(0x9A16 + (dst << 7));
last_modified[dst] = p;
break;
case InstructionType::ISUB_R:
if (src != dst) {
// sub x20 + dst, x20 + dst, x20 + src
emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else {
imm_to_x5(-imm, p);
// c.add x20 + dst, x5
emit16(0x9A16 + (dst << 7));
}
last_modified[dst] = p;
break;
case InstructionType::ISUB_M:
loadFromScratchpad(src, dst, mod, imm, p);
// sub x20 + dst, x20 + dst, x5
emit32(0x405A0A33 + (dst << 7) + (dst << 15));
last_modified[dst] = p;
break;
case InstructionType::IMUL_R:
if (src != dst) {
// mul x20 + dst, x20 + dst, x20 + src
emit32(0x034A0A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else {
imm_to_x5(imm, p);
// mul x20 + dst, x20 + dst, x5
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
}
last_modified[dst] = p;
break;
case InstructionType::IMUL_M:
loadFromScratchpad(src, dst, mod, imm, p);
// mul x20 + dst, x20 + dst, x5
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
last_modified[dst] = p;
break;
case InstructionType::IMULH_R:
// mulhu x20 + dst, x20 + dst, x20 + src
emit32(0x034A3A33 + (dst << 7) + (dst << 15) + (src << 20));
last_modified[dst] = p;
break;
case InstructionType::IMULH_M:
loadFromScratchpad(src, dst, mod, imm, p);
// mulhu x20 + dst, x20 + dst, x5
emit32(0x025A3A33 + (dst << 7) + (dst << 15));
last_modified[dst] = p;
break;
case InstructionType::ISMULH_R:
// mulh x20 + dst, x20 + dst, x20 + src
emit32(0x034A1A33 + (dst << 7) + (dst << 15) + (src << 20));
last_modified[dst] = p;
break;
case InstructionType::ISMULH_M:
loadFromScratchpad(src, dst, mod, imm, p);
// mulh x20 + dst, x20 + dst, x5
emit32(0x025A1A33 + (dst << 7) + (dst << 15));
last_modified[dst] = p;
break;
case InstructionType::IMUL_RCP:
if (!isZeroOrPowerOf2(imm)) {
const uint64_t offset = (cur_literal - imul_rcp_literals) * 8;
*(cur_literal++) = randomx_reciprocal_fast(imm);
static constexpr uint32_t rcp_regs[26] = {
/* Integer */ 8, 10, 28, 29, 30, 31,
/* Float */ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 28, 29, 30, 31
};
if (offset < 6 * 8) {
// mul x20 + dst, x20 + dst, rcp_reg
emit32(0x020A0A33 + (dst << 7) + (dst << 15) + (rcp_regs[offset / 8] << 20));
}
else if (offset < 26 * 8) {
// fmv.x.d x5, rcp_reg
emit32(0xE20002D3 + (rcp_regs[offset / 8] << 15));
// mul x20 + dst, x20 + dst, x5
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
}
else {
// ld x5, offset(x18)
emit32(0x00093283 + (offset << 20));
// mul x20 + dst, x20 + dst, x5
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
}
last_modified[dst] = p;
}
break;
case InstructionType::INEG_R:
// sub x20 + dst, x0, x20 + dst
emit32(0x41400A33 + (dst << 7) + (dst << 20));
last_modified[dst] = p;
break;
case InstructionType::IXOR_R:
if (src != dst) {
// xor x20 + dst, x20 + dst, x20 + src
emit32(0x014A4A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else {
imm_to_x5(imm, p);
// xor x20, x20, x5
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
}
last_modified[dst] = p;
break;
case InstructionType::IXOR_M:
loadFromScratchpad(src, dst, mod, imm, p);
// xor x20, x20, x5
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
last_modified[dst] = p;
break;
#ifdef __riscv_zbb
case InstructionType::IROR_R:
if (src != dst) {
// ror x20 + dst, x20 + dst, x20 + src
emit32(0x614A5A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else {
// rori x20 + dst, x20 + dst, imm
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((imm & 63) << 20));
}
last_modified[dst] = p;
break;
case InstructionType::IROL_R:
if (src != dst) {
// rol x20 + dst, x20 + dst, x20 + src
emit32(0x614A1A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else {
// rori x20 + dst, x20 + dst, -imm
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((-imm & 63) << 20));
}
last_modified[dst] = p;
break;
#else // __riscv_zbb
case InstructionType::IROR_R:
if (src != dst) {
// sub x5, x0, x20 + src
emit32(0x414002B3 + (src << 20));
// srl x6, x20 + dst, x20 + src
emit32(0x014A5333 + (dst << 15) + (src << 20));
// sll x20 + dst, x20 + dst, x5
emit32(0x005A1A33 + (dst << 7) + (dst << 15));
// or x20 + dst, x20 + dst, x6
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
}
else {
// srli x5, x20 + dst, imm
emit32(0x000A5293 + (dst << 15) + ((imm & 63) << 20));
// slli x6, x20 + dst, -imm
emit32(0x000A1313 + (dst << 15) + ((-imm & 63) << 20));
// or x20 + dst, x5, x6
emit32(0x0062EA33 + (dst << 7));
}
last_modified[dst] = p;
break;
case InstructionType::IROL_R:
if (src != dst) {
// sub x5, x0, x20 + src
emit32(0x414002B3 + (src << 20));
// sll x6, x20 + dst, x20 + src
emit32(0x014A1333 + (dst << 15) + (src << 20));
// srl x20 + dst, x20 + dst, x5
emit32(0x005A5A33 + (dst << 7) + (dst << 15));
// or x20 + dst, x20 + dst, x6
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
}
else {
// srli x5, x20 + dst, -imm
emit32(0x000A5293 + (dst << 15) + ((-imm & 63) << 20));
// slli x6, x20 + dst, imm
emit32(0x000A1313 + (dst << 15) + ((imm & 63) << 20));
// or x20 + dst, x5, x6
emit32(0x0062EA33 + (dst << 7));
}
last_modified[dst] = p;
break;
#endif // __riscv_zbb
case InstructionType::ISWAP_R:
if (src != dst) {
// c.mv x5, x20 + dst
emit16(0x82D2 + (dst << 2));
// c.mv x20 + dst, x20 + src
emit16(0x8A52 + (src << 2) + (dst << 7));
// c.mv x20 + src, x5
emit16(0x8A16 + (src << 7));
last_modified[src] = p;
last_modified[dst] = p;
}
break;
case InstructionType::FSWAP_R:
// vmv.x.s x5, v0 + dst
emit32(0x420022D7 + (dst << 20));
// vslide1down.vx v0 + dst, v0 + dst, x5
emit32(0x3E02E057 + (dst << 7) + (dst << 20));
break;
case InstructionType::FADD_R:
src %= RegisterCountFlt;
dst %= RegisterCountFlt;
// vfadd.vv v0 + dst, v0 + dst, v8 + src
emit32(0x02041057 + (dst << 7) + (src << 15) + (dst << 20));
break;
case InstructionType::FADD_M:
dst %= RegisterCountFlt;
loadFromScratchpad(src, RegistersCount, mod, imm, p);
emit_data(group_f_convert);
// vfadd.vv v0 + dst, v0 + dst, v16
emit32(0x02081057 + (dst << 7) + (dst << 20));
break;
case InstructionType::FSUB_R:
src %= RegisterCountFlt;
dst %= RegisterCountFlt;
// vfsub.vv v0 + dst, v0 + dst, v8 + src
emit32(0x0A041057 + (dst << 7) + (src << 15) + (dst << 20));
break;
case InstructionType::FSUB_M:
dst %= RegisterCountFlt;
loadFromScratchpad(src, RegistersCount, mod, imm, p);
emit_data(group_f_convert);
// vfsub.vv v0 + dst, v0 + dst, v16
emit32(0x0A081057 + (dst << 7) + (dst << 20));
break;
case InstructionType::FSCAL_R:
dst %= RegisterCountFlt;
// vxor.vv v0, v0, v14
emit32(0x2E070057 + (dst << 7) + (dst << 20));
break;
case InstructionType::FMUL_R:
src %= RegisterCountFlt;
dst %= RegisterCountFlt;
// vfmul.vv v4 + dst, v4 + dst, v8 + src
emit32(0x92441257 + (dst << 7) + (src << 15) + (dst << 20));
break;
case InstructionType::FDIV_M:
dst %= RegisterCountFlt;
loadFromScratchpad(src, RegistersCount, mod, imm, p);
emit_data(group_f_convert);
emit_data(group_e_post_process);
// vfdiv.vv v0 + dst, v0 + dst, v16
emit32(0x82481257 + (dst << 7) + (dst << 20));
break;
case InstructionType::FSQRT_R:
dst %= RegisterCountFlt;
// vfsqrt.v v4 + dst, v4 + dst
emit32(0x4E401257 + (dst << 7) + (dst << 20));
break;
case InstructionType::CBRANCH:
{
const uint32_t shift = (mod >> 4) + RANDOMX_JUMP_OFFSET;
imm |= (1UL << shift);
if (RANDOMX_JUMP_OFFSET > 0 || shift > 0) {
imm &= ~(1UL << (shift - 1));
}
// slli x6, x7, shift
// x6 = branchMask
emit32(0x00039313 + (shift << 20));
// x5 = imm
imm_to_x5(imm, p);
// c.add x20 + dst, x5
emit16(0x9A16 + (dst << 7));
// and x5, x20 + dst, x6
emit32(0x006A72B3 + (dst << 15));
const int offset = static_cast<int>(last_modified[dst] - p);
if (offset >= -4096) {
// beqz x5, offset
const uint32_t k = static_cast<uint32_t>(offset);
emit32(0x80028063 | ((k & 0x1E) << 7) | ((k & 0x7E0) << 20) | ((k & 0x800) >> 4));
}
else {
// bnez x5, 8
emit32(0x00029463);
// j offset
const uint32_t k = static_cast<uint32_t>(offset - 4);
emit32(0x8000006F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
}
for (uint32_t j = 0; j < RegistersCount; ++j) {
last_modified[j] = p;
}
}
break;
case InstructionType::CFROUND:
if ((imm - 1) & 63) {
#ifdef __riscv_zbb
// rori x5, x20 + src, imm - 1
emit32(0x600A5293 + (src << 15) + (((imm - 1) & 63) << 20));
#else // __riscv_zbb
// srli x5, x20 + src, imm - 1
emit32(0x000A5293 + (src << 15) + (((imm - 1) & 63) << 20));
// slli x6, x20 + src, 1 - imm
emit32(0x000A1313 + (src << 15) + (((1 - imm) & 63) << 20));
// or x5, x5, x6
emit32(0x0062E2B3);
#endif // __riscv_zbb
// andi x5, x5, 6
emit32(0x0062F293);
}
else {
// andi x5, x20 + src, 6
emit32(0x006A7293 + (src << 15));
}
// li x6, 01111000b
// x6 = CFROUND lookup table
emit32(0x07800313);
// srl x5, x6, x5
emit32(0x005352B3);
// andi x5, x5, 3
emit32(0x0032F293);
// csrw frm, x5
emit32(0x00229073);
break;
case InstructionType::ISTORE:
{
uint32_t mask_reg;
uint32_t shift;
if ((mod >> 4) >= 14) {
shift = MaskL3Shift;
mask_reg = 1; // x1 = L3 mask
}
else {
if ((mod & 3) == 0) {
shift = MaskL2Shift;
mask_reg = 17; // x17 = L2 mask
}
else {
shift = MaskL1Shift;
mask_reg = 16; // x16 = L1 mask
}
}
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
imm_to_x5(imm, p);
// c.add x5, x20 + dst
emit16(0x92D2 + (dst << 2));
// and x5, x5, x0 + mask_reg
emit32(0x0002F2B3 + (mask_reg << 20));
// c.add x5, x12
emit16(0x92B2);
// sd x20 + src, 0(x5)
emit32(0x0142B023 + (src << 20));
}
break;
case InstructionType::NOP:
break;
default:
UNREACHABLE;
}
// Prefetch scratchpad lines for the next main loop iteration
// scratchpad_prefetch_pos is a conservative estimate of the earliest place in the code where we can do it
if (i == scratchpad_prefetch_pos) {
uint8_t* e = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch_end));
const size_t n = e - ((uint8_t*)spaddr_xor2);
memcpy(p, spaddr_xor2, n);
p += n;
}
}
const uint8_t* e;
if (entryDataInitScalar) {
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end_light_mode" instruction
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end_light_mode);
}
else {
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end" instruction
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end);
}
const uint32_t k = e - p;
emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
#ifdef __GNUC__
char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
char* p2 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_end));
__builtin___clear_cache(p1, p2);
#endif
return buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_begin);
}
} // namespace randomx

View file

@ -0,0 +1,45 @@
/*
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <cstdlib>
#include <vector>
namespace randomx {
class SuperscalarProgram;
struct ProgramConfiguration;
class Program;
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs, std::vector<uint64_t>& reciprocalCache);
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset);
} // namespace randomx

View file

@ -0,0 +1,873 @@
/*
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "configuration.h"
// Compatibility macros
#if !defined(RANDOMX_CACHE_ACCESSES) && defined(RANDOMX_CACHE_MAX_ACCESSES)
#define RANDOMX_CACHE_ACCESSES RANDOMX_CACHE_MAX_ACCESSES
#endif
#if defined(RANDOMX_ARGON_MEMORY)
#define RANDOMX_CACHE_MASK RANDOMX_ARGON_MEMORY * 1024 / 64 - 1
#elif defined(RANDOMX_CACHE_MAX_SIZE)
#define RANDOMX_CACHE_MASK RANDOMX_CACHE_MAX_SIZE / 64 - 1
#endif
#define DECL(x) x
.text
#ifndef __riscv_v
#error This file requires rv64gcv
#endif
.option pic
.global DECL(randomx_riscv64_vector_code_begin)
.global DECL(randomx_riscv64_vector_sshash_begin)
.global DECL(randomx_riscv64_vector_sshash_imul_rcp_literals)
.global DECL(randomx_riscv64_vector_sshash_dataset_init)
.global DECL(randomx_riscv64_vector_sshash_generated_instructions)
.global DECL(randomx_riscv64_vector_sshash_generated_instructions_end)
.global DECL(randomx_riscv64_vector_sshash_cache_prefetch)
.global DECL(randomx_riscv64_vector_sshash_xor)
.global DECL(randomx_riscv64_vector_sshash_end)
.global DECL(randomx_riscv64_vector_program_params)
.global DECL(randomx_riscv64_vector_program_imul_rcp_literals)
.global DECL(randomx_riscv64_vector_program_begin)
.global DECL(randomx_riscv64_vector_program_main_loop_instructions)
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end)
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor)
.global DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor)
.global DECL(randomx_riscv64_vector_program_main_loop_light_mode_data)
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode)
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode)
.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch)
.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end)
.global DECL(randomx_riscv64_vector_program_end)
.global DECL(randomx_riscv64_vector_code_end)
.balign 8
DECL(randomx_riscv64_vector_code_begin):
DECL(randomx_riscv64_vector_sshash_begin):
sshash_constant_0: .dword 6364136223846793005
sshash_constant_1: .dword 9298411001130361340
sshash_constant_2: .dword 12065312585734608966
sshash_constant_3: .dword 9306329213124626780
sshash_constant_4: .dword 5281919268842080866
sshash_constant_5: .dword 10536153434571861004
sshash_constant_6: .dword 3398623926847679864
sshash_constant_7: .dword 9549104520008361294
sshash_offsets: .dword 0,1,2,3
store_offsets: .dword 0,64,128,192
DECL(randomx_riscv64_vector_sshash_imul_rcp_literals): .fill 512,8,0
/*
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#73-dataset-block-generation
Register layout
---------------
x5 = temporary
x10 = randomx cache
x11 = output buffer
x12 = startBlock
x13 = endBlock
x14 = cache mask
x15 = imul_rcp literal pointer
v0-v7 = r0-r7
v8 = itemNumber
v9 = cacheIndex, then a pointer into cache->memory (for prefetch), then a byte offset into cache->memory
v10-v17 = sshash constants
v18 = temporary
v19 = dataset item store offsets
*/
DECL(randomx_riscv64_vector_sshash_dataset_init):
// Process 4 64-bit values at a time
vsetivli zero, 4, e64, m1, ta, ma
// Load cache->memory pointer
ld x10, (x10)
// Init cache mask
li x14, RANDOMX_CACHE_MASK
// Init dataset item store offsets
lla x5, store_offsets
vle64.v v19, (x5)
// Init itemNumber vector to (startBlock, startBlock + 1, startBlock + 2, startBlock + 3)
lla x5, sshash_offsets
vle64.v v8, (x5)
vadd.vx v8, v8, x12
// Load constants (stride = x0 = 0, so a 64-bit value will be broadcast into each element of a vector)
lla x5, sshash_constant_0
vlse64.v v10, (x5), x0
lla x5, sshash_constant_1
vlse64.v v11, (x5), x0
lla x5, sshash_constant_2
vlse64.v v12, (x5), x0
lla x5, sshash_constant_3
vlse64.v v13, (x5), x0
lla x5, sshash_constant_4
vlse64.v v14, (x5), x0
lla x5, sshash_constant_5
vlse64.v v15, (x5), x0
lla x5, sshash_constant_6
vlse64.v v16, (x5), x0
lla x5, sshash_constant_7
vlse64.v v17, (x5), x0
// Calculate the end pointer for dataset init
sub x13, x13, x12
slli x13, x13, 6
add x13, x13, x11
init_item:
// Step 1. Init r0-r7
// r0 = (itemNumber + 1) * 6364136223846793005
vmv.v.v v0, v8
vmadd.vv v0, v10, v10
// r_i = r0 ^ c_i for i = 1..7
vxor.vv v1, v0, v11
vxor.vv v2, v0, v12
vxor.vv v3, v0, v13
vxor.vv v4, v0, v14
vxor.vv v5, v0, v15
vxor.vv v6, v0, v16
vxor.vv v7, v0, v17
// Step 2. Let cacheIndex = itemNumber
vmv.v.v v9, v8
// Step 3 is implicit (all iterations are inlined, there is no "i")
// Init imul_rcp literal pointer
lla x15, randomx_riscv64_vector_sshash_imul_rcp_literals
DECL(randomx_riscv64_vector_sshash_generated_instructions):
// Generated by JIT compiler
//
// Step 4. randomx_riscv64_vector_sshash_cache_prefetch
// Step 5. SuperscalarHash[i]
// Step 6. randomx_riscv64_vector_sshash_xor
//
// Above steps will be repeated RANDOMX_CACHE_ACCESSES times
.fill RANDOMX_CACHE_ACCESSES * 2048, 4, 0
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
vsuxei64.v v0, (x11), v19
add x5, x11, 8
vsuxei64.v v1, (x5), v19
add x5, x11, 16
vsuxei64.v v2, (x5), v19
add x5, x11, 24
vsuxei64.v v3, (x5), v19
add x5, x11, 32
vsuxei64.v v4, (x5), v19
add x5, x11, 40
vsuxei64.v v5, (x5), v19
add x5, x11, 48
vsuxei64.v v6, (x5), v19
add x5, x11, 56
vsuxei64.v v7, (x5), v19
// Iterate to the next 4 items
vadd.vi v8, v8, 4
add x11, x11, 256
bltu x11, x13, init_item
ret
// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache.
DECL(randomx_riscv64_vector_sshash_cache_prefetch):
// v9 = convert from cacheIndex to a direct pointer into cache->memory
vand.vx v9, v9, x14
vsll.vi v9, v9, 6
vadd.vx v9, v9, x10
// Prefetch element 0
vmv.x.s x5, v9
#ifdef __riscv_zicbop
prefetch.r (x5)
#else
ld x5, (x5)
#endif
// Prefetch element 1
vslidedown.vi v18, v9, 1
vmv.x.s x5, v18
#ifdef __riscv_zicbop
prefetch.r (x5)
#else
ld x5, (x5)
#endif
// Prefetch element 2
vslidedown.vi v18, v9, 2
vmv.x.s x5, v18
#ifdef __riscv_zicbop
prefetch.r (x5)
#else
ld x5, (x5)
#endif
// Prefetch element 3
vslidedown.vi v18, v9, 3
vmv.x.s x5, v18
#ifdef __riscv_zicbop
prefetch.r (x5)
#else
ld x5, (x5)
#endif
// v9 = byte offset into cache->memory
vsub.vx v9, v9, x10
// Step 6. XOR all registers with data loaded from randomx cache
DECL(randomx_riscv64_vector_sshash_xor):
vluxei64.v v18, (x10), v9
vxor.vv v0, v0, v18
add x5, x10, 8
vluxei64.v v18, (x5), v9
vxor.vv v1, v1, v18
add x5, x10, 16
vluxei64.v v18, (x5), v9
vxor.vv v2, v2, v18
add x5, x10, 24
vluxei64.v v18, (x5), v9
vxor.vv v3, v3, v18
add x5, x10, 32
vluxei64.v v18, (x5), v9
vxor.vv v4, v4, v18
add x5, x10, 40
vluxei64.v v18, (x5), v9
vxor.vv v5, v5, v18
add x5, x10, 48
vluxei64.v v18, (x5), v9
vxor.vv v6, v6, v18
add x5, x10, 56
vluxei64.v v18, (x5), v9
vxor.vv v7, v7, v18
DECL(randomx_riscv64_vector_sshash_end):
/*
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#46-vm-execution
C declarations:
struct RegisterFile {
uint64_t r[8];
double f[4][2];
double e[4][2];
double a[4][2];
};
struct MemoryRegisters {
uint32_t mx, ma;
uint8_t* memory; // dataset (fast mode) or cache (light mode)
};
void ProgramFunc(RegisterFile* reg, MemoryRegisters* mem, uint8_t* scratchpad, uint64_t iterations);
Register layout
---------------
x0 = zero
x1 = scratchpad L3 mask
x2 = stack pointer
x3 = global pointer (unused)
x4 = thread pointer (unused)
x5 = temporary
x6 = temporary
x7 = branch mask (unshifted)
x8 = frame pointer, also 64-bit literal inside the loop
x9 = scratchpad L3 mask (64-byte aligned)
x10 = RegisterFile* reg, also 64-bit literal inside the loop
x11 = MemoryRegisters* mem, then dataset/cache pointer
x12 = scratchpad
x13 = iterations
x14 = mx, ma (always stored with dataset mask applied)
x15 = spAddr0, spAddr1
x16 = scratchpad L1 mask
x17 = scratchpad L2 mask
x18 = IMUL_RCP literals pointer
x19 = dataset mask
x20-x27 = r0-r7
x28-x31 = 64-bit literals
f0-f7 = 64-bit literals
f10-f17 = 64-bit literals
f28-f31 = 64-bit literals
v0-v3 = f0-f3
v4-v7 = e0-e3
v8-v11 = a0-a3
v12 = E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
v13 = E 'or' mask = 0x3*00000000******'3*00000000******
v14 = scale mask = 0x80f0000000000000'80f0000000000000
v15 = unused
v16 = temporary
v17 = unused
v18 = temporary
v19-v31 = unused
*/
.balign 8
DECL(randomx_riscv64_vector_program_params):
// JIT compiler will adjust these values for different RandomX variants
randomx_masks: .dword 16376, 262136, 2097144, 2147483584, 255
DECL(randomx_riscv64_vector_program_imul_rcp_literals):
imul_rcp_literals: .fill RANDOMX_PROGRAM_SIZE, 8, 0
DECL(randomx_riscv64_vector_program_begin):
addi sp, sp, -112
sd x8, 96(sp) // save old frame pointer
addi x8, sp, 112 // setup new frame pointer
sd x1, 104(sp) // save return address
// Save callee-saved registers
sd x9, 0(sp)
sd x18, 8(sp)
sd x19, 16(sp)
sd x20, 24(sp)
sd x21, 32(sp)
sd x22, 40(sp)
sd x23, 48(sp)
sd x24, 56(sp)
sd x25, 64(sp)
sd x26, 72(sp)
sd x27, 80(sp)
// Save x10 as it will be used as an IMUL_RCP literal
sd x10, 88(sp)
// Load mx, ma and dataset pointer
ld x14, (x11)
ld x11, 8(x11)
// Initialize spAddr0-spAddr1
mv x15, x14
// Set registers r0-r7 to zero
li x20, 0
li x21, 0
li x22, 0
li x23, 0
li x24, 0
li x25, 0
li x26, 0
li x27, 0
// Load masks
lla x5, randomx_masks
ld x16, 0(x5)
ld x17, 8(x5)
ld x1, 16(x5)
ld x19, 24(x5)
ld x7, 32(x5)
addi x9, x1, -56
// Set vector registers to 2x64 bit
vsetivli zero, 2, e64, m1, ta, ma
// Apply dataset mask to mx, ma
slli x5, x19, 32
or x5, x5, x19
and x14, x14, x5
// Load group A registers
addi x5, x10, 192
vle64.v v8, (x5)
addi x5, x10, 208
vle64.v v9, (x5)
addi x5, x10, 224
vle64.v v10, (x5)
addi x5, x10, 240
vle64.v v11, (x5)
// Load E 'and' mask
vmv.v.i v12, -1
vsrl.vi v12, v12, 8
// Load E 'or' mask (stored in reg.f[0])
addi x5, x10, 64
vle64.v v13, (x5)
// Load scale mask
lui x5, 0x80f00
slli x5, x5, 32
vmv.v.x v14, x5
// IMUL_RCP literals pointer
lla x18, imul_rcp_literals
// Load IMUL_RCP literals
ld x8, 0(x18)
ld x10, 8(x18)
ld x28, 16(x18)
ld x29, 24(x18)
ld x30, 32(x18)
ld x31, 40(x18)
fld f0, 48(x18)
fld f1, 56(x18)
fld f2, 64(x18)
fld f3, 72(x18)
fld f4, 80(x18)
fld f5, 88(x18)
fld f6, 96(x18)
fld f7, 104(x18)
fld f10, 112(x18)
fld f11, 120(x18)
fld f12, 128(x18)
fld f13, 136(x18)
fld f14, 144(x18)
fld f15, 152(x18)
fld f16, 160(x18)
fld f17, 168(x18)
fld f28, 176(x18)
fld f29, 184(x18)
fld f30, 192(x18)
fld f31, 200(x18)
randomx_riscv64_vector_program_main_loop:
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
// read a 64-byte line from scratchpad (indexed by spAddr0) and XOR it with r0-r7
ld x6, 0(x5)
xor x20, x20, x6
ld x6, 8(x5)
xor x21, x21, x6
ld x6, 16(x5)
xor x22, x22, x6
ld x6, 24(x5)
xor x23, x23, x6
ld x6, 32(x5)
xor x24, x24, x6
ld x6, 40(x5)
xor x25, x25, x6
ld x6, 48(x5)
xor x26, x26, x6
ld x6, 56(x5)
xor x27, x27, x6
srli x5, x15, 32 // x5 = spAddr1
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
// read a 64-byte line from scratchpad (indexed by spAddr1) and initialize f0-f3, e0-e3 registers
// Set vector registers to 2x32 bit
vsetivli zero, 2, e32, m1, ta, ma
// load f0
vle32.v v16, (x5)
vfwcvt.f.x.v v0, v16
// load f1
addi x6, x5, 8
vle32.v v1, (x6)
// Use v16 as an intermediary register because vfwcvt accepts only registers with even numbers here
vfwcvt.f.x.v v16, v1
vmv1r.v v1, v16
// load f2
addi x6, x5, 16
vle32.v v16, (x6)
vfwcvt.f.x.v v2, v16
// load f3
addi x6, x5, 24
vle32.v v3, (x6)
vfwcvt.f.x.v v16, v3
vmv1r.v v3, v16
// load e0
addi x6, x5, 32
vle32.v v16, (x6)
vfwcvt.f.x.v v4, v16
// load e1
addi x6, x5, 40
vle32.v v5, (x6)
vfwcvt.f.x.v v16, v5
vmv1r.v v5, v16
// load e2
addi x6, x5, 48
vle32.v v16, (x6)
vfwcvt.f.x.v v6, v16
// load e3
addi x6, x5, 56
vle32.v v7, (x6)
vfwcvt.f.x.v v16, v7
vmv1r.v v7, v16
// Set vector registers back to 2x64 bit
vsetivli zero, 2, e64, m1, ta, ma
// post-process e0-e3
vand.vv v4, v4, v12
vand.vv v5, v5, v12
vand.vv v6, v6, v12
vand.vv v7, v7, v12
vor.vv v4, v4, v13
vor.vv v5, v5, v13
vor.vv v6, v6, v13
vor.vv v7, v7, v13
DECL(randomx_riscv64_vector_program_main_loop_instructions):
// Generated by JIT compiler
// FDIV_M can generate up to 50 bytes of code (round it up to 52 - a multiple of 4)
// +32 bytes for the scratchpad prefetch and the final jump instruction
.fill RANDOMX_PROGRAM_SIZE * 52 + 32, 1, 0
DECL(randomx_riscv64_vector_program_main_loop_instructions_end):
// Calculate dataset pointer for dataset read
// Do it here to break false dependency from readReg2 and readReg3 (see below)
srli x6, x14, 32 // x6 = ma & dataset mask
DECL(randomx_riscv64_vector_program_main_loop_mx_xor):
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
and x5, x14, x19 // x5 = mx & dataset mask
add x5, x5, x11 // x5 = &dataset[mx & dataset mask]
#ifdef __riscv_zicbop
prefetch.r (x5)
#else
ld x5, (x5)
#endif
add x5, x6, x11 // x5 = &dataset[ma & dataset mask]
// read a 64-byte line from dataset and XOR it with r0-r7
ld x6, 0(x5)
xor x20, x20, x6
ld x6, 8(x5)
xor x21, x21, x6
ld x6, 16(x5)
xor x22, x22, x6
ld x6, 24(x5)
xor x23, x23, x6
ld x6, 32(x5)
xor x24, x24, x6
ld x6, 40(x5)
xor x25, x25, x6
ld x6, 48(x5)
xor x26, x26, x6
ld x6, 56(x5)
xor x27, x27, x6
randomx_riscv64_vector_program_main_loop_swap_mx_ma:
// swap mx <-> ma
#ifdef __riscv_zbb
rori x14, x14, 32
#else
srli x5, x14, 32
slli x14, x14, 32
or x14, x14, x5
#endif
srli x5, x15, 32 // x5 = spAddr1
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
// store registers r0-r7 to the scratchpad
sd x20, 0(x5)
sd x21, 8(x5)
sd x22, 16(x5)
sd x23, 24(x5)
sd x24, 32(x5)
sd x25, 40(x5)
sd x26, 48(x5)
sd x27, 56(x5)
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor):
xor x15, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
// store registers f0-f3 to the scratchpad (f0-f3 are first combined with e0-e3)
vxor.vv v0, v0, v4
vxor.vv v1, v1, v5
vxor.vv v2, v2, v6
vxor.vv v3, v3, v7
vse64.v v0, (x5)
addi x6, x5, 16
vse64.v v1, (x6)
addi x6, x5, 32
vse64.v v2, (x6)
addi x6, x5, 48
vse64.v v3, (x6)
addi x13, x13, -1
beqz x13, randomx_riscv64_vector_program_main_loop_end
j randomx_riscv64_vector_program_main_loop
randomx_riscv64_vector_program_main_loop_end:
// Restore x8 and x10
addi x8, sp, 112
ld x10, 88(sp)
// Store integer registers
sd x20, 0(x10)
sd x21, 8(x10)
sd x22, 16(x10)
sd x23, 24(x10)
sd x24, 32(x10)
sd x25, 40(x10)
sd x26, 48(x10)
sd x27, 56(x10)
// Store FP registers
addi x5, x10, 64
vse64.v v0, (x5)
addi x5, x10, 80
vse64.v v1, (x5)
addi x5, x10, 96
vse64.v v2, (x5)
addi x5, x10, 112
vse64.v v3, (x5)
addi x5, x10, 128
vse64.v v4, (x5)
addi x5, x10, 144
vse64.v v5, (x5)
addi x5, x10, 160
vse64.v v6, (x5)
addi x5, x10, 176
vse64.v v7, (x5)
// Restore callee-saved registers
ld x9, 0(sp)
ld x18, 8(sp)
ld x19, 16(sp)
ld x20, 24(sp)
ld x21, 32(sp)
ld x22, 40(sp)
ld x23, 48(sp)
ld x24, 56(sp)
ld x25, 64(sp)
ld x26, 72(sp)
ld x27, 80(sp)
ld x8, 96(sp) // old frame pointer
ld x1, 104(sp) // return address
addi sp, sp, 112
ret
DECL(randomx_riscv64_vector_program_main_loop_light_mode_data):
// 1) Pointer to the scalar dataset init function
// 2) Dataset offset
.dword 0, 0
DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode):
// Calculate dataset pointer for dataset read
// Do it here to break false dependency from readReg2 and readReg3 (see below)
srli x6, x14, 32 // x6 = ma & dataset mask
DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode):
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
// Save all registers modified when calling dataset_init_scalar_func_ptr
addi sp, sp, -192
// bytes [0, 127] - saved registers
// bytes [128, 191] - output buffer
sd x1, 0(sp)
sd x7, 16(sp)
sd x10, 24(sp)
sd x11, 32(sp)
sd x12, 40(sp)
sd x13, 48(sp)
sd x14, 56(sp)
sd x15, 64(sp)
sd x16, 72(sp)
sd x17, 80(sp)
sd x28, 88(sp)
sd x29, 96(sp)
sd x30, 104(sp)
sd x31, 112(sp)
// setup randomx_riscv64_vector_sshash_dataset_init's parameters
// x10 = pointer to pointer to cache memory
// pointer to cache memory was saved in "sd x11, 32(sp)", so x10 = sp + 32
addi x10, sp, 32
// x11 = output buffer (64 bytes)
addi x11, sp, 128
// x12 = start block
lla x5, randomx_riscv64_vector_program_main_loop_light_mode_data
ld x12, 8(x5)
add x12, x12, x6
srli x12, x12, 6
// x13 = end block
addi x13, x12, 1
ld x5, 0(x5)
jalr x1, 0(x5)
// restore registers
ld x1, 0(sp)
ld x7, 16(sp)
ld x10, 24(sp)
ld x11, 32(sp)
ld x12, 40(sp)
ld x13, 48(sp)
ld x14, 56(sp)
ld x15, 64(sp)
ld x16, 72(sp)
ld x17, 80(sp)
ld x28, 88(sp)
ld x29, 96(sp)
ld x30, 104(sp)
ld x31, 112(sp)
// read a 64-byte line from dataset and XOR it with r0-r7
ld x5, 128(sp)
xor x20, x20, x5
ld x5, 136(sp)
xor x21, x21, x5
ld x5, 144(sp)
xor x22, x22, x5
ld x5, 152(sp)
xor x23, x23, x5
ld x5, 160(sp)
xor x24, x24, x5
ld x5, 168(sp)
xor x25, x25, x5
ld x5, 176(sp)
xor x26, x26, x5
ld x5, 184(sp)
xor x27, x27, x5
addi sp, sp, 192
j randomx_riscv64_vector_program_main_loop_swap_mx_ma
DECL(randomx_riscv64_vector_program_scratchpad_prefetch):
xor x5, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
srli x6, x5, 32 // x6 = spAddr1
and x5, x5, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
and x6, x6, x9 // x6 = spAddr1 & 64-byte aligned L3 mask
c.add x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
c.add x6, x12 // x6 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
#ifdef __riscv_zicbop
prefetch.r (x5)
prefetch.r (x6)
#else
ld x5, (x5)
ld x6, (x6)
#endif
DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end):
DECL(randomx_riscv64_vector_program_end):
DECL(randomx_riscv64_vector_code_end):

View file

@ -0,0 +1,74 @@
/*
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#if defined(__cplusplus)
#include <cstdint>
#else
#include <stdint.h>
#endif
#if defined(__cplusplus)
extern "C" {
#endif
struct randomx_cache;
void randomx_riscv64_vector_code_begin();
void randomx_riscv64_vector_sshash_begin();
void randomx_riscv64_vector_sshash_imul_rcp_literals();
void randomx_riscv64_vector_sshash_dataset_init(struct randomx_cache* cache, uint8_t* output_buf, uint32_t startBlock, uint32_t endBlock);
void randomx_riscv64_vector_sshash_cache_prefetch();
void randomx_riscv64_vector_sshash_generated_instructions();
void randomx_riscv64_vector_sshash_generated_instructions_end();
void randomx_riscv64_vector_sshash_cache_prefetch();
void randomx_riscv64_vector_sshash_xor();
void randomx_riscv64_vector_sshash_end();
void randomx_riscv64_vector_program_params();
void randomx_riscv64_vector_program_imul_rcp_literals();
void randomx_riscv64_vector_program_begin();
void randomx_riscv64_vector_program_main_loop_instructions();
void randomx_riscv64_vector_program_main_loop_instructions_end();
void randomx_riscv64_vector_program_main_loop_mx_xor();
void randomx_riscv64_vector_program_main_loop_spaddr_xor();
void randomx_riscv64_vector_program_main_loop_light_mode_data();
void randomx_riscv64_vector_program_main_loop_instructions_end_light_mode();
void randomx_riscv64_vector_program_main_loop_mx_xor_light_mode();
void randomx_riscv64_vector_program_end();
void randomx_riscv64_vector_program_scratchpad_prefetch();
void randomx_riscv64_vector_program_scratchpad_prefetch_end();
void randomx_riscv64_vector_code_end();
#if defined(__cplusplus)
}
#endif

View file

@ -0,0 +1,847 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdexcept>
#include <cstring>
#include <climits>
#include "jit_compiler_x86.hpp"
#include "jit_compiler_x86_static.hpp"
#include "superscalar.hpp"
#include "program.hpp"
#include "reciprocal.h"
#include "virtual_memory.h"
namespace randomx {
/*
REGISTER ALLOCATION:
; rax -> temporary
; rbx -> iteration counter "ic"
; rcx -> temporary
; rdx -> temporary
; rsi -> scratchpad pointer
; rdi -> dataset pointer
; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits)
; rsp -> stack pointer
; r8 -> "r0"
; r9 -> "r1"
; r10 -> "r2"
; r11 -> "r3"
; r12 -> "r4"
; r13 -> "r5"
; r14 -> "r6"
; r15 -> "r7"
; xmm0 -> "f0"
; xmm1 -> "f1"
; xmm2 -> "f2"
; xmm3 -> "f3"
; xmm4 -> "e0"
; xmm5 -> "e1"
; xmm6 -> "e2"
; xmm7 -> "e3"
; xmm8 -> "a0"
; xmm9 -> "a1"
; xmm10 -> "a2"
; xmm11 -> "a3"
; xmm12 -> temporary
; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000******
; xmm15 -> scale mask = 0x81f000000000000081f0000000000000
*/
//Calculate the required code buffer size that is sufficient for the largest possible program:
constexpr size_t MaxRandomXInstrCodeSize = 32; //FDIV_M requires up to 32 bytes of x86 code
constexpr size_t MaxSuperscalarInstrSize = 14; //IMUL_RCP requires 14 bytes of x86 code
constexpr size_t SuperscalarProgramHeader = 128; //overhead per superscalar program
constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB
constexpr size_t ReserveCodeSize = CodeAlign; //function prologue/epilogue + reserve
constexpr size_t RandomXCodeSize = alignSize(ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign);
constexpr size_t SuperscalarSize = alignSize(ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign);
static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large");
static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large");
constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize;
constexpr int32_t superScalarHashOffset = RandomXCodeSize;
#if defined(_MSC_VER) && (defined(_DEBUG) || defined (RELWITHDEBINFO))
#define ADDR(x) ((((uint8_t*)&x)[0] == 0xE9) ? (((uint8_t*)&x) + *(const int32_t*)(((uint8_t*)&x) + 1) + 5) : ((uint8_t*)&x))
#else
#define ADDR(x) ((uint8_t*)&x)
#endif
const uint8_t* codePrologue = ADDR(randomx_program_prologue);
const uint8_t* codeLoopBegin = ADDR(randomx_program_loop_begin);
const uint8_t* codeLoopLoad = ADDR(randomx_program_loop_load);
const uint8_t* codeProgamStart = ADDR(randomx_program_start);
const uint8_t* codeReadDataset = ADDR(randomx_program_read_dataset);
const uint8_t* codeReadDatasetLightSshInit = ADDR(randomx_program_read_dataset_sshash_init);
const uint8_t* codeReadDatasetLightSshFin = ADDR(randomx_program_read_dataset_sshash_fin);
const uint8_t* codeDatasetInit = ADDR(randomx_dataset_init);
const uint8_t* codeLoopStore = ADDR(randomx_program_loop_store);
const uint8_t* codeLoopEnd = ADDR(randomx_program_loop_end);
const uint8_t* codeEpilogue = ADDR(randomx_program_epilogue);
const uint8_t* codeProgramEnd = ADDR(randomx_program_end);
const uint8_t* codeShhLoad = ADDR(randomx_sshash_load);
const uint8_t* codeShhPrefetch = ADDR(randomx_sshash_prefetch);
const uint8_t* codeShhEnd = ADDR(randomx_sshash_end);
const uint8_t* codeShhInit = ADDR(randomx_sshash_init);
const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset;
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
const int32_t datasetInitSize = codeEpilogue - codeDatasetInit;
const int32_t epilogueSize = codeShhLoad - codeEpilogue;
const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad;
const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch;
const int32_t codeSshInitSize = codeProgramEnd - codeShhInit;
const int32_t epilogueOffset = CodeSize - epilogueSize;
static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 };
static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b };
static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b };
static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 };
static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 };
static const uint8_t REX_81[] = { 0x49, 0x81 };
static const uint8_t AND_EAX_I = 0x25;
static const uint8_t MOV_EAX_I = 0xb8;
static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 };
static const uint8_t REX_LEA[] = { 0x4f, 0x8d };
static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e };
static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e };
static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 };
static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 };
static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 };
static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea };
static const uint8_t REX_SH[] = { 0x49, 0xc1 };
static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f };
static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 };
static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 };
static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 };
static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 };
static const uint8_t ADD_R_RAX[] = { 0x4C, 0x03 };
static const uint8_t XOR_EAX_EAX[] = { 0x33, 0xC0 };
static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 };
static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 };
static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA };
static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 };
static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0 };
static const uint8_t REX_NEG[] = { 0x49, 0xF7 };
static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 };
static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 };
static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 };
static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 };
static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 };
static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c };
static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 };
static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 };
static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f };
static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e };
static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 };
static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x50, 0x0F, 0xAE, 0x14, 0x24, 0x58 };
static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 };
static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 };
static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 };
static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 };
static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 };
static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 };
static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 };
static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 };
static const uint8_t JNZ[] = { 0x0f, 0x85 };
static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
static const uint8_t CALL = 0xe8;
static const uint8_t REX_ADD_I[] = { 0x49, 0x81 };
static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
static const uint8_t JZ[] = { 0x0f, 0x84 };
static const uint8_t RET = 0xc3;
static const uint8_t LEA_32[] = { 0x41, 0x8d };
static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 };
static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 };
static const uint8_t NOP1[] = { 0x90 };
static const uint8_t NOP2[] = { 0x66, 0x90 };
static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 };
static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 };
static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 };
static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
size_t JitCompilerX86::getCodeSize() {
return CodeSize;
}
JitCompilerX86::JitCompilerX86() {
code = (uint8_t*)allocMemoryPages(CodeSize);
if (code == nullptr)
throw std::runtime_error("allocMemoryPages");
memcpy(code, codePrologue, prologueSize);
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
}
JitCompilerX86::~JitCompilerX86() {
freePagedMemory(code, CodeSize);
}
void JitCompilerX86::enableAll() {
setPagesRWX(code, CodeSize);
}
void JitCompilerX86::enableWriting() {
setPagesRW(code, CodeSize);
}
void JitCompilerX86::enableExecution() {
setPagesRX(code, CodeSize);
}
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
generateProgramPrologue(prog, pcfg);
memcpy(code + codePos, codeReadDataset, readDatasetSize);
codePos += readDatasetSize;
generateProgramEpilogue(prog, pcfg);
}
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
generateProgramPrologue(prog, pcfg);
emit(codeReadDatasetLightSshInit, readDatasetLightInitSize);
emit(ADD_EBX_I);
emit32(datasetOffset / CacheLineSize);
emitByte(CALL);
emit32(superScalarHashOffset - (codePos + 4));
emit(codeReadDatasetLightSshFin, readDatasetLightFinSize);
generateProgramEpilogue(prog, pcfg);
}
template<size_t N>
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache) {
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
codePos = superScalarHashOffset + codeSshInitSize;
for (unsigned j = 0; j < N; ++j) {
SuperscalarProgram& prog = programs[j];
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
generateSuperscalarCode(instr, reciprocalCache);
}
emit(codeShhLoad, codeSshLoadSize);
if (j < N - 1) {
emit(REX_MOV_RR64);
emitByte(0xd8 + prog.getAddressRegister());
emit(codeShhPrefetch, codeSshPrefetchSize);
#ifdef RANDOMX_ALIGN
int align = (codePos % 16);
while (align != 0) {
int nopSize = 16 - align;
if (nopSize > 8) nopSize = 8;
emit(NOPX[nopSize - 1], nopSize);
align = (codePos % 16);
}
#endif
}
}
emitByte(RET);
}
template
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t> &reciprocalCache);
void JitCompilerX86::generateDatasetInitCode() {
memcpy(code, codeDatasetInit, datasetInitSize);
}
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
instructionOffsets.clear();
for (unsigned i = 0; i < RegistersCount; ++i) {
registerUsage[i] = -1;
}
codePos = prologueSize;
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
memcpy(code + codePos, codeLoopLoad, loopLoadSize);
codePos += loopLoadSize;
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
generateCode(instr, i);
}
emit(REX_MOV_RR);
emitByte(0xc0 + pcfg.readReg2);
emit(REX_XOR_EAX);
emitByte(0xc0 + pcfg.readReg3);
}
void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
emit(REX_MOV_RR64);
emitByte(0xc0 + pcfg.readReg0);
emit(REX_XOR_RAX_R64);
emitByte(0xc0 + pcfg.readReg1);
emit(ADDR(randomx_prefetch_scratchpad), ADDR(randomx_prefetch_scratchpad_end) - ADDR(randomx_prefetch_scratchpad));
memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize;
emit(SUB_EBX);
emit(JNZ);
emit32(prologueSize - codePos - 4);
emitByte(JMP);
emit32(epilogueOffset - codePos - 4);
}
void JitCompilerX86::generateCode(Instruction& instr, int i) {
instructionOffsets.push_back(codePos);
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector<uint64_t> &reciprocalCache) {
switch ((SuperscalarInstructionType)instr.opcode)
{
case randomx::SuperscalarInstructionType::ISUB_R:
emit(REX_SUB_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
break;
case randomx::SuperscalarInstructionType::IXOR_R:
emit(REX_XOR_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
break;
case randomx::SuperscalarInstructionType::IADD_RS:
emit(REX_LEA);
emitByte(0x04 + 8 * instr.dst);
genSIB(instr.getModShift(), instr.src, instr.dst);
break;
case randomx::SuperscalarInstructionType::IMUL_R:
emit(REX_IMUL_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
break;
case randomx::SuperscalarInstructionType::IROR_C:
emit(REX_ROT_I8);
emitByte(0xc8 + instr.dst);
emitByte(instr.getImm32() & 63);
break;
case randomx::SuperscalarInstructionType::IADD_C7:
emit(REX_81);
emitByte(0xc0 + instr.dst);
emit32(instr.getImm32());
break;
case randomx::SuperscalarInstructionType::IXOR_C7:
emit(REX_XOR_RI);
emitByte(0xf0 + instr.dst);
emit32(instr.getImm32());
break;
case randomx::SuperscalarInstructionType::IADD_C8:
emit(REX_81);
emitByte(0xc0 + instr.dst);
emit32(instr.getImm32());
#ifdef RANDOMX_ALIGN
emit(NOP1);
#endif
break;
case randomx::SuperscalarInstructionType::IXOR_C8:
emit(REX_XOR_RI);
emitByte(0xf0 + instr.dst);
emit32(instr.getImm32());
#ifdef RANDOMX_ALIGN
emit(NOP1);
#endif
break;
case randomx::SuperscalarInstructionType::IADD_C9:
emit(REX_81);
emitByte(0xc0 + instr.dst);
emit32(instr.getImm32());
#ifdef RANDOMX_ALIGN
emit(NOP2);
#endif
break;
case randomx::SuperscalarInstructionType::IXOR_C9:
emit(REX_XOR_RI);
emitByte(0xf0 + instr.dst);
emit32(instr.getImm32());
#ifdef RANDOMX_ALIGN
emit(NOP2);
#endif
break;
case randomx::SuperscalarInstructionType::IMULH_R:
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
emitByte(0xe0 + instr.src);
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
break;
case randomx::SuperscalarInstructionType::ISMULH_R:
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
emitByte(0xe8 + instr.src);
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
break;
case randomx::SuperscalarInstructionType::IMUL_RCP:
emit(MOV_RAX_I);
emit64(reciprocalCache[instr.getImm32()]);
emit(REX_IMUL_RM);
emitByte(0xc0 + 8 * instr.dst);
break;
default:
UNREACHABLE;
}
}
void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) {
emit(LEA_32);
emitByte(0x80 + instr.src + (rax ? 0 : 8));
if (instr.src == RegisterNeedsSib) {
emitByte(0x24);
}
emit32(instr.getImm32());
if (rax)
emitByte(AND_EAX_I);
else
emit(AND_ECX_I);
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
void JitCompilerX86::genAddressRegDst(Instruction& instr) {
emit(LEA_32);
emitByte(0x80 + instr.dst);
if (instr.dst == RegisterNeedsSib) {
emitByte(0x24);
}
emit32(instr.getImm32());
emitByte(AND_EAX_I);
if (instr.getModCond() < StoreL3Condition) {
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask);
}
else {
emit32(ScratchpadL3Mask);
}
}
void JitCompilerX86::genAddressImm(Instruction& instr) {
emit32(instr.getImm32() & ScratchpadL3Mask);
}
void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
emit(REX_LEA);
if (instr.dst == RegisterNeedsDisplacement)
emitByte(0xac);
else
emitByte(0x04 + 8 * instr.dst);
genSIB(instr.getModShift(), instr.src, instr.dst);
if (instr.dst == RegisterNeedsDisplacement)
emit32(instr.getImm32());
}
void JitCompilerX86::h_IADD_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
emit(REX_ADD_RM);
emitByte(0x04 + 8 * instr.dst);
emitByte(0x06);
}
else {
emit(REX_ADD_RM);
emitByte(0x86 + 8 * instr.dst);
genAddressImm(instr);
}
}
void JitCompilerX86::genSIB(int scale, int index, int base) {
emitByte((scale << 6) | (index << 3) | base);
}
void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
emit(REX_SUB_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
emit(REX_81);
emitByte(0xe8 + instr.dst);
emit32(instr.getImm32());
}
}
void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
emit(REX_SUB_RM);
emitByte(0x04 + 8 * instr.dst);
emitByte(0x06);
}
else {
emit(REX_SUB_RM);
emitByte(0x86 + 8 * instr.dst);
genAddressImm(instr);
}
}
void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
emit(REX_IMUL_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
emit(REX_IMUL_RRI);
emitByte(0xc0 + 9 * instr.dst);
emit32(instr.getImm32());
}
}
void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
emit(REX_IMUL_RM);
emitByte(0x04 + 8 * instr.dst);
emitByte(0x06);
}
else {
emit(REX_IMUL_RM);
emitByte(0x86 + 8 * instr.dst);
genAddressImm(instr);
}
}
void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
emitByte(0xe0 + instr.src);
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
}
void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr, false);
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_MEM);
}
else {
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_M);
emitByte(0xa6);
genAddressImm(instr);
}
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
}
void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
emitByte(0xe8 + instr.src);
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
}
void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr, false);
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_IMUL_MEM);
}
else {
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_M);
emitByte(0xae);
genAddressImm(instr);
}
emit(REX_MOV_R64R);
emitByte(0xc2 + 8 * instr.dst);
}
void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) {
const uint32_t divisor = instr.getImm32();
if (!isZeroOrPowerOf2(divisor)) {
registerUsage[instr.dst] = i;
emit(MOV_RAX_I);
emit64(randomx_reciprocal_fast(divisor));
emit(REX_IMUL_RM);
emitByte(0xc0 + 8 * instr.dst);
}
}
void JitCompilerX86::h_INEG_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
emit(REX_NEG);
emitByte(0xd8 + instr.dst);
}
void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
emit(REX_XOR_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
emit(REX_XOR_RI);
emitByte(0xf0 + instr.dst);
emit32(instr.getImm32());
}
}
void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
genAddressReg(instr);
emit(REX_XOR_RM);
emitByte(0x04 + 8 * instr.dst);
emitByte(0x06);
}
else {
emit(REX_XOR_RM);
emitByte(0x86 + 8 * instr.dst);
genAddressImm(instr);
}
}
void JitCompilerX86::h_IROR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
emit(REX_MOV_RR);
emitByte(0xc8 + instr.src);
emit(REX_ROT_CL);
emitByte(0xc8 + instr.dst);
}
else {
emit(REX_ROT_I8);
emitByte(0xc8 + instr.dst);
emitByte(instr.getImm32() & 63);
}
}
void JitCompilerX86::h_IROL_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
emit(REX_MOV_RR);
emitByte(0xc8 + instr.src);
emit(REX_ROT_CL);
emitByte(0xc0 + instr.dst);
}
else {
emit(REX_ROT_I8);
emitByte(0xc0 + instr.dst);
emitByte(instr.getImm32() & 63);
}
}
void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
registerUsage[instr.dst] = i;
registerUsage[instr.src] = i;
emit(REX_XCHG);
emitByte(0xc0 + instr.src + 8 * instr.dst);
}
}
void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) {
emit(SHUFPD);
emitByte(0xc0 + 9 * instr.dst);
emitByte(1);
}
void JitCompilerX86::h_FADD_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
emit(REX_ADDPD);
emitByte(0xc0 + instr.src + 8 * instr.dst);
}
void JitCompilerX86::h_FADD_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
emit(REX_CVTDQ2PD_XMM12);
emit(REX_ADDPD);
emitByte(0xc4 + 8 * instr.dst);
}
void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
emit(REX_SUBPD);
emitByte(0xc0 + instr.src + 8 * instr.dst);
}
void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
emit(REX_CVTDQ2PD_XMM12);
emit(REX_SUBPD);
emitByte(0xc4 + 8 * instr.dst);
}
void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
emit(REX_XORPS);
emitByte(0xc7 + 8 * instr.dst);
}
void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt;
emit(REX_MULPD);
emitByte(0xe0 + instr.src + 8 * instr.dst);
}
void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
genAddressReg(instr);
emit(REX_CVTDQ2PD_XMM12);
emit(REX_ANDPS_XMM12);
emit(REX_DIVPD);
emitByte(0xe4 + 8 * instr.dst);
}
void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) {
instr.dst %= RegisterCountFlt;
emit(SQRTPD);
emitByte(0xe4 + 9 * instr.dst);
}
void JitCompilerX86::h_CFROUND(Instruction& instr, int i) {
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.src);
int rotate = (13 - (instr.getImm32() & 63)) & 63;
if (rotate != 0) {
emit(ROL_RAX);
emitByte(rotate);
}
emit(AND_OR_MOV_LDMXCSR);
}
void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) {
int reg = instr.dst;
int target = registerUsage[reg] + 1;
emit(REX_ADD_I);
emitByte(0xc0 + reg);
int shift = instr.getModCond() + ConditionOffset;
uint32_t imm = instr.getImm32() | (1UL << shift);
if (ConditionOffset > 0 || shift > 0)
imm &= ~(1UL << (shift - 1));
emit32(imm);
emit(REX_TEST);
emitByte(0xc0 + reg);
emit32(ConditionMask << shift);
emit(JZ);
emit32(instructionOffsets[target] - (codePos + 4));
//mark all registers as used
for (unsigned j = 0; j < RegistersCount; ++j) {
registerUsage[j] = i;
}
}
void JitCompilerX86::h_ISTORE(Instruction& instr, int i) {
genAddressRegDst(instr);
emit(REX_MOV_MR);
emitByte(0x04 + 8 * instr.src);
emitByte(0x06);
}
void JitCompilerX86::h_NOP(Instruction& instr, int i) {
emit(NOP1);
}
#include "instruction_weights.hpp"
#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x))
InstructionGeneratorX86 JitCompilerX86::engine[256] = {
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IMUL_RCP)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
INST_HANDLE(FSWAP_R)
INST_HANDLE(FADD_R)
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
INST_HANDLE(FSCAL_R)
INST_HANDLE(FMUL_R)
INST_HANDLE(FDIV_M)
INST_HANDLE(FSQRT_R)
INST_HANDLE(CBRANCH)
INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE)
INST_HANDLE(NOP)
};
}

View file

@ -0,0 +1,142 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <cstring>
#include <vector>
#include "common.hpp"
namespace randomx {
class Program;
struct ProgramConfiguration;
class SuperscalarProgram;
class JitCompilerX86;
class Instruction;
typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int);
class JitCompilerX86 {
public:
JitCompilerX86();
~JitCompilerX86();
void generateProgram(Program&, ProgramConfiguration&);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
template<size_t N>
void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector<uint64_t> &);
void generateDatasetInitCode();
ProgramFunc* getProgramFunc() {
return (ProgramFunc*)code;
}
DatasetInitFunc* getDatasetInitFunc() {
return (DatasetInitFunc*)code;
}
uint8_t* getCode() {
return code;
}
size_t getCodeSize();
void enableWriting();
void enableExecution();
void enableAll();
private:
static InstructionGeneratorX86 engine[256];
std::vector<int32_t> instructionOffsets;
int registerUsage[RegistersCount];
uint8_t* code;
int32_t codePos;
void generateProgramPrologue(Program&, ProgramConfiguration&);
void generateProgramEpilogue(Program&, ProgramConfiguration&);
void genAddressReg(Instruction&, bool);
void genAddressRegDst(Instruction&);
void genAddressImm(Instruction&);
void genSIB(int scale, int index, int base);
void generateCode(Instruction&, int);
void generateSuperscalarCode(Instruction &, std::vector<uint64_t> &);
void emitByte(uint8_t val) {
code[codePos] = val;
codePos++;
}
void emit32(uint32_t val) {
memcpy(code + codePos, &val, sizeof val);
codePos += sizeof val;
}
void emit64(uint64_t val) {
memcpy(code + codePos, &val, sizeof val);
codePos += sizeof val;
}
template<size_t N>
void emit(const uint8_t (&src)[N]) {
emit(src, N);
}
void emit(const uint8_t* src, size_t count) {
memcpy(code + codePos, src, count);
codePos += count;
}
void h_IADD_RS(Instruction&, int);
void h_IADD_M(Instruction&, int);
void h_ISUB_R(Instruction&, int);
void h_ISUB_M(Instruction&, int);
void h_IMUL_R(Instruction&, int);
void h_IMUL_M(Instruction&, int);
void h_IMULH_R(Instruction&, int);
void h_IMULH_M(Instruction&, int);
void h_ISMULH_R(Instruction&, int);
void h_ISMULH_M(Instruction&, int);
void h_IMUL_RCP(Instruction&, int);
void h_INEG_R(Instruction&, int);
void h_IXOR_R(Instruction&, int);
void h_IXOR_M(Instruction&, int);
void h_IROR_R(Instruction&, int);
void h_IROL_R(Instruction&, int);
void h_ISWAP_R(Instruction&, int);
void h_FSWAP_R(Instruction&, int);
void h_FADD_R(Instruction&, int);
void h_FADD_M(Instruction&, int);
void h_FSUB_R(Instruction&, int);
void h_FSUB_M(Instruction&, int);
void h_FSCAL_R(Instruction&, int);
void h_FMUL_R(Instruction&, int);
void h_FDIV_M(Instruction&, int);
void h_FSQRT_R(Instruction&, int);
void h_CBRANCH(Instruction&, int);
void h_CFROUND(Instruction&, int);
void h_ISTORE(Instruction&, int);
void h_NOP(Instruction&, int);
};
}

View file

@ -0,0 +1,230 @@
# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the copyright holder nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#define DECL(x) _##x
#else
.section .text
#define DECL(x) x
#endif
#if defined(__WIN32__) || defined(__CYGWIN__)
#define WINABI
#endif
.global DECL(randomx_prefetch_scratchpad)
.global DECL(randomx_prefetch_scratchpad_end)
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store)
.global DECL(randomx_program_loop_end)
.global DECL(randomx_dataset_init)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_sshash_load)
.global DECL(randomx_sshash_prefetch)
.global DECL(randomx_sshash_end)
.global DECL(randomx_sshash_init)
.global DECL(randomx_program_end)
.global DECL(randomx_reciprocal_fast)
#include "configuration.h"
#define RANDOMX_SCRATCHPAD_MASK (RANDOMX_SCRATCHPAD_L3-64)
#define RANDOMX_DATASET_BASE_MASK (RANDOMX_DATASET_BASE_SIZE-64)
#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1)
#define RANDOMX_ALIGN 4096
#define SUPERSCALAR_OFFSET ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN))
#define db .byte
DECL(randomx_prefetch_scratchpad):
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_end):
.balign 64
DECL(randomx_program_prologue):
#if defined(WINABI)
#include "asm/program_prologue_win64.inc"
#else
#include "asm/program_prologue_linux.inc"
#endif
movapd xmm13, xmmword ptr [mantissaMask+rip]
movapd xmm14, xmmword ptr [exp240+rip]
movapd xmm15, xmmword ptr [scaleMask+rip]
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
jmp rx_program_loop_begin
.balign 64
#include "asm/program_xmm_constants.inc"
.balign 64
DECL(randomx_program_loop_begin):
rx_program_loop_begin:
nop
DECL(randomx_program_loop_load):
#include "asm/program_loop_load.inc"
DECL(randomx_program_start):
nop
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"
DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc"
DECL(randomx_program_read_dataset_sshash_fin):
#include "asm/program_read_dataset_sshash_fin.inc"
DECL(randomx_program_loop_store):
#include "asm/program_loop_store.inc"
DECL(randomx_program_loop_end):
nop
.balign 64
DECL(randomx_dataset_init):
rx_dataset_init:
push rbx
push rbp
push r12
push r13
push r14
push r15
#if defined(WINABI)
push rdi
push rsi
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
#else
mov rdi, qword ptr [rdi] ;# cache->memory
;# dataset in rsi
mov rbp, rdx ;# block index
push rcx ;# max. block index
#endif
init_block_loop:
prefetchw byte ptr [rsi]
mov rbx, rbp
.byte 232 ;# 0xE8 = call
.int SUPERSCALAR_OFFSET - (call_offset - rx_dataset_init)
call_offset:
mov qword ptr [rsi+0], r8
mov qword ptr [rsi+8], r9
mov qword ptr [rsi+16], r10
mov qword ptr [rsi+24], r11
mov qword ptr [rsi+32], r12
mov qword ptr [rsi+40], r13
mov qword ptr [rsi+48], r14
mov qword ptr [rsi+56], r15
add rbp, 1
add rsi, 64
cmp rbp, qword ptr [rsp]
jb init_block_loop
pop rax
#if defined(WINABI)
pop rsi
pop rdi
#endif
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
ret
.balign 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_store.inc"
#if defined(WINABI)
#include "asm/program_epilogue_win64.inc"
#else
#include "asm/program_epilogue_linux.inc"
#endif
.balign 64
DECL(randomx_sshash_load):
#include "asm/program_sshash_load.inc"
DECL(randomx_sshash_prefetch):
#include "asm/program_sshash_prefetch.inc"
DECL(randomx_sshash_end):
nop
.balign 64
DECL(randomx_sshash_init):
lea r8, [rbx+1]
#include "asm/program_sshash_prefetch.inc"
imul r8, qword ptr [r0_mul+rip]
mov r9, qword ptr [r1_add+rip]
xor r9, r8
mov r10, qword ptr [r2_add+rip]
xor r10, r8
mov r11, qword ptr [r3_add+rip]
xor r11, r8
mov r12, qword ptr [r4_add+rip]
xor r12, r8
mov r13, qword ptr [r5_add+rip]
xor r13, r8
mov r14, qword ptr [r6_add+rip]
xor r14, r8
mov r15, qword ptr [r7_add+rip]
xor r15, r8
jmp rx_program_end
.balign 64
#include "asm/program_sshash_constants.inc"
.balign 64
DECL(randomx_program_end):
rx_program_end:
nop
DECL(randomx_reciprocal_fast):
#if !defined(WINABI)
mov rcx, rdi
#endif
#include "asm/randomx_reciprocal.inc"

View file

@ -0,0 +1,223 @@
; Copyright (c) 2018-2019, tevador <tevador@gmail.com>
;
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
; * Neither the name of the copyright holder nor the
; names of its contributors may be used to endorse or promote products
; derived from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_prefetch_scratchpad
PUBLIC randomx_prefetch_scratchpad_end
PUBLIC randomx_program_prologue
PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
PUBLIC randomx_program_loop_store
PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
PUBLIC randomx_sshash_load
PUBLIC randomx_sshash_prefetch
PUBLIC randomx_sshash_end
PUBLIC randomx_sshash_init
PUBLIC randomx_program_end
PUBLIC randomx_reciprocal_fast
include asm/configuration.asm
RANDOMX_SCRATCHPAD_MASK EQU (RANDOMX_SCRATCHPAD_L3-64)
RANDOMX_DATASET_BASE_MASK EQU (RANDOMX_DATASET_BASE_SIZE-64)
RANDOMX_CACHE_MASK EQU (RANDOMX_ARGON_MEMORY*16-1)
RANDOMX_ALIGN EQU 4096
SUPERSCALAR_OFFSET EQU ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN))
randomx_prefetch_scratchpad PROC
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad ENDP
randomx_prefetch_scratchpad_end PROC
randomx_prefetch_scratchpad_end ENDP
ALIGN 64
randomx_program_prologue PROC
include asm/program_prologue_win64.inc
movapd xmm13, xmmword ptr [mantissaMask]
movapd xmm14, xmmword ptr [exp240]
movapd xmm15, xmmword ptr [scaleMask]
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
jmp rx_program_loop_begin
randomx_program_prologue ENDP
ALIGN 64
include asm/program_xmm_constants.inc
ALIGN 64
randomx_program_loop_begin PROC
rx_program_loop_begin::
nop
randomx_program_loop_begin ENDP
randomx_program_loop_load PROC
include asm/program_loop_load.inc
randomx_program_loop_load ENDP
randomx_program_start PROC
nop
randomx_program_start ENDP
randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP
randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP
randomx_program_read_dataset_sshash_fin PROC
include asm/program_read_dataset_sshash_fin.inc
randomx_program_read_dataset_sshash_fin ENDP
randomx_program_loop_store PROC
include asm/program_loop_store.inc
randomx_program_loop_store ENDP
randomx_program_loop_end PROC
nop
randomx_program_loop_end ENDP
ALIGN 64
randomx_dataset_init PROC
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
init_block_loop:
prefetchw byte ptr [rsi]
mov rbx, rbp
db 232 ;# 0xE8 = call
dd SUPERSCALAR_OFFSET - distance
distance equ $ - offset randomx_dataset_init
mov qword ptr [rsi+0], r8
mov qword ptr [rsi+8], r9
mov qword ptr [rsi+16], r10
mov qword ptr [rsi+24], r11
mov qword ptr [rsi+32], r12
mov qword ptr [rsi+40], r13
mov qword ptr [rsi+48], r14
mov qword ptr [rsi+56], r15
add rbp, 1
add rsi, 64
cmp rbp, qword ptr [rsp]
jb init_block_loop
pop r9
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
ret
randomx_dataset_init ENDP
ALIGN 64
randomx_program_epilogue PROC
include asm/program_epilogue_store.inc
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_sshash_load PROC
include asm/program_sshash_load.inc
randomx_sshash_load ENDP
randomx_sshash_prefetch PROC
include asm/program_sshash_prefetch.inc
randomx_sshash_prefetch ENDP
randomx_sshash_end PROC
nop
randomx_sshash_end ENDP
ALIGN 64
randomx_sshash_init PROC
lea r8, [rbx+1]
include asm/program_sshash_prefetch.inc
imul r8, qword ptr [r0_mul]
mov r9, qword ptr [r1_add]
xor r9, r8
mov r10, qword ptr [r2_add]
xor r10, r8
mov r11, qword ptr [r3_add]
xor r11, r8
mov r12, qword ptr [r4_add]
xor r12, r8
mov r13, qword ptr [r5_add]
xor r13, r8
mov r14, qword ptr [r6_add]
xor r14, r8
mov r15, qword ptr [r7_add]
xor r15, r8
jmp rx_program_end
randomx_sshash_init ENDP
ALIGN 64
include asm/program_sshash_constants.inc
ALIGN 64
randomx_program_end PROC
rx_program_end::
nop
randomx_program_end ENDP
randomx_reciprocal_fast PROC
include asm/randomx_reciprocal.inc
randomx_reciprocal_fast ENDP
_RANDOMX_JITX86_STATIC ENDS
ENDIF
END

View file

@ -0,0 +1,50 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
extern "C" {
void randomx_prefetch_scratchpad();
void randomx_prefetch_scratchpad_end();
void randomx_program_prologue();
void randomx_program_loop_begin();
void randomx_program_loop_load();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store();
void randomx_program_loop_end();
void randomx_dataset_init();
void randomx_program_epilogue();
void randomx_sshash_load();
void randomx_sshash_prefetch();
void randomx_sshash_end();
void randomx_sshash_init();
void randomx_program_end();
}

71
crypto/randomx/program.hpp Executable file
View file

@ -0,0 +1,71 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <ostream>
#include "common.hpp"
#include "instruction.hpp"
#include "blake2/endian.h"
namespace randomx {
struct ProgramConfiguration {
uint64_t eMask[2];
uint32_t readReg0, readReg1, readReg2, readReg3;
};
class Program {
public:
Instruction& operator()(int pc) {
return programBuffer[pc];
}
friend std::ostream& operator<<(std::ostream& os, const Program& p) {
p.print(os);
return os;
}
uint64_t getEntropy(int i) {
return load64(&entropyBuffer[i]);
}
uint32_t getSize() {
return RANDOMX_PROGRAM_SIZE;
}
private:
void print(std::ostream& os) const {
for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) {
auto instr = programBuffer[i];
os << instr;
}
}
uint64_t entropyBuffer[16];
Instruction programBuffer[RANDOMX_PROGRAM_SIZE];
};
static_assert(sizeof(Program) % 64 == 0, "Invalid size of class randomx::Program");
}

450
crypto/randomx/randomx.cpp Executable file
View file

@ -0,0 +1,450 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "randomx.h"
#include "dataset.hpp"
#include "vm_interpreted.hpp"
#include "vm_interpreted_light.hpp"
#include "vm_compiled.hpp"
#include "vm_compiled_light.hpp"
#include "blake2/blake2.h"
#include "cpu.hpp"
#include <cassert>
#include <limits>
#if defined(__SSE__) || defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP > 0))
#define USE_CSR_INTRINSICS
#include <xmmintrin.h>
#else
#include <cfenv>
#endif
extern "C" {
randomx_flags randomx_get_flags() {
randomx_flags flags = RANDOMX_HAVE_COMPILER ? RANDOMX_FLAG_JIT : RANDOMX_FLAG_DEFAULT;
randomx::Cpu cpu;
#ifdef RANDOMX_FORCE_SECURE
if (flags == RANDOMX_FLAG_JIT) {
flags |= RANDOMX_FLAG_SECURE;
}
#endif
if (HAVE_AES && cpu.hasAes()) {
flags |= RANDOMX_FLAG_HARD_AES;
}
if (randomx_argon2_impl_avx2() != nullptr && cpu.hasAvx2()) {
flags |= RANDOMX_FLAG_ARGON2_AVX2;
}
if (randomx_argon2_impl_ssse3() != nullptr && cpu.hasSsse3()) {
flags |= RANDOMX_FLAG_ARGON2_SSSE3;
}
return flags;
}
randomx_cache *randomx_alloc_cache(randomx_flags flags) {
randomx_cache *cache = nullptr;
auto impl = randomx::selectArgonImpl(flags);
if (impl == nullptr) {
return cache;
}
try {
cache = new randomx_cache();
cache->argonImpl = impl;
switch ((int)(flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES))) {
case RANDOMX_FLAG_DEFAULT:
cache->dealloc = &randomx::deallocCache<randomx::DefaultAllocator>;
cache->jit = nullptr;
cache->initialize = &randomx::initCache;
cache->datasetInit = &randomx::initDataset;
cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::CacheSize);
break;
case RANDOMX_FLAG_JIT:
cache->dealloc = &randomx::deallocCache<randomx::DefaultAllocator>;
cache->jit = new randomx::JitCompiler();
cache->initialize = &randomx::initCacheCompile;
cache->datasetInit = cache->jit->getDatasetInitFunc();
cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::CacheSize);
break;
case RANDOMX_FLAG_LARGE_PAGES:
cache->dealloc = &randomx::deallocCache<randomx::LargePageAllocator>;
cache->jit = nullptr;
cache->initialize = &randomx::initCache;
cache->datasetInit = &randomx::initDataset;
cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::CacheSize);
break;
case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES:
cache->dealloc = &randomx::deallocCache<randomx::LargePageAllocator>;
cache->jit = new randomx::JitCompiler();
cache->initialize = &randomx::initCacheCompile;
cache->datasetInit = cache->jit->getDatasetInitFunc();
cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::CacheSize);
break;
default:
UNREACHABLE;
}
}
catch (std::exception &ex) {
if (cache != nullptr) {
randomx_release_cache(cache);
cache = nullptr;
}
}
if (cache && cache->memory == nullptr) {
randomx_release_cache(cache);
cache = nullptr;
}
return cache;
}
void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize) {
assert(cache != nullptr);
assert(keySize == 0 || key != nullptr);
std::string cacheKey;
cacheKey.assign((const char *)key, keySize);
if (cache->cacheKey != cacheKey || !cache->isInitialized()) {
cache->initialize(cache, key, keySize);
cache->cacheKey = cacheKey;
}
}
void *randomx_get_cache_memory(randomx_cache *cache) {
assert(cache != nullptr);
return cache->memory;
}
void randomx_release_cache(randomx_cache* cache) {
assert(cache != nullptr);
cache->dealloc(cache);
delete cache;
}
randomx_dataset *randomx_alloc_dataset(randomx_flags flags) {
//fail on 32-bit systems if DatasetSize is >= 4 GiB
if (randomx::DatasetSize > std::numeric_limits<size_t>::max()) {
return nullptr;
}
randomx_dataset *dataset = nullptr;
try {
dataset = new randomx_dataset();
if (flags & RANDOMX_FLAG_LARGE_PAGES) {
dataset->dealloc = &randomx::deallocDataset<randomx::LargePageAllocator>;
dataset->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(randomx::DatasetSize);
}
else {
dataset->dealloc = &randomx::deallocDataset<randomx::DefaultAllocator>;
dataset->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(randomx::DatasetSize);
}
}
catch (std::exception &ex) {
if (dataset != nullptr) {
randomx_release_dataset(dataset);
dataset = nullptr;
}
}
if (dataset && dataset->memory == nullptr) {
randomx_release_dataset(dataset);
dataset = nullptr;
}
return dataset;
}
constexpr unsigned long DatasetItemCount = randomx::DatasetSize / RANDOMX_DATASET_ITEM_SIZE;
unsigned long randomx_dataset_item_count() {
return DatasetItemCount;
}
void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount) {
assert(dataset != nullptr);
assert(cache != nullptr);
assert(startItem < DatasetItemCount && itemCount <= DatasetItemCount);
assert(startItem + itemCount <= DatasetItemCount);
if (itemCount < 4) {
uint8_t buf[randomx::CacheLineSize * 4];
cache->datasetInit(cache, buf, startItem, startItem + 4);
memcpy(dataset->memory + startItem * randomx::CacheLineSize, buf, itemCount * randomx::CacheLineSize);
}
else if ((itemCount % 4) == 0) {
cache->datasetInit(cache, dataset->memory + startItem * randomx::CacheLineSize, startItem, startItem + itemCount);
}
else {
cache->datasetInit(cache, dataset->memory + startItem * randomx::CacheLineSize, startItem, startItem + itemCount - (itemCount % 4));
startItem += itemCount - 4;
cache->datasetInit(cache, dataset->memory + startItem * randomx::CacheLineSize, startItem, startItem + 4);
}
}
void *randomx_get_dataset_memory(randomx_dataset *dataset) {
assert(dataset != nullptr);
return dataset->memory;
}
void randomx_release_dataset(randomx_dataset *dataset) {
assert(dataset != nullptr);
dataset->dealloc(dataset);
delete dataset;
}
randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset) {
assert(cache != nullptr || (flags & RANDOMX_FLAG_FULL_MEM));
assert(cache == nullptr || cache->isInitialized());
assert(dataset != nullptr || !(flags & RANDOMX_FLAG_FULL_MEM));
randomx_vm *vm = nullptr;
try {
switch ((int)(flags & (RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES))) {
case RANDOMX_FLAG_DEFAULT:
vm = new randomx::InterpretedLightVmDefault();
break;
case RANDOMX_FLAG_FULL_MEM:
vm = new randomx::InterpretedVmDefault();
break;
case RANDOMX_FLAG_JIT:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledLightVmDefaultSecure();
}
else {
vm = new randomx::CompiledLightVmDefault();
}
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledVmDefaultSecure();
}
else {
vm = new randomx::CompiledVmDefault();
}
break;
case RANDOMX_FLAG_HARD_AES:
vm = new randomx::InterpretedLightVmHardAes();
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES:
vm = new randomx::InterpretedVmHardAes();
break;
case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledLightVmHardAesSecure();
}
else {
vm = new randomx::CompiledLightVmHardAes();
}
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledVmHardAesSecure();
}
else {
vm = new randomx::CompiledVmHardAes();
}
break;
case RANDOMX_FLAG_LARGE_PAGES:
vm = new randomx::InterpretedLightVmLargePage();
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_LARGE_PAGES:
vm = new randomx::InterpretedVmLargePage();
break;
case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledLightVmLargePageSecure();
}
else {
vm = new randomx::CompiledLightVmLargePage();
}
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledVmLargePageSecure();
}
else {
vm = new randomx::CompiledVmLargePage();
}
break;
case RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES:
vm = new randomx::InterpretedLightVmLargePageHardAes();
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES:
vm = new randomx::InterpretedVmLargePageHardAes();
break;
case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledLightVmLargePageHardAesSecure();
}
else {
vm = new randomx::CompiledLightVmLargePageHardAes();
}
break;
case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES:
if (flags & RANDOMX_FLAG_SECURE) {
vm = new randomx::CompiledVmLargePageHardAesSecure();
}
else {
vm = new randomx::CompiledVmLargePageHardAes();
}
break;
default:
UNREACHABLE;
}
if(cache != nullptr) {
vm->setCache(cache);
vm->cacheKey = cache->cacheKey;
}
if(dataset != nullptr)
vm->setDataset(dataset);
vm->allocate();
}
catch (std::exception &ex) {
delete vm;
vm = nullptr;
}
return vm;
}
void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) {
assert(machine != nullptr);
assert(cache != nullptr && cache->isInitialized());
if (machine->cacheKey != cache->cacheKey || machine->getMemory() != cache->memory) {
machine->setCache(cache);
machine->cacheKey = cache->cacheKey;
}
}
void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset) {
assert(machine != nullptr);
assert(dataset != nullptr);
machine->setDataset(dataset);
}
void randomx_destroy_vm(randomx_vm *machine) {
assert(machine != nullptr);
delete machine;
}
void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output) {
assert(machine != nullptr);
assert(inputSize == 0 || input != nullptr);
assert(output != nullptr);
#ifdef USE_CSR_INTRINSICS
const unsigned int fpstate = _mm_getcsr();
#else
fenv_t fpstate;
fegetenv(&fpstate);
#endif
alignas(16) uint64_t tempHash[8];
int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
assert(blakeResult == 0);
machine->initScratchpad(&tempHash);
machine->resetRoundingMode();
for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
machine->run(&tempHash);
blakeResult = blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
assert(blakeResult == 0);
}
machine->run(&tempHash);
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
#ifdef USE_CSR_INTRINSICS
_mm_setcsr(fpstate);
#else
fesetenv(&fpstate);
#endif
}
void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) {
blake2b(machine->tempHash, sizeof(machine->tempHash), input, inputSize, nullptr, 0);
machine->initScratchpad(machine->tempHash);
}
void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output) {
machine->resetRoundingMode();
for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
machine->run(machine->tempHash);
blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
}
machine->run(machine->tempHash);
// Finish current hash and fill the scratchpad for the next hash at the same time
blake2b(machine->tempHash, sizeof(machine->tempHash), nextInput, nextInputSize, nullptr, 0);
machine->hashAndFill(output, RANDOMX_HASH_SIZE, machine->tempHash);
}
void randomx_calculate_hash_last(randomx_vm* machine, void* output) {
machine->resetRoundingMode();
for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
machine->run(machine->tempHash);
blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
}
machine->run(machine->tempHash);
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
}
void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) {
assert(inputSize == 0 || input != nullptr);
assert(hash_in != nullptr);
assert(com_out != nullptr);
blake2b_state state;
blake2b_init(&state, RANDOMX_HASH_SIZE);
blake2b_update(&state, input, inputSize);
blake2b_update(&state, hash_in, RANDOMX_HASH_SIZE);
blake2b_final(&state, com_out, RANDOMX_HASH_SIZE);
}
}

288
crypto/randomx/randomx.h Executable file
View file

@ -0,0 +1,288 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef RANDOMX_H
#define RANDOMX_H
#include <stddef.h>
#include <stdint.h>
#define RANDOMX_HASH_SIZE 32
#define RANDOMX_DATASET_ITEM_SIZE 64
#ifndef RANDOMX_EXPORT
#define RANDOMX_EXPORT
#endif
typedef enum {
RANDOMX_FLAG_DEFAULT = 0,
RANDOMX_FLAG_LARGE_PAGES = 1,
RANDOMX_FLAG_HARD_AES = 2,
RANDOMX_FLAG_FULL_MEM = 4,
RANDOMX_FLAG_JIT = 8,
RANDOMX_FLAG_SECURE = 16,
RANDOMX_FLAG_ARGON2_SSSE3 = 32,
RANDOMX_FLAG_ARGON2_AVX2 = 64,
RANDOMX_FLAG_ARGON2 = 96
} randomx_flags;
typedef struct randomx_dataset randomx_dataset;
typedef struct randomx_cache randomx_cache;
typedef struct randomx_vm randomx_vm;
#if defined(__cplusplus)
#ifdef __cpp_constexpr
#define CONSTEXPR constexpr
#else
#define CONSTEXPR
#endif
inline CONSTEXPR randomx_flags operator |(randomx_flags a, randomx_flags b) {
return static_cast<randomx_flags>(static_cast<int>(a) | static_cast<int>(b));
}
inline CONSTEXPR randomx_flags operator &(randomx_flags a, randomx_flags b) {
return static_cast<randomx_flags>(static_cast<int>(a) & static_cast<int>(b));
}
inline randomx_flags& operator |=(randomx_flags& a, randomx_flags b) {
return a = a | b;
}
extern "C" {
#endif
/**
* @return The recommended flags to be used on the current machine.
* Does not include:
* RANDOMX_FLAG_LARGE_PAGES
* RANDOMX_FLAG_FULL_MEM
* RANDOMX_FLAG_SECURE
* These flags must be added manually if desired.
* On OpenBSD RANDOMX_FLAG_SECURE is enabled by default in JIT mode as W^X is enforced by the OS.
*/
RANDOMX_EXPORT randomx_flags randomx_get_flags(void);
/**
* Creates a randomx_cache structure and allocates memory for RandomX Cache.
*
* @param flags is any combination of these 2 flags (each flag can be set or not set):
* RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
* RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes
* subsequent Dataset initialization faster
* Optionally, one of these two flags may be selected:
* RANDOMX_FLAG_ARGON2_SSSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set
* makes subsequent cache initialization faster
* RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set
* makes subsequent cache initialization faster
*
* @return Pointer to an allocated randomx_cache structure.
* Returns NULL if:
* (1) memory allocation fails
* (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform
* (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set
*/
RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags);
/**
* Initializes the cache memory and SuperscalarHash using the provided key value.
* Does nothing if called again with the same key value.
*
* @param cache is a pointer to a previously allocated randomx_cache structure. Must not be NULL.
* @param key is a pointer to memory which contains the key value. Must not be NULL.
* @param keySize is the number of bytes of the key.
*/
RANDOMX_EXPORT void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize);
/**
* Returns a pointer to the internal memory buffer of the cache structure. The size
* of the internal memory buffer is RANDOMX_ARGON_MEMORY KiB.
*
* @param cache is a pointer to a previously allocated randomx_cache structure. Must not be NULL.
*
* @return Pointer to the internal memory buffer of the cache structure.
*/
RANDOMX_EXPORT void *randomx_get_cache_memory(randomx_cache *cache);
/**
* Releases all memory occupied by the randomx_cache structure.
*
* @param cache is a pointer to a previously allocated randomx_cache structure.
*/
RANDOMX_EXPORT void randomx_release_cache(randomx_cache* cache);
/**
* Creates a randomx_dataset structure and allocates memory for RandomX Dataset.
*
* @param flags is the initialization flags. Only one flag is supported (can be set or not set):
* RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
*
* @return Pointer to an allocated randomx_dataset structure.
* NULL is returned if memory allocation fails.
*/
RANDOMX_EXPORT randomx_dataset *randomx_alloc_dataset(randomx_flags flags);
/**
* Gets the number of items contained in the dataset.
*
* @return the number of items contained in the dataset.
*/
RANDOMX_EXPORT unsigned long randomx_dataset_item_count(void);
/**
* Initializes dataset items.
*
* Note: In order to use the Dataset, all items from 0 to (randomx_dataset_item_count() - 1) must be initialized.
* This may be done by several calls to this function using non-overlapping item sequences.
*
* @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL.
* @param cache is a pointer to a previously allocated and initialized randomx_cache structure. Must not be NULL.
* @param startItem is the item number where initialization should start.
* @param itemCount is the number of items that should be initialized.
*/
RANDOMX_EXPORT void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount);
/**
* Returns a pointer to the internal memory buffer of the dataset structure. The size
* of the internal memory buffer is randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE.
*
* @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL.
*
* @return Pointer to the internal memory buffer of the dataset structure.
*/
RANDOMX_EXPORT void *randomx_get_dataset_memory(randomx_dataset *dataset);
/**
* Releases all memory occupied by the randomx_dataset structure.
*
* @param dataset is a pointer to a previously allocated randomx_dataset structure.
*/
RANDOMX_EXPORT void randomx_release_dataset(randomx_dataset *dataset);
/**
* Creates and initializes a RandomX virtual machine.
*
* @param flags is any combination of these 5 flags (each flag can be set or not set):
* RANDOMX_FLAG_LARGE_PAGES - allocate scratchpad memory in large pages
* RANDOMX_FLAG_HARD_AES - virtual machine will use hardware accelerated AES
* RANDOMX_FLAG_FULL_MEM - virtual machine will use the full dataset
* RANDOMX_FLAG_JIT - virtual machine will use a JIT compiler
* RANDOMX_FLAG_SECURE - when combined with RANDOMX_FLAG_JIT, the JIT pages are never
* writable and executable at the same time (W^X policy)
* The numeric values of the first 4 flags are ordered so that a higher value will provide
* faster hash calculation and a lower numeric value will provide higher portability.
* Using RANDOMX_FLAG_DEFAULT (all flags not set) works on all platforms, but is the slowest.
* @param cache is a pointer to an initialized randomx_cache structure. Can be
* NULL if RANDOMX_FLAG_FULL_MEM is set.
* @param dataset is a pointer to a randomx_dataset structure. Can be NULL
* if RANDOMX_FLAG_FULL_MEM is not set.
*
* @return Pointer to an initialized randomx_vm structure.
* Returns NULL if:
* (1) Scratchpad memory allocation fails.
* (2) The requested initialization flags are not supported on the current platform.
* (3) cache parameter is NULL and RANDOMX_FLAG_FULL_MEM is not set
* (4) dataset parameter is NULL and RANDOMX_FLAG_FULL_MEM is set
*/
RANDOMX_EXPORT randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset);
/**
* Reinitializes a virtual machine with a new Cache. This function should be called anytime
* the Cache is reinitialized with a new key. Does nothing if called with a Cache containing
* the same key value as already set.
*
* @param machine is a pointer to a randomx_vm structure that was initialized
* without RANDOMX_FLAG_FULL_MEM. Must not be NULL.
* @param cache is a pointer to an initialized randomx_cache structure. Must not be NULL.
*/
RANDOMX_EXPORT void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache);
/**
* Reinitializes a virtual machine with a new Dataset.
*
* @param machine is a pointer to a randomx_vm structure that was initialized
* with RANDOMX_FLAG_FULL_MEM. Must not be NULL.
* @param dataset is a pointer to an initialized randomx_dataset structure. Must not be NULL.
*/
RANDOMX_EXPORT void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset);
/**
* Releases all memory occupied by the randomx_vm structure.
*
* @param machine is a pointer to a previously created randomx_vm structure.
*/
RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
/**
* Calculates a RandomX hash value.
*
* @param machine is a pointer to a randomx_vm structure. Must not be NULL.
* @param input is a pointer to memory to be hashed. Must not be NULL.
* @param inputSize is the number of bytes to be hashed.
* @param output is a pointer to memory where the hash will be stored. Must not
* be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
*/
RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);
/**
* Set of functions used to calculate multiple RandomX hashes more efficiently.
* randomx_calculate_hash_first will begin a hash calculation.
* randomx_calculate_hash_next will output the hash value of the previous input
* and begin the calculation of the next hash.
* randomx_calculate_hash_last will output the hash value of the previous input.
*
* WARNING: These functions may alter the floating point rounding mode of the calling thread.
*
* @param machine is a pointer to a randomx_vm structure. Must not be NULL.
* @param input is a pointer to memory to be hashed. Must not be NULL.
* @param inputSize is the number of bytes to be hashed.
* @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL.
* @param nextInputSize is the number of bytes to be hashed for the next hash.
* @param output is a pointer to memory where the hash will be stored. Must not
* be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
*/
RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize);
RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output);
RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output);
/**
* Calculate a RandomX commitment from a RandomX hash and its input.
*
* @param input is a pointer to memory that was hashed. Must not be NULL.
* @param inputSize is the number of bytes in the input.
* @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes).
* @param com_out is a pointer to memory where the commitment will be stored. Must not
* be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
*/
RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out);
#if defined(__cplusplus)
}
#endif
#endif

72
crypto/randomx/reciprocal.c Executable file
View file

@ -0,0 +1,72 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include "reciprocal.h"
/*
Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64.
divisor must not be 0 or a power of 2
Equivalent x86 assembly (divisor in rcx):
mov edx, 1
mov r8, rcx
xor eax, eax
bsr rcx, rcx
shl rdx, cl
div r8
ret
*/
uint64_t randomx_reciprocal(uint32_t divisor) {
assert(divisor != 0);
const uint64_t p2exp63 = 1ULL << 63;
const uint64_t q = p2exp63 / divisor;
const uint64_t r = p2exp63 % divisor;
#ifdef __GNUC__
const uint32_t shift = 64 - __builtin_clzll(divisor);
#else
uint32_t shift = 32;
for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
--shift;
#endif
return (q << shift) + ((r << shift) / divisor);
}
#if !RANDOMX_HAVE_FAST_RECIPROCAL
uint64_t randomx_reciprocal_fast(uint32_t divisor) {
return randomx_reciprocal(divisor);
}
#endif

48
crypto/randomx/reciprocal.h Executable file
View file

@ -0,0 +1,48 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdint.h>
#if defined(_M_X64) || defined(__x86_64__)
#define RANDOMX_HAVE_FAST_RECIPROCAL 1
#else
#define RANDOMX_HAVE_FAST_RECIPROCAL 0
#endif
#if defined(__cplusplus)
extern "C" {
#endif
uint64_t randomx_reciprocal(uint32_t);
uint64_t randomx_reciprocal_fast(uint32_t);
#if defined(__cplusplus)
}
#endif

378
crypto/randomx/soft_aes.cpp Executable file
View file

@ -0,0 +1,378 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "soft_aes.h"
alignas(16) const uint8_t sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
};
alignas(16) const uint32_t lutEnc0[256] = {
0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec,
0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb,
0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b,
0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c, 0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83,
0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a,
0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f,
0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df, 0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea,
0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b,
0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413,
0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1, 0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6,
0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85,
0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511,
0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe, 0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b,
0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1,
0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142, 0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf,
0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3, 0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e,
0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6,
0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b,
0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428, 0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad,
0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8,
0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2,
0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda, 0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949,
0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810,
0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697,
0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e, 0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f,
0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c,
0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27,
0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122, 0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433,
0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5,
0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0,
0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c,
};
alignas(16) const uint32_t lutEnc1[256] = {
0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154,
0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a,
0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b,
0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea, 0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b,
0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a, 0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f,
0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908, 0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f,
0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e, 0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5,
0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d, 0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f,
0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e, 0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb,
0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce, 0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397,
0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c, 0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed,
0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b, 0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a,
0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16, 0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194,
0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81, 0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3,
0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a, 0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104,
0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263, 0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d,
0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f, 0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39,
0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47, 0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695,
0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f, 0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83,
0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c, 0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76,
0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e, 0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4,
0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6, 0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b,
0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7, 0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0,
0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25, 0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018,
0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72, 0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751,
0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21, 0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85,
0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa, 0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12,
0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0, 0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9,
0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233, 0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7,
0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a,
0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8,
0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a,
};
alignas(16) const uint32_t lutEnc2[256] = {
0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5,
0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76,
0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0,
0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf, 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0,
0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26, 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc,
0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1, 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15,
0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3, 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a,
0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2, 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75,
0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a, 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0,
0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3, 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784,
0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced, 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b,
0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39, 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf,
0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb, 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485,
0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f, 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8,
0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f, 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5,
0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321, 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2,
0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec, 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917,
0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d, 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573,
0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc, 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388,
0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14, 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db,
0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a, 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c,
0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662, 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79,
0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d, 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9,
0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea, 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808,
0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e, 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6,
0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f, 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a,
0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66, 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e,
0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9, 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e,
0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311, 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794,
0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf,
0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868,
0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16,
};
alignas(16) const uint32_t lutEnc3[256] = {
0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5,
0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676,
0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0,
0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf, 0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0,
0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626, 0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc,
0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1, 0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515,
0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3, 0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a,
0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2, 0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575,
0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a, 0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0,
0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3, 0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484,
0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded, 0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b,
0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939, 0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf,
0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb, 0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585,
0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f, 0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8,
0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f, 0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5,
0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121, 0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2,
0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec, 0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717,
0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d, 0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373,
0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc, 0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888,
0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414, 0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb,
0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a, 0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c,
0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262, 0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979,
0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d, 0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9,
0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea, 0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808,
0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e, 0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6,
0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f, 0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a,
0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666, 0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e,
0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9, 0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e,
0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111, 0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494,
0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf,
0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868,
0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616,
};
alignas(16) const uint32_t lutDec0[256] = {
0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5,
0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b,
0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e,
0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d,
0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9,
0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566,
0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed,
0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4,
0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd,
0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060,
0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879,
0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c,
0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624,
0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c,
0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14,
0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b,
0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684,
0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177,
0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322,
0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f,
0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382,
0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb,
0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef,
0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235,
0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117,
0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546,
0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d,
0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a,
0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478,
0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff,
0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0,
};
alignas(16) const uint32_t lutDec1[256] = {
0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93,
0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f,
0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6,
0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44,
0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4,
0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994,
0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a,
0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c,
0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a,
0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51,
0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff,
0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db,
0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e,
0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a,
0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16,
0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8,
0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34,
0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420,
0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0,
0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef,
0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4,
0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5,
0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b,
0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6,
0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0,
0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f,
0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f,
0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13,
0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c,
0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886,
0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41,
0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042,
};
alignas(16) const uint32_t lutDec2[256] = {
0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303,
0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3,
0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9,
0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8,
0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a,
0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b,
0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab,
0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82,
0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe,
0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110,
0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15,
0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee,
0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72,
0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e,
0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a,
0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9,
0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e,
0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011,
0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3,
0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90,
0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf,
0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af,
0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb,
0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8,
0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066,
0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6,
0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51,
0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347,
0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1,
0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db,
0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195,
0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257,
};
alignas(16) const uint32_t lutDec3[256] = {
0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3,
0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362,
0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3,
0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9,
0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace,
0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08,
0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55,
0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216,
0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6,
0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e,
0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550,
0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8,
0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a,
0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36,
0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12,
0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e,
0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb,
0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6,
0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1,
0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033,
0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad,
0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3,
0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b,
0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815,
0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2,
0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691,
0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165,
0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6,
0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147,
0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44,
0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d,
0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
};
alignas(16) const uint8_t lutEncIndex[4][32] = {
{ 0, 255, 255, 255, 4, 255, 255, 255, 8, 255, 255, 255, 12, 255, 255, 255, 16, 255, 255, 255, 20, 255, 255, 255, 24, 255, 255, 255, 28, 255, 255, 255 },
{ 5, 255, 255, 255, 9, 255, 255, 255, 13, 255, 255, 255, 1, 255, 255, 255, 21, 255, 255, 255, 25, 255, 255, 255, 29, 255, 255, 255, 17, 255, 255, 255 },
{ 10, 255, 255, 255, 14, 255, 255, 255, 2, 255, 255, 255, 6, 255, 255, 255, 26, 255, 255, 255, 30, 255, 255, 255, 18, 255, 255, 255, 22, 255, 255, 255 },
{ 15, 255, 255, 255, 3, 255, 255, 255, 7, 255, 255, 255, 11, 255, 255, 255, 31, 255, 255, 255, 19, 255, 255, 255, 23, 255, 255, 255, 27, 255, 255, 255 }
};
alignas(16) const uint8_t lutDecIndex[4][32] = {
{ 0, 255, 255, 255, 4, 255, 255, 255, 8, 255, 255, 255, 12, 255, 255, 255, 16, 255, 255, 255, 20, 255, 255, 255, 24, 255, 255, 255, 28, 255, 255, 255 },
{ 13, 255, 255, 255, 1, 255, 255, 255, 5, 255, 255, 255, 9, 255, 255, 255, 29, 255, 255, 255, 17, 255, 255, 255, 21, 255, 255, 255, 25, 255, 255, 255 },
{ 10, 255, 255, 255, 14, 255, 255, 255, 2, 255, 255, 255, 6, 255, 255, 255, 26, 255, 255, 255, 30, 255, 255, 255, 18, 255, 255, 255, 22, 255, 255, 255 },
{ 7, 255, 255, 255, 11, 255, 255, 255, 15, 255, 255, 255, 3, 255, 255, 255, 23, 255, 255, 255, 27, 255, 255, 255, 31, 255, 255, 255, 19, 255, 255, 255 }
};
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) {
uint32_t s0, s1, s2, s3;
s0 = rx_vec_i128_w(in);
s1 = rx_vec_i128_z(in);
s2 = rx_vec_i128_y(in);
s3 = rx_vec_i128_x(in);
rx_vec_i128 out = rx_set_int_vec_i128(
(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
);
return rx_xor_vec_i128(out, key);
}
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) {
uint32_t s0, s1, s2, s3;
s0 = rx_vec_i128_w(in);
s1 = rx_vec_i128_z(in);
s2 = rx_vec_i128_y(in);
s3 = rx_vec_i128_x(in);
rx_vec_i128 out = rx_set_int_vec_i128(
(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
);
return rx_xor_vec_i128(out, key);
}

58
crypto/randomx/soft_aes.h Executable file
View file

@ -0,0 +1,58 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdint.h>
#include "intrin_portable.h"
extern const uint32_t lutEnc0[256];
extern const uint32_t lutEnc1[256];
extern const uint32_t lutEnc2[256];
extern const uint32_t lutEnc3[256];
extern const uint32_t lutDec0[256];
extern const uint32_t lutDec1[256];
extern const uint32_t lutDec2[256];
extern const uint32_t lutDec3[256];
extern const uint8_t lutEncIndex[4][32];
extern const uint8_t lutDecIndex[4][32];
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key);
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key);
template<bool soft>
inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) {
return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key);
}
template<bool soft>
inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) {
return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key);
}

903
crypto/randomx/superscalar.cpp Executable file
View file

@ -0,0 +1,903 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "configuration.h"
#include "program.hpp"
#include "blake2/endian.h"
#include <iostream>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <iomanip>
#include "superscalar.hpp"
#include "intrin_portable.h"
#include "reciprocal.h"
#include "common.hpp"
namespace randomx {
static bool isMultiplication(SuperscalarInstructionType type) {
return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP;
}
//uOPs (micro-ops) are represented only by the execution port they can go to
namespace ExecutionPort {
using type = int;
constexpr type Null = 0;
constexpr type P0 = 1;
constexpr type P1 = 2;
constexpr type P5 = 4;
constexpr type P01 = P0 | P1;
constexpr type P05 = P0 | P5;
constexpr type P015 = P0 | P1 | P5;
}
//Macro-operation as output of the x86 decoder
//Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op
//Macro-op can consist of 1 or 2 uOPs.
class MacroOp {
public:
MacroOp(const char* name, int size)
: name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {}
MacroOp(const char* name, int size, int latency, ExecutionPort::type uop)
: name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {}
MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2)
: name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {}
MacroOp(const MacroOp& parent, bool dependent)
: name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {}
const char* getName() const {
return name_;
}
int getSize() const {
return size_;
}
int getLatency() const {
return latency_;
}
ExecutionPort::type getUop1() const {
return uop1_;
}
ExecutionPort::type getUop2() const {
return uop2_;
}
bool isSimple() const {
return uop2_ == ExecutionPort::Null;
}
bool isEliminated() const {
return uop1_ == ExecutionPort::Null;
}
bool isDependent() const {
return dependent_;
}
static const MacroOp Add_rr;
static const MacroOp Add_ri;
static const MacroOp Lea_sib;
static const MacroOp Sub_rr;
static const MacroOp Imul_rr;
static const MacroOp Imul_r;
static const MacroOp Mul_r;
static const MacroOp Mov_rr;
static const MacroOp Mov_ri64;
static const MacroOp Xor_rr;
static const MacroOp Xor_ri;
static const MacroOp Ror_rcl;
static const MacroOp Ror_ri;
static const MacroOp TestJz_fused;
static const MacroOp Xor_self;
static const MacroOp Cmp_ri;
static const MacroOp Setcc_r;
private:
const char* name_;
int size_;
int latency_;
ExecutionPort::type uop1_;
ExecutionPort::type uop2_;
bool dependent_ = false;
};
//Size: 3 bytes
const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015);
const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015);
const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015);
const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3);
//Size: 4 bytes
const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01);
const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1);
const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05);
//Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes)
const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015);
const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015);
//Size: 10 bytes
const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015);
//Unused:
const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5);
const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3);
const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015);
const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05);
const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5);
const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr };
const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr };
const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) };
class SuperscalarInstructionInfo {
public:
const char* getName() const {
return name_;
}
int getSize() const {
return ops_.size();
}
bool isSimple() const {
return getSize() == 1;
}
int getLatency() const {
return latency_;
}
const MacroOp& getOp(int index) const {
return ops_[index];
}
SuperscalarInstructionType getType() const {
return type_;
}
int getResultOp() const {
return resultOp_;
}
int getDstOp() const {
return dstOp_;
}
int getSrcOp() const {
return srcOp_;
}
static const SuperscalarInstructionInfo ISUB_R;
static const SuperscalarInstructionInfo IXOR_R;
static const SuperscalarInstructionInfo IADD_RS;
static const SuperscalarInstructionInfo IMUL_R;
static const SuperscalarInstructionInfo IROR_C;
static const SuperscalarInstructionInfo IADD_C7;
static const SuperscalarInstructionInfo IXOR_C7;
static const SuperscalarInstructionInfo IADD_C8;
static const SuperscalarInstructionInfo IXOR_C8;
static const SuperscalarInstructionInfo IADD_C9;
static const SuperscalarInstructionInfo IXOR_C9;
static const SuperscalarInstructionInfo IMULH_R;
static const SuperscalarInstructionInfo ISMULH_R;
static const SuperscalarInstructionInfo IMUL_RCP;
static const SuperscalarInstructionInfo NOP;
private:
const char* name_;
SuperscalarInstructionType type_;
std::vector<MacroOp> ops_;
int latency_;
int resultOp_ = 0;
int dstOp_ = 0;
int srcOp_;
SuperscalarInstructionInfo(const char* name)
: name_(name), type_(SuperscalarInstructionType::INVALID), latency_(0) {}
SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp& op, int srcOp)
: name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) {
ops_.push_back(MacroOp(op));
}
template <size_t N>
SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp)
: name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) {
for (unsigned i = 0; i < N; ++i) {
ops_.push_back(MacroOp(arr[i]));
latency_ += ops_.back().getLatency();
}
static_assert(N > 1, "Invalid array size");
}
};
const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);
const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP");
//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
const int buffer0[] = { 4, 8, 4 };
const int buffer1[] = { 7, 3, 3, 3 };
const int buffer2[] = { 3, 7, 3, 3 };
const int buffer3[] = { 4, 9, 3 };
const int buffer4[] = { 4, 4, 4, 4 };
const int buffer5[] = { 3, 3, 10 };
class DecoderBuffer {
public:
static const DecoderBuffer Default;
template <size_t N>
DecoderBuffer(const char* name, int index, const int(&arr)[N])
: name_(name), index_(index), counts_(arr), opsCount_(N) {}
const int* getCounts() const {
return counts_;
}
int getSize() const {
return opsCount_;
}
int getIndex() const {
return index_;
}
const char* getName() const {
return name_;
}
const DecoderBuffer* fetchNext(SuperscalarInstructionType instrType, int cycle, int mulCount, Blake2Generator& gen) const {
//If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10
//because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs.
//Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops.
if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R)
return &decodeBuffer3310;
//To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications
//is lower than the number of cycles.
if (mulCount < cycle + 1)
return &decodeBuffer4444;
//If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication.
if(instrType == SuperscalarInstructionType::IMUL_RCP)
return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493;
//Default: select a random fetch configuration.
return fetchNextDefault(gen);
}
private:
const char* name_;
int index_;
const int* counts_;
int opsCount_;
DecoderBuffer() : index_(-1) {}
static const DecoderBuffer decodeBuffer484;
static const DecoderBuffer decodeBuffer7333;
static const DecoderBuffer decodeBuffer3733;
static const DecoderBuffer decodeBuffer493;
static const DecoderBuffer decodeBuffer4444;
static const DecoderBuffer decodeBuffer3310;
static const DecoderBuffer* decodeBuffers[4];
const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const {
return decodeBuffers[gen.getByte() & 3];
}
};
const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0);
const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1);
const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2);
const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3);
const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4);
const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5);
const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = {
&DecoderBuffer::decodeBuffer484,
&DecoderBuffer::decodeBuffer7333,
&DecoderBuffer::decodeBuffer3733,
&DecoderBuffer::decodeBuffer493,
};
const DecoderBuffer DecoderBuffer::Default = DecoderBuffer();
const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R };
const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R };
const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS };
const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 };
const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 };
const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 };
const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP;
static bool selectRegister(std::vector<int>& availableRegisters, Blake2Generator& gen, int& reg) {
int index;
if (availableRegisters.size() == 0)
return false;
if (availableRegisters.size() > 1) {
index = gen.getUInt32() % availableRegisters.size();
}
else {
index = 0;
}
reg = availableRegisters[index];
return true;
}
class RegisterInfo {
public:
RegisterInfo() : latency(0), lastOpGroup(SuperscalarInstructionType::INVALID), lastOpPar(-1), value(0) {}
int latency;
SuperscalarInstructionType lastOpGroup;
int lastOpPar;
int value;
};
//"SuperscalarInstruction" consists of one or more macro-ops
class SuperscalarInstruction {
public:
void toInstr(Instruction& instr) { //translate to a RandomX instruction format
instr.opcode = (int)getType();
instr.dst = dst_;
instr.src = src_ >= 0 ? src_ : dst_;
instr.setMod(mod_);
instr.setImm32(imm32_);
}
void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) {
switch (slotSize)
{
case 3:
//if this is the last slot, we can also select "IMULH" instructions
if (isLast) {
create(slot_3L[gen.getByte() & 3], gen);
}
else {
create(slot_3[gen.getByte() & 1], gen);
}
break;
case 4:
//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
if (fetchType == 4 && !isLast) {
create(&SuperscalarInstructionInfo::IMUL_R, gen);
}
else {
create(slot_4[gen.getByte() & 1], gen);
}
break;
case 7:
create(slot_7[gen.getByte() & 1], gen);
break;
case 8:
create(slot_8[gen.getByte() & 1], gen);
break;
case 9:
create(slot_9[gen.getByte() & 1], gen);
break;
case 10:
create(slot_10, gen);
break;
default:
UNREACHABLE;
}
}
void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) {
info_ = info;
reset();
switch (info->getType())
{
case SuperscalarInstructionType::ISUB_R: {
mod_ = 0;
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IADD_RS;
groupParIsSource_ = true;
} break;
case SuperscalarInstructionType::IXOR_R: {
mod_ = 0;
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IXOR_R;
groupParIsSource_ = true;
} break;
case SuperscalarInstructionType::IADD_RS: {
mod_ = gen.getByte();
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IADD_RS;
groupParIsSource_ = true;
} break;
case SuperscalarInstructionType::IMUL_R: {
mod_ = 0;
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IMUL_R;
groupParIsSource_ = true;
} break;
case SuperscalarInstructionType::IROR_C: {
mod_ = 0;
do {
imm32_ = gen.getByte() & 63;
} while (imm32_ == 0);
opGroup_ = SuperscalarInstructionType::IROR_C;
opGroupPar_ = -1;
} break;
case SuperscalarInstructionType::IADD_C7:
case SuperscalarInstructionType::IADD_C8:
case SuperscalarInstructionType::IADD_C9: {
mod_ = 0;
imm32_ = gen.getUInt32();
opGroup_ = SuperscalarInstructionType::IADD_C7;
opGroupPar_ = -1;
} break;
case SuperscalarInstructionType::IXOR_C7:
case SuperscalarInstructionType::IXOR_C8:
case SuperscalarInstructionType::IXOR_C9: {
mod_ = 0;
imm32_ = gen.getUInt32();
opGroup_ = SuperscalarInstructionType::IXOR_C7;
opGroupPar_ = -1;
} break;
case SuperscalarInstructionType::IMULH_R: {
canReuse_ = true;
mod_ = 0;
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IMULH_R;
opGroupPar_ = gen.getUInt32();
} break;
case SuperscalarInstructionType::ISMULH_R: {
canReuse_ = true;
mod_ = 0;
imm32_ = 0;
opGroup_ = SuperscalarInstructionType::ISMULH_R;
opGroupPar_ = gen.getUInt32();
} break;
case SuperscalarInstructionType::IMUL_RCP: {
mod_ = 0;
do {
imm32_ = gen.getUInt32();
} while (isZeroOrPowerOf2(imm32_));
opGroup_ = SuperscalarInstructionType::IMUL_RCP;
opGroupPar_ = -1;
} break;
default:
break;
}
}
bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (&registers)[8], Blake2Generator& gen) {
/*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R)
std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/
std::vector<int> availableRegisters;
//Conditions for the destination register:
// * value must be ready at the required cycle
// * cannot be the same as the source register unless the instruction allows it
// - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
// * register cannot be multiplied twice in a row unless allowChainedMul is true
// - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
// - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
// * either the last instruction applied to the register or its source must be different than this instruction
// - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
// * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
for (unsigned i = 0; i < 8; ++i) {
if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement))
availableRegisters.push_back(i);
}
return selectRegister(availableRegisters, gen, dst_);
}
bool selectSource(int cycle, RegisterInfo(&registers)[8], Blake2Generator& gen) {
std::vector<int> availableRegisters;
//all registers that are ready at the cycle
for (unsigned i = 0; i < 8; ++i) {
if (registers[i].latency <= cycle)
availableRegisters.push_back(i);
}
//if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination
if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) {
if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) {
opGroupPar_ = src_ = RegisterNeedsDisplacement;
return true;
}
}
if (selectRegister(availableRegisters, gen, src_)) {
if (groupParIsSource_)
opGroupPar_ = src_;
return true;
}
return false;
}
SuperscalarInstructionType getType() {
return info_->getType();
}
int getSource() {
return src_;
}
int getDestination() {
return dst_;
}
SuperscalarInstructionType getGroup() {
return opGroup_;
}
int getGroupPar() {
return opGroupPar_;
}
const SuperscalarInstructionInfo& getInfo() const {
return *info_;
}
static const SuperscalarInstruction Null;
private:
const SuperscalarInstructionInfo* info_;
int src_ = -1;
int dst_ = -1;
int mod_;
uint32_t imm32_;
SuperscalarInstructionType opGroup_;
int opGroupPar_;
bool canReuse_ = false;
bool groupParIsSource_ = false;
void reset() {
src_ = dst_ = -1;
canReuse_ = groupParIsSource_ = false;
}
SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) {
}
};
const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP);
constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4;
constexpr int LOOK_FORWARD_CYCLES = 4;
constexpr int MAX_THROWAWAY_COUNT = 256;
template<bool commit>
static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) {
//The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
//port P1 (multiplication) by instructions that can go to any port.
for (; cycle < CYCLE_MAP_SIZE; ++cycle) {
if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) {
if (commit) {
if (trace) std::cout << "; P5 at cycle " << cycle << std::endl;
portBusy[cycle][2] = uop;
}
return cycle;
}
if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) {
if (commit) {
if (trace) std::cout << "; P0 at cycle " << cycle << std::endl;
portBusy[cycle][0] = uop;
}
return cycle;
}
if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) {
if (commit) {
if (trace) std::cout << "; P1 at cycle " << cycle << std::endl;
portBusy[cycle][1] = uop;
}
return cycle;
}
}
return -1;
}
template<bool commit>
static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) {
//if this macro-op depends on the previous one, increase the starting cycle if needed
//this handles an explicit dependency chain in IMUL_RCP
if (mop.isDependent()) {
cycle = std::max(cycle, depCycle);
}
//move instructions are eliminated and don't need an execution unit
if (mop.isEliminated()) {
if (commit)
if (trace) std::cout << "; (eliminated)" << std::endl;
return cycle;
}
else if (mop.isSimple()) {
//this macro-op has only one uOP
return scheduleUop<commit>(mop.getUop1(), portBusy, cycle);
}
else {
//macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle
for (; cycle < CYCLE_MAP_SIZE; ++cycle) {
int cycle1 = scheduleUop<false>(mop.getUop1(), portBusy, cycle);
int cycle2 = scheduleUop<false>(mop.getUop2(), portBusy, cycle);
if (cycle1 >= 0 && cycle1 == cycle2) {
if (commit) {
scheduleUop<true>(mop.getUop1(), portBusy, cycle1);
scheduleUop<true>(mop.getUop2(), portBusy, cycle2);
}
return cycle1;
}
}
}
return -1;
}
void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) {
ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3];
memset(portBusy, 0, sizeof(portBusy));
RegisterInfo registers[8];
const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default;
SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null;
int macroOpIndex = 0;
int codeSize = 0;
int macroOpCount = 0;
int cycle = 0;
int depCycle = 0;
int retireCycle = 0;
bool portsSaturated = false;
int programSize = 0;
int mulCount = 0;
int decodeCycle;
int throwAwayCount = 0;
//decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated.
//Each decode cycle decodes 16 bytes of x86 code.
//Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always
//saturated first. The cycle limit is present only to guarantee loop termination.
//Program size is limited to SuperscalarMaxSize instructions.
for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < SuperscalarMaxSize; ++decodeCycle) {
//select a decode configuration
decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen);
if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl;
int bufferIndex = 0;
//fill all instruction slots in the current decode buffer
while (bufferIndex < decodeBuffer->getSize()) {
int topCycle = cycle;
//if we have issued all macro-ops for the current RandomX instruction, create a new instruction
if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
if (portsSaturated || programSize >= SuperscalarMaxSize)
break;
//select an instruction so that the first macro-op fits into the current slot
currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0);
macroOpIndex = 0;
if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl;
}
const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex);
if (trace) std::cout << mop.getName() << " ";
//calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution
int scheduleCycle = scheduleMop<false>(mop, portBusy, cycle, depCycle);
if (scheduleCycle < 0) {
if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl;
//__debugbreak();
portsSaturated = true;
break;
}
//find a source register (if applicable) that will be ready when this instruction executes
if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) {
int forward;
//if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward
for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) {
if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl;
++scheduleCycle;
++cycle;
}
//if no register was found, throw the instruction away and try another one
if (forward == LOOK_FORWARD_CYCLES) {
if (throwAwayCount < MAX_THROWAWAY_COUNT) {
throwAwayCount++;
macroOpIndex = currentInstruction.getInfo().getSize();
if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
//cycle = topCycle;
continue;
}
//abort this decode buffer
if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl;
currentInstruction = SuperscalarInstruction::Null;
break;
}
if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl;
}
//find a destination register that will be ready when this instruction executes
if (macroOpIndex == currentInstruction.getInfo().getDstOp()) {
int forward;
for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) {
if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl;
++scheduleCycle;
++cycle;
}
if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away
if (throwAwayCount < MAX_THROWAWAY_COUNT) {
throwAwayCount++;
macroOpIndex = currentInstruction.getInfo().getSize();
if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
//cycle = topCycle;
continue;
}
//abort this decode buffer
if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl;
currentInstruction = SuperscalarInstruction::Null;
break;
}
if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl;
}
throwAwayCount = 0;
//recalculate when the instruction can be scheduled for execution based on operand availability
scheduleCycle = scheduleMop<true>(mop, portBusy, scheduleCycle, scheduleCycle);
if (scheduleCycle < 0) {
if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << scheduleCycle << ")" << std::endl;
portsSaturated = true;
break;
}
//calculate when the result will be ready
depCycle = scheduleCycle + mop.getLatency();
//if this instruction writes the result, modify register information
// RegisterInfo.latency - which cycle the register will be ready
// RegisterInfo.lastOpGroup - the last operation that was applied to the register
// RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register)
if (macroOpIndex == currentInstruction.getInfo().getResultOp()) {
int dst = currentInstruction.getDestination();
RegisterInfo& ri = registers[dst];
retireCycle = depCycle;
ri.latency = retireCycle;
ri.lastOpGroup = currentInstruction.getGroup();
ri.lastOpPar = currentInstruction.getGroupPar();
if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl;
}
codeSize += mop.getSize();
bufferIndex++;
macroOpIndex++;
macroOpCount++;
//terminating condition
if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) {
portsSaturated = true;
}
cycle = topCycle;
//when all macro-ops of the current instruction have been issued, add the instruction into the program
if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
currentInstruction.toInstr(prog(programSize++));
mulCount += isMultiplication(currentInstruction.getType());
}
}
++cycle;
}
double ipc = (macroOpCount / (double)retireCycle);
memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies));
//Calculate ASIC latency:
//Assumes 1 cycle latency for all operations and unlimited parallelization.
for (int i = 0; i < programSize; ++i) {
Instruction& instr = prog(i);
int latDst = prog.asicLatencies[instr.dst] + 1;
int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0;
prog.asicLatencies[instr.dst] = std::max(latDst, latSrc);
}
//address register is the register with the highest ASIC latency
int asicLatencyMax = 0;
int addressReg = 0;
for (int i = 0; i < 8; ++i) {
if (prog.asicLatencies[i] > asicLatencyMax) {
asicLatencyMax = prog.asicLatencies[i];
addressReg = i;
}
prog.cpuLatencies[i] = registers[i].latency;
}
prog.setSize(programSize);
prog.setAddressRegister(addressReg);
prog.cpuLatency = retireCycle;
prog.asicLatency = asicLatencyMax;
prog.codeSize = codeSize;
prog.macroOps = macroOpCount;
prog.decodeCycles = decodeCycle;
prog.ipc = ipc;
prog.mulCount = mulCount;
/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;
int portCycles = 0;
for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
std::cout << "; " << std::setw(3) << i << " ";
for (int j = 0; j < 3; ++j) {
std::cout << (portBusy[i][j] ? '*' : '_');
portCycles += !!portBusy[i][j];
}
std::cout << std::endl;
}*/
}
void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals) {
for (unsigned j = 0; j < prog.getSize(); ++j) {
Instruction& instr = prog(j);
switch ((SuperscalarInstructionType)instr.opcode)
{
case SuperscalarInstructionType::ISUB_R:
r[instr.dst] -= r[instr.src];
break;
case SuperscalarInstructionType::IXOR_R:
r[instr.dst] ^= r[instr.src];
break;
case SuperscalarInstructionType::IADD_RS:
r[instr.dst] += r[instr.src] << instr.getModShift();
break;
case SuperscalarInstructionType::IMUL_R:
r[instr.dst] *= r[instr.src];
break;
case SuperscalarInstructionType::IROR_C:
r[instr.dst] = rotr(r[instr.dst], instr.getImm32());
break;
case SuperscalarInstructionType::IADD_C7:
case SuperscalarInstructionType::IADD_C8:
case SuperscalarInstructionType::IADD_C9:
r[instr.dst] += signExtend2sCompl(instr.getImm32());
break;
case SuperscalarInstructionType::IXOR_C7:
case SuperscalarInstructionType::IXOR_C8:
case SuperscalarInstructionType::IXOR_C9:
r[instr.dst] ^= signExtend2sCompl(instr.getImm32());
break;
case SuperscalarInstructionType::IMULH_R:
r[instr.dst] = mulh(r[instr.dst], r[instr.src]);
break;
case SuperscalarInstructionType::ISMULH_R:
r[instr.dst] = smulh(r[instr.dst], r[instr.src]);
break;
case SuperscalarInstructionType::IMUL_RCP:
if (reciprocals != nullptr)
r[instr.dst] *= (*reciprocals)[instr.getImm32()];
else
r[instr.dst] *= randomx_reciprocal(instr.getImm32());
break;
default:
UNREACHABLE;
}
}
}
}

60
crypto/randomx/superscalar.hpp Executable file
View file

@ -0,0 +1,60 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include <vector>
#include "superscalar_program.hpp"
#include "blake2_generator.hpp"
namespace randomx {
// Intel Ivy Bridge reference
enum class SuperscalarInstructionType { //uOPs (decode) execution ports latency code size
ISUB_R = 0, //1 p015 1 3 (sub)
IXOR_R = 1, //1 p015 1 3 (xor)
IADD_RS = 2, //1 p01 1 4 (lea)
IMUL_R = 3, //1 p1 3 4 (imul)
IROR_C = 4, //1 p05 1 4 (ror)
IADD_C7 = 5, //1 p015 1 7 (add)
IXOR_C7 = 6, //1 p015 1 7 (xor)
IADD_C8 = 7, //1+0 p015 1 7+1 (add+nop)
IXOR_C8 = 8, //1+0 p015 1 7+1 (xor+nop)
IADD_C9 = 9, //1+0 p015 1 7+2 (add+nop)
IXOR_C9 = 10, //1+0 p015 1 7+2 (xor+nop)
IMULH_R = 11, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+mul+mov)
ISMULH_R = 12, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+imul+mov)
IMUL_RCP = 13, //1+1 p015+p1 4 10+4 (mov+imul)
COUNT = 14,
INVALID = -1
};
void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen);
void executeSuperscalar(uint64_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals = nullptr);
}

View file

@ -0,0 +1,84 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include "instruction.hpp"
#include "common.hpp"
namespace randomx {
class SuperscalarProgram {
public:
Instruction& operator()(int pc) {
return programBuffer[pc];
}
friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) {
p.print(os);
return os;
}
uint32_t getSize() {
return size;
}
void setSize(uint32_t val) {
size = val;
}
int getAddressRegister() {
return addrReg;
}
void setAddressRegister(int val) {
addrReg = val;
}
Instruction programBuffer[SuperscalarMaxSize];
uint32_t size
#ifndef NDEBUG
= 0
#endif
;
int addrReg;
double ipc;
int codeSize;
int macroOps;
int decodeCycles;
int cpuLatency;
int asicLatency;
int mulCount;
int cpuLatencies[8];
int asicLatencies[8];
private:
void print(std::ostream& os) const {
for (unsigned i = 0; i < size; ++i) {
auto instr = programBuffer[i];
os << instr;
}
}
};
}

View file

@ -0,0 +1,145 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstring>
#include <iomanip>
#include <stdexcept>
#include "virtual_machine.hpp"
#include "common.hpp"
#include "aes_hash.hpp"
#include "blake2/blake2.h"
#include "intrin_portable.h"
#include "allocator.hpp"
randomx_vm::~randomx_vm() {
}
void randomx_vm::resetRoundingMode() {
rx_reset_float_state();
}
namespace randomx {
static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) {
auto exponent = entropy >> 59; //0..31
auto mantissa = entropy & mantissaMask;
exponent += exponentBias;
exponent &= exponentMask;
exponent <<= mantissaSize;
return exponent | mantissa;
}
static inline uint64_t getStaticExponent(uint64_t entropy) {
auto exponent = constExponentBits;
exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits;
exponent <<= mantissaSize;
return exponent;
}
static inline uint64_t getFloatMask(uint64_t entropy) {
constexpr uint64_t mask22bit = (1ULL << 22) - 1;
return (entropy & mask22bit) | getStaticExponent(entropy);
}
}
void randomx_vm::initialize() {
store64(&reg.a[0].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(0)));
store64(&reg.a[0].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(1)));
store64(&reg.a[1].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(2)));
store64(&reg.a[1].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(3)));
store64(&reg.a[2].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(4)));
store64(&reg.a[2].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(5)));
store64(&reg.a[3].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(6)));
store64(&reg.a[3].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(7)));
mem.ma = program.getEntropy(8) & randomx::CacheLineAlignMask;
mem.mx = program.getEntropy(10);
auto addressRegisters = program.getEntropy(12);
config.readReg0 = 0 + (addressRegisters & 1);
addressRegisters >>= 1;
config.readReg1 = 2 + (addressRegisters & 1);
addressRegisters >>= 1;
config.readReg2 = 4 + (addressRegisters & 1);
addressRegisters >>= 1;
config.readReg3 = 6 + (addressRegisters & 1);
datasetOffset = (program.getEntropy(13) % (randomx::DatasetExtraItems + 1)) * randomx::CacheLineSize;
store64(&config.eMask[0], randomx::getFloatMask(program.getEntropy(14)));
store64(&config.eMask[1], randomx::getFloatMask(program.getEntropy(15)));
}
namespace randomx {
alignas(16) volatile static rx_vec_i128 aesDummy;
template<class Allocator, bool softAes>
VmBase<Allocator, softAes>::~VmBase() {
Allocator::freeMemory(scratchpad, ScratchpadSize);
}
template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::allocate() {
if (datasetPtr == nullptr)
throw std::invalid_argument("Cache/Dataset not set");
#ifndef __riscv
if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy);
tmp = rx_aesenc_vec_i128(tmp, tmp);
rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp);
}
#endif
scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
}
template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::getFinalResult(void* out, size_t outSize) {
hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a);
blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
}
template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) {
hashAndFillAes1Rx4<softAes>((void*) getScratchpad(), ScratchpadSize, &reg.a, fill_state);
blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
}
template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::initScratchpad(void* seed) {
fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad);
}
template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::generateProgram(void* seed) {
fillAes4Rx4<softAes>(seed, sizeof(program), &program);
}
template class VmBase<AlignedAllocator<CacheLineSize>, false>;
template class VmBase<AlignedAllocator<CacheLineSize>, true>;
template class VmBase<LargePageAllocator, false>;
template class VmBase<LargePageAllocator, true>;
}

View file

@ -0,0 +1,91 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <cstdint>
#include "common.hpp"
#include "program.hpp"
/* Global namespace for C binding */
class randomx_vm {
public:
virtual ~randomx_vm() = 0;
virtual void allocate() = 0;
virtual void getFinalResult(void* out, size_t outSize) = 0;
virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0;
virtual void setDataset(randomx_dataset* dataset) { }
virtual void setCache(randomx_cache* cache) { }
virtual void initScratchpad(void* seed) = 0;
virtual void run(void* seed) = 0;
void resetRoundingMode();
randomx::RegisterFile *getRegisterFile() {
return &reg;
}
const void* getScratchpad() {
return scratchpad;
}
const randomx::Program& getProgram()
{
return program;
}
const uint8_t* getMemory() const {
return mem.memory;
}
protected:
void initialize();
alignas(64) randomx::Program program;
alignas(64) randomx::RegisterFile reg;
alignas(16) randomx::ProgramConfiguration config;
randomx::MemoryRegisters mem;
uint8_t* scratchpad = nullptr;
union {
randomx_cache* cachePtr = nullptr;
randomx_dataset* datasetPtr;
};
uint64_t datasetOffset;
public:
std::string cacheKey;
alignas(16) uint64_t tempHash[8]; //8 64-bit values used to store intermediate data
};
namespace randomx {
template<class Allocator, bool softAes>
class VmBase : public randomx_vm {
public:
~VmBase() override;
void allocate() override;
void initScratchpad(void* seed) override;
void getFinalResult(void* out, size_t outSize) override;
void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override;
protected:
void generateProgram(void* seed);
};
}

243
crypto/randomx/virtual_memory.c Executable file
View file

@ -0,0 +1,243 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_WIN32) || defined(__CYGWIN__)
#include <windows.h>
#else
#define _GNU_SOURCE 1 /* needed for MAP_ANONYMOUS on older platforms */
#ifdef __APPLE__
#include <mach/vm_statistics.h>
#include <TargetConditionals.h>
#include <AvailabilityMacros.h>
# if TARGET_OS_OSX
# define USE_PTHREAD_JIT_WP 1
# include <pthread.h>
# include <sys/utsname.h>
# include <stdio.h>
# endif
#endif
#include <sys/types.h>
#include <sys/mman.h>
#include <errno.h>
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#define PAGE_READONLY PROT_READ
#define PAGE_READWRITE (PROT_READ | PROT_WRITE)
#define PAGE_EXECUTE_READ (PROT_READ | PROT_EXEC)
#define PAGE_EXECUTE_READWRITE (PROT_READ | PROT_WRITE | PROT_EXEC)
#endif
#include "virtual_memory.h"
#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \
&& MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
static int MacOSchecked, MacOSver;
/* This function is used implicitly by clang's __builtin_available() checker.
* When cross-compiling, the library containing this function doesn't exist,
* and linking will fail because the symbol is unresolved. The function here
* is a quick and dirty hack to get close enough to identify MacOSX 11.0.
*/
static int32_t __isOSVersionAtLeast(int32_t major, int32_t minor, int32_t subminor) {
if (!MacOSchecked) {
struct utsname ut;
int mmaj, mmin;
uname(&ut);
sscanf(ut.release, "%d.%d", &mmaj, &mmin);
// The utsname release version is 9 greater than the canonical OS version
mmaj -= 9;
MacOSver = (mmaj << 8) | mmin;
MacOSchecked = 1;
}
return MacOSver >= ((major << 8) | minor);
}
#endif
#if defined(_WIN32) || defined(__CYGWIN__)
#define Fail(func) do {*errfunc = func; return GetLastError();} while(0)
int setPrivilege(const char* pszPrivilege, BOOL bEnable, char **errfunc) {
HANDLE hToken;
TOKEN_PRIVILEGES tp;
BOOL status;
DWORD error = 0;
*errfunc = NULL;
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
Fail("OpenProcessToken");
if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) {
*errfunc = "LookupPrivilegeValue";
error = GetLastError();
goto out;
}
tp.PrivilegeCount = 1;
if (bEnable)
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
else
tp.Privileges[0].Attributes = 0;
status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
error = GetLastError();
if (!status || (error != ERROR_SUCCESS)) {
*errfunc = "AdjustTokenPrivileges";
goto out;
}
out:
if (!CloseHandle(hToken)) {
if (*errfunc == NULL) {
*errfunc = "CloseHandle";
error = GetLastError();
}
}
return error;
}
#else
#define Fail(func) do {*errfunc = func; return errno;} while(0)
#endif
void* allocMemoryPages(size_t bytes) {
void* mem;
#if defined(_WIN32) || defined(__CYGWIN__)
mem = VirtualAlloc(NULL, bytes, MEM_COMMIT, PAGE_READWRITE);
#else
#if defined(__NetBSD__)
#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
#else
#define RESERVED_FLAGS 0
#endif
#ifdef USE_PTHREAD_JIT_WP
#define MEXTRA MAP_JIT
#define PEXTRA PROT_EXEC
#else
#define MEXTRA 0
#define PEXTRA 0
#endif
mem = mmap(NULL, bytes, PAGE_READWRITE | RESERVED_FLAGS | PEXTRA, MAP_ANONYMOUS | MAP_PRIVATE | MEXTRA, -1, 0);
if (mem == MAP_FAILED)
mem = NULL;
#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \
&& MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(0);
}
#endif
#endif
return mem;
}
static inline int pageProtect(void* ptr, size_t bytes, int rules, char **errfunc) {
#if defined(_WIN32) || defined(__CYGWIN__)
DWORD oldp;
if (!VirtualProtect(ptr, bytes, (DWORD)rules, &oldp)) {
Fail("VirtualProtect");
}
#else
if (-1 == mprotect(ptr, bytes, rules))
Fail("mprotect");
#endif
return 0;
}
void setPagesRW(void* ptr, size_t bytes) {
char *errfunc;
#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \
&& MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(0);
} else {
pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc);
}
#else
pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc);
#endif
}
void setPagesRX(void* ptr, size_t bytes) {
char *errfunc;
#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \
&& MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(1);
__builtin___clear_cache((char*)ptr, ((char*)ptr) + bytes);
} else {
pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc);
}
#else
pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc);
#endif
}
void setPagesRWX(void* ptr, size_t bytes) {
char *errfunc;
pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE, &errfunc);
}
void* allocLargePagesMemory(size_t bytes) {
void* mem;
char *errfunc;
#if defined(_WIN32) || defined(__CYGWIN__)
if (setPrivilege("SeLockMemoryPrivilege", 1, &errfunc))
return NULL;
size_t pageMinimum = GetLargePageMinimum();
if (!pageMinimum) {
errfunc = "No large pages";
return NULL;
}
mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE);
#else
#ifdef __APPLE__
mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
#elif defined(__FreeBSD__)
mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0);
#elif defined(__OpenBSD__) || defined(__NetBSD__)
mem = MAP_FAILED; // OpenBSD does not support huge pages
#else
mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
#endif
if (mem == MAP_FAILED)
mem = NULL;
#endif
return mem;
}
void freePagedMemory(void* ptr, size_t bytes) {
#if defined(_WIN32) || defined(__CYGWIN__)
VirtualFree(ptr, 0, MEM_RELEASE);
#else
// some munmap implementations can crash on null pointer, despite what the manpage says
if (ptr) {
munmap(ptr, bytes);
}
#endif
}

48
crypto/randomx/virtual_memory.h Executable file
View file

@ -0,0 +1,48 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>
#define alignSize(pos, align) (((pos - 1) / align + 1) * align)
void* allocMemoryPages(size_t);
void setPagesRW(void*, size_t);
void setPagesRX(void*, size_t);
void setPagesRWX(void*, size_t);
void* allocLargePagesMemory(size_t);
void freePagedMemory(void*, size_t);
#ifdef __cplusplus
}
#endif

80
crypto/randomx/vm_compiled.cpp Executable file
View file

@ -0,0 +1,80 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vm_compiled.hpp"
#include "common.hpp"
namespace randomx {
static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct randomx::MemoryRegisters");
static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct randomx::RegisterFile");
template<class Allocator, bool softAes, bool secureJit>
CompiledVm<Allocator, softAes, secureJit>::CompiledVm() {
if (!secureJit) {
compiler.enableAll(); //make JIT buffer both writable and executable
}
}
template<class Allocator, bool softAes, bool secureJit>
void CompiledVm<Allocator, softAes, secureJit>::setDataset(randomx_dataset* dataset) {
datasetPtr = dataset;
}
template<class Allocator, bool softAes, bool secureJit>
void CompiledVm<Allocator, softAes, secureJit>::run(void* seed) {
VmBase<Allocator, softAes>::generateProgram(seed);
randomx_vm::initialize();
if (secureJit) {
compiler.enableWriting();
}
compiler.generateProgram(program, config);
if (secureJit) {
compiler.enableExecution();
}
mem.memory = datasetPtr->memory + datasetOffset;
execute();
}
template<class Allocator, bool softAes, bool secureJit>
void CompiledVm<Allocator, softAes, secureJit>::execute() {
#if defined(__aarch64__) || defined(__riscv)
memcpy(reg.f, config.eMask, sizeof(config.eMask));
#endif
compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS);
}
template class CompiledVm<AlignedAllocator<CacheLineSize>, false, false>;
template class CompiledVm<AlignedAllocator<CacheLineSize>, true, false>;
template class CompiledVm<LargePageAllocator, false, false>;
template class CompiledVm<LargePageAllocator, true, false>;
template class CompiledVm<AlignedAllocator<CacheLineSize>, false, true>;
template class CompiledVm<AlignedAllocator<CacheLineSize>, true, true>;
template class CompiledVm<LargePageAllocator, false, true>;
template class CompiledVm<LargePageAllocator, true, true>;
}

77
crypto/randomx/vm_compiled.hpp Executable file
View file

@ -0,0 +1,77 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <new>
#include <cstdint>
#include "virtual_machine.hpp"
#include "jit_compiler.hpp"
#include "allocator.hpp"
#include "dataset.hpp"
namespace randomx {
template<class Allocator, bool softAes, bool secureJit>
class CompiledVm : public VmBase<Allocator, softAes> {
public:
void* operator new(size_t size) {
void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
if (ptr == nullptr)
throw std::bad_alloc();
return ptr;
}
void operator delete(void* ptr) {
AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(CompiledVm));
}
CompiledVm();
void setDataset(randomx_dataset* dataset) override;
void run(void* seed) override;
using VmBase<Allocator, softAes>::mem;
using VmBase<Allocator, softAes>::program;
using VmBase<Allocator, softAes>::config;
using VmBase<Allocator, softAes>::reg;
using VmBase<Allocator, softAes>::scratchpad;
using VmBase<Allocator, softAes>::datasetPtr;
using VmBase<Allocator, softAes>::datasetOffset;
protected:
void execute();
JitCompiler compiler;
};
using CompiledVmDefault = CompiledVm<AlignedAllocator<CacheLineSize>, true, false>;
using CompiledVmHardAes = CompiledVm<AlignedAllocator<CacheLineSize>, false, false>;
using CompiledVmLargePage = CompiledVm<LargePageAllocator, true, false>;
using CompiledVmLargePageHardAes = CompiledVm<LargePageAllocator, false, false>;
using CompiledVmDefaultSecure = CompiledVm<AlignedAllocator<CacheLineSize>, true, true>;
using CompiledVmHardAesSecure = CompiledVm<AlignedAllocator<CacheLineSize>, false, true>;
using CompiledVmLargePageSecure = CompiledVm<LargePageAllocator, true, true>;
using CompiledVmLargePageHardAesSecure = CompiledVm<LargePageAllocator, false, true>;
}

View file

@ -0,0 +1,70 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vm_compiled_light.hpp"
#include "common.hpp"
#include <stdexcept>
namespace randomx {
template<class Allocator, bool softAes, bool secureJit>
void CompiledLightVm<Allocator, softAes, secureJit>::setCache(randomx_cache* cache) {
cachePtr = cache;
mem.memory = cache->memory;
if (secureJit) {
compiler.enableWriting();
}
compiler.generateSuperscalarHash(cache->programs, cache->reciprocalCache);
if (secureJit) {
compiler.enableExecution();
}
}
template<class Allocator, bool softAes, bool secureJit>
void CompiledLightVm<Allocator, softAes, secureJit>::run(void* seed) {
VmBase<Allocator, softAes>::generateProgram(seed);
randomx_vm::initialize();
if (secureJit) {
compiler.enableWriting();
}
compiler.generateProgramLight(program, config, datasetOffset);
if (secureJit) {
compiler.enableExecution();
}
CompiledVm<Allocator, softAes, secureJit>::execute();
}
template class CompiledLightVm<AlignedAllocator<CacheLineSize>, false, false>;
template class CompiledLightVm<AlignedAllocator<CacheLineSize>, true, false>;
template class CompiledLightVm<LargePageAllocator, false, false>;
template class CompiledLightVm<LargePageAllocator, true, false>;
template class CompiledLightVm<AlignedAllocator<CacheLineSize>, false, true>;
template class CompiledLightVm<AlignedAllocator<CacheLineSize>, true, true>;
template class CompiledLightVm<LargePageAllocator, false, true>;
template class CompiledLightVm<LargePageAllocator, true, true>;
}

View file

@ -0,0 +1,68 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <new>
#include "vm_compiled.hpp"
namespace randomx {
template<class Allocator, bool softAes, bool secureJit>
class CompiledLightVm : public CompiledVm<Allocator, softAes, secureJit> {
public:
void* operator new(size_t size) {
void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
if (ptr == nullptr)
throw std::bad_alloc();
return ptr;
}
void operator delete(void* ptr) {
AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(CompiledLightVm));
}
void setCache(randomx_cache* cache) override;
void setDataset(randomx_dataset* dataset) override { }
void run(void* seed) override;
using CompiledVm<Allocator, softAes, secureJit>::mem;
using CompiledVm<Allocator, softAes, secureJit>::compiler;
using CompiledVm<Allocator, softAes, secureJit>::program;
using CompiledVm<Allocator, softAes, secureJit>::config;
using CompiledVm<Allocator, softAes, secureJit>::cachePtr;
using CompiledVm<Allocator, softAes, secureJit>::datasetOffset;
};
using CompiledLightVmDefault = CompiledLightVm<AlignedAllocator<CacheLineSize>, true, false>;
using CompiledLightVmHardAes = CompiledLightVm<AlignedAllocator<CacheLineSize>, false, false>;
using CompiledLightVmLargePage = CompiledLightVm<LargePageAllocator, true, false>;
using CompiledLightVmLargePageHardAes = CompiledLightVm<LargePageAllocator, false, false>;
using CompiledLightVmDefaultSecure = CompiledLightVm<AlignedAllocator<CacheLineSize>, true, true>;
using CompiledLightVmHardAesSecure = CompiledLightVm<AlignedAllocator<CacheLineSize>, false, true>;
using CompiledLightVmLargePageSecure = CompiledLightVm<LargePageAllocator, true, true>;
using CompiledLightVmLargePageHardAesSecure = CompiledLightVm<LargePageAllocator, false, true>;
}

131
crypto/randomx/vm_interpreted.cpp Executable file
View file

@ -0,0 +1,131 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <iostream>
#include <iomanip>
#include <stdexcept>
#include <sstream>
#include <cmath>
#include <cfloat>
#include "vm_interpreted.hpp"
#include "dataset.hpp"
#include "intrin_portable.h"
#include "reciprocal.h"
namespace randomx {
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::setDataset(randomx_dataset* dataset) {
datasetPtr = dataset;
mem.memory = dataset->memory;
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::run(void* seed) {
VmBase<Allocator, softAes>::generateProgram(seed);
randomx_vm::initialize();
execute();
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::execute() {
NativeRegisterFile nreg;
for(unsigned i = 0; i < RegisterCountFlt; ++i)
nreg.a[i] = rx_load_vec_f128(&reg.a[i].lo);
compileProgram(program, bytecode, nreg);
uint32_t spAddr0 = mem.mx;
uint32_t spAddr1 = mem.ma;
for(unsigned ic = 0; ic < RANDOMX_PROGRAM_ITERATIONS; ++ic) {
uint64_t spMix = nreg.r[config.readReg0] ^ nreg.r[config.readReg1];
spAddr0 ^= spMix;
spAddr0 &= ScratchpadL3Mask64;
spAddr1 ^= spMix >> 32;
spAddr1 &= ScratchpadL3Mask64;
for (unsigned i = 0; i < RegistersCount; ++i)
nreg.r[i] ^= load64(scratchpad + spAddr0 + 8 * i);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
nreg.f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
nreg.e[i] = maskRegisterExponentMantissa(config, rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
executeBytecode(bytecode, scratchpad, config);
mem.mx ^= nreg.r[config.readReg2] ^ nreg.r[config.readReg3];
mem.mx &= CacheLineAlignMask;
datasetPrefetch(datasetOffset + mem.mx);
datasetRead(datasetOffset + mem.ma, nreg.r);
std::swap(mem.mx, mem.ma);
for (unsigned i = 0; i < RegistersCount; ++i)
store64(scratchpad + spAddr1 + 8 * i, nreg.r[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
nreg.f[i] = rx_xor_vec_f128(nreg.f[i], nreg.e[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), nreg.f[i]);
spAddr0 = 0;
spAddr1 = 0;
}
for (unsigned i = 0; i < RegistersCount; ++i)
store64(&reg.r[i], nreg.r[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
rx_store_vec_f128(&reg.f[i].lo, nreg.f[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
rx_store_vec_f128(&reg.e[i].lo, nreg.e[i]);
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::datasetRead(uint64_t address, int_reg_t(&r)[RegistersCount]) {
uint64_t* datasetLine = (uint64_t*)(mem.memory + address);
for (int i = 0; i < RegistersCount; ++i)
r[i] ^= datasetLine[i];
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::datasetPrefetch(uint64_t address) {
rx_prefetch_nta(mem.memory + address);
}
template class InterpretedVm<AlignedAllocator<CacheLineSize>, false>;
template class InterpretedVm<AlignedAllocator<CacheLineSize>, true>;
template class InterpretedVm<LargePageAllocator, false>;
template class InterpretedVm<LargePageAllocator, true>;
}

View file

@ -0,0 +1,75 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <new>
#include <vector>
#include "common.hpp"
#include "virtual_machine.hpp"
#include "bytecode_machine.hpp"
#include "intrin_portable.h"
#include "allocator.hpp"
namespace randomx {
template<class Allocator, bool softAes>
class InterpretedVm : public VmBase<Allocator, softAes>, public BytecodeMachine {
public:
using VmBase<Allocator, softAes>::mem;
using VmBase<Allocator, softAes>::scratchpad;
using VmBase<Allocator, softAes>::program;
using VmBase<Allocator, softAes>::config;
using VmBase<Allocator, softAes>::reg;
using VmBase<Allocator, softAes>::datasetPtr;
using VmBase<Allocator, softAes>::datasetOffset;
void* operator new(size_t size) {
void* ptr = AlignedAllocator<CacheLineSize>::allocMemory(size);
if (ptr == nullptr)
throw std::bad_alloc();
return ptr;
}
void operator delete(void* ptr) {
AlignedAllocator<CacheLineSize>::freeMemory(ptr, sizeof(InterpretedVm));
}
void run(void* seed) override;
void setDataset(randomx_dataset* dataset) override;
protected:
virtual void datasetRead(uint64_t blockNumber, int_reg_t(&r)[RegistersCount]);
virtual void datasetPrefetch(uint64_t blockNumber);
private:
void execute();
InstructionByteCode bytecode[RANDOMX_PROGRAM_SIZE];
};
using InterpretedVmDefault = InterpretedVm<AlignedAllocator<CacheLineSize>, true>;
using InterpretedVmHardAes = InterpretedVm<AlignedAllocator<CacheLineSize>, false>;
using InterpretedVmLargePage = InterpretedVm<LargePageAllocator, true>;
using InterpretedVmLargePageHardAes = InterpretedVm<LargePageAllocator, false>;
}

Some files were not shown because too many files have changed in this diff Show more