mirror of
https://git.wownero.com/wownero/wownero.git
synced 2025-01-05 16:28:52 +00:00
slow-hash: cache TLS references locally once at function start
This commit is contained in:
parent
5bbbe3902b
commit
7ac3334217
@ -274,10 +274,10 @@ static inline int use_v4_jit(void)
|
|||||||
#define VARIANT2_2() \
|
#define VARIANT2_2() \
|
||||||
do if (variant == 2 || variant == 3) \
|
do if (variant == 2 || variant == 3) \
|
||||||
{ \
|
{ \
|
||||||
*U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
|
*U64(local_hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
|
||||||
*(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
|
*(U64(local_hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
|
||||||
hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \
|
hi ^= SWAP64LE(*U64(local_hp_state + (j ^ 0x20))); \
|
||||||
lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \
|
lo ^= SWAP64LE(*(U64(local_hp_state + (j ^ 0x20)) + 1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define V4_REG_LOAD(dst, src) \
|
#define V4_REG_LOAD(dst, src) \
|
||||||
@ -405,7 +405,7 @@ static inline int use_v4_jit(void)
|
|||||||
|
|
||||||
#define pre_aes() \
|
#define pre_aes() \
|
||||||
j = state_index(a); \
|
j = state_index(a); \
|
||||||
_c = _mm_load_si128(R128(&hp_state[j])); \
|
_c = _mm_load_si128(R128(&local_hp_state[j])); \
|
||||||
_a = _mm_load_si128(R128(a)); \
|
_a = _mm_load_si128(R128(a)); \
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -418,20 +418,20 @@ static inline int use_v4_jit(void)
|
|||||||
* This code is based upon an optimized implementation by dga.
|
* This code is based upon an optimized implementation by dga.
|
||||||
*/
|
*/
|
||||||
#define post_aes() \
|
#define post_aes() \
|
||||||
VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
|
||||||
_mm_store_si128(R128(c), _c); \
|
_mm_store_si128(R128(c), _c); \
|
||||||
_mm_store_si128(R128(&hp_state[j]), _mm_xor_si128(_b, _c)); \
|
_mm_store_si128(R128(&local_hp_state[j]), _mm_xor_si128(_b, _c)); \
|
||||||
VARIANT1_1(&hp_state[j]); \
|
VARIANT1_1(&local_hp_state[j]); \
|
||||||
j = state_index(c); \
|
j = state_index(c); \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
b[0] = p[0]; b[1] = p[1]; \
|
b[0] = p[0]; b[1] = p[1]; \
|
||||||
VARIANT2_INTEGER_MATH_SSE2(b, c); \
|
VARIANT2_INTEGER_MATH_SSE2(b, c); \
|
||||||
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
||||||
__mul(); \
|
__mul(); \
|
||||||
VARIANT2_2(); \
|
VARIANT2_2(); \
|
||||||
VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
|
||||||
a[0] += hi; a[1] += lo; \
|
a[0] += hi; a[1] += lo; \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
p[0] = a[0]; p[1] = a[1]; \
|
p[0] = a[0]; p[1] = a[1]; \
|
||||||
a[0] ^= b[0]; a[1] ^= b[1]; \
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
||||||
VARIANT1_2(p + 1); \
|
VARIANT1_2(p + 1); \
|
||||||
@ -893,6 +893,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
if(hp_state == NULL)
|
if(hp_state == NULL)
|
||||||
slow_hash_allocate_state();
|
slow_hash_allocate_state();
|
||||||
|
|
||||||
|
// locals to avoid constant TLS dereferencing
|
||||||
|
uint8_t *local_hp_state = hp_state;
|
||||||
|
v4_random_math_JIT_func local_hp_jitfunc = hp_jitfunc;
|
||||||
|
|
||||||
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
||||||
if (prehashed) {
|
if (prehashed) {
|
||||||
memcpy(&state.hs, data, length);
|
memcpy(&state.hs, data, length);
|
||||||
@ -915,7 +919,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -927,7 +931,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
for(j = 0; j < INIT_SIZE_BLK; j++)
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
|
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -975,7 +979,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
// add the xor to the pseudo round
|
// add the xor to the pseudo round
|
||||||
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -985,7 +989,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
{
|
{
|
||||||
for(j = 0; j < INIT_SIZE_BLK; j++)
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
{
|
{
|
||||||
xor_blocks(&text[j * AES_BLOCK_SIZE], &hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
||||||
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1065,24 +1069,24 @@ union cn_slow_hash_state
|
|||||||
|
|
||||||
#define pre_aes() \
|
#define pre_aes() \
|
||||||
j = state_index(a); \
|
j = state_index(a); \
|
||||||
_c = vld1q_u8(&hp_state[j]); \
|
_c = vld1q_u8(&local_hp_state[j]); \
|
||||||
_a = vld1q_u8((const uint8_t *)a); \
|
_a = vld1q_u8((const uint8_t *)a); \
|
||||||
|
|
||||||
#define post_aes() \
|
#define post_aes() \
|
||||||
VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
|
||||||
vst1q_u8((uint8_t *)c, _c); \
|
vst1q_u8((uint8_t *)c, _c); \
|
||||||
vst1q_u8(&hp_state[j], veorq_u8(_b, _c)); \
|
vst1q_u8(&local_hp_state[j], veorq_u8(_b, _c)); \
|
||||||
VARIANT1_1(&hp_state[j]); \
|
VARIANT1_1(&local_hp_state[j]); \
|
||||||
j = state_index(c); \
|
j = state_index(c); \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
b[0] = p[0]; b[1] = p[1]; \
|
b[0] = p[0]; b[1] = p[1]; \
|
||||||
VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
|
VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
|
||||||
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
||||||
__mul(); \
|
__mul(); \
|
||||||
VARIANT2_2(); \
|
VARIANT2_2(); \
|
||||||
VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
|
||||||
a[0] += hi; a[1] += lo; \
|
a[0] += hi; a[1] += lo; \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
p[0] = a[0]; p[1] = a[1]; \
|
p[0] = a[0]; p[1] = a[1]; \
|
||||||
a[0] ^= b[0]; a[1] ^= b[1]; \
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
||||||
VARIANT1_2(p + 1); \
|
VARIANT1_2(p + 1); \
|
||||||
@ -1245,9 +1249,9 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
RDATA_ALIGN16 uint8_t expandedKey[240];
|
RDATA_ALIGN16 uint8_t expandedKey[240];
|
||||||
|
|
||||||
#ifndef FORCE_USE_HEAP
|
#ifndef FORCE_USE_HEAP
|
||||||
RDATA_ALIGN16 uint8_t hp_state[MEMORY];
|
RDATA_ALIGN16 uint8_t local_hp_state[MEMORY];
|
||||||
#else
|
#else
|
||||||
uint8_t *hp_state = (uint8_t *)aligned_malloc(MEMORY,16);
|
uint8_t *local_hp_state = (uint8_t *)aligned_malloc(MEMORY,16);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint8_t text[INIT_SIZE_BYTE];
|
uint8_t text[INIT_SIZE_BYTE];
|
||||||
@ -1287,7 +1291,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
}
|
}
|
||||||
|
|
||||||
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
||||||
@ -1322,7 +1326,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
// add the xor to the pseudo round
|
// add the xor to the pseudo round
|
||||||
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
||||||
@ -1337,7 +1341,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
|||||||
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
||||||
|
|
||||||
#ifdef FORCE_USE_HEAP
|
#ifdef FORCE_USE_HEAP
|
||||||
aligned_free(hp_state);
|
aligned_free(local_hp_state);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else /* aarch64 && crypto */
|
#else /* aarch64 && crypto */
|
||||||
|
Loading…
Reference in New Issue
Block a user