/* See LICENSE file for copyright and license details. */
#define WARN_UNKNOWN_ENDIAN
#include "common.h"
#include <stdatomic.h>
#if defined(__x86_64__)
# include <immintrin.h>
#endif
struct threaded_fill_segments_params {
struct block *memory;
const uint_least64_t *sbox;
struct libar2_argon2_parameters *params;
uint_least32_t seglen;
uint_least32_t lanelen;
uint_least32_t blocks;
uint_least32_t pass;
uint_least32_t lane;
uint_least32_t slice;
};
static const struct libblake_blake2b_params b2params = {
.digest_len = 64,
.key_len = 0,
.fanout = 1,
.depth = 1,
.leaf_len = 0,
.node_offset = 0,
.node_depth = 0,
.inner_len = 0
};
static const SIMD_ALIGNED struct block zerob; /* implicitly zeroed via `static` */
#if defined(__x86_64__) && defined(LIBAR2_TARGET__)
/* Relative, approximate, execution times for blockxor:
* Over large memory Repeatedly over one block
* plain old xor: 1 1
* _mm_xor_si128: 1.055 1.031 (removed becaused it's slow)
* _mm256_xor_si256: 0.828 0.514
* _mm512_xor_si512: I don't have the means to test this
*
* No difference noted between having a function pointer
* to the plain-old-xor version and just implementing it
* with a regular function.
*
* Similar performance is observed for blockxor3 and blockcpy
*/
LIBAR2_TARGET__("avx2")
static void
blockxor_avx2(struct block *a_, const struct block *b_)
{
__m256i *a = (__m256i *)a_;
const __m256i *b = (const __m256i *)b_;
size_t i;
for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
a[i] = _mm256_xor_si256(a[i], b[i]);
}
/* $covered{$ (removing from coverage test because my machine does not spport AVX512F) */
LIBAR2_TARGET__("avx512f")
static void
blockxor_avx512f(struct block *a_, const struct block *b_)
{
__m512i *a = (__m512i *)a_;
const __m512i *b = (const __m512i *)b_;
size_t i;
for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
a[i] = _mm512_xor_si512(a[i], b[i]);
}
/* $covered}$ */
static void
blockxor_vanilla(struct block *a, const struct block *b)
{
size_t i;
for (i = 0; i < ELEMSOF(a->w); i++)
a->w[i] ^= b->w[i];
}
static void (*blockxor)(struct block *a, const struct block *b) = &blockxor_vanilla;
LIBAR2_TARGET__("avx2")
static void
blockxor3_avx2(struct block *a_, const struct block *b_, const struct block *c_)
{
__m256i *a = (__m256i *)a_;
const __m256i *b = (const __m256i *)b_;
const __m256i *c = (const __m256i *)c_;
size_t i;
for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
a[i] = _mm256_xor_si256(b[i], c[i]);
}
/* $covered{$ (removing from coverage test because my machine does not spport AVX512F) */
LIBAR2_TARGET__("avx512f")
static void
blockxor3_avx512f(struct block *a_, const struct block *b_, const struct block *c_)
{
__m512i *a = (__m512i *)a_;
const __m512i *b = (const __m512i *)b_;
const __m512i *c = (const __m512i *)c_;
size_t i;
for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
a[i] = _mm512_xor_si512(b[i], c[i]);
}
/* $covered}$ */
static void
blockxor3_vanilla(struct block *a, const struct block *b, const struct block *c)
{
size_t i;
for (i = 0; i < ELEMSOF(a->w); i++)
a->w[i] = b->w[i] ^ c->w[i];
}
#define DEFINED_BLOCKXOR3
static void (*blockxor3)(struct block *a, const struct block *b, const struct block *c) = &blockxor3_vanilla;
LIBAR2_TARGET__("avx2")
static void
blockcpy_avx2(struct block *a_, const struct block *b_)
{
__m256i *a = (__m256i *)a_;
const __m256i *b = (const __m256i *)b_;
size_t i;
for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
a[i] = _mm256_load_si256(&b[i]);
}
/* $covered{$ (removing from coverage test because my machine does not spport AVX512F) */
LIBAR2_TARGET__("avx512f")
static void
blockcpy_avx512f(struct block *a_, const struct block *b_)
{
__m512i *a = (__m512i *)a_;
const __m512i *b = (const __m512i *)b_;
size_t i;
for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
a[i] = _mm512_load_si512(&b[i]);
}
/* $covered}$ */
static void
blockcpy_vanilla(struct block *a, const struct block *b)
{
size_t i;
for (i = 0; i < ELEMSOF(a->w); i++)
a->w[i] = b->w[i];
}
#define DEFINED_BLOCKCPY
static void (*blockcpy)(struct block *a, const struct block *b) = &blockcpy_vanilla;
#else
static void
blockxor(struct block *a, const struct block *b)
{
size_t i;
for (i = 0; i < ELEMSOF(a->w); i++)
a->w[i] ^= b->w[i];
}
#endif
static size_t
store32(unsigned char *out, uint_least32_t value)
{
out[0] = (unsigned char)((value >> 0) & 255);
out[1] = (unsigned char)((value >> 8) & 255);
out[2] = (unsigned char)((value >> 16) & 255);
out[3] = (unsigned char)((value >> 24) & 255);
return 4;
}
#ifndef USING_LITTLE_ENDIAN
static void
store64(unsigned char *out, uint_least64_t value)
{
out[0] = (unsigned char)((value >> 0) & 255);
out[1] = (unsigned char)((value >> 8) & 255);
out[2] = (unsigned char)((value >> 16) & 255);
out[3] = (unsigned char)((value >> 24) & 255);
out[4] = (unsigned char)((value >> 32) & 255);
out[5] = (unsigned char)((value >> 40) & 255);
out[6] = (unsigned char)((value >> 48) & 255);
out[7] = (unsigned char)((value >> 56) & 255);
}
static void
load64(uint_least64_t *out, const unsigned char *data)
{
*out = ((uint_least64_t)(data[0] & 255) << 0)
| ((uint_least64_t)(data[1] & 255) << 8)
| ((uint_least64_t)(data[2] & 255) << 16)
| ((uint_least64_t)(data[3] & 255) << 24)
| ((uint_least64_t)(data[4] & 255) << 32)
| ((uint_least64_t)(data[5] & 255) << 40)
| ((uint_least64_t)(data[6] & 255) << 48)
| ((uint_least64_t)(data[7] & 255) << 56);
}
static void
store_block(unsigned char *block8, const struct block *block64)
{
size_t i, j;
for (i = 0, j = 0; i < 1024; i += 8, j += 1)
store64(&block8[i], block64->w[j]);
}
static void
load_block(struct block *block64, const unsigned char *block8)
{
size_t i, j;
for (i = 0, j = 0; i < 1024; i += 8, j += 1)
load64(&block64->w[j], &block8[i]);
}
#endif
static size_t
storemem(unsigned char *out, const void *mem, size_t len, size_t max)
{
size_t n = MIN(len, max);
memcpy(out, mem, n);
return n;
}
static uint_least64_t
rotr64(uint_least64_t x, int n)
{
return ((x >> n) | (x << (64 - n))) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
}
static uint_least64_t
fBlaMka(uint_least64_t x, uint_least64_t y)
{
return x + y + 2 * (x & UINT_LEAST64_C(0xFFffFFff)) * (y & UINT_LEAST64_C(0xFFffFFff));
}
static void
fill_block(struct block *block, const struct block *prevblock, const struct block *refblock,
int with_xor, const uint_least64_t *sbox)
{
uint_least64_t x = 0;
uint_least32_t x_hi, x_lo;
size_t i;
SIMD_ALIGNED struct block tmpblock;
#if defined(DEFINED_BLOCKXOR3) && defined(DEFINED_BLOCKCPY)
if (with_xor) {
blockxor3(&tmpblock, refblock, prevblock);
blockxor(block, &tmpblock);
} else {
blockxor3(&tmpblock, refblock, prevblock);
blockcpy(block, &tmpblock);
}
#else
if (with_xor) {
for (i = 0; i < ELEMSOF(refblock->w); i++)
block->w[i] ^= tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
} else {
for (i = 0; i < ELEMSOF(refblock->w); i++)
block->w[i] = tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
}
#endif
if (sbox) {
x = tmpblock.w[0] ^ tmpblock.w[ELEMSOF(tmpblock.w) - 1];
for (i = 0; i < 96; i++) {
x_hi = (uint_least32_t)(x >> 32);
x_lo = (uint_least32_t)x & UINT_LEAST32_C(0xFFFFffff);
x = (uint_least64_t)x_hi * (uint_least64_t)x_lo;
x += sbox[(x_hi & UINT_LEAST32_C(0x1FF)) + 0];
x ^= sbox[(x_lo & UINT_LEAST32_C(0x1FF)) + 512];
}
}
/* TODO optimise (also fBlaMka) once similar BLAKE2b code in libblake has been optimised { */
#define BLAMKA_G(A, B, C, D)\
A = fBlaMka(A, B);\
D = rotr64(D ^ A, 32);\
C = fBlaMka(C, D);\
B = rotr64(B ^ C, 24);\
A = fBlaMka(A, B);\
D = rotr64(D ^ A, 16);\
C = fBlaMka(C, D);\
B = rotr64(B ^ C, 63)
#define BLAMKA_ROUND(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\
BLAMKA_G(W0, W4, W8, WC);\
BLAMKA_G(W1, W5, W9, WD);\
BLAMKA_G(W2, W6, WA, WE);\
BLAMKA_G(W3, W7, WB, WF);\
BLAMKA_G(W0, W5, WA, WF);\
BLAMKA_G(W1, W6, WB, WC);\
BLAMKA_G(W2, W7, W8, WD);\
BLAMKA_G(W3, W4, W9, WE)
#define BLAMKA_ROUND_(ARR, OFF, W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\
BLAMKA_ROUND(ARR[OFF + W0], ARR[OFF + W1], ARR[OFF + W2], ARR[OFF + W3],\
ARR[OFF + W4], ARR[OFF + W5], ARR[OFF + W6], ARR[OFF + W7],\
ARR[OFF + W8], ARR[OFF + W9], ARR[OFF + WA], ARR[OFF + WB],\
ARR[OFF + WC], ARR[OFF + WD], ARR[OFF + WE], ARR[OFF + WF])
for (i = 0; i < 8; i++) {
BLAMKA_ROUND_(tmpblock.w, i * 16,
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15);
}
for (i = 0; i < 8; i++) {
BLAMKA_ROUND_(tmpblock.w, i * 2,
0, 1, 16, 17,
32, 33, 48, 49,
64, 65, 80, 81,
96, 97, 112, 113);
}
/* } */
blockxor(block, &tmpblock);
block->w[0] += x;
block->w[ELEMSOF(block->w) - 1] += x;
block->w[0] &= UINT_LEAST64_C(0xFFFFffffFFFFffff);
block->w[ELEMSOF(block->w) - 1] &= UINT_LEAST64_C(0xFFFFffffFFFFffff);
}
static void
generate_sbox(uint_least64_t *sbox, struct block *memory)
{
void *next, *prev = memory;
size_t i;
for (i = 0; i < 8; i++) {
next = &sbox[i * 128];
fill_block(next, &zerob, prev, 0, NULL);
fill_block(next, &zerob, next, 0, NULL);
prev = next;
}
}
static void
next_address_block(struct block *addrb, struct block *inputb)
{
inputb->w[6] += 1;
fill_block(addrb, &zerob, inputb, 0, NULL);
fill_block(addrb, &zerob, addrb, 0, NULL);
}
static uint_least32_t
get_rindex(uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t pass,
uint_least32_t slice, uint_least32_t index, uint_least64_t prand, int same_lane)
{
uint_least32_t size, startpos;
uint_least64_t relpos;
if (!pass) {
if (!slice)
size = index - 1;
else if (same_lane)
size = slice * seglen + index - 1;
else
size = slice * seglen - !index;
} else {
if (same_lane)
size = lanelen - seglen + index - 1;
else
size = lanelen - seglen - !index;
}
prand &= UINT_LEAST64_C(0xFFffFFff);
relpos = (prand * prand) >> 32;
relpos = ((uint_least64_t)size * relpos) >> 32;
relpos = (uint_least64_t)size - 1 - relpos;
startpos = pass ? slice == 3 ? 0 : (slice + 1) * seglen : 0;
return (startpos + (uint_least32_t)relpos) % lanelen;
}
static void
fill_segment(struct block *memory, const uint_least64_t *sbox, struct libar2_argon2_parameters *params,
uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t blocks,
uint_least32_t pass, uint_least32_t lane, uint_least32_t slice)
{
int data_independent;
SIMD_ALIGNED struct block inputb, addrb;
uint_least32_t off, prevoff, rlane, rindex;
uint_least32_t index = 0, i;
uint_least64_t prand;
data_independent =
(params->type == LIBAR2_ARGON2I) ||
(params->type == LIBAR2_ARGON2ID && !pass && slice < 2);
if (data_independent) {
memset(&inputb.w[6], 0, sizeof(*inputb.w) * (ELEMSOF(inputb.w) - 6));
inputb.w[0] = pass;
inputb.w[1] = lane;
inputb.w[2] = slice;
inputb.w[3] = blocks;
inputb.w[4] = params->t_cost;
inputb.w[5] = (uint_least32_t)params->type;
if (!pass && !slice) {
next_address_block(&addrb, &inputb);
index = 2;
}
} else if (!pass && !slice) {
index = 2;
}
off = lane * lanelen + slice * seglen + index;
prevoff = off - 1 + (off % lanelen ? 0 : lanelen);
for (; index < seglen; index++, off++, prevoff++) {
if (off % lanelen == 1)
prevoff = off - 1;
if (data_independent) {
i = index % ELEMSOF(addrb.w);
if (!i)
next_address_block(&addrb, &inputb);
prand = addrb.w[i];
} else {
prand = memory[prevoff].w[0];
}
rlane = (!pass && !slice) ? lane : (uint_least32_t)(prand >> 32) % params->lanes;
rindex = get_rindex(seglen, lanelen, pass, slice, index, prand, rlane == lane);
fill_block(&memory[off], &memory[prevoff], &memory[rlane * lanelen + rindex],
params->version > LIBAR2_ARGON2_VERSION_10 && pass, sbox);
}
}
static void
threaded_fill_segment(void *data)
{
struct threaded_fill_segments_params *tparams = data;
fill_segment(tparams->memory, tparams->sbox, tparams->params,
tparams->seglen, tparams->lanelen, tparams->blocks,
tparams->pass, tparams->lane, tparams->slice);
}
static void
initial_hash(unsigned char hash[static 64], void *msg, size_t msglen,
struct libar2_argon2_parameters *params, struct libar2_context *ctx)
{
#define SEGMENT(DATA, LEN, OFF) &((const unsigned char *)(DATA))[(OFF)], (LEN) - (OFF)
struct libblake_blake2b_state state;
unsigned char block[128 + 3];
size_t n = 0, off;
libblake_blake2b_init(&state, &b2params);
n += store32(&block[n], params->lanes);
n += store32(&block[n], (uint_least32_t)params->hashlen);
n += store32(&block[n], params->m_cost);
n += store32(&block[n], params->t_cost);
n += store32(&block[n], (uint_least32_t)(params->version ? params->version : LIBAR2_ARGON2_VERSION_10));
n += store32(&block[n], (uint_least32_t)params->type);
n += store32(&block[n], (uint_least32_t)msglen);
if (msglen) {
n += off = storemem(&block[n], msg, msglen, 128 - n);
if (n == 128) {
libblake_blake2b_force_update(&state, block, n);
n = 0;
if (off < msglen) {
off += libblake_blake2b_force_update(&state, SEGMENT(msg, msglen, off));
memcpy(block, SEGMENT(msg, msglen, off));
n = msglen - off;
}
}
if (ctx->autoerase_message)
libar2_erase(msg, msglen);
}
n += store32(&block[n], (uint_least32_t)params->saltlen);
if (n >= 128) {
n -= libblake_blake2b_force_update(&state, block, n);
memcpy(block, &block[128], n); /* overlap is impossible */
}
if (params->saltlen) {
if (!n)
off = 0;
else
n += off = storemem(&block[n], params->salt, params->saltlen, 128 - n);
if (n == 128) {
libblake_blake2b_force_update(&state, block, n);
n = 0;
}
if (n == 0 && off < params->saltlen) {
off += libblake_blake2b_force_update(&state, SEGMENT(params->salt, params->saltlen, off));
memcpy(block, SEGMENT(params->salt, params->saltlen, off));
n = params->saltlen - off;
}
if (ctx->autoerase_salt)
libar2_erase(params->salt, params->saltlen);
}
n += store32(&block[n], (uint_least32_t)params->keylen);
if (n >= 128) {
n -= libblake_blake2b_force_update(&state, block, n);
memcpy(block, &block[128], n); /* overlap is impossible */
}
if (params->keylen) {
if (!n)
off = 0;
else
n += off = storemem(&block[n], params->key, params->keylen, 128 - n);
if (n == 128) {
libblake_blake2b_force_update(&state, block, n);
n = 0;
}
if (n == 0 && off < params->keylen) {
off += libblake_blake2b_force_update(&state, SEGMENT(params->key, params->keylen, off));
memcpy(block, SEGMENT(params->key, params->keylen, off));
n = params->keylen - off;
}
if (ctx->autoerase_secret)
libar2_erase(params->key, params->keylen);
}
n += store32(&block[n], (uint_least32_t)params->adlen);
if (n > 128 || (n == 128 && params->adlen)) {
n -= libblake_blake2b_force_update(&state, block, n);
memcpy(block, &block[128], n); /* overlap is impossible */
}
if (params->adlen) {
if (!n)
off = 0;
else
n += off = storemem(&block[n], params->ad, params->adlen, 128 - n);
if (off < params->adlen) {
if (n == 128) {
libblake_blake2b_force_update(&state, block, n);
n = 0;
}
if (n == 0) {
off += libblake_blake2b_update(&state, SEGMENT(params->ad, params->adlen, off));
if (params->adlen - off > 128) {
/* $covered{$ (not really possible, but just to be safe) */
off += libblake_blake2b_force_update(&state, SEGMENT(params->ad, params->adlen, off));
/* $covered}$ */
}
memcpy(block, SEGMENT(params->ad, params->adlen, off));
n = params->adlen - off;
}
}
if (ctx->autoerase_associated_data)
libar2_erase(params->ad, params->adlen);
}
libblake_blake2b_digest(&state, block, n, 0, 64, hash);
ERASE_ARRAY(block);
ERASE_STRUCT(state);
#undef SEGMENT
}
static void /* this is not BLAKE2Xb, but something Argon2-specific */
argon2_blake2b_exthash(void *hash_, size_t hashlen, void *msg_, size_t msglen)
{
struct libblake_blake2b_params params;
struct libblake_blake2b_state state;
unsigned char *msg = msg_;
unsigned char block[128];
unsigned char *hash = hash_;
size_t n, off;
params = b2params;
params.digest_len = (uint_least8_t)MIN(hashlen, (size_t)params.digest_len);
libblake_blake2b_init(&state, ¶ms);
n = store32(block, (uint_least32_t)hashlen);
n += off = storemem(&block[n], msg, msglen, 128 - n);
if (off == msglen) {
libblake_blake2b_digest(&state, block, n, 0, params.digest_len, hash);
} else {
libblake_blake2b_force_update(&state, block, 128);
libblake_blake2b_digest(&state, &msg[off], msglen - off, 0, params.digest_len, hash);
}
if (hashlen > 64) {
hashlen -= 32;
params.digest_len = 64;
while (hashlen > 64) {
libblake_blake2b_init(&state, ¶ms);
libblake_blake2b_digest(&state, hash, 64, 0, 64, &hash[32]);
hash += 32;
hashlen -= 32;
}
params.digest_len = (uint_least8_t)hashlen;
libblake_blake2b_init(&state, ¶ms);
libblake_blake2b_digest(&state, hash, 64, 0, hashlen, &hash[32]);
}
ERASE_STRUCT(state);
ERASE_ARRAY(block);
}
#if defined(__x86_64__) && defined(LIBAR2_TARGET__)
void
libar2_internal_use_generic__(void)
{
blockxor = &blockxor_vanilla;
blockxor3 = &blockxor3_vanilla;
blockcpy = &blockcpy_vanilla;
}
void
libar2_internal_use_sse2__(void)
{
libar2_internal_use_generic__();
}
void
libar2_internal_use_avx2__(void)
{
blockxor = &blockxor_avx2;
blockxor3 = &blockxor3_avx2;
blockcpy = &blockcpy_avx2;
}
void
libar2_internal_use_avx512f__(void)
{
blockxor = &blockxor_avx512f;
blockxor3 = &blockxor3_avx512f;
blockcpy = &blockcpy_avx512f;
}
#endif
LIBAR2_INITIALISER__ /* ignored if statically linked, so this function shall
* by the application, we just use the constructor (init)
* attribute in case that is forgotten, as it will only
* improve performance, but the library with function
* perfectly fine even if it's not called */
void
libar2_init(void)
{
#if defined(__x86_64__) && defined(LIBAR2_TARGET__)
static volatile int initialised = 0;
static volatile atomic_flag spinlock = ATOMIC_FLAG_INIT;
if (initialised)
return;
while (atomic_flag_test_and_set(&spinlock));
if (!initialised) {
# if 0
__builtin_cpu_init();
/* $covered{$ (we know that it works, but the test cannot enter every branch) */
if (__builtin_cpu_supports("avx512f"))
libar2_internal_use_avx512f__();
else if (__builtin_cpu_supports("avx2"))
libar2_internal_use_avx2__();
else if (__builtin_cpu_supports("sse2"))
libar2_internal_use_sse2__();
else
libar2_internal_use_generic__();
/* $covered}$ */
# else
uint32_t x;
__asm__ volatile("cpuid" : "=b"(x) : "a"(7), "c"(0) : "edx");
/* $covered{$ (we know that it works, but the test cannot enter every branch) */
if (x & ((uint32_t)1 << 16)) {
libar2_internal_use_avx512f__();
} else if (x & ((uint32_t)1 << 5)) {
libar2_internal_use_avx2__();
} else {
__asm__ volatile("cpuid" : "=d"(x) : "a"(1) : "ebx", "ecx");
if (x & ((uint32_t)1 << 26))
libar2_internal_use_sse2__();
else
libar2_internal_use_generic__();
}
/* $covered}$ */
# endif
initialised = 1;
}
atomic_flag_clear(&spinlock);
#endif
}
int
libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameters *params, struct libar2_context *ctx)
{
#ifndef USING_LITTLE_ENDIAN
unsigned char block[1024 + 128];
#endif
_Alignas(4) unsigned char hash0[256];
uint_least32_t blocks, seglen, lanelen;
struct block *memory;
size_t i, p, s, nthreads, ts[16], ti, tn, bufsize;
struct threaded_fill_segments_params *tparams = NULL;
uint_least64_t *sbox = NULL; /* This is 8K large (assuming support for uint64_t), so we allocate it dynamically */
size_t alignment;
libar2_init();
if (libar2_validate_params(params, NULL) || msglen >> 31 > 1) {
errno = EINVAL;
return -1;
}
blocks = MAX(params->m_cost, 8 * params->lanes); /* 8 * params->lanes <= 0x07FFfff8 */
seglen = blocks / (4 * params->lanes);
blocks -= blocks % (4 * params->lanes);
lanelen = seglen * 4;
alignment = MAX(MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), MAX_SIMD_ALIGNMENT);
#ifdef USING_LITTLE_ENDIAN
/* We are allocating one extra block, this gives use 1024 extra bytes,
* but we only need 128, to ensure that `argon2_blake2b_exthash` does
* not write on unallocated memory. Preferable we would just request
* 128 bytes bytes, but this would require an undesirable API/ABI
* change. */
memory = ctx->allocate(blocks + 1, sizeof(struct block), alignment, ctx);
#else
memory = ctx->allocate(blocks, sizeof(struct block), alignment, ctx);
#endif
if (!memory)
return -1;
if (params->type == LIBAR2_ARGON2DS) {
alignment = MAX(ALIGNOF(uint_least64_t), MAX_SIMD_ALIGNMENT);
sbox = ctx->allocate(1024, sizeof(*sbox), alignment, ctx);
if (!sbox) {
ctx->deallocate(memory, ctx);
return -1;
}
}
initial_hash(hash0, msg, msglen, params, ctx);
for (i = 0; i < params->lanes; i++) { /* direction is important for little-endian optimisation */
store32(&hash0[64], 0);
store32(&hash0[68], (uint_least32_t)i);
#ifdef USING_LITTLE_ENDIAN
argon2_blake2b_exthash(&memory[i * lanelen + 0], 1024, hash0, 72);
#else
argon2_blake2b_exthash(block, 1024, hash0, 72);
load_block(&memory[i * lanelen + 0], block);
#endif
store32(&hash0[64], 1);
#ifdef USING_LITTLE_ENDIAN
argon2_blake2b_exthash(&memory[i * lanelen + 1], 1024, hash0, 72);
#else
argon2_blake2b_exthash(block, 1024, hash0, 72);
load_block(&memory[i * lanelen + 1], block);
#endif
}
ERASE_ARRAY(hash0);
if (ctx->init_thread_pool(params->lanes, &nthreads, ctx))
goto fail;
if (nthreads == 1) {
nthreads = 0;
if (ctx->destroy_thread_pool(ctx))
goto fail;
}
if (!nthreads) {
for (p = 0; p < params->t_cost; p++) {
if (sbox)
generate_sbox(sbox, memory);
for (s = 0; s < 4; s++) {
for (i = 0; i < params->lanes; i++) {
fill_segment(memory, sbox, params, seglen, lanelen, blocks,
(uint_least32_t)p, (uint_least32_t)i, (uint_least32_t)s);
}
}
}
} else {
tparams = ctx->allocate(nthreads, sizeof(*tparams), ALIGNOF(struct threaded_fill_segments_params), ctx);
if (!tparams) {
ctx->destroy_thread_pool(ctx);
goto fail;
}
for (i = 0; i < nthreads; i++) {
tparams[i].memory = memory;
tparams[i].sbox = sbox;
tparams[i].params = params;
tparams[i].seglen = seglen;
tparams[i].lanelen = lanelen;
tparams[i].blocks = blocks;
}
for (p = 0; p < params->t_cost; p++) {
if (sbox)
generate_sbox(sbox, memory);
for (s = 0; s < 4; s++) {
ti = tn = 0;
for (i = 0; i < params->lanes; i++) {
if (ti == tn) {
tn = ctx->get_ready_threads(ts, ELEMSOF(ts), ctx);
if (!tn)
goto fail;
ti = 0;
}
tparams[ts[ti]].pass = (uint_least32_t)p;
tparams[ts[ti]].lane = (uint_least32_t)i;
tparams[ts[ti]].slice = (uint_least32_t)s;
if (ctx->run_thread(ts[ti], threaded_fill_segment, &tparams[ts[ti]], ctx))
goto fail;
ti++;
}
if (ctx->join_thread_pool(ctx))
goto fail;
}
}
if (ctx->destroy_thread_pool(ctx))
goto fail;
ctx->deallocate(tparams, ctx);
tparams = NULL;
}
for (i = 1; i < params->lanes; i++)
blockxor(&memory[lanelen - 1], &memory[i * lanelen + lanelen - 1]);
#ifdef USING_LITTLE_ENDIAN
argon2_blake2b_exthash(hash, params->hashlen, &memory[lanelen - 1], 1024);
#else
store_block(block, &memory[lanelen - 1]);
argon2_blake2b_exthash(hash, params->hashlen, block, 1024);
#endif
bufsize = libar2_hash_buf_size(params);
if (bufsize) /* should never be 0 as that would indicate the user provided a too small buffer */
libar2_erase(&((char *)hash)[params->hashlen], bufsize - params->hashlen);
#ifndef USING_LITTLE_ENDIAN
ERASE_ARRAY(block);
#endif
if (sbox)
ctx->deallocate(sbox, ctx);
ctx->deallocate(memory, ctx);
return 0;
fail:
if (tparams)
ctx->deallocate(tparams, ctx);
if (sbox)
ctx->deallocate(sbox, ctx);
ctx->deallocate(memory, ctx);
return -1;
}