aboutsummaryrefslogtreecommitdiffstats
path: root/libar2_hash.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libar2_hash.c612
1 files changed, 612 insertions, 0 deletions
diff --git a/libar2_hash.c b/libar2_hash.c
new file mode 100644
index 0000000..0eb9211
--- /dev/null
+++ b/libar2_hash.c
@@ -0,0 +1,612 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+struct threaded_fill_segments_params {
+ struct block *memory;
+ const uint_least64_t *sbox;
+ struct libar2_argon2_parameters *params;
+ uint_least32_t seglen;
+ uint_least32_t lanelen;
+ uint_least32_t blocks;
+ uint_least32_t pass;
+ uint_least32_t lane;
+ uint_least32_t slice;
+};
+
+
+static const struct libblake_blake2b_params b2params = {
+ .digest_len = 64,
+ .key_len = 0,
+ .fanout = 1,
+ .depth = 1,
+ .leaf_len = 0,
+ .node_offset = 0,
+ .node_depth = 0,
+ .inner_len = 0
+};
+
+
+static const struct block zerob; /* implicitly zeroed via `static` */
+
+
+static void
+memxor(void *a_, const void *b_, size_t n)
+{
+ unsigned char *a = a_;
+ const unsigned char *b = b_;
+ size_t i;
+ for (i = 0; i < n; i++)
+ a[i] ^= b[i];
+}
+
+
+static size_t
+store32(unsigned char *out, uint_least32_t value)
+{
+ out[0] = (unsigned char)((value >> 0) & 255);
+ out[1] = (unsigned char)((value >> 8) & 255);
+ out[2] = (unsigned char)((value >> 16) & 255);
+ out[3] = (unsigned char)((value >> 24) & 255);
+ return 4;
+}
+
+
+static void
+store64(unsigned char *out, uint_least64_t value)
+{
+ out[0] = (unsigned char)((value >> 0) & 255);
+ out[1] = (unsigned char)((value >> 8) & 255);
+ out[2] = (unsigned char)((value >> 16) & 255);
+ out[3] = (unsigned char)((value >> 24) & 255);
+ out[4] = (unsigned char)((value >> 32) & 255);
+ out[5] = (unsigned char)((value >> 40) & 255);
+ out[6] = (unsigned char)((value >> 48) & 255);
+ out[7] = (unsigned char)((value >> 56) & 255);
+}
+
+
+static void
+load64(uint_least64_t *out, const unsigned char *data)
+{
+ *out = ((uint_least64_t)(data[0] & 255) << 0)
+ | ((uint_least64_t)(data[1] & 255) << 8)
+ | ((uint_least64_t)(data[2] & 255) << 16)
+ | ((uint_least64_t)(data[3] & 255) << 24)
+ | ((uint_least64_t)(data[4] & 255) << 32)
+ | ((uint_least64_t)(data[5] & 255) << 40)
+ | ((uint_least64_t)(data[6] & 255) << 48)
+ | ((uint_least64_t)(data[7] & 255) << 56);
+}
+
+
+static void
+store_block(unsigned char *block8, const struct block *block64)
+{
+ size_t i, j;
+ for (i = 0, j = 0; i < 1024; i += 8, j += 1)
+ store64(&block8[i], block64->w[j]);
+}
+
+
+static void
+load_block(struct block *block64, const unsigned char *block8)
+{
+ size_t i, j;
+ for (i = 0, j = 0; i < 1024; i += 8, j += 1)
+ load64(&block64->w[j], &block8[i]);
+}
+
+
+static size_t
+storemem(unsigned char *out, const void *mem, size_t len, size_t max)
+{
+ size_t n = MIN(len, max);
+ memcpy(out, mem, n);
+ return n;
+}
+
+
+static uint_least64_t
+rotr64(uint_least64_t x, int n)
+{
+ return ((x >> n) | (x << (64 - n))) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
+}
+
+
+static uint_least64_t
+fBlaMka(uint_least64_t x, uint_least64_t y)
+{
+ return x + y + 2 * (x & UINT_LEAST64_C(0xFFffFFff)) * (y & UINT_LEAST64_C(0xFFffFFff));
+}
+
+
+static void
+fill_block(struct block *block, const struct block *prevblock, const struct block *refblock,
+ int with_xor, const uint_least64_t *sbox)
+{
+ uint_least64_t x = 0;
+ uint_least32_t x_hi, x_lo;
+ struct block tmpblock;
+ size_t i;
+
+ if (with_xor) {
+ for (i = 0; i < ELEMSOF(refblock->w); i++)
+ block->w[i] ^= tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
+ } else {
+ for (i = 0; i < ELEMSOF(refblock->w); i++)
+ block->w[i] = tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
+ }
+
+ if (sbox) {
+ x = tmpblock.w[0] ^ tmpblock.w[ELEMSOF(tmpblock.w) - 1];
+ for (i = 0; i < 96; i++) {
+ x_hi = (uint_least32_t)(x >> 32);
+ x_lo = (uint_least32_t)x & UINT_LEAST32_C(0xFFFFffff);
+ x = (uint_least64_t)x_hi * (uint_least64_t)x_lo;
+ x += sbox[(x_hi & UINT_LEAST32_C(0x1FF)) + 0];
+ x ^= sbox[(x_lo & UINT_LEAST32_C(0x1FF)) + 512];
+ }
+ }
+
+#define BLAMKA_G(A, B, C, D)\
+ A = fBlaMka(A, B);\
+ D = rotr64(D ^ A, 32);\
+ C = fBlaMka(C, D);\
+ B = rotr64(B ^ C, 24);\
+ A = fBlaMka(A, B);\
+ D = rotr64(D ^ A, 16);\
+ C = fBlaMka(C, D);\
+ B = rotr64(B ^ C, 63)
+
+#define BLAMKA_ROUND(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\
+ BLAMKA_G(W0, W4, W8, WC);\
+ BLAMKA_G(W1, W5, W9, WD);\
+ BLAMKA_G(W2, W6, WA, WE);\
+ BLAMKA_G(W3, W7, WB, WF);\
+ BLAMKA_G(W0, W5, WA, WF);\
+ BLAMKA_G(W1, W6, WB, WC);\
+ BLAMKA_G(W2, W7, W8, WD);\
+ BLAMKA_G(W3, W4, W9, WE)
+
+#define BLAMKA_ROUND_(ARR, OFF, W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\
+ BLAMKA_ROUND(ARR[OFF + W0], ARR[OFF + W1], ARR[OFF + W2], ARR[OFF + W3],\
+ ARR[OFF + W4], ARR[OFF + W5], ARR[OFF + W6], ARR[OFF + W7],\
+ ARR[OFF + W8], ARR[OFF + W9], ARR[OFF + WA], ARR[OFF + WB],\
+ ARR[OFF + WC], ARR[OFF + WD], ARR[OFF + WE], ARR[OFF + WF])
+
+ for (i = 0; i < 8; i++) {
+ BLAMKA_ROUND_(tmpblock.w, i * 16,
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15);
+ }
+ for (i = 0; i < 8; i++) {
+ BLAMKA_ROUND_(tmpblock.w, i * 2,
+ 0, 1, 16, 17,
+ 32, 33, 48, 49,
+ 64, 65, 80, 81,
+ 96, 97, 112, 113);
+ }
+
+ for (i = 0; i < ELEMSOF(refblock->w); i++)
+ block->w[i] ^= tmpblock.w[i];
+
+ block->w[0] += x;
+ block->w[ELEMSOF(block->w) - 1] += x;
+ block->w[0] &= UINT_LEAST64_C(0xFFFFffffFFFFffff);
+ block->w[ELEMSOF(block->w) - 1] &= UINT_LEAST64_C(0xFFFFffffFFFFffff);
+}
+
+
+static void
+generate_sbox(uint_least64_t *sbox, struct block *memory)
+{
+ void *next, *prev = memory;
+ size_t i;
+
+ for (i = 0; i < 8; i++) {
+ next = &sbox[i * 128];
+ fill_block(next, &zerob, prev, 0, NULL);
+ fill_block(next, &zerob, next, 0, NULL);
+ prev = next;
+ }
+}
+
+
+static void
+next_address_block(struct block *addrb, struct block *inputb)
+{
+ inputb->w[6] += 1;
+ fill_block(addrb, &zerob, inputb, 0, NULL);
+ fill_block(addrb, &zerob, addrb, 0, NULL);
+}
+
+
+static uint_least32_t
+get_rindex(uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t pass,
+ uint_least32_t slice, uint_least32_t index, uint_least64_t prand, int same_lane)
+{
+ uint_least32_t size, startpos;
+ uint_least64_t relpos;
+
+ if (!pass) {
+ if (!slice)
+ size = index - 1;
+ else if (same_lane)
+ size = slice * seglen + index - 1;
+ else
+ size = slice * seglen - !index;
+ } else {
+ if (same_lane)
+ size = lanelen - seglen + index - 1;
+ else
+ size = lanelen - seglen - !index;
+ }
+
+ prand &= UINT_LEAST64_C(0xFFffFFff);
+ relpos = (prand * prand) >> 32;
+ relpos = ((uint_least64_t)size * relpos) >> 32;
+ relpos = (uint_least64_t)size - 1 - relpos;
+
+ startpos = pass ? slice == 3 ? 0 : (slice + 1) * seglen : 0;
+
+ return (startpos + (uint_least32_t)relpos) % lanelen;
+}
+
+
+static void
+fill_segment(struct block *memory, const uint_least64_t *sbox, struct libar2_argon2_parameters *params,
+ uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t blocks,
+ uint_least32_t pass, uint_least32_t lane, uint_least32_t slice)
+{
+ int data_independent;
+ struct block inputb, addrb;
+ uint_least32_t off, prevoff, rlane, rindex;
+ uint_least32_t index = 0, i;
+ uint_least64_t prand;
+
+ data_independent =
+ (params->type == LIBAR2_ARGON2I) ||
+ (params->type == LIBAR2_ARGON2ID && !pass && slice < 2);
+
+ if (data_independent) {
+ memset(&inputb.w[6], 0, sizeof(*inputb.w) * (ELEMSOF(inputb.w) - 6));
+ inputb.w[0] = pass;
+ inputb.w[1] = lane;
+ inputb.w[2] = slice;
+ inputb.w[3] = blocks;
+ inputb.w[4] = params->t_cost;
+ inputb.w[5] = (uint_least32_t)params->type;
+ }
+
+ if (!pass && !slice) {
+ if (data_independent) {
+ next_address_block(&addrb, &inputb);
+ }
+ index = 2;
+ }
+
+ off = lane * lanelen + slice * seglen + index;
+ prevoff = off - 1 + (off % lanelen ? 0 : lanelen);
+
+ for (; index < seglen; index++, off++, prevoff++) {
+ if (off % lanelen == 1)
+ prevoff = off - 1;
+ if (data_independent) {
+ i = index % ELEMSOF(addrb.w);
+ if (!i) {
+ next_address_block(&addrb, &inputb);
+ }
+ prand = addrb.w[i];
+ } else {
+ prand = memory[prevoff].w[0];
+ }
+
+ rlane = (!pass && !slice) ? lane : (uint_least32_t)(prand >> 32) % params->lanes;
+ rindex = get_rindex(seglen, lanelen, pass, slice, index, prand, rlane == lane);
+
+ fill_block(&memory[off], &memory[prevoff], &memory[rlane * lanelen + rindex],
+ params->version > LIBAR2_ARGON2_VERSION_10 && pass, sbox);
+ }
+}
+
+
+static void
+threaded_fill_segment(void *data)
+{
+ struct threaded_fill_segments_params *tparams = data;
+ fill_segment(tparams->memory, tparams->sbox, tparams->params,
+ tparams->seglen, tparams->lanelen, tparams->blocks,
+ tparams->pass, tparams->lane, tparams->slice);
+}
+
+
+static void
+initial_hash(unsigned char hash[static 64], void *msg, size_t msglen,
+ struct libar2_argon2_parameters *params, struct libar2_context *ctx)
+{
+#define SEGMENT(DATA, LEN, OFF) &((const unsigned char *)(DATA))[(OFF)], (LEN) - (OFF)
+
+ struct libblake_blake2b_state state;
+ unsigned char block[128 + 3];
+ size_t n = 0, off;
+
+ libblake_blake2b_init(&state, &b2params, NULL);
+
+ n += store32(&block[n], params->lanes);
+ n += store32(&block[n], (uint_least32_t)params->hashlen);
+ n += store32(&block[n], params->m_cost);
+ n += store32(&block[n], params->t_cost);
+ n += store32(&block[n], (uint_least32_t)params->version);
+ n += store32(&block[n], (uint_least32_t)params->type);
+ n += store32(&block[n], (uint_least32_t)msglen);
+ if (msglen) {
+ n += off = storemem(&block[n], msg, msglen, 128 - n);
+ if (n == 128) {
+ libblake_blake2b_force_update(&state, block, n);
+ n = 0;
+ if (off < msglen) {
+ off += libblake_blake2b_force_update(&state, SEGMENT(msg, msglen, off));
+ memcpy(block, SEGMENT(msg, msglen, off));
+ n = msglen - off;
+ }
+ }
+ if (ctx->autoerase_message)
+ ERASE(msg, msglen);
+ }
+
+ n += store32(&block[n], (uint_least32_t)params->saltlen);
+ if (n >= 128) {
+ n -= libblake_blake2b_force_update(&state, block, n);
+ memcpy(block, &block[128], n); /* overlap is impossible */
+ }
+ if (params->saltlen) {
+ if (!n)
+ off = 0;
+ else
+ n += off = storemem(&block[n], params->salt, params->saltlen, 128 - n);
+ if (n == 128) {
+ libblake_blake2b_force_update(&state, block, n);
+ n = 0;
+ }
+ if (n == 0 && off < params->saltlen) {
+ off += libblake_blake2b_force_update(&state, SEGMENT(params->salt, params->saltlen, off));
+ memcpy(block, SEGMENT(params->salt, params->saltlen, off));
+ n = params->saltlen - off;
+ }
+ if (ctx->autoerase_salt)
+ ERASE(params->salt, params->saltlen);
+ }
+
+ n += store32(&block[n], (uint_least32_t)params->keylen);
+ if (n >= 128) {
+ n -= libblake_blake2b_force_update(&state, block, n);
+ memcpy(block, &block[128], n); /* overlap is impossible */
+ }
+ if (params->keylen) {
+ if (!n)
+ off = 0;
+ else
+ n += off = storemem(&block[n], params->key, params->keylen, 128 - n);
+ if (n == 128) {
+ libblake_blake2b_force_update(&state, block, n);
+ n = 0;
+ }
+ if (n == 0 && off < params->keylen) {
+ off += libblake_blake2b_force_update(&state, SEGMENT(params->key, params->keylen, off));
+ memcpy(block, SEGMENT(params->key, params->keylen, off));
+ n = params->keylen - off;
+ }
+ if (ctx->autoerase_secret)
+ ERASE(params->key, params->keylen);
+ }
+
+ n += store32(&block[n], (uint_least32_t)params->adlen);
+ if (n > 128 || (n == 128 && params->adlen)) {
+ n -= libblake_blake2b_force_update(&state, block, n);
+ memcpy(block, &block[128], n); /* overlap is impossible */
+ }
+ if (params->adlen) {
+ if (!n)
+ off = 0;
+ else
+ n += off = storemem(&block[n], params->ad, params->adlen, 128 - n);
+ if (off < params->adlen) {
+ if (n == 128) {
+ libblake_blake2b_force_update(&state, block, n);
+ n = 0;
+ }
+ if (n == 0) {
+ off += libblake_blake2b_update(&state, SEGMENT(params->ad, params->adlen, off));
+ if (params->adlen - off > 128)
+ off += libblake_blake2b_force_update(&state, SEGMENT(params->ad, params->adlen, off));
+ memcpy(block, SEGMENT(params->ad, params->adlen, off));
+ n = params->adlen - off;
+ }
+ }
+ if (ctx->autoerase_associated_data)
+ ERASE(params->ad, params->adlen);
+ }
+
+ libblake_blake2b_digest(&state, block, n, 0, 64, hash);
+
+ ERASE_ARRAY(block);
+ ERASE_STRUCT(state);
+
+#undef SEGMENT
+}
+
+
+static void /* this is not BLAKE2Xb, but something Argon2-specific */
+argon2_blake2b_exthash(void *hash_, size_t hashlen, void *msg_, size_t msglen)
+{
+ struct libblake_blake2b_params params;
+ struct libblake_blake2b_state state;
+ unsigned char *msg = msg_;
+ unsigned char block[128];
+ unsigned char *hash = hash_;
+ size_t n, off;
+
+ params = b2params;
+ params.digest_len = (uint_least8_t)MIN(hashlen, (size_t)params.digest_len);
+
+ libblake_blake2b_init(&state, &params, NULL);
+ n = store32(block, (uint_least32_t)hashlen);
+ n += off = storemem(&block[n], msg, msglen, 128 - n);
+ if (off == msglen) {
+ libblake_blake2b_digest(&state, block, n, 0, params.digest_len, hash);
+ } else {
+ libblake_blake2b_force_update(&state, block, 128);
+ libblake_blake2b_digest(&state, &msg[off], msglen - off, 0, params.digest_len, hash);
+ }
+
+ if (hashlen > 64) {
+ hashlen -= 32;
+ params.digest_len = 64;
+ while (hashlen > 64) {
+ libblake_blake2b_init(&state, &params, NULL);
+ libblake_blake2b_digest(&state, hash, 64, 0, 64, &hash[32]);
+ hash += 32;
+ hashlen -= 32;
+ }
+ params.digest_len = (uint_least8_t)hashlen;
+ libblake_blake2b_init(&state, &params, NULL);
+ libblake_blake2b_digest(&state, hash, 64, 0, hashlen, &hash[32]);
+ }
+
+ ERASE_STRUCT(state);
+ ERASE_ARRAY(block);
+}
+
+
+int
+libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameters *params, struct libar2_context *ctx)
+{
+ unsigned char block[1024 + 128], hash0[256];
+ uint_least32_t blocks, seglen, lanelen;
+ struct block *memory;
+ size_t i, p, s, nthreads, ts[16], ti, tn;
+ struct threaded_fill_segments_params *tparams = NULL;
+ uint_least64_t *sbox = NULL; /* This is 8K large (assuming support for uint64_t), so we allocate it dynamically */
+
+ if (libar2_validate_params(params, NULL) || msglen >> 31 > 1) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ blocks = MAX(params->m_cost, 8 * params->lanes);
+ seglen = blocks / (4 * params->lanes);
+ blocks -= blocks % (4 * params->lanes);
+ lanelen = seglen * 4;
+
+ memory = ctx->allocate(blocks, sizeof(struct block), MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), ctx);
+ if (!memory)
+ return -1;
+
+ if (params->type == LIBAR2_ARGON2DS) {
+ sbox = ctx->allocate(1024, sizeof(*sbox), ALIGNOF(uint_least64_t), ctx);
+ if (!sbox) {
+ ctx->deallocate(memory, ctx);
+ return -1;
+ }
+ }
+
+ initial_hash(hash0, msg, msglen, params, ctx);
+ for (i = 0; i < params->lanes; i++) {
+ store32(&hash0[64], 0);
+ store32(&hash0[68], (uint_least32_t)i);
+ argon2_blake2b_exthash(block, 1024, hash0, 72);
+ load_block(&memory[i * lanelen + 0], block); /* TODO this is a copy function on LE-machines */
+
+ store32(&hash0[64], 1);
+ argon2_blake2b_exthash(block, 1024, hash0, 72);
+ load_block(&memory[i * lanelen + 1], block);
+ }
+
+ ERASE_ARRAY(hash0);
+
+ if (ctx->init_thread_pool(params->lanes, &nthreads, ctx))
+ goto fail;
+ if (nthreads == 1) {
+ nthreads = 0;
+ if (ctx->destroy_thread_pool(ctx))
+ goto fail;
+ }
+
+ if (!nthreads) {
+ for (p = 0; p < params->t_cost; p++) {
+ if (sbox)
+ generate_sbox(sbox, memory);
+ for (s = 0; s < 4; s++) {
+ for (i = 0; i < params->lanes; i++) {
+ fill_segment(memory, sbox, params, seglen, lanelen, blocks,
+ (uint_least32_t)p, (uint_least32_t)i, (uint_least32_t)s);
+ }
+ }
+ }
+
+ } else {
+ tparams = ctx->allocate(nthreads, sizeof(*tparams), ALIGNOF(struct threaded_fill_segments_params), ctx);
+ if (!tparams) {
+ ctx->destroy_thread_pool(ctx);
+ goto fail;
+ }
+ for (i = 0; i < nthreads; i++) {
+ tparams[i].memory = memory;
+ tparams[i].sbox = sbox;
+ tparams[i].params = params;
+ tparams[i].seglen = seglen;
+ tparams[i].lanelen = lanelen;
+ tparams[i].blocks = blocks;
+ }
+
+ for (p = 0; p < params->t_cost; p++) {
+ if (sbox)
+ generate_sbox(sbox, memory);
+ for (s = 0; s < 4; s++) {
+ ti = tn = 0;
+ for (i = 0; i < params->lanes; i++) {
+ if (ti == tn) {
+ tn = ctx->get_ready_threads(ts, ELEMSOF(ts), ctx);
+ if (!tn)
+ goto fail;
+ }
+ tparams[ti].pass = (uint_least32_t)p;
+ tparams[ti].lane = (uint_least32_t)i;
+ tparams[ti].slice = (uint_least32_t)s;
+ if (ctx->run_thread(ts[ti], threaded_fill_segment, &tparams[ti], ctx))
+ goto fail;
+ ti++;
+ }
+ if (ctx->join_thread_pool(ctx))
+ goto fail;
+ }
+ }
+
+ if (ctx->destroy_thread_pool(ctx))
+ goto fail;
+ ctx->deallocate(tparams, ctx);
+ tparams = NULL;
+ }
+
+ for (i = 1; i < params->lanes; i++)
+ memxor(&memory[lanelen - 1], &memory[i * lanelen + lanelen - 1], sizeof(*memory));
+ store_block(block, &memory[lanelen - 1]);
+ argon2_blake2b_exthash(hash, params->hashlen, block, 1024);
+
+ ERASE_ARRAY(block);
+ if (sbox)
+ ctx->deallocate(sbox, ctx);
+ ctx->deallocate(memory, ctx);
+ return 0;
+
+fail:
+ if (tparams)
+ ctx->deallocate(tparams, ctx);
+ if (sbox)
+ ctx->deallocate(sbox, ctx);
+ ctx->deallocate(memory, ctx);
+ return -1;
+}