From 8bdc2e4929068210d523c34c0b171b51ce96057f Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Sun, 16 Jan 2022 19:58:57 +0100 Subject: First commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- libar2_hash.c | 612 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 612 insertions(+) create mode 100644 libar2_hash.c (limited to 'libar2_hash.c') diff --git a/libar2_hash.c b/libar2_hash.c new file mode 100644 index 0000000..0eb9211 --- /dev/null +++ b/libar2_hash.c @@ -0,0 +1,612 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" + + +struct threaded_fill_segments_params { + struct block *memory; + const uint_least64_t *sbox; + struct libar2_argon2_parameters *params; + uint_least32_t seglen; + uint_least32_t lanelen; + uint_least32_t blocks; + uint_least32_t pass; + uint_least32_t lane; + uint_least32_t slice; +}; + + +static const struct libblake_blake2b_params b2params = { + .digest_len = 64, + .key_len = 0, + .fanout = 1, + .depth = 1, + .leaf_len = 0, + .node_offset = 0, + .node_depth = 0, + .inner_len = 0 +}; + + +static const struct block zerob; /* implicitly zeroed via `static` */ + + +static void +memxor(void *a_, const void *b_, size_t n) +{ + unsigned char *a = a_; + const unsigned char *b = b_; + size_t i; + for (i = 0; i < n; i++) + a[i] ^= b[i]; +} + + +static size_t +store32(unsigned char *out, uint_least32_t value) +{ + out[0] = (unsigned char)((value >> 0) & 255); + out[1] = (unsigned char)((value >> 8) & 255); + out[2] = (unsigned char)((value >> 16) & 255); + out[3] = (unsigned char)((value >> 24) & 255); + return 4; +} + + +static void +store64(unsigned char *out, uint_least64_t value) +{ + out[0] = (unsigned char)((value >> 0) & 255); + out[1] = (unsigned char)((value >> 8) & 255); + out[2] = (unsigned char)((value >> 16) & 255); + out[3] = (unsigned char)((value >> 24) & 255); + out[4] = (unsigned char)((value >> 32) & 255); + out[5] = (unsigned char)((value >> 40) & 255); + out[6] = (unsigned char)((value >> 48) & 255); + out[7] = (unsigned char)((value >> 56) & 255); +} + + +static void +load64(uint_least64_t *out, const unsigned char *data) +{ + *out = ((uint_least64_t)(data[0] & 255) << 0) + | ((uint_least64_t)(data[1] & 255) << 8) + | ((uint_least64_t)(data[2] & 255) << 16) + | ((uint_least64_t)(data[3] & 255) << 24) + | ((uint_least64_t)(data[4] & 255) << 32) + | ((uint_least64_t)(data[5] & 255) << 40) + | ((uint_least64_t)(data[6] & 255) << 48) + | ((uint_least64_t)(data[7] & 255) << 56); +} + + +static void +store_block(unsigned char *block8, const struct block *block64) +{ + size_t i, j; + for (i = 0, j = 0; i < 1024; i += 8, j += 1) + store64(&block8[i], block64->w[j]); +} + + +static void +load_block(struct block *block64, const unsigned char *block8) +{ + size_t i, j; + for (i = 0, j = 0; i < 1024; i += 8, j += 1) + load64(&block64->w[j], &block8[i]); +} + + +static size_t +storemem(unsigned char *out, const void *mem, size_t len, size_t max) +{ + size_t n = MIN(len, max); + memcpy(out, mem, n); + return n; +} + + +static uint_least64_t +rotr64(uint_least64_t x, int n) +{ + return ((x >> n) | (x << (64 - n))) & UINT_LEAST64_C(0xFFFFffffFFFFffff); +} + + +static uint_least64_t +fBlaMka(uint_least64_t x, uint_least64_t y) +{ + return x + y + 2 * (x & UINT_LEAST64_C(0xFFffFFff)) * (y & UINT_LEAST64_C(0xFFffFFff)); +} + + +static void +fill_block(struct block *block, const struct block *prevblock, const struct block *refblock, + int with_xor, const uint_least64_t *sbox) +{ + uint_least64_t x = 0; + uint_least32_t x_hi, x_lo; + struct block tmpblock; + size_t i; + + if (with_xor) { + for (i = 0; i < ELEMSOF(refblock->w); i++) + block->w[i] ^= tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i]; + } else { + for (i = 0; i < ELEMSOF(refblock->w); i++) + block->w[i] = tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i]; + } + + if (sbox) { + x = tmpblock.w[0] ^ tmpblock.w[ELEMSOF(tmpblock.w) - 1]; + for (i = 0; i < 96; i++) { + x_hi = (uint_least32_t)(x >> 32); + x_lo = (uint_least32_t)x & UINT_LEAST32_C(0xFFFFffff); + x = (uint_least64_t)x_hi * (uint_least64_t)x_lo; + x += sbox[(x_hi & UINT_LEAST32_C(0x1FF)) + 0]; + x ^= sbox[(x_lo & UINT_LEAST32_C(0x1FF)) + 512]; + } + } + +#define BLAMKA_G(A, B, C, D)\ + A = fBlaMka(A, B);\ + D = rotr64(D ^ A, 32);\ + C = fBlaMka(C, D);\ + B = rotr64(B ^ C, 24);\ + A = fBlaMka(A, B);\ + D = rotr64(D ^ A, 16);\ + C = fBlaMka(C, D);\ + B = rotr64(B ^ C, 63) + +#define BLAMKA_ROUND(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\ + BLAMKA_G(W0, W4, W8, WC);\ + BLAMKA_G(W1, W5, W9, WD);\ + BLAMKA_G(W2, W6, WA, WE);\ + BLAMKA_G(W3, W7, WB, WF);\ + BLAMKA_G(W0, W5, WA, WF);\ + BLAMKA_G(W1, W6, WB, WC);\ + BLAMKA_G(W2, W7, W8, WD);\ + BLAMKA_G(W3, W4, W9, WE) + +#define BLAMKA_ROUND_(ARR, OFF, W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, WA, WB, WC, WD, WE, WF)\ + BLAMKA_ROUND(ARR[OFF + W0], ARR[OFF + W1], ARR[OFF + W2], ARR[OFF + W3],\ + ARR[OFF + W4], ARR[OFF + W5], ARR[OFF + W6], ARR[OFF + W7],\ + ARR[OFF + W8], ARR[OFF + W9], ARR[OFF + WA], ARR[OFF + WB],\ + ARR[OFF + WC], ARR[OFF + WD], ARR[OFF + WE], ARR[OFF + WF]) + + for (i = 0; i < 8; i++) { + BLAMKA_ROUND_(tmpblock.w, i * 16, + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15); + } + for (i = 0; i < 8; i++) { + BLAMKA_ROUND_(tmpblock.w, i * 2, + 0, 1, 16, 17, + 32, 33, 48, 49, + 64, 65, 80, 81, + 96, 97, 112, 113); + } + + for (i = 0; i < ELEMSOF(refblock->w); i++) + block->w[i] ^= tmpblock.w[i]; + + block->w[0] += x; + block->w[ELEMSOF(block->w) - 1] += x; + block->w[0] &= UINT_LEAST64_C(0xFFFFffffFFFFffff); + block->w[ELEMSOF(block->w) - 1] &= UINT_LEAST64_C(0xFFFFffffFFFFffff); +} + + +static void +generate_sbox(uint_least64_t *sbox, struct block *memory) +{ + void *next, *prev = memory; + size_t i; + + for (i = 0; i < 8; i++) { + next = &sbox[i * 128]; + fill_block(next, &zerob, prev, 0, NULL); + fill_block(next, &zerob, next, 0, NULL); + prev = next; + } +} + + +static void +next_address_block(struct block *addrb, struct block *inputb) +{ + inputb->w[6] += 1; + fill_block(addrb, &zerob, inputb, 0, NULL); + fill_block(addrb, &zerob, addrb, 0, NULL); +} + + +static uint_least32_t +get_rindex(uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t pass, + uint_least32_t slice, uint_least32_t index, uint_least64_t prand, int same_lane) +{ + uint_least32_t size, startpos; + uint_least64_t relpos; + + if (!pass) { + if (!slice) + size = index - 1; + else if (same_lane) + size = slice * seglen + index - 1; + else + size = slice * seglen - !index; + } else { + if (same_lane) + size = lanelen - seglen + index - 1; + else + size = lanelen - seglen - !index; + } + + prand &= UINT_LEAST64_C(0xFFffFFff); + relpos = (prand * prand) >> 32; + relpos = ((uint_least64_t)size * relpos) >> 32; + relpos = (uint_least64_t)size - 1 - relpos; + + startpos = pass ? slice == 3 ? 0 : (slice + 1) * seglen : 0; + + return (startpos + (uint_least32_t)relpos) % lanelen; +} + + +static void +fill_segment(struct block *memory, const uint_least64_t *sbox, struct libar2_argon2_parameters *params, + uint_least32_t seglen, uint_least32_t lanelen, uint_least32_t blocks, + uint_least32_t pass, uint_least32_t lane, uint_least32_t slice) +{ + int data_independent; + struct block inputb, addrb; + uint_least32_t off, prevoff, rlane, rindex; + uint_least32_t index = 0, i; + uint_least64_t prand; + + data_independent = + (params->type == LIBAR2_ARGON2I) || + (params->type == LIBAR2_ARGON2ID && !pass && slice < 2); + + if (data_independent) { + memset(&inputb.w[6], 0, sizeof(*inputb.w) * (ELEMSOF(inputb.w) - 6)); + inputb.w[0] = pass; + inputb.w[1] = lane; + inputb.w[2] = slice; + inputb.w[3] = blocks; + inputb.w[4] = params->t_cost; + inputb.w[5] = (uint_least32_t)params->type; + } + + if (!pass && !slice) { + if (data_independent) { + next_address_block(&addrb, &inputb); + } + index = 2; + } + + off = lane * lanelen + slice * seglen + index; + prevoff = off - 1 + (off % lanelen ? 0 : lanelen); + + for (; index < seglen; index++, off++, prevoff++) { + if (off % lanelen == 1) + prevoff = off - 1; + if (data_independent) { + i = index % ELEMSOF(addrb.w); + if (!i) { + next_address_block(&addrb, &inputb); + } + prand = addrb.w[i]; + } else { + prand = memory[prevoff].w[0]; + } + + rlane = (!pass && !slice) ? lane : (uint_least32_t)(prand >> 32) % params->lanes; + rindex = get_rindex(seglen, lanelen, pass, slice, index, prand, rlane == lane); + + fill_block(&memory[off], &memory[prevoff], &memory[rlane * lanelen + rindex], + params->version > LIBAR2_ARGON2_VERSION_10 && pass, sbox); + } +} + + +static void +threaded_fill_segment(void *data) +{ + struct threaded_fill_segments_params *tparams = data; + fill_segment(tparams->memory, tparams->sbox, tparams->params, + tparams->seglen, tparams->lanelen, tparams->blocks, + tparams->pass, tparams->lane, tparams->slice); +} + + +static void +initial_hash(unsigned char hash[static 64], void *msg, size_t msglen, + struct libar2_argon2_parameters *params, struct libar2_context *ctx) +{ +#define SEGMENT(DATA, LEN, OFF) &((const unsigned char *)(DATA))[(OFF)], (LEN) - (OFF) + + struct libblake_blake2b_state state; + unsigned char block[128 + 3]; + size_t n = 0, off; + + libblake_blake2b_init(&state, &b2params, NULL); + + n += store32(&block[n], params->lanes); + n += store32(&block[n], (uint_least32_t)params->hashlen); + n += store32(&block[n], params->m_cost); + n += store32(&block[n], params->t_cost); + n += store32(&block[n], (uint_least32_t)params->version); + n += store32(&block[n], (uint_least32_t)params->type); + n += store32(&block[n], (uint_least32_t)msglen); + if (msglen) { + n += off = storemem(&block[n], msg, msglen, 128 - n); + if (n == 128) { + libblake_blake2b_force_update(&state, block, n); + n = 0; + if (off < msglen) { + off += libblake_blake2b_force_update(&state, SEGMENT(msg, msglen, off)); + memcpy(block, SEGMENT(msg, msglen, off)); + n = msglen - off; + } + } + if (ctx->autoerase_message) + ERASE(msg, msglen); + } + + n += store32(&block[n], (uint_least32_t)params->saltlen); + if (n >= 128) { + n -= libblake_blake2b_force_update(&state, block, n); + memcpy(block, &block[128], n); /* overlap is impossible */ + } + if (params->saltlen) { + if (!n) + off = 0; + else + n += off = storemem(&block[n], params->salt, params->saltlen, 128 - n); + if (n == 128) { + libblake_blake2b_force_update(&state, block, n); + n = 0; + } + if (n == 0 && off < params->saltlen) { + off += libblake_blake2b_force_update(&state, SEGMENT(params->salt, params->saltlen, off)); + memcpy(block, SEGMENT(params->salt, params->saltlen, off)); + n = params->saltlen - off; + } + if (ctx->autoerase_salt) + ERASE(params->salt, params->saltlen); + } + + n += store32(&block[n], (uint_least32_t)params->keylen); + if (n >= 128) { + n -= libblake_blake2b_force_update(&state, block, n); + memcpy(block, &block[128], n); /* overlap is impossible */ + } + if (params->keylen) { + if (!n) + off = 0; + else + n += off = storemem(&block[n], params->key, params->keylen, 128 - n); + if (n == 128) { + libblake_blake2b_force_update(&state, block, n); + n = 0; + } + if (n == 0 && off < params->keylen) { + off += libblake_blake2b_force_update(&state, SEGMENT(params->key, params->keylen, off)); + memcpy(block, SEGMENT(params->key, params->keylen, off)); + n = params->keylen - off; + } + if (ctx->autoerase_secret) + ERASE(params->key, params->keylen); + } + + n += store32(&block[n], (uint_least32_t)params->adlen); + if (n > 128 || (n == 128 && params->adlen)) { + n -= libblake_blake2b_force_update(&state, block, n); + memcpy(block, &block[128], n); /* overlap is impossible */ + } + if (params->adlen) { + if (!n) + off = 0; + else + n += off = storemem(&block[n], params->ad, params->adlen, 128 - n); + if (off < params->adlen) { + if (n == 128) { + libblake_blake2b_force_update(&state, block, n); + n = 0; + } + if (n == 0) { + off += libblake_blake2b_update(&state, SEGMENT(params->ad, params->adlen, off)); + if (params->adlen - off > 128) + off += libblake_blake2b_force_update(&state, SEGMENT(params->ad, params->adlen, off)); + memcpy(block, SEGMENT(params->ad, params->adlen, off)); + n = params->adlen - off; + } + } + if (ctx->autoerase_associated_data) + ERASE(params->ad, params->adlen); + } + + libblake_blake2b_digest(&state, block, n, 0, 64, hash); + + ERASE_ARRAY(block); + ERASE_STRUCT(state); + +#undef SEGMENT +} + + +static void /* this is not BLAKE2Xb, but something Argon2-specific */ +argon2_blake2b_exthash(void *hash_, size_t hashlen, void *msg_, size_t msglen) +{ + struct libblake_blake2b_params params; + struct libblake_blake2b_state state; + unsigned char *msg = msg_; + unsigned char block[128]; + unsigned char *hash = hash_; + size_t n, off; + + params = b2params; + params.digest_len = (uint_least8_t)MIN(hashlen, (size_t)params.digest_len); + + libblake_blake2b_init(&state, ¶ms, NULL); + n = store32(block, (uint_least32_t)hashlen); + n += off = storemem(&block[n], msg, msglen, 128 - n); + if (off == msglen) { + libblake_blake2b_digest(&state, block, n, 0, params.digest_len, hash); + } else { + libblake_blake2b_force_update(&state, block, 128); + libblake_blake2b_digest(&state, &msg[off], msglen - off, 0, params.digest_len, hash); + } + + if (hashlen > 64) { + hashlen -= 32; + params.digest_len = 64; + while (hashlen > 64) { + libblake_blake2b_init(&state, ¶ms, NULL); + libblake_blake2b_digest(&state, hash, 64, 0, 64, &hash[32]); + hash += 32; + hashlen -= 32; + } + params.digest_len = (uint_least8_t)hashlen; + libblake_blake2b_init(&state, ¶ms, NULL); + libblake_blake2b_digest(&state, hash, 64, 0, hashlen, &hash[32]); + } + + ERASE_STRUCT(state); + ERASE_ARRAY(block); +} + + +int +libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameters *params, struct libar2_context *ctx) +{ + unsigned char block[1024 + 128], hash0[256]; + uint_least32_t blocks, seglen, lanelen; + struct block *memory; + size_t i, p, s, nthreads, ts[16], ti, tn; + struct threaded_fill_segments_params *tparams = NULL; + uint_least64_t *sbox = NULL; /* This is 8K large (assuming support for uint64_t), so we allocate it dynamically */ + + if (libar2_validate_params(params, NULL) || msglen >> 31 > 1) { + errno = EINVAL; + return -1; + } + + blocks = MAX(params->m_cost, 8 * params->lanes); + seglen = blocks / (4 * params->lanes); + blocks -= blocks % (4 * params->lanes); + lanelen = seglen * 4; + + memory = ctx->allocate(blocks, sizeof(struct block), MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), ctx); + if (!memory) + return -1; + + if (params->type == LIBAR2_ARGON2DS) { + sbox = ctx->allocate(1024, sizeof(*sbox), ALIGNOF(uint_least64_t), ctx); + if (!sbox) { + ctx->deallocate(memory, ctx); + return -1; + } + } + + initial_hash(hash0, msg, msglen, params, ctx); + for (i = 0; i < params->lanes; i++) { + store32(&hash0[64], 0); + store32(&hash0[68], (uint_least32_t)i); + argon2_blake2b_exthash(block, 1024, hash0, 72); + load_block(&memory[i * lanelen + 0], block); /* TODO this is a copy function on LE-machines */ + + store32(&hash0[64], 1); + argon2_blake2b_exthash(block, 1024, hash0, 72); + load_block(&memory[i * lanelen + 1], block); + } + + ERASE_ARRAY(hash0); + + if (ctx->init_thread_pool(params->lanes, &nthreads, ctx)) + goto fail; + if (nthreads == 1) { + nthreads = 0; + if (ctx->destroy_thread_pool(ctx)) + goto fail; + } + + if (!nthreads) { + for (p = 0; p < params->t_cost; p++) { + if (sbox) + generate_sbox(sbox, memory); + for (s = 0; s < 4; s++) { + for (i = 0; i < params->lanes; i++) { + fill_segment(memory, sbox, params, seglen, lanelen, blocks, + (uint_least32_t)p, (uint_least32_t)i, (uint_least32_t)s); + } + } + } + + } else { + tparams = ctx->allocate(nthreads, sizeof(*tparams), ALIGNOF(struct threaded_fill_segments_params), ctx); + if (!tparams) { + ctx->destroy_thread_pool(ctx); + goto fail; + } + for (i = 0; i < nthreads; i++) { + tparams[i].memory = memory; + tparams[i].sbox = sbox; + tparams[i].params = params; + tparams[i].seglen = seglen; + tparams[i].lanelen = lanelen; + tparams[i].blocks = blocks; + } + + for (p = 0; p < params->t_cost; p++) { + if (sbox) + generate_sbox(sbox, memory); + for (s = 0; s < 4; s++) { + ti = tn = 0; + for (i = 0; i < params->lanes; i++) { + if (ti == tn) { + tn = ctx->get_ready_threads(ts, ELEMSOF(ts), ctx); + if (!tn) + goto fail; + } + tparams[ti].pass = (uint_least32_t)p; + tparams[ti].lane = (uint_least32_t)i; + tparams[ti].slice = (uint_least32_t)s; + if (ctx->run_thread(ts[ti], threaded_fill_segment, &tparams[ti], ctx)) + goto fail; + ti++; + } + if (ctx->join_thread_pool(ctx)) + goto fail; + } + } + + if (ctx->destroy_thread_pool(ctx)) + goto fail; + ctx->deallocate(tparams, ctx); + tparams = NULL; + } + + for (i = 1; i < params->lanes; i++) + memxor(&memory[lanelen - 1], &memory[i * lanelen + lanelen - 1], sizeof(*memory)); + store_block(block, &memory[lanelen - 1]); + argon2_blake2b_exthash(hash, params->hashlen, block, 1024); + + ERASE_ARRAY(block); + if (sbox) + ctx->deallocate(sbox, ctx); + ctx->deallocate(memory, ctx); + return 0; + +fail: + if (tparams) + ctx->deallocate(tparams, ctx); + if (sbox) + ctx->deallocate(sbox, ctx); + ctx->deallocate(memory, ctx); + return -1; +} -- cgit v1.2.3-70-g09d2