/* See LICENSE file for copyright and license details. */ #include "common.h" static void encode_uint64_le(unsigned char *out, uint_least64_t value, size_t bytes) { /* Adding LIKELY to indicate that the default case is the * expected does not affact the output */ switch (bytes) { default: /* * The following optimisation have been tested: * * 1) Changing the default case, on amd64, to * *(uint64_t *)out = (uint64_t)value; * break; * result: halved performance */ out[7] = (unsigned char)((value >> 56) & 255); /* fall through */ case 7: out[6] = (unsigned char)((value >> 48) & 255); /* fall through */ case 6: out[5] = (unsigned char)((value >> 40) & 255); /* fall through */ case 5: out[4] = (unsigned char)((value >> 32) & 255); /* fall through */ case 4: out[3] = (unsigned char)((value >> 24) & 255); /* fall through */ case 3: out[2] = (unsigned char)((value >> 16) & 255); /* fall through */ case 2: out[1] = (unsigned char)((value >> 8) & 255); /* fall through */ case 1: out[0] = (unsigned char)((value >> 0) & 255); /* fall through */ case 0: break; } } void libblake_internal_blake2b_output_digest(struct libblake_blake2b_state *state, size_t output_len, unsigned char *output) { size_t i, j; #ifdef LITTLE_ENDIAN if ((uint_least64_t)(UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1) == 0) { /* 37.5x performance improvement; * even though the compiler is smart enough to optimise * `encode_uint64_le(&output[i], state->h[j], 8);` to a * movq (on amd64), it is note smart enough to optimise * the rest */ memcpy(output, state->h, output_len); return; } #endif /* Estimated to have similar performance benefit as above * on big-endian machines */ for (i = 0, j = 0; i + 8 < output_len; i += 8, j += 1) encode_uint64_le(&output[i], state->h[j], 8); encode_uint64_le(&output[i], state->h[j], output_len - i); /* * Unoptimised code: * * for (i = 0, j = 0; i < output_len; i += 8, j += 1) * encode_uint64_le(&output[i], state->h[j], output_len - i); */ }