aboutsummaryrefslogtreecommitdiffstats
path: root/libblake_internal_blake2b_output_digest.c
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2022-01-19 20:28:55 +0100
committerMattias Andrée <maandree@kth.se>2022-01-19 20:28:55 +0100
commit5d77a0178349ecac6536e0374cf689500efa22bc (patch)
treef6fcb38cd39e8f4240537233a08fdbb5c0284798 /libblake_internal_blake2b_output_digest.c
parentImprove portability (diff)
downloadlibblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.gz
libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.bz2
libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.xz
Optimisation for amd64
Increased major number as the ABI was broken by insertion of padding into the BLAKE2 parameter structures (except for BLAKE2Xs) Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
-rw-r--r--libblake_internal_blake2b_output_digest.c36
1 files changed, 34 insertions, 2 deletions
diff --git a/libblake_internal_blake2b_output_digest.c b/libblake_internal_blake2b_output_digest.c
index bc5b407..8700c8c 100644
--- a/libblake_internal_blake2b_output_digest.c
+++ b/libblake_internal_blake2b_output_digest.c
@@ -4,8 +4,18 @@
static void
encode_uint64_le(unsigned char *out, uint_least64_t value, size_t bytes)
{
+ /* Adding LIKELY to indicate that the default case is the
+ * expected does not affact the output */
switch (bytes) {
default:
+ /*
+ * The following optimisation have been tested:
+ *
+ * 1) Changing the default case, on amd64, to
+ * *(uint64_t *)out = (uint64_t)value;
+ * break;
+ * result: halved performance
+ */
out[7] = (unsigned char)((value >> 56) & 255);
/* fall through */
case 7:
@@ -39,6 +49,28 @@ libblake_internal_blake2b_output_digest(struct libblake_blake2b_state *state, si
{
size_t i, j;
- for (i = 0, j = 0; i < output_len; i += 8, j += 1)
- encode_uint64_le(&output[i], state->h[j], output_len - i);
+#ifdef LITTLE_ENDIAN
+ if ((uint_least64_t)(UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1) == 0) {
+ /* 37.5x performance improvement;
+ * even though the compiler is smart enough to optimise
+ * `encode_uint64_le(&output[i], state->h[j], 8);` to a
+ * movq (on amd64), it is note smart enough to optimise
+ * the rest */
+ memcpy(output, state->h, output_len);
+ return;
+ }
+#endif
+
+ /* Estimated to have similar performance benefit as above
+ * on big-endian machines */
+ for (i = 0, j = 0; i + 8 < output_len; i += 8, j += 1)
+ encode_uint64_le(&output[i], state->h[j], 8);
+ encode_uint64_le(&output[i], state->h[j], output_len - i);
+
+ /*
+ * Unoptimised code:
+ *
+ * for (i = 0, j = 0; i < output_len; i += 8, j += 1)
+ * encode_uint64_le(&output[i], state->h[j], output_len - i);
+ */
}