From 5d77a0178349ecac6536e0374cf689500efa22bc Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Wed, 19 Jan 2022 20:28:55 +0100 Subject: Optimisation for amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increased major number as the ABI was broken by insertion of padding into the BLAKE2 parameter structures (except for BLAKE2Xs) Signed-off-by: Mattias Andrée --- .gitignore | 2 + Makefile | 2 +- common.h | 31 ++++++++- libblake.h | 3 + libblake_blake2b_digest.c | 7 +- libblake_blake2b_force_update.c | 24 ++++++- libblake_blake2b_init.c | 110 ++++++++++++++++++----------- libblake_blake2b_update.c | 3 +- libblake_blake2s_digest.c | 7 +- libblake_blake2s_force_update.c | 18 +++++ libblake_blake2s_init.c | 78 ++++++++++++++------- libblake_blake2s_update.c | 1 + libblake_internal_blake2b_output_digest.c | 36 +++++++++- libblake_internal_blake2s_output_digest.c | 39 ++++++++++- libblake_internal_blake2xb_init0.c | 112 +++++++++++++++++++----------- libblake_internal_blake2xs_init0.c | 76 +++++++++++++------- 16 files changed, 406 insertions(+), 143 deletions(-) diff --git a/.gitignore b/.gitignore index c2c489c..4161a51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *\#* *~ +a.out *.o *.a *.lo @@ -12,4 +13,5 @@ *.gcov *.gcno *.gcda +*.s /test diff --git a/Makefile b/Makefile index d96ac42..da9b9a7 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ OS = linux include mk/$(OS).mk -LIB_MAJOR = 1 +LIB_MAJOR = 2 LIB_MINOR = 0 LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR) LIB_NAME = blake diff --git a/common.h b/common.h index b1920dc..3f9dce0 100644 --- a/common.h +++ b/common.h @@ -24,9 +24,38 @@ #endif #if defined(__GNUC__) -# define HIDDEN __attribute__((__visibility__("hidden"))) +# define HIDDEN __attribute__((visibility("hidden"))) +# define LIKELY(X) __builtin_expect(!!(X), 1) +# define UNLIKELY(X) __builtin_expect(!!(X), 0) +# define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X)) #else # define HIDDEN +# define LIKELY(X) X +# define UNLIKELY(X) X +# define ASSUMING_CONSTANT(X) 0 +# if defined(__has_builtin) +# if __has_builtin(__builtin_expect) +# undef LIKELY +# undef UNLIKELY +# define LIKELY(X) __builtin_expect(!!(X), 1) +# define UNLIKELY(X) __builtin_expect(!!(X), 0) +# endif +# if __has_builtin(__builtin_constant_p) +# undef ASSUMING_CONSTANT +# define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X)) +# endif +# endif +#endif +#if defined(__has_builtin) +# define HAS_BUILTIN(X) __has_builtin(X) +#else +# define HAS_BUILTIN(X) 0 +#endif + +#if defined(__x86_64__) || defined(__i386__) +# define LITTLE_ENDIAN +#else +# error Endian is unknown #endif #define A 10 diff --git a/libblake.h b/libblake.h index 70b5a09..48c2ca1 100644 --- a/libblake.h +++ b/libblake.h @@ -86,6 +86,7 @@ struct libblake_blake2s_params { uint_least64_t node_offset; /* (48-bits) normally 0 */ uint_least8_t node_depth; /* normally 0 */ uint_least8_t inner_len; /* normally 0 */ + uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */ uint_least8_t salt[8]; uint_least8_t pepper[8]; }; @@ -99,6 +100,7 @@ struct libblake_blake2b_params { uint_least64_t node_offset; /* normally 0 */ uint_least8_t node_depth; /* normally 0 */ uint_least8_t inner_len; /* normally 0 */ + uint_least8_t _padding[6]; /* to keep .salt and .pepper aligned as uint_least64_t */ uint_least8_t salt[16]; uint_least8_t pepper[16]; }; @@ -127,6 +129,7 @@ struct libblake_blake2xb_params { uint_least32_t xof_len; /* max if not known in advance */ uint_least8_t node_depth; /* normally 0 */ uint_least8_t inner_len; /* normally 0 */ + uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */ uint_least8_t salt[16]; uint_least8_t pepper[16]; }; diff --git a/libblake_blake2b_digest.c b/libblake_blake2b_digest.c index 08b2d75..4338387 100644 --- a/libblake_blake2b_digest.c +++ b/libblake_blake2b_digest.c @@ -13,13 +13,16 @@ libblake_blake2b_digest(struct libblake_blake2b_state *state, void *data_, size_ len -= r; state->f[0] = UINT_LEAST64_C(0xFFFFffffFFFFffff); - if (last_node) + if (UNLIKELY(last_node)) state->f[1] = UINT_LEAST64_C(0xFFFFffffFFFFffff); memset(&data[len], 0, 128 - len); + /* This runs so seldom that we are not bothering with trying to optimise + * this since similar code in libblake_blake2b_force_update.c could not + * be optimised */ state->t[0] = (state->t[0] + len) & UINT_LEAST64_C(0xFFFFffffFFFFffff); - if (state->t[0] < len) + if (UNLIKELY(state->t[0] < len)) state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff); libblake_internal_blake2b_compress(state, data); diff --git a/libblake_blake2b_force_update.c b/libblake_blake2b_force_update.c index 60b8fab..2446e16 100644 --- a/libblake_blake2b_force_update.c +++ b/libblake_blake2b_force_update.c @@ -8,8 +8,30 @@ libblake_blake2b_force_update(struct libblake_blake2b_state *state, const void * size_t off = 0; for (; len - off >= 128; off += 128) { + /* The following optimisations have been tested: + * + * 1) + * `*(__uint128_t *)state->t += 128;` + * result: slower + * + * 2) + * addq, adcq using `__asm__ __volatile__` + * result: slower (as 1) + * + * 3) + * using `__builtin_add_overflow` + * result: no difference + * + * These testes where preformed on amd64 with a compile-time + * assumption that `UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1 == 0`, + * which the compiler accepted and those included the attempted + * optimisations. + * + * UNLIKELY does not seem to make any difference, but it + * does change the output, theoretically of the better. + */ state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff); - if (state->t[0] < 128) + if (UNLIKELY(state->t[0] < 128)) state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff); libblake_internal_blake2b_compress(state, &data[off]); diff --git a/libblake_blake2b_init.c b/libblake_blake2b_init.c index b520a87..f07d1ef 100644 --- a/libblake_blake2b_init.c +++ b/libblake_blake2b_init.c @@ -1,6 +1,25 @@ /* See LICENSE file for copyright and license details. */ #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le64(X) X +#else +static uint_least64_t +le64(uint_least64_t h) +{ + unsigned char r[8]; + r[0] = (unsigned char)((h >> 0) & 255); + r[1] = (unsigned char)((h >> 8) & 255); + r[2] = (unsigned char)((h >> 16) & 255); + r[3] = (unsigned char)((h >> 24) & 255); + r[4] = (unsigned char)((h >> 32) & 255); + r[5] = (unsigned char)((h >> 40) & 255); + r[6] = (unsigned char)((h >> 48) & 255); + r[7] = (unsigned char)((h >> 56) & 255); + return *(uint_least64_t *)r; +} +#endif + void libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblake_blake2b_params *params, const unsigned char *key) { @@ -18,46 +37,57 @@ libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblak state->f[0] = 0; state->f[1] = 0; - state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; - state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; - state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; - state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24; - state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; - state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff); - state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; - state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; - state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; - state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; - state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; - state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; - state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; - state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; - state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; - state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; - state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; - state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; - state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; - state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; - state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; - state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; - state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; - state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; - state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; - state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; - state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; - state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; - state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; - state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; - state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; - state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; - state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; - state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; - state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; - state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; - state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; - state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; - state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; - state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; + if (offsetof(struct libblake_blake2b_params, inner_len) == 17) { + state->h[0] ^= le64(((uint_least64_t *)params)[0]); + state->h[1] ^= le64(((uint_least64_t *)params)[1]); + state->h[2] ^= le64(((uint_least64_t)params->node_depth << 0) | + ((uint_least64_t)params->inner_len << 8)); + state->h[4] ^= le64(*(uint_least64_t *)¶ms->salt[0]); + state->h[5] ^= le64(*(uint_least64_t *)¶ms->salt[8]); + state->h[6] ^= le64(*(uint_least64_t *)¶ms->pepper[0]); + state->h[7] ^= le64(*(uint_least64_t *)¶ms->pepper[8]); + } else { + state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; + state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; + state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; + state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24; + state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; + state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff); + state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; + state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; + state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; + state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; + state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; + state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; + state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; + state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; + state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; + state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; + state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; + state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; + state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; + state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; + state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; + state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; + state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; + state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; + state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; + state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; + state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; + state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; + state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; + state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; + state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; + state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; + state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; + state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; + state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; + state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; + state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; + state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; + state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; + state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; + } if (params->key_len) { state->t[0] = 128; diff --git a/libblake_blake2b_update.c b/libblake_blake2b_update.c index cbd38f2..2f825a3 100644 --- a/libblake_blake2b_update.c +++ b/libblake_blake2b_update.c @@ -8,8 +8,9 @@ libblake_blake2b_update(struct libblake_blake2b_state *state, const void *data_, size_t off = 0; for (; len - off > 128; off += 128) { + /* See libblake_blake2b_force_update.c for optimisations notes */ state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff); - if (state->t[0] < 128) + if (UNLIKELY(state->t[0] < 128)) state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff); libblake_internal_blake2b_compress(state, &data[off]); diff --git a/libblake_blake2s_digest.c b/libblake_blake2s_digest.c index 2ee45ed..d0c1ac8 100644 --- a/libblake_blake2s_digest.c +++ b/libblake_blake2s_digest.c @@ -13,13 +13,16 @@ libblake_blake2s_digest(struct libblake_blake2s_state *state, void *data_, size_ len -= r; state->f[0] = UINT_LEAST32_C(0xFFFFffff); - if (last_node) + if (UNLIKELY(last_node)) state->f[1] = UINT_LEAST32_C(0xFFFFffff); memset(&data[len], 0, 64 - len); + /* This runs so seldom that we are not bothering with trying to optimise + * this since similar code in libblake_blake2s_force_update.c could not + * be optimised */ state->t[0] = (state->t[0] + len) & UINT_LEAST32_C(0xFFFFffff); - if (state->t[0] < len) + if (UNLIKELY(state->t[0] < len)) state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff); libblake_internal_blake2s_compress(state, data); diff --git a/libblake_blake2s_force_update.c b/libblake_blake2s_force_update.c index 925d381..5330ab2 100644 --- a/libblake_blake2s_force_update.c +++ b/libblake_blake2s_force_update.c @@ -8,6 +8,24 @@ libblake_blake2s_force_update(struct libblake_blake2s_state *state, const void * size_t off = 0; for (; len - off >= 64; off += 64) { + /* The following optimisations have been tested: + * + * 1) + * `*(uint64_t *)state->t += 64;` + * result: slower + * + * 2) + * using `__builtin_add_overflow` + * result: no difference + * + * These testes where preformed on amd64 with a compile-time + * assumption that `UINT_LEAST32_C(0xFFFFffff) + 1 == 0`, + * which the compiler accepted and those included the attempted + * optimisations. + * + * UNLIKELY does not seem to make any difference, but it + * does change the output, theoretically of the better. + */ state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff); if (state->t[0] < 64) state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff); diff --git a/libblake_blake2s_init.c b/libblake_blake2s_init.c index c4b126c..fae7e0c 100644 --- a/libblake_blake2s_init.c +++ b/libblake_blake2s_init.c @@ -1,6 +1,21 @@ /* See LICENSE file for copyright and license details. */ #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le32(X) X +#else +static uint_least32_t +le32(uint_least32_t h) +{ + unsigned char r[4]; + r[0] = (unsigned char)((h >> 0) & 255); + r[1] = (unsigned char)((h >> 8) & 255); + r[2] = (unsigned char)((h >> 16) & 255); + r[3] = (unsigned char)((h >> 24) & 255); + return *(uint_least32_t *)r; +} +#endif + void libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblake_blake2s_params *params, const unsigned char *key) { @@ -18,31 +33,44 @@ libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblak state->f[0] = 0; state->f[1] = 0; - state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; - state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; - state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; - state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24; - state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); - state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff)); - state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0; - state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; - state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; - state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; - state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; - state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; - state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; - state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; - state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; - state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; - state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; - state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; - state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; - state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; - state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; - state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; - state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; - state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; - state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; + if (offsetof(struct libblake_blake2s_params, inner_len) == 17) { + state->h[0] ^= le32(((uint_least32_t *)params)[0]); + state->h[1] ^= le32(((uint_least32_t *)params)[1]); + state->h[2] ^= le32((uint_least32_t)(params->node_offset >> 0)); + state->h[3] ^= le32(((uint_least32_t)(params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) | + ((uint_least32_t)params->node_depth << 16) | + ((uint_least32_t)params->inner_len << 24)); + state->h[4] ^= le32(*(uint_least32_t *)¶ms->salt[0]); + state->h[5] ^= le32(*(uint_least32_t *)¶ms->salt[4]); + state->h[6] ^= le32(*(uint_least32_t *)¶ms->pepper[0]); + state->h[7] ^= le32(*(uint_least32_t *)¶ms->pepper[4]); + } else { + state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; + state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; + state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; + state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24; + state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); + state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff)); + state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0; + state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; + state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; + state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; + state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; + state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; + state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; + state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; + state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; + state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; + state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; + state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; + state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; + state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; + state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; + state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; + state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; + state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; + state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; + } if (params->key_len) { state->t[0] = 32; diff --git a/libblake_blake2s_update.c b/libblake_blake2s_update.c index 598260f..b6d18c2 100644 --- a/libblake_blake2s_update.c +++ b/libblake_blake2s_update.c @@ -8,6 +8,7 @@ libblake_blake2s_update(struct libblake_blake2s_state *state, const void *data_, size_t off = 0; for (; len - off > 64; off += 64) { + /* See libblake_blake2s_force_update.c for optimisations notes */ state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff); if (state->t[0] < 64) state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff); diff --git a/libblake_internal_blake2b_output_digest.c b/libblake_internal_blake2b_output_digest.c index bc5b407..8700c8c 100644 --- a/libblake_internal_blake2b_output_digest.c +++ b/libblake_internal_blake2b_output_digest.c @@ -4,8 +4,18 @@ static void encode_uint64_le(unsigned char *out, uint_least64_t value, size_t bytes) { + /* Adding LIKELY to indicate that the default case is the + * expected does not affact the output */ switch (bytes) { default: + /* + * The following optimisation have been tested: + * + * 1) Changing the default case, on amd64, to + * *(uint64_t *)out = (uint64_t)value; + * break; + * result: halved performance + */ out[7] = (unsigned char)((value >> 56) & 255); /* fall through */ case 7: @@ -39,6 +49,28 @@ libblake_internal_blake2b_output_digest(struct libblake_blake2b_state *state, si { size_t i, j; - for (i = 0, j = 0; i < output_len; i += 8, j += 1) - encode_uint64_le(&output[i], state->h[j], output_len - i); +#ifdef LITTLE_ENDIAN + if ((uint_least64_t)(UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1) == 0) { + /* 37.5x performance improvement; + * even though the compiler is smart enough to optimise + * `encode_uint64_le(&output[i], state->h[j], 8);` to a + * movq (on amd64), it is note smart enough to optimise + * the rest */ + memcpy(output, state->h, output_len); + return; + } +#endif + + /* Estimated to have similar performance benefit as above + * on big-endian machines */ + for (i = 0, j = 0; i + 8 < output_len; i += 8, j += 1) + encode_uint64_le(&output[i], state->h[j], 8); + encode_uint64_le(&output[i], state->h[j], output_len - i); + + /* + * Unoptimised code: + * + * for (i = 0, j = 0; i < output_len; i += 8, j += 1) + * encode_uint64_le(&output[i], state->h[j], output_len - i); + */ } diff --git a/libblake_internal_blake2s_output_digest.c b/libblake_internal_blake2s_output_digest.c index d7b891c..6b4a5da 100644 --- a/libblake_internal_blake2s_output_digest.c +++ b/libblake_internal_blake2s_output_digest.c @@ -4,8 +4,18 @@ static void encode_uint32_le(unsigned char *out, uint_least32_t value, size_t bytes) { + /* Adding LIKELY to indicate that the default case is the + * expected does not affact the output */ switch (bytes) { default: + /* + * The following optimisation have been tested: + * + * 1) Changing the default case, on amd64, to + * *(uint32_t *)out = (uint32_t)value; + * break; + * result: halved performance + */ out[3] = (unsigned char)((value >> 24) & 255); /* fall through */ case 3: @@ -27,6 +37,31 @@ libblake_internal_blake2s_output_digest(struct libblake_blake2s_state *state, si { size_t i, j; - for (i = 0, j = 0; i < output_len; i += 4, j += 1) - encode_uint32_le(&output[i], state->h[j], output_len - i); +#ifdef LITTLE_ENDIAN + if ((uint_least32_t)(UINT_LEAST32_C(0xFFFFffff) + 1) == 0) { + /* No noticeable performance benefit on amd64, however + * it signficantly reduces the translation size and + * a 37.5x performance benefit was seen on the 64-bit + * version on amd64; + * even though the compiler is smart enough to optimise + * `encode_uint32_le(&output[i], state->h[j], 4);` to a + * movq (on amd64), it is note smart enough to optimise + * the rest */ + memcpy(output, state->h, output_len); + return; + } +#endif + + /* Estimated to have similar performance benefit as above + * on big-endian machines */ + for (i = 0, j = 0; i + 4 < output_len; i += 4, j += 1) + encode_uint32_le(&output[i], state->h[j], 4); + encode_uint32_le(&output[i], state->h[j], output_len - i); + + /* + * Unoptimised code: + * + * for (i = 0, j = 0; i < output_len; i += 4, j += 1) + * encode_uint32_le(&output[i], state->h[j], output_len - i); + */ } diff --git a/libblake_internal_blake2xb_init0.c b/libblake_internal_blake2xb_init0.c index d6063dc..c045cec 100644 --- a/libblake_internal_blake2xb_init0.c +++ b/libblake_internal_blake2xb_init0.c @@ -1,6 +1,25 @@ /* See LICENSE file for copyright and license details. */ #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le64(X) X +#else +static uint_least64_t +le64(uint_least64_t h) +{ + unsigned char r[8]; + r[0] = (unsigned char)((h >> 0) & 255); + r[1] = (unsigned char)((h >> 8) & 255); + r[2] = (unsigned char)((h >> 16) & 255); + r[3] = (unsigned char)((h >> 24) & 255); + r[4] = (unsigned char)((h >> 32) & 255); + r[5] = (unsigned char)((h >> 40) & 255); + r[6] = (unsigned char)((h >> 48) & 255); + r[7] = (unsigned char)((h >> 56) & 255); + return *(uint_least64_t *)r; +} +#endif + void libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const struct libblake_blake2xb_params *params) { @@ -18,45 +37,56 @@ libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const st state->b2b.f[0] = 0; state->b2b.f[1] = 0; - state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; - state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; - state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; - state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24; - state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; - state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0; - state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32; - state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; - state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; - state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; - state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; - state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; - state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; + if (offsetof(struct libblake_blake2xb_params, inner_len) == 17) { + state->b2b.h[0] ^= le64(((uint_least64_t *)params)[0]); + state->b2b.h[1] ^= le64(((uint_least64_t *)params)[1]); + state->b2b.h[2] ^= le64(((uint_least64_t)params->node_depth << 0) | + ((uint_least64_t)params->inner_len << 8)); + state->b2b.h[4] ^= le64(*(uint_least64_t *)¶ms->salt[0]); + state->b2b.h[5] ^= le64(*(uint_least64_t *)¶ms->salt[8]); + state->b2b.h[6] ^= le64(*(uint_least64_t *)¶ms->pepper[0]); + state->b2b.h[7] ^= le64(*(uint_least64_t *)¶ms->pepper[8]); + } else { + state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; + state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; + state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; + state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24; + state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; + state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0; + state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32; + state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; + state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; + state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; + state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; + state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; + state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; + } } diff --git a/libblake_internal_blake2xs_init0.c b/libblake_internal_blake2xs_init0.c index 92cb7bf..f20bb9c 100644 --- a/libblake_internal_blake2xs_init0.c +++ b/libblake_internal_blake2xs_init0.c @@ -1,6 +1,21 @@ /* See LICENSE file for copyright and license details. */ #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le32(X) X +#else +static uint_least32_t +le32(uint_least32_t h) +{ + unsigned char r[4]; + r[0] = (unsigned char)((h >> 0) & 255); + r[1] = (unsigned char)((h >> 8) & 255); + r[2] = (unsigned char)((h >> 16) & 255); + r[3] = (unsigned char)((h >> 24) & 255); + return *(uint_least32_t *)r; +} +#endif + void libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const struct libblake_blake2xs_params *params) { @@ -18,29 +33,40 @@ libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const st state->b2s.f[0] = 0; state->b2s.f[1] = 0; - state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; - state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; - state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; - state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24; - state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); - state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff); - state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0; - state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; - state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; - state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; - state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; - state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; - state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; - state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; - state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; - state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; - state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; - state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; - state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; - state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; - state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; - state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; - state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; - state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; - state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; + if (sizeof(*params) == sizeof(state->b2s.h)) { + state->b2s.h[0] ^= le32(((uint_least32_t *)params)[0]); + state->b2s.h[1] ^= le32(((uint_least32_t *)params)[1]); + state->b2s.h[2] ^= le32(((uint_least32_t *)params)[2]); + state->b2s.h[3] ^= le32(((uint_least32_t *)params)[3]); + state->b2s.h[4] ^= le32(((uint_least32_t *)params)[4]); + state->b2s.h[5] ^= le32(((uint_least32_t *)params)[5]); + state->b2s.h[6] ^= le32(((uint_least32_t *)params)[6]); + state->b2s.h[7] ^= le32(((uint_least32_t *)params)[7]); + } else { + state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; + state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; + state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; + state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24; + state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); + state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff); + state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0; + state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; + state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; + state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; + state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; + state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; + state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; + state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; + state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; + state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; + state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; + state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; + state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; + state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; + state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; + state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; + state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; + state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; + state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; + } } -- cgit v1.2.3-70-g09d2