diff options
| author | Mattias Andrée <maandree@kth.se> | 2022-01-19 20:28:55 +0100 | 
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2022-01-19 20:28:55 +0100 | 
| commit | 5d77a0178349ecac6536e0374cf689500efa22bc (patch) | |
| tree | f6fcb38cd39e8f4240537233a08fdbb5c0284798 | |
| parent | Improve portability (diff) | |
| download | libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.gz libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.bz2 libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.xz | |
Optimisation for amd64
Increased major number as the ABI was broken
by insertion of padding into the BLAKE2
parameter structures (except for BLAKE2Xs)
Signed-off-by: Mattias Andrée <maandree@kth.se>
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | common.h | 31 | ||||
| -rw-r--r-- | libblake.h | 3 | ||||
| -rw-r--r-- | libblake_blake2b_digest.c | 7 | ||||
| -rw-r--r-- | libblake_blake2b_force_update.c | 24 | ||||
| -rw-r--r-- | libblake_blake2b_init.c | 110 | ||||
| -rw-r--r-- | libblake_blake2b_update.c | 3 | ||||
| -rw-r--r-- | libblake_blake2s_digest.c | 7 | ||||
| -rw-r--r-- | libblake_blake2s_force_update.c | 18 | ||||
| -rw-r--r-- | libblake_blake2s_init.c | 78 | ||||
| -rw-r--r-- | libblake_blake2s_update.c | 1 | ||||
| -rw-r--r-- | libblake_internal_blake2b_output_digest.c | 36 | ||||
| -rw-r--r-- | libblake_internal_blake2s_output_digest.c | 39 | ||||
| -rw-r--r-- | libblake_internal_blake2xb_init0.c | 112 | ||||
| -rw-r--r-- | libblake_internal_blake2xs_init0.c | 76 | 
16 files changed, 406 insertions, 143 deletions
| @@ -1,5 +1,6 @@  *\#*  *~ +a.out  *.o  *.a  *.lo @@ -12,4 +13,5 @@  *.gcov  *.gcno  *.gcda +*.s  /test @@ -10,7 +10,7 @@ OS = linux  include mk/$(OS).mk -LIB_MAJOR = 1 +LIB_MAJOR = 2  LIB_MINOR = 0  LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR)  LIB_NAME = blake @@ -24,9 +24,38 @@  #endif  #if defined(__GNUC__) -# define HIDDEN __attribute__((__visibility__("hidden"))) +# define HIDDEN __attribute__((visibility("hidden"))) +# define LIKELY(X) __builtin_expect(!!(X), 1) +# define UNLIKELY(X) __builtin_expect(!!(X), 0) +# define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X))  #else  # define HIDDEN +# define LIKELY(X) X +# define UNLIKELY(X) X +# define ASSUMING_CONSTANT(X) 0 +# if defined(__has_builtin) +#  if __has_builtin(__builtin_expect) +#   undef LIKELY +#   undef UNLIKELY +#   define LIKELY(X) __builtin_expect(!!(X), 1) +#   define UNLIKELY(X) __builtin_expect(!!(X), 0) +#  endif +#  if __has_builtin(__builtin_constant_p) +#   undef ASSUMING_CONSTANT +#   define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X)) +#  endif +# endif +#endif +#if defined(__has_builtin) +# define HAS_BUILTIN(X) __has_builtin(X) +#else +# define HAS_BUILTIN(X) 0 +#endif + +#if defined(__x86_64__) || defined(__i386__) +# define LITTLE_ENDIAN +#else +# error Endian is unknown  #endif  #define A 10 @@ -86,6 +86,7 @@ struct libblake_blake2s_params {  	uint_least64_t node_offset; /* (48-bits) normally 0 */  	uint_least8_t node_depth; /* normally 0 */  	uint_least8_t inner_len; /* normally 0 */ +	uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */  	uint_least8_t salt[8];  	uint_least8_t pepper[8];  }; @@ -99,6 +100,7 @@ struct libblake_blake2b_params {  	uint_least64_t node_offset; /* normally 0 */  	uint_least8_t node_depth; /* normally 0 */  	uint_least8_t inner_len; /* normally 0 */ +	uint_least8_t _padding[6]; /* to keep .salt and .pepper aligned as uint_least64_t */  	uint_least8_t salt[16];  	uint_least8_t pepper[16];  }; @@ -127,6 +129,7 @@ struct libblake_blake2xb_params {  	uint_least32_t xof_len; /* max if not known in advance */  	uint_least8_t node_depth; /* normally 0 */  	uint_least8_t inner_len; /* normally 0 */ +	uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */  	uint_least8_t salt[16];  	uint_least8_t pepper[16];  }; diff --git a/libblake_blake2b_digest.c b/libblake_blake2b_digest.c index 08b2d75..4338387 100644 --- a/libblake_blake2b_digest.c +++ b/libblake_blake2b_digest.c @@ -13,13 +13,16 @@ libblake_blake2b_digest(struct libblake_blake2b_state *state, void *data_, size_  	len -= r;  	state->f[0] = UINT_LEAST64_C(0xFFFFffffFFFFffff); -	if (last_node) +	if (UNLIKELY(last_node))  		state->f[1] = UINT_LEAST64_C(0xFFFFffffFFFFffff);  	memset(&data[len], 0, 128 - len); +	/* This runs so seldom that we are not bothering with trying to optimise +	 * this since similar code in libblake_blake2b_force_update.c could not +	 * be optimised */  	state->t[0] = (state->t[0] + len) & UINT_LEAST64_C(0xFFFFffffFFFFffff); -	if (state->t[0] < len) +	if (UNLIKELY(state->t[0] < len))  		state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);  	libblake_internal_blake2b_compress(state, data); diff --git a/libblake_blake2b_force_update.c b/libblake_blake2b_force_update.c index 60b8fab..2446e16 100644 --- a/libblake_blake2b_force_update.c +++ b/libblake_blake2b_force_update.c @@ -8,8 +8,30 @@ libblake_blake2b_force_update(struct libblake_blake2b_state *state, const void *  	size_t off = 0;  	for (; len - off >= 128; off += 128) { +		/* The following optimisations have been tested: +		 *  +		 * 1) +		 *     `*(__uint128_t *)state->t += 128;` +		 *     result: slower +		 *  +		 * 2) +		 *     addq, adcq using `__asm__ __volatile__` +		 *     result: slower (as 1) +		 *  +		 * 3) +		 *     using `__builtin_add_overflow` +		 *     result: no difference +		 *  +		 * These testes where preformed on amd64 with a compile-time +		 * assumption that `UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1 == 0`, +		 * which the compiler accepted and those included the attempted +		 * optimisations. +		 *  +		 * UNLIKELY does not seem to make any difference, but it +		 * does change the output, theoretically of the better. +		 */  		state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff); -		if (state->t[0] < 128) +		if (UNLIKELY(state->t[0] < 128))  			state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);  		libblake_internal_blake2b_compress(state, &data[off]); diff --git a/libblake_blake2b_init.c b/libblake_blake2b_init.c index b520a87..f07d1ef 100644 --- a/libblake_blake2b_init.c +++ b/libblake_blake2b_init.c @@ -1,6 +1,25 @@  /* See LICENSE file for copyright and license details. */  #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le64(X) X +#else +static uint_least64_t +le64(uint_least64_t h) +{ +	unsigned char r[8]; +	r[0] = (unsigned char)((h >>  0) & 255); +	r[1] = (unsigned char)((h >>  8) & 255); +	r[2] = (unsigned char)((h >> 16) & 255); +	r[3] = (unsigned char)((h >> 24) & 255); +	r[4] = (unsigned char)((h >> 32) & 255); +	r[5] = (unsigned char)((h >> 40) & 255); +	r[6] = (unsigned char)((h >> 48) & 255); +	r[7] = (unsigned char)((h >> 56) & 255); +	return *(uint_least64_t *)r; +} +#endif +  void  libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblake_blake2b_params *params, const unsigned char *key)  { @@ -18,46 +37,57 @@ libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblak  	state->f[0] = 0;  	state->f[1] = 0; -	state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; -	state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; -	state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; -	state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24; -	state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; -	state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff); -	state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; -	state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; -	state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; -	state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; -	state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; -	state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; -	state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; -	state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; -	state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; -	state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; -	state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; -	state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; -	state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; -	state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; -	state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; -	state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; -	state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; -	state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; -	state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; -	state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; -	state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; -	state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; -	state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; -	state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; -	state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; -	state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; -	state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; -	state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; -	state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; -	state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; -	state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; -	state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; -	state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; -	state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; +	if (offsetof(struct libblake_blake2b_params, inner_len) == 17) { +		state->h[0] ^= le64(((uint_least64_t *)params)[0]); +		state->h[1] ^= le64(((uint_least64_t *)params)[1]); +		state->h[2] ^= le64(((uint_least64_t)params->node_depth << 0) | +		                    ((uint_least64_t)params->inner_len << 8)); +		state->h[4] ^= le64(*(uint_least64_t *)¶ms->salt[0]); +		state->h[5] ^= le64(*(uint_least64_t *)¶ms->salt[8]); +		state->h[6] ^= le64(*(uint_least64_t *)¶ms->pepper[0]); +		state->h[7] ^= le64(*(uint_least64_t *)¶ms->pepper[8]); +	} else { +		state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; +		state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; +		state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; +		state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24; +		state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; +		state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff); +		state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; +		state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; +		state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; +		state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; +		state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; +		state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; +		state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; +		state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; +		state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; +		state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; +		state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; +		state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; +		state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; +		state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; +		state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; +		state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; +		state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; +		state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; +		state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; +		state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; +		state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; +		state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; +		state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; +		state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; +		state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; +		state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; +		state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; +		state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; +		state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; +		state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; +		state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; +		state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; +		state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; +		state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; +	}  	if (params->key_len) {  		state->t[0] = 128; diff --git a/libblake_blake2b_update.c b/libblake_blake2b_update.c index cbd38f2..2f825a3 100644 --- a/libblake_blake2b_update.c +++ b/libblake_blake2b_update.c @@ -8,8 +8,9 @@ libblake_blake2b_update(struct libblake_blake2b_state *state, const void *data_,  	size_t off = 0;  	for (; len - off > 128; off += 128) { +		/* See libblake_blake2b_force_update.c for optimisations notes */  		state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff); -		if (state->t[0] < 128) +		if (UNLIKELY(state->t[0] < 128))  			state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);  		libblake_internal_blake2b_compress(state, &data[off]); diff --git a/libblake_blake2s_digest.c b/libblake_blake2s_digest.c index 2ee45ed..d0c1ac8 100644 --- a/libblake_blake2s_digest.c +++ b/libblake_blake2s_digest.c @@ -13,13 +13,16 @@ libblake_blake2s_digest(struct libblake_blake2s_state *state, void *data_, size_  	len -= r;  	state->f[0] = UINT_LEAST32_C(0xFFFFffff); -	if (last_node) +	if (UNLIKELY(last_node))  		state->f[1] = UINT_LEAST32_C(0xFFFFffff);  	memset(&data[len], 0, 64 - len); +	/* This runs so seldom that we are not bothering with trying to optimise +	 * this since similar code in libblake_blake2s_force_update.c could not +	 * be optimised */  	state->t[0] = (state->t[0] + len) & UINT_LEAST32_C(0xFFFFffff); -	if (state->t[0] < len) +	if (UNLIKELY(state->t[0] < len))  		state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff);  	libblake_internal_blake2s_compress(state, data); diff --git a/libblake_blake2s_force_update.c b/libblake_blake2s_force_update.c index 925d381..5330ab2 100644 --- a/libblake_blake2s_force_update.c +++ b/libblake_blake2s_force_update.c @@ -8,6 +8,24 @@ libblake_blake2s_force_update(struct libblake_blake2s_state *state, const void *  	size_t off = 0;  	for (; len - off >= 64; off += 64) { +		/* The following optimisations have been tested: +		 *  +		 * 1) +		 *     `*(uint64_t *)state->t += 64;` +		 *     result: slower +		 *  +		 * 2) +		 *     using `__builtin_add_overflow` +		 *     result: no difference +		 *  +		 * These testes where preformed on amd64 with a compile-time +		 * assumption that `UINT_LEAST32_C(0xFFFFffff) + 1 == 0`, +		 * which the compiler accepted and those included the attempted +		 * optimisations. +		 *  +		 * UNLIKELY does not seem to make any difference, but it +		 * does change the output, theoretically of the better. +		 */  		state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff);  		if (state->t[0] < 64)  			state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff); diff --git a/libblake_blake2s_init.c b/libblake_blake2s_init.c index c4b126c..fae7e0c 100644 --- a/libblake_blake2s_init.c +++ b/libblake_blake2s_init.c @@ -1,6 +1,21 @@  /* See LICENSE file for copyright and license details. */  #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le32(X) X +#else +static uint_least32_t +le32(uint_least32_t h) +{ +	unsigned char r[4]; +	r[0] = (unsigned char)((h >>  0) & 255); +	r[1] = (unsigned char)((h >>  8) & 255); +	r[2] = (unsigned char)((h >> 16) & 255); +	r[3] = (unsigned char)((h >> 24) & 255); +	return *(uint_least32_t *)r; +} +#endif +  void  libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblake_blake2s_params *params, const unsigned char *key)  { @@ -18,31 +33,44 @@ libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblak  	state->f[0] = 0;  	state->f[1] = 0; -	state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; -	state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; -	state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; -	state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24; -	state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); -	state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff)); -	state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0; -	state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; -	state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; -	state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; -	state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; -	state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; -	state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; -	state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; -	state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; -	state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; -	state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; -	state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; -	state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; -	state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; -	state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; -	state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; -	state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; -	state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; -	state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; +	if (offsetof(struct libblake_blake2s_params, inner_len) == 17) { +		state->h[0] ^= le32(((uint_least32_t *)params)[0]); +		state->h[1] ^= le32(((uint_least32_t *)params)[1]); +		state->h[2] ^= le32((uint_least32_t)(params->node_offset >> 0)); +		state->h[3] ^= le32(((uint_least32_t)(params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) | +		                    ((uint_least32_t)params->node_depth << 16) | +		                    ((uint_least32_t)params->inner_len << 24)); +		state->h[4] ^= le32(*(uint_least32_t *)¶ms->salt[0]); +		state->h[5] ^= le32(*(uint_least32_t *)¶ms->salt[4]); +		state->h[6] ^= le32(*(uint_least32_t *)¶ms->pepper[0]); +		state->h[7] ^= le32(*(uint_least32_t *)¶ms->pepper[4]); +	} else { +		state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; +		state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; +		state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; +		state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24; +		state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); +		state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff)); +		state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0; +		state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; +		state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; +		state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; +		state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; +		state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; +		state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; +		state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; +		state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; +		state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; +		state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; +		state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; +		state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; +		state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; +		state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; +		state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; +		state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; +		state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; +		state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; +	}  	if (params->key_len) {  		state->t[0] = 32; diff --git a/libblake_blake2s_update.c b/libblake_blake2s_update.c index 598260f..b6d18c2 100644 --- a/libblake_blake2s_update.c +++ b/libblake_blake2s_update.c @@ -8,6 +8,7 @@ libblake_blake2s_update(struct libblake_blake2s_state *state, const void *data_,  	size_t off = 0;  	for (; len - off > 64; off += 64) { +		/* See libblake_blake2s_force_update.c for optimisations notes */  		state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff);  		if (state->t[0] < 64)  			state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff); diff --git a/libblake_internal_blake2b_output_digest.c b/libblake_internal_blake2b_output_digest.c index bc5b407..8700c8c 100644 --- a/libblake_internal_blake2b_output_digest.c +++ b/libblake_internal_blake2b_output_digest.c @@ -4,8 +4,18 @@  static void  encode_uint64_le(unsigned char *out, uint_least64_t value, size_t bytes)  { +	/* Adding LIKELY to indicate that the default case is the +	 * expected does not affact the output */  	switch (bytes) {  	default: +		/* +		 * The following optimisation have been tested: +		 *  +		 * 1) Changing the default case, on amd64, to +		 *    *(uint64_t *)out = (uint64_t)value; +		 *    break; +		 *    result: halved performance +		 */  		out[7] = (unsigned char)((value >> 56) & 255);  		/* fall through */  	case 7: @@ -39,6 +49,28 @@ libblake_internal_blake2b_output_digest(struct libblake_blake2b_state *state, si  {  	size_t i, j; -	for (i = 0, j = 0; i < output_len; i += 8, j += 1) -		encode_uint64_le(&output[i], state->h[j], output_len - i); +#ifdef LITTLE_ENDIAN +	if ((uint_least64_t)(UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1) == 0) { +		/* 37.5x performance improvement; +		 * even though the compiler is smart enough to optimise +		 * `encode_uint64_le(&output[i], state->h[j], 8);` to a +		 * movq (on amd64), it is note smart enough to optimise +		 * the rest */ +		memcpy(output, state->h, output_len); +		return; +	} +#endif + +	/* Estimated to have similar performance benefit as above +	 * on big-endian machines */ +	for (i = 0, j = 0; i + 8 < output_len; i += 8, j += 1) +		encode_uint64_le(&output[i], state->h[j], 8); +	encode_uint64_le(&output[i], state->h[j], output_len - i); + +	/* +	 * Unoptimised code: +	 *  +	 * for (i = 0, j = 0; i < output_len; i += 8, j += 1) +	 *          encode_uint64_le(&output[i], state->h[j], output_len - i); +	 */  } diff --git a/libblake_internal_blake2s_output_digest.c b/libblake_internal_blake2s_output_digest.c index d7b891c..6b4a5da 100644 --- a/libblake_internal_blake2s_output_digest.c +++ b/libblake_internal_blake2s_output_digest.c @@ -4,8 +4,18 @@  static void  encode_uint32_le(unsigned char *out, uint_least32_t value, size_t bytes)  { +	/* Adding LIKELY to indicate that the default case is the +	 * expected does not affact the output */  	switch (bytes) {  	default: +		/* +		 * The following optimisation have been tested: +		 *  +		 * 1) Changing the default case, on amd64, to +		 *    *(uint32_t *)out = (uint32_t)value; +		 *    break; +		 *    result: halved performance +		 */  		out[3] = (unsigned char)((value >> 24) & 255);  		/* fall through */  	case 3: @@ -27,6 +37,31 @@ libblake_internal_blake2s_output_digest(struct libblake_blake2s_state *state, si  {  	size_t i, j; -	for (i = 0, j = 0; i < output_len; i += 4, j += 1) -		encode_uint32_le(&output[i], state->h[j], output_len - i); +#ifdef LITTLE_ENDIAN +	if ((uint_least32_t)(UINT_LEAST32_C(0xFFFFffff) + 1) == 0) { +		/* No noticeable performance benefit on amd64, however +		 * it signficantly reduces the translation size and +		 * a 37.5x performance benefit was seen on the 64-bit +		 * version on amd64; +		 * even though the compiler is smart enough to optimise +		 * `encode_uint32_le(&output[i], state->h[j], 4);` to a +		 * movq (on amd64), it is note smart enough to optimise +		 * the rest */ +		memcpy(output, state->h, output_len); +		return; +	} +#endif + +	/* Estimated to have similar performance benefit as above +	 * on big-endian machines */ +	for (i = 0, j = 0; i + 4 < output_len; i += 4, j += 1) +		encode_uint32_le(&output[i], state->h[j], 4); +	encode_uint32_le(&output[i], state->h[j], output_len - i); + +	/* +	 * Unoptimised code: +	 *  +	 * for (i = 0, j = 0; i < output_len; i += 4, j += 1) +	 *         encode_uint32_le(&output[i], state->h[j], output_len - i); +	 */  } diff --git a/libblake_internal_blake2xb_init0.c b/libblake_internal_blake2xb_init0.c index d6063dc..c045cec 100644 --- a/libblake_internal_blake2xb_init0.c +++ b/libblake_internal_blake2xb_init0.c @@ -1,6 +1,25 @@  /* See LICENSE file for copyright and license details. */  #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le64(X) X +#else +static uint_least64_t +le64(uint_least64_t h) +{ +	unsigned char r[8]; +	r[0] = (unsigned char)((h >>  0) & 255); +	r[1] = (unsigned char)((h >>  8) & 255); +	r[2] = (unsigned char)((h >> 16) & 255); +	r[3] = (unsigned char)((h >> 24) & 255); +	r[4] = (unsigned char)((h >> 32) & 255); +	r[5] = (unsigned char)((h >> 40) & 255); +	r[6] = (unsigned char)((h >> 48) & 255); +	r[7] = (unsigned char)((h >> 56) & 255); +	return *(uint_least64_t *)r; +} +#endif +  void  libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const struct libblake_blake2xb_params *params)  { @@ -18,45 +37,56 @@ libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const st  	state->b2b.f[0] = 0;  	state->b2b.f[1] = 0; -	state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; -	state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; -	state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; -	state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24; -	state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; -	state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0; -	state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32; -	state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; -	state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; -	state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; -	state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; -	state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; -	state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; +	if (offsetof(struct libblake_blake2xb_params, inner_len) == 17) { +		state->b2b.h[0] ^= le64(((uint_least64_t *)params)[0]); +		state->b2b.h[1] ^= le64(((uint_least64_t *)params)[1]); +		state->b2b.h[2] ^= le64(((uint_least64_t)params->node_depth << 0) | +		                        ((uint_least64_t)params->inner_len << 8)); +		state->b2b.h[4] ^= le64(*(uint_least64_t *)¶ms->salt[0]); +		state->b2b.h[5] ^= le64(*(uint_least64_t *)¶ms->salt[8]); +		state->b2b.h[6] ^= le64(*(uint_least64_t *)¶ms->pepper[0]); +		state->b2b.h[7] ^= le64(*(uint_least64_t *)¶ms->pepper[8]); +	} else { +		state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0; +		state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8; +		state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16; +		state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24; +		state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32; +		state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0; +		state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32; +		state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0; +		state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48; +		state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48; +		state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48; +		state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48; +		state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56; +	}  } diff --git a/libblake_internal_blake2xs_init0.c b/libblake_internal_blake2xs_init0.c index 92cb7bf..f20bb9c 100644 --- a/libblake_internal_blake2xs_init0.c +++ b/libblake_internal_blake2xs_init0.c @@ -1,6 +1,21 @@  /* See LICENSE file for copyright and license details. */  #include "common.h" +#if defined(LITTLE_ENDIAN) +# define le32(X) X +#else +static uint_least32_t +le32(uint_least32_t h) +{ +	unsigned char r[4]; +	r[0] = (unsigned char)((h >>  0) & 255); +	r[1] = (unsigned char)((h >>  8) & 255); +	r[2] = (unsigned char)((h >> 16) & 255); +	r[3] = (unsigned char)((h >> 24) & 255); +	return *(uint_least32_t *)r; +} +#endif +  void  libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const struct libblake_blake2xs_params *params)  { @@ -18,29 +33,40 @@ libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const st  	state->b2s.f[0] = 0;  	state->b2s.f[1] = 0; -	state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; -	state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; -	state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; -	state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24; -	state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); -	state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff); -	state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0; -	state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; -	state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; -	state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; -	state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; -	state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; -	state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; -	state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; -	state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; -	state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; -	state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; -	state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; -	state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; -	state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; -	state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; -	state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; -	state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; -	state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; -	state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; +	if (sizeof(*params) == sizeof(state->b2s.h)) { +		state->b2s.h[0] ^= le32(((uint_least32_t *)params)[0]); +		state->b2s.h[1] ^= le32(((uint_least32_t *)params)[1]); +		state->b2s.h[2] ^= le32(((uint_least32_t *)params)[2]); +		state->b2s.h[3] ^= le32(((uint_least32_t *)params)[3]); +		state->b2s.h[4] ^= le32(((uint_least32_t *)params)[4]); +		state->b2s.h[5] ^= le32(((uint_least32_t *)params)[5]); +		state->b2s.h[6] ^= le32(((uint_least32_t *)params)[6]); +		state->b2s.h[7] ^= le32(((uint_least32_t *)params)[7]); +	} else { +		state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0; +		state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8; +		state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16; +		state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24; +		state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff); +		state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff); +		state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0; +		state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16; +		state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24; +		state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0; +		state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8; +		state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16; +		state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24; +		state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0; +		state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8; +		state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16; +		state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24; +		state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0; +		state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8; +		state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16; +		state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24; +		state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0; +		state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8; +		state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16; +		state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24; +	}  } | 
