aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--Makefile2
-rw-r--r--common.h31
-rw-r--r--libblake.h3
-rw-r--r--libblake_blake2b_digest.c7
-rw-r--r--libblake_blake2b_force_update.c24
-rw-r--r--libblake_blake2b_init.c110
-rw-r--r--libblake_blake2b_update.c3
-rw-r--r--libblake_blake2s_digest.c7
-rw-r--r--libblake_blake2s_force_update.c18
-rw-r--r--libblake_blake2s_init.c78
-rw-r--r--libblake_blake2s_update.c1
-rw-r--r--libblake_internal_blake2b_output_digest.c36
-rw-r--r--libblake_internal_blake2s_output_digest.c39
-rw-r--r--libblake_internal_blake2xb_init0.c112
-rw-r--r--libblake_internal_blake2xs_init0.c76
16 files changed, 406 insertions, 143 deletions
diff --git a/.gitignore b/.gitignore
index c2c489c..4161a51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
*\#*
*~
+a.out
*.o
*.a
*.lo
@@ -12,4 +13,5 @@
*.gcov
*.gcno
*.gcda
+*.s
/test
diff --git a/Makefile b/Makefile
index d96ac42..da9b9a7 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ OS = linux
include mk/$(OS).mk
-LIB_MAJOR = 1
+LIB_MAJOR = 2
LIB_MINOR = 0
LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR)
LIB_NAME = blake
diff --git a/common.h b/common.h
index b1920dc..3f9dce0 100644
--- a/common.h
+++ b/common.h
@@ -24,9 +24,38 @@
#endif
#if defined(__GNUC__)
-# define HIDDEN __attribute__((__visibility__("hidden")))
+# define HIDDEN __attribute__((visibility("hidden")))
+# define LIKELY(X) __builtin_expect(!!(X), 1)
+# define UNLIKELY(X) __builtin_expect(!!(X), 0)
+# define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X))
#else
# define HIDDEN
+# define LIKELY(X) X
+# define UNLIKELY(X) X
+# define ASSUMING_CONSTANT(X) 0
+# if defined(__has_builtin)
+# if __has_builtin(__builtin_expect)
+# undef LIKELY
+# undef UNLIKELY
+# define LIKELY(X) __builtin_expect(!!(X), 1)
+# define UNLIKELY(X) __builtin_expect(!!(X), 0)
+# endif
+# if __has_builtin(__builtin_constant_p)
+# undef ASSUMING_CONSTANT
+# define ASSUMING_CONSTANT(X) (__builtin_constant_p(X) && (X))
+# endif
+# endif
+#endif
+#if defined(__has_builtin)
+# define HAS_BUILTIN(X) __has_builtin(X)
+#else
+# define HAS_BUILTIN(X) 0
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+# define LITTLE_ENDIAN
+#else
+# error Endian is unknown
#endif
#define A 10
diff --git a/libblake.h b/libblake.h
index 70b5a09..48c2ca1 100644
--- a/libblake.h
+++ b/libblake.h
@@ -86,6 +86,7 @@ struct libblake_blake2s_params {
uint_least64_t node_offset; /* (48-bits) normally 0 */
uint_least8_t node_depth; /* normally 0 */
uint_least8_t inner_len; /* normally 0 */
+ uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */
uint_least8_t salt[8];
uint_least8_t pepper[8];
};
@@ -99,6 +100,7 @@ struct libblake_blake2b_params {
uint_least64_t node_offset; /* normally 0 */
uint_least8_t node_depth; /* normally 0 */
uint_least8_t inner_len; /* normally 0 */
+ uint_least8_t _padding[6]; /* to keep .salt and .pepper aligned as uint_least64_t */
uint_least8_t salt[16];
uint_least8_t pepper[16];
};
@@ -127,6 +129,7 @@ struct libblake_blake2xb_params {
uint_least32_t xof_len; /* max if not known in advance */
uint_least8_t node_depth; /* normally 0 */
uint_least8_t inner_len; /* normally 0 */
+ uint_least8_t _padding[2]; /* to keep .salt and .pepper aligned as uint_least32_t */
uint_least8_t salt[16];
uint_least8_t pepper[16];
};
diff --git a/libblake_blake2b_digest.c b/libblake_blake2b_digest.c
index 08b2d75..4338387 100644
--- a/libblake_blake2b_digest.c
+++ b/libblake_blake2b_digest.c
@@ -13,13 +13,16 @@ libblake_blake2b_digest(struct libblake_blake2b_state *state, void *data_, size_
len -= r;
state->f[0] = UINT_LEAST64_C(0xFFFFffffFFFFffff);
- if (last_node)
+ if (UNLIKELY(last_node))
state->f[1] = UINT_LEAST64_C(0xFFFFffffFFFFffff);
memset(&data[len], 0, 128 - len);
+ /* This runs so seldom that we are not bothering with trying to optimise
+ * this since similar code in libblake_blake2b_force_update.c could not
+ * be optimised */
state->t[0] = (state->t[0] + len) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
- if (state->t[0] < len)
+ if (UNLIKELY(state->t[0] < len))
state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
libblake_internal_blake2b_compress(state, data);
diff --git a/libblake_blake2b_force_update.c b/libblake_blake2b_force_update.c
index 60b8fab..2446e16 100644
--- a/libblake_blake2b_force_update.c
+++ b/libblake_blake2b_force_update.c
@@ -8,8 +8,30 @@ libblake_blake2b_force_update(struct libblake_blake2b_state *state, const void *
size_t off = 0;
for (; len - off >= 128; off += 128) {
+ /* The following optimisations have been tested:
+ *
+ * 1)
+ * `*(__uint128_t *)state->t += 128;`
+ * result: slower
+ *
+ * 2)
+ * addq, adcq using `__asm__ __volatile__`
+ * result: slower (as 1)
+ *
+ * 3)
+ * using `__builtin_add_overflow`
+ * result: no difference
+ *
+ * These testes where preformed on amd64 with a compile-time
+ * assumption that `UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1 == 0`,
+ * which the compiler accepted and those included the attempted
+ * optimisations.
+ *
+ * UNLIKELY does not seem to make any difference, but it
+ * does change the output, theoretically of the better.
+ */
state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
- if (state->t[0] < 128)
+ if (UNLIKELY(state->t[0] < 128))
state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
libblake_internal_blake2b_compress(state, &data[off]);
diff --git a/libblake_blake2b_init.c b/libblake_blake2b_init.c
index b520a87..f07d1ef 100644
--- a/libblake_blake2b_init.c
+++ b/libblake_blake2b_init.c
@@ -1,6 +1,25 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+#if defined(LITTLE_ENDIAN)
+# define le64(X) X
+#else
+static uint_least64_t
+le64(uint_least64_t h)
+{
+ unsigned char r[8];
+ r[0] = (unsigned char)((h >> 0) & 255);
+ r[1] = (unsigned char)((h >> 8) & 255);
+ r[2] = (unsigned char)((h >> 16) & 255);
+ r[3] = (unsigned char)((h >> 24) & 255);
+ r[4] = (unsigned char)((h >> 32) & 255);
+ r[5] = (unsigned char)((h >> 40) & 255);
+ r[6] = (unsigned char)((h >> 48) & 255);
+ r[7] = (unsigned char)((h >> 56) & 255);
+ return *(uint_least64_t *)r;
+}
+#endif
+
void
libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblake_blake2b_params *params, const unsigned char *key)
{
@@ -18,46 +37,57 @@ libblake_blake2b_init(struct libblake_blake2b_state *state, const struct libblak
state->f[0] = 0;
state->f[1] = 0;
- state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0;
- state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8;
- state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16;
- state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24;
- state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
- state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff);
- state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0;
- state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8;
- state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0;
- state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8;
- state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16;
- state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24;
- state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32;
- state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40;
- state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48;
- state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56;
- state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0;
- state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8;
- state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16;
- state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24;
- state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32;
- state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40;
- state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48;
- state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56;
- state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0;
- state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8;
- state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16;
- state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24;
- state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32;
- state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40;
- state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48;
- state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56;
- state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0;
- state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8;
- state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16;
- state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24;
- state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32;
- state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40;
- state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48;
- state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56;
+ if (offsetof(struct libblake_blake2b_params, inner_len) == 17) {
+ state->h[0] ^= le64(((uint_least64_t *)params)[0]);
+ state->h[1] ^= le64(((uint_least64_t *)params)[1]);
+ state->h[2] ^= le64(((uint_least64_t)params->node_depth << 0) |
+ ((uint_least64_t)params->inner_len << 8));
+ state->h[4] ^= le64(*(uint_least64_t *)&params->salt[0]);
+ state->h[5] ^= le64(*(uint_least64_t *)&params->salt[8]);
+ state->h[6] ^= le64(*(uint_least64_t *)&params->pepper[0]);
+ state->h[7] ^= le64(*(uint_least64_t *)&params->pepper[8]);
+ } else {
+ state->h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0;
+ state->h[0] ^= ((uint_least64_t)params->key_len & 255) << 8;
+ state->h[0] ^= ((uint_least64_t)params->fanout & 255) << 16;
+ state->h[0] ^= ((uint_least64_t)params->depth & 255) << 24;
+ state->h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
+ state->h[1] ^= params->node_offset & UINT_LEAST64_C(0xFFFFffffFFFFffff);
+ state->h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0;
+ state->h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8;
+ state->h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0;
+ state->h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8;
+ state->h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16;
+ state->h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24;
+ state->h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32;
+ state->h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40;
+ state->h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48;
+ state->h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56;
+ state->h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0;
+ state->h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8;
+ state->h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16;
+ state->h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24;
+ state->h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32;
+ state->h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40;
+ state->h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48;
+ state->h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56;
+ state->h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0;
+ state->h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8;
+ state->h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16;
+ state->h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24;
+ state->h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32;
+ state->h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40;
+ state->h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48;
+ state->h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56;
+ state->h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0;
+ state->h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8;
+ state->h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16;
+ state->h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24;
+ state->h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32;
+ state->h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40;
+ state->h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48;
+ state->h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56;
+ }
if (params->key_len) {
state->t[0] = 128;
diff --git a/libblake_blake2b_update.c b/libblake_blake2b_update.c
index cbd38f2..2f825a3 100644
--- a/libblake_blake2b_update.c
+++ b/libblake_blake2b_update.c
@@ -8,8 +8,9 @@ libblake_blake2b_update(struct libblake_blake2b_state *state, const void *data_,
size_t off = 0;
for (; len - off > 128; off += 128) {
+ /* See libblake_blake2b_force_update.c for optimisations notes */
state->t[0] = (state->t[0] + 128) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
- if (state->t[0] < 128)
+ if (UNLIKELY(state->t[0] < 128))
state->t[1] = (state->t[1] + 1) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
libblake_internal_blake2b_compress(state, &data[off]);
diff --git a/libblake_blake2s_digest.c b/libblake_blake2s_digest.c
index 2ee45ed..d0c1ac8 100644
--- a/libblake_blake2s_digest.c
+++ b/libblake_blake2s_digest.c
@@ -13,13 +13,16 @@ libblake_blake2s_digest(struct libblake_blake2s_state *state, void *data_, size_
len -= r;
state->f[0] = UINT_LEAST32_C(0xFFFFffff);
- if (last_node)
+ if (UNLIKELY(last_node))
state->f[1] = UINT_LEAST32_C(0xFFFFffff);
memset(&data[len], 0, 64 - len);
+ /* This runs so seldom that we are not bothering with trying to optimise
+ * this since similar code in libblake_blake2s_force_update.c could not
+ * be optimised */
state->t[0] = (state->t[0] + len) & UINT_LEAST32_C(0xFFFFffff);
- if (state->t[0] < len)
+ if (UNLIKELY(state->t[0] < len))
state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff);
libblake_internal_blake2s_compress(state, data);
diff --git a/libblake_blake2s_force_update.c b/libblake_blake2s_force_update.c
index 925d381..5330ab2 100644
--- a/libblake_blake2s_force_update.c
+++ b/libblake_blake2s_force_update.c
@@ -8,6 +8,24 @@ libblake_blake2s_force_update(struct libblake_blake2s_state *state, const void *
size_t off = 0;
for (; len - off >= 64; off += 64) {
+ /* The following optimisations have been tested:
+ *
+ * 1)
+ * `*(uint64_t *)state->t += 64;`
+ * result: slower
+ *
+ * 2)
+ * using `__builtin_add_overflow`
+ * result: no difference
+ *
+ * These testes where preformed on amd64 with a compile-time
+ * assumption that `UINT_LEAST32_C(0xFFFFffff) + 1 == 0`,
+ * which the compiler accepted and those included the attempted
+ * optimisations.
+ *
+ * UNLIKELY does not seem to make any difference, but it
+ * does change the output, theoretically of the better.
+ */
state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff);
if (state->t[0] < 64)
state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff);
diff --git a/libblake_blake2s_init.c b/libblake_blake2s_init.c
index c4b126c..fae7e0c 100644
--- a/libblake_blake2s_init.c
+++ b/libblake_blake2s_init.c
@@ -1,6 +1,21 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+#if defined(LITTLE_ENDIAN)
+# define le32(X) X
+#else
+static uint_least32_t
+le32(uint_least32_t h)
+{
+ unsigned char r[4];
+ r[0] = (unsigned char)((h >> 0) & 255);
+ r[1] = (unsigned char)((h >> 8) & 255);
+ r[2] = (unsigned char)((h >> 16) & 255);
+ r[3] = (unsigned char)((h >> 24) & 255);
+ return *(uint_least32_t *)r;
+}
+#endif
+
void
libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblake_blake2s_params *params, const unsigned char *key)
{
@@ -18,31 +33,44 @@ libblake_blake2s_init(struct libblake_blake2s_state *state, const struct libblak
state->f[0] = 0;
state->f[1] = 0;
- state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0;
- state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8;
- state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16;
- state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24;
- state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff);
- state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff));
- state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0;
- state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16;
- state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24;
- state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0;
- state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8;
- state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16;
- state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24;
- state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0;
- state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8;
- state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16;
- state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24;
- state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0;
- state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8;
- state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16;
- state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24;
- state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0;
- state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8;
- state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16;
- state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24;
+ if (offsetof(struct libblake_blake2s_params, inner_len) == 17) {
+ state->h[0] ^= le32(((uint_least32_t *)params)[0]);
+ state->h[1] ^= le32(((uint_least32_t *)params)[1]);
+ state->h[2] ^= le32((uint_least32_t)(params->node_offset >> 0));
+ state->h[3] ^= le32(((uint_least32_t)(params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) |
+ ((uint_least32_t)params->node_depth << 16) |
+ ((uint_least32_t)params->inner_len << 24));
+ state->h[4] ^= le32(*(uint_least32_t *)&params->salt[0]);
+ state->h[5] ^= le32(*(uint_least32_t *)&params->salt[4]);
+ state->h[6] ^= le32(*(uint_least32_t *)&params->pepper[0]);
+ state->h[7] ^= le32(*(uint_least32_t *)&params->pepper[4]);
+ } else {
+ state->h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0;
+ state->h[0] ^= ((uint_least32_t)params->key_len & 255) << 8;
+ state->h[0] ^= ((uint_least32_t)params->fanout & 255) << 16;
+ state->h[0] ^= ((uint_least32_t)params->depth & 255) << 24;
+ state->h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff);
+ state->h[2] ^= (uint_least32_t)((params->node_offset >> 0) & UINT_LEAST64_C(0xFFFFffff));
+ state->h[3] ^= (uint_least32_t)((params->node_offset >> 32) & UINT_LEAST64_C(0xFFFF)) << 0;
+ state->h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16;
+ state->h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24;
+ state->h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0;
+ state->h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8;
+ state->h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16;
+ state->h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24;
+ state->h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0;
+ state->h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8;
+ state->h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16;
+ state->h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24;
+ state->h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0;
+ state->h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8;
+ state->h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16;
+ state->h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24;
+ state->h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0;
+ state->h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8;
+ state->h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16;
+ state->h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24;
+ }
if (params->key_len) {
state->t[0] = 32;
diff --git a/libblake_blake2s_update.c b/libblake_blake2s_update.c
index 598260f..b6d18c2 100644
--- a/libblake_blake2s_update.c
+++ b/libblake_blake2s_update.c
@@ -8,6 +8,7 @@ libblake_blake2s_update(struct libblake_blake2s_state *state, const void *data_,
size_t off = 0;
for (; len - off > 64; off += 64) {
+ /* See libblake_blake2s_force_update.c for optimisations notes */
state->t[0] = (state->t[0] + 64) & UINT_LEAST32_C(0xFFFFffff);
if (state->t[0] < 64)
state->t[1] = (state->t[1] + 1) & UINT_LEAST32_C(0xFFFFffff);
diff --git a/libblake_internal_blake2b_output_digest.c b/libblake_internal_blake2b_output_digest.c
index bc5b407..8700c8c 100644
--- a/libblake_internal_blake2b_output_digest.c
+++ b/libblake_internal_blake2b_output_digest.c
@@ -4,8 +4,18 @@
static void
encode_uint64_le(unsigned char *out, uint_least64_t value, size_t bytes)
{
+ /* Adding LIKELY to indicate that the default case is the
+ * expected does not affact the output */
switch (bytes) {
default:
+ /*
+ * The following optimisation have been tested:
+ *
+ * 1) Changing the default case, on amd64, to
+ * *(uint64_t *)out = (uint64_t)value;
+ * break;
+ * result: halved performance
+ */
out[7] = (unsigned char)((value >> 56) & 255);
/* fall through */
case 7:
@@ -39,6 +49,28 @@ libblake_internal_blake2b_output_digest(struct libblake_blake2b_state *state, si
{
size_t i, j;
- for (i = 0, j = 0; i < output_len; i += 8, j += 1)
- encode_uint64_le(&output[i], state->h[j], output_len - i);
+#ifdef LITTLE_ENDIAN
+ if ((uint_least64_t)(UINT_LEAST64_C(0xFFFFffffFFFFffff) + 1) == 0) {
+ /* 37.5x performance improvement;
+ * even though the compiler is smart enough to optimise
+ * `encode_uint64_le(&output[i], state->h[j], 8);` to a
+ * movq (on amd64), it is note smart enough to optimise
+ * the rest */
+ memcpy(output, state->h, output_len);
+ return;
+ }
+#endif
+
+ /* Estimated to have similar performance benefit as above
+ * on big-endian machines */
+ for (i = 0, j = 0; i + 8 < output_len; i += 8, j += 1)
+ encode_uint64_le(&output[i], state->h[j], 8);
+ encode_uint64_le(&output[i], state->h[j], output_len - i);
+
+ /*
+ * Unoptimised code:
+ *
+ * for (i = 0, j = 0; i < output_len; i += 8, j += 1)
+ * encode_uint64_le(&output[i], state->h[j], output_len - i);
+ */
}
diff --git a/libblake_internal_blake2s_output_digest.c b/libblake_internal_blake2s_output_digest.c
index d7b891c..6b4a5da 100644
--- a/libblake_internal_blake2s_output_digest.c
+++ b/libblake_internal_blake2s_output_digest.c
@@ -4,8 +4,18 @@
static void
encode_uint32_le(unsigned char *out, uint_least32_t value, size_t bytes)
{
+ /* Adding LIKELY to indicate that the default case is the
+ * expected does not affact the output */
switch (bytes) {
default:
+ /*
+ * The following optimisation have been tested:
+ *
+ * 1) Changing the default case, on amd64, to
+ * *(uint32_t *)out = (uint32_t)value;
+ * break;
+ * result: halved performance
+ */
out[3] = (unsigned char)((value >> 24) & 255);
/* fall through */
case 3:
@@ -27,6 +37,31 @@ libblake_internal_blake2s_output_digest(struct libblake_blake2s_state *state, si
{
size_t i, j;
- for (i = 0, j = 0; i < output_len; i += 4, j += 1)
- encode_uint32_le(&output[i], state->h[j], output_len - i);
+#ifdef LITTLE_ENDIAN
+ if ((uint_least32_t)(UINT_LEAST32_C(0xFFFFffff) + 1) == 0) {
+ /* No noticeable performance benefit on amd64, however
+ * it signficantly reduces the translation size and
+ * a 37.5x performance benefit was seen on the 64-bit
+ * version on amd64;
+ * even though the compiler is smart enough to optimise
+ * `encode_uint32_le(&output[i], state->h[j], 4);` to a
+ * movq (on amd64), it is note smart enough to optimise
+ * the rest */
+ memcpy(output, state->h, output_len);
+ return;
+ }
+#endif
+
+ /* Estimated to have similar performance benefit as above
+ * on big-endian machines */
+ for (i = 0, j = 0; i + 4 < output_len; i += 4, j += 1)
+ encode_uint32_le(&output[i], state->h[j], 4);
+ encode_uint32_le(&output[i], state->h[j], output_len - i);
+
+ /*
+ * Unoptimised code:
+ *
+ * for (i = 0, j = 0; i < output_len; i += 4, j += 1)
+ * encode_uint32_le(&output[i], state->h[j], output_len - i);
+ */
}
diff --git a/libblake_internal_blake2xb_init0.c b/libblake_internal_blake2xb_init0.c
index d6063dc..c045cec 100644
--- a/libblake_internal_blake2xb_init0.c
+++ b/libblake_internal_blake2xb_init0.c
@@ -1,6 +1,25 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+#if defined(LITTLE_ENDIAN)
+# define le64(X) X
+#else
+static uint_least64_t
+le64(uint_least64_t h)
+{
+ unsigned char r[8];
+ r[0] = (unsigned char)((h >> 0) & 255);
+ r[1] = (unsigned char)((h >> 8) & 255);
+ r[2] = (unsigned char)((h >> 16) & 255);
+ r[3] = (unsigned char)((h >> 24) & 255);
+ r[4] = (unsigned char)((h >> 32) & 255);
+ r[5] = (unsigned char)((h >> 40) & 255);
+ r[6] = (unsigned char)((h >> 48) & 255);
+ r[7] = (unsigned char)((h >> 56) & 255);
+ return *(uint_least64_t *)r;
+}
+#endif
+
void
libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const struct libblake_blake2xb_params *params)
{
@@ -18,45 +37,56 @@ libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const st
state->b2b.f[0] = 0;
state->b2b.f[1] = 0;
- state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0;
- state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8;
- state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16;
- state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24;
- state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
- state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0;
- state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
- state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0;
- state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48;
- state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48;
- state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48;
- state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48;
- state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56;
+ if (offsetof(struct libblake_blake2xb_params, inner_len) == 17) {
+ state->b2b.h[0] ^= le64(((uint_least64_t *)params)[0]);
+ state->b2b.h[1] ^= le64(((uint_least64_t *)params)[1]);
+ state->b2b.h[2] ^= le64(((uint_least64_t)params->node_depth << 0) |
+ ((uint_least64_t)params->inner_len << 8));
+ state->b2b.h[4] ^= le64(*(uint_least64_t *)&params->salt[0]);
+ state->b2b.h[5] ^= le64(*(uint_least64_t *)&params->salt[8]);
+ state->b2b.h[6] ^= le64(*(uint_least64_t *)&params->pepper[0]);
+ state->b2b.h[7] ^= le64(*(uint_least64_t *)&params->pepper[8]);
+ } else {
+ state->b2b.h[0] ^= ((uint_least64_t)params->digest_len & 255) << 0;
+ state->b2b.h[0] ^= ((uint_least64_t)params->key_len & 255) << 8;
+ state->b2b.h[0] ^= ((uint_least64_t)params->fanout & 255) << 16;
+ state->b2b.h[0] ^= ((uint_least64_t)params->depth & 255) << 24;
+ state->b2b.h[0] ^= (uint_least64_t)(params->leaf_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
+ state->b2b.h[1] ^= (uint_least64_t)(params->node_offset & UINT_LEAST32_C(0xFFFFffff)) << 0;
+ state->b2b.h[1] ^= (uint_least64_t)(params->xof_len & UINT_LEAST32_C(0xFFFFffff)) << 32;
+ state->b2b.h[2] ^= ((uint_least64_t)params->node_depth & 255) << 0;
+ state->b2b.h[2] ^= ((uint_least64_t)params->inner_len & 255) << 8;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[0] & 255) << 0;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[1] & 255) << 8;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[2] & 255) << 16;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[3] & 255) << 24;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[4] & 255) << 32;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[5] & 255) << 40;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[6] & 255) << 48;
+ state->b2b.h[4] ^= ((uint_least64_t)params->salt[7] & 255) << 56;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[8] & 255) << 0;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[9] & 255) << 8;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[A] & 255) << 16;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[B] & 255) << 24;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[C] & 255) << 32;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[D] & 255) << 40;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[E] & 255) << 48;
+ state->b2b.h[5] ^= ((uint_least64_t)params->salt[F] & 255) << 56;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[0] & 255) << 0;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[1] & 255) << 8;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[2] & 255) << 16;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[3] & 255) << 24;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[4] & 255) << 32;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[5] & 255) << 40;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[6] & 255) << 48;
+ state->b2b.h[6] ^= ((uint_least64_t)params->pepper[7] & 255) << 56;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[8] & 255) << 0;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[9] & 255) << 8;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[A] & 255) << 16;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[B] & 255) << 24;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[C] & 255) << 32;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[D] & 255) << 40;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[E] & 255) << 48;
+ state->b2b.h[7] ^= ((uint_least64_t)params->pepper[F] & 255) << 56;
+ }
}
diff --git a/libblake_internal_blake2xs_init0.c b/libblake_internal_blake2xs_init0.c
index 92cb7bf..f20bb9c 100644
--- a/libblake_internal_blake2xs_init0.c
+++ b/libblake_internal_blake2xs_init0.c
@@ -1,6 +1,21 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+#if defined(LITTLE_ENDIAN)
+# define le32(X) X
+#else
+static uint_least32_t
+le32(uint_least32_t h)
+{
+ unsigned char r[4];
+ r[0] = (unsigned char)((h >> 0) & 255);
+ r[1] = (unsigned char)((h >> 8) & 255);
+ r[2] = (unsigned char)((h >> 16) & 255);
+ r[3] = (unsigned char)((h >> 24) & 255);
+ return *(uint_least32_t *)r;
+}
+#endif
+
void
libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const struct libblake_blake2xs_params *params)
{
@@ -18,29 +33,40 @@ libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const st
state->b2s.f[0] = 0;
state->b2s.f[1] = 0;
- state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0;
- state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8;
- state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16;
- state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24;
- state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff);
- state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff);
- state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0;
- state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16;
- state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24;
- state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0;
- state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8;
- state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16;
- state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24;
- state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0;
- state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8;
- state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16;
- state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24;
- state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0;
- state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8;
- state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16;
- state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24;
- state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0;
- state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8;
- state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16;
- state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24;
+ if (sizeof(*params) == sizeof(state->b2s.h)) {
+ state->b2s.h[0] ^= le32(((uint_least32_t *)params)[0]);
+ state->b2s.h[1] ^= le32(((uint_least32_t *)params)[1]);
+ state->b2s.h[2] ^= le32(((uint_least32_t *)params)[2]);
+ state->b2s.h[3] ^= le32(((uint_least32_t *)params)[3]);
+ state->b2s.h[4] ^= le32(((uint_least32_t *)params)[4]);
+ state->b2s.h[5] ^= le32(((uint_least32_t *)params)[5]);
+ state->b2s.h[6] ^= le32(((uint_least32_t *)params)[6]);
+ state->b2s.h[7] ^= le32(((uint_least32_t *)params)[7]);
+ } else {
+ state->b2s.h[0] ^= ((uint_least32_t)params->digest_len & 255) << 0;
+ state->b2s.h[0] ^= ((uint_least32_t)params->key_len & 255) << 8;
+ state->b2s.h[0] ^= ((uint_least32_t)params->fanout & 255) << 16;
+ state->b2s.h[0] ^= ((uint_least32_t)params->depth & 255) << 24;
+ state->b2s.h[1] ^= params->leaf_len & UINT_LEAST32_C(0xFFFFffff);
+ state->b2s.h[2] ^= params->node_offset & UINT_LEAST32_C(0xFFFFffff);
+ state->b2s.h[3] ^= (uint_least32_t)(params->xof_len & UINT_LEAST16_C(0xFFFF)) << 0;
+ state->b2s.h[3] ^= ((uint_least32_t)params->node_depth & 255) << 16;
+ state->b2s.h[3] ^= ((uint_least32_t)params->inner_len & 255) << 24;
+ state->b2s.h[4] ^= ((uint_least32_t)params->salt[0] & 255) << 0;
+ state->b2s.h[4] ^= ((uint_least32_t)params->salt[1] & 255) << 8;
+ state->b2s.h[4] ^= ((uint_least32_t)params->salt[2] & 255) << 16;
+ state->b2s.h[4] ^= ((uint_least32_t)params->salt[3] & 255) << 24;
+ state->b2s.h[5] ^= ((uint_least32_t)params->salt[4] & 255) << 0;
+ state->b2s.h[5] ^= ((uint_least32_t)params->salt[5] & 255) << 8;
+ state->b2s.h[5] ^= ((uint_least32_t)params->salt[6] & 255) << 16;
+ state->b2s.h[5] ^= ((uint_least32_t)params->salt[7] & 255) << 24;
+ state->b2s.h[6] ^= ((uint_least32_t)params->pepper[0] & 255) << 0;
+ state->b2s.h[6] ^= ((uint_least32_t)params->pepper[1] & 255) << 8;
+ state->b2s.h[6] ^= ((uint_least32_t)params->pepper[2] & 255) << 16;
+ state->b2s.h[6] ^= ((uint_least32_t)params->pepper[3] & 255) << 24;
+ state->b2s.h[7] ^= ((uint_least32_t)params->pepper[4] & 255) << 0;
+ state->b2s.h[7] ^= ((uint_least32_t)params->pepper[5] & 255) << 8;
+ state->b2s.h[7] ^= ((uint_least32_t)params->pepper[6] & 255) << 16;
+ state->b2s.h[7] ^= ((uint_least32_t)params->pepper[7] & 255) << 24;
+ }
}