aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2022-01-21 18:29:26 +0100
committerMattias Andrée <maandree@kth.se>2022-01-21 18:29:26 +0100
commit839a3d17d257e73be9bc99dfa90e56c0824050ba (patch)
tree6bb010351447edbb0ae8d910948b01837d2de9e5
parentFix memory corruption bug in test.c and simplify message byte-length calculation (diff)
downloadlibblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.gz
libblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.bz2
libblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.xz
Initial work on optimising compression function; mm128 version is slower, mm256 version is barely faster
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--Makefile15
-rw-r--r--common.h2
-rw-r--r--config.mk9
-rw-r--r--libblake.h17
-rw-r--r--libblake_init.c27
-rw-r--r--libblake_internal_blake2b_compress.c4
-rw-r--r--libblake_internal_blake2b_compress_mm128.c153
-rw-r--r--libblake_internal_blake2b_compress_mm256.c101
-rw-r--r--libblake_internal_blake2s_compress.c2
-rw-r--r--test.c2
10 files changed, 330 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index da9b9a7..94909b2 100644
--- a/Makefile
+++ b/Makefile
@@ -67,14 +67,15 @@ OBJ_BLAKE2 =\
libblake_blake2xs_update.o\
libblake_internal_blake2b_compress.o\
libblake_internal_blake2s_compress.o\
- libblake_internal_blake2s_output_digest.o\
libblake_internal_blake2b_output_digest.o\
+ libblake_internal_blake2s_output_digest.o\
libblake_internal_blake2xb_init0.o\
libblake_internal_blake2xs_init0.o
OBJ =\
libblake_encode_hex.o\
libblake_decode_hex.o\
+ libblake_init.o\
$(OBJ_BLAKE)\
$(OBJ_BLAKE2)
@@ -96,6 +97,18 @@ test.o: $(HDR)
.c.lo:
$(CC) -fPIC -c -o $@ $< $(CFLAGS) $(CPPFLAGS)
+libblake_internal_blake2b_compress_mm128.o: libblake_internal_blake2b_compress_mm128.c $(HDR)
+ $(CC) -c -o $@ $(@:.o=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM128)
+
+libblake_internal_blake2b_compress_mm128.lo: libblake_internal_blake2b_compress_mm128.c $(HDR)
+ $(CC) -c -o $@ $(@:.lo=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM128)
+
+libblake_internal_blake2b_compress_mm256.o: libblake_internal_blake2b_compress_mm256.c $(HDR)
+ $(CC) -c -o $@ $(@:.o=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM256)
+
+libblake_internal_blake2b_compress_mm256.lo: libblake_internal_blake2b_compress_mm256.c $(HDR)
+ $(CC) -c -o $@ $(@:.lo=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM256)
+
test: test.o libblake.a
$(CC) -o $@ test.o libblake.a $(LDFLAGS)
diff --git a/common.h b/common.h
index 63d2bb4..7eab3b9 100644
--- a/common.h
+++ b/common.h
@@ -77,6 +77,8 @@ HIDDEN void libblake_internal_blakeb_digest(struct libblake_blakeb_state *state,
HIDDEN void libblake_internal_blake2s_compress(struct libblake_blake2s_state *state, const unsigned char *data);
HIDDEN void libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data);
+/* HIDDEN void libblake_internal_blake2b_compress_mm128_init(void); */
+/* HIDDEN void libblake_internal_blake2b_compress_mm256_init(void); */
HIDDEN void libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const struct libblake_blake2xs_params *params);
HIDDEN void libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const struct libblake_blake2xb_params *params);
diff --git a/config.mk b/config.mk
index bfe4895..36843a0 100644
--- a/config.mk
+++ b/config.mk
@@ -1,8 +1,15 @@
PREFIX = /usr
MANPREFIX = $(PREFIX)/share/man
-CC = c99
+CC = cc -std=c11
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
CFLAGS = -Wall -O3
LDFLAGS = -s
+
+# These optimisations may not only break compatibility with
+# processors that the software was not compiled on, but they
+# will infact also degrade performance. Therefore they are
+# only only used for specific translation units.
+CFLAGS_MM128 = -msse4.1 -mavx2
+CFLAGS_MM256 = -msse4.1 -mavx2
diff --git a/libblake.h b/libblake.h
index 65f31cf..ec67df8 100644
--- a/libblake.h
+++ b/libblake.h
@@ -27,6 +27,21 @@
# define LIBBLAKE_CONST__
#endif
+#if defined(__STDC_VERSION__)
+# if __STDC_VERSION__ >= 201112L
+# define LIBBLAKE_ALIGNED__(BYTES) _Alignas(BYTES)
+# endif
+#endif
+#ifndef LIBBLAKE_ALIGNED__
+# if defined(__GNUC__)
+# define LIBBLAKE_ALIGNED__(BYTES) __attribute__((__aligned__(BYTES)))
+# else
+# define LIBBLAKE_ALIGNED__(BYTES)
+# endif
+#endif
+
+
+LIBBLAKE_PUBLIC__ void libblake_init(void);
LIBBLAKE_PUBLIC__ void libblake_encode_hex(const void *data, size_t n, char out[/* static n * 2 + 1 */], int uppercase);
LIBBLAKE_PUBLIC__ size_t libblake_decode_hex(const char *data, size_t n, void *out, int *validp);
@@ -140,12 +155,14 @@ struct libblake_blake2xb_params {
};
struct libblake_blake2s_state {
+ LIBBLAKE_ALIGNED__(32)
uint_least32_t h[8];
uint_least32_t t[2];
uint_least32_t f[2];
};
struct libblake_blake2b_state {
+ LIBBLAKE_ALIGNED__(32)
uint_least64_t h[8];
uint_least64_t t[2];
uint_least64_t f[2];
diff --git a/libblake_init.c b/libblake_init.c
new file mode 100644
index 0000000..ebf0e34
--- /dev/null
+++ b/libblake_init.c
@@ -0,0 +1,27 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+#include <stdatomic.h>
+
+#if defined(__GNUC__)
+__attribute__((__constructor__)) /* ignored if statically linked, so this function shall
+ * by the application, we just use the constructor (init)
+ * attribute incase that is forgotten, as it will only
+ * improve performance, but the library with function
+ * perfectly fine even if it's not called */
+#endif
+void
+libblake_init(void)
+{
+ static volatile int initialised = 0;
+ static volatile atomic_flag spinlock = ATOMIC_FLAG_INIT;
+
+ while (atomic_flag_test_and_set(&spinlock));
+
+ if (!initialised) {
+ /* libblake_internal_blake2b_compress_mm128_init(); */
+ /* libblake_internal_blake2b_compress_mm256_init(); */
+ initialised = 1;
+ }
+
+ atomic_flag_clear(&spinlock);
+}
diff --git a/libblake_internal_blake2b_compress.c b/libblake_internal_blake2b_compress.c
index e844180..d04a469 100644
--- a/libblake_internal_blake2b_compress.c
+++ b/libblake_internal_blake2b_compress.c
@@ -1,9 +1,12 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+/* This code performs suboptimally if compiled with -mavx2 */
+
static uint_least64_t
decode_uint64_le(const unsigned char *data)
{
+ /* This is perfectly optimised by the compiler */
return (((uint_least64_t)(data[0] & 255)) << 0) |
(((uint_least64_t)(data[1] & 255)) << 8) |
(((uint_least64_t)(data[2] & 255)) << 16) |
@@ -17,6 +20,7 @@ decode_uint64_le(const unsigned char *data)
static uint_least64_t
rotate_right(uint_least64_t x, int n)
{
+ /* This is perfectly optimised by the compiler */
return ((x >> n) | (x << (64 - n))) & UINT_LEAST64_C(0xFFFFffffFFFFffff);
}
diff --git a/libblake_internal_blake2b_compress_mm128.c b/libblake_internal_blake2b_compress_mm128.c
new file mode 100644
index 0000000..eec7eae
--- /dev/null
+++ b/libblake_internal_blake2b_compress_mm128.c
@@ -0,0 +1,153 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+#include <immintrin.h>
+
+static __m128i ror24, ror16;
+
+static __m128i
+load_m128i(size_t a, size_t b, __m128i vec[])
+{
+ if (a & 1) {
+ if (b & 1)
+ return _mm_unpackhi_epi64(vec[a / 2], vec[b / 2]);
+ else
+ return _mm_shuffle_epi32(_mm_blend_epi32(vec[a / 2], vec[b / 2], 0x3), _MM_SHUFFLE(1, 0, 3, 2));
+ } else {
+ if (a + 1 == b)
+ return vec[a / 2];
+ else if (b & 1)
+ return _mm_blend_epi32(vec[b / 2], vec[a / 2], 0x3);
+ else
+ return _mm_unpacklo_epi64(vec[a / 2], vec[b / 2]);
+ }
+}
+
+static __m128i
+load_high_and_low(__m128i hi, __m128i lo)
+{
+ return _mm_shuffle_epi32(_mm_blend_epi32(hi, lo, 0x3), _MM_SHUFFLE(1, 0, 3, 2));
+}
+
+static void
+store_high_low_and_low_high(__m128i *hip, __m128i *lop, __m128i val1, __m128i val2)
+{
+ *hip = load_high_and_low(val1, val2);
+ *lop = load_high_and_low(val2, val1);
+}
+
+void
+libblake_internal_blake2b_compress_mm128_init(void)
+{
+#define X(A, B, C, D, E, F, G, H, P) (A + P), (B + P), (C + P), (D + P), (E + P), (F + P), (G + P), (H + P)
+ ror24 = _mm_setr_epi8(X(3, 4, 5, 6, 7, 0, 1, 2, 0),
+ X(3, 4, 5, 6, 7, 0, 1, 2, 8));
+ ror16 = _mm_setr_epi8(X(2, 3, 4, 5, 6, 7, 0, 1, 0),
+ X(2, 3, 4, 5, 6, 7, 0, 1, 8));
+#undef X
+}
+
+void
+libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data)
+{
+ static const uint_least64_t _Alignas(__m128i) initvec[] = {
+ UINT_LEAST64_C(0x6A09E667F3BCC908), UINT_LEAST64_C(0xBB67AE8584CAA73B),
+ UINT_LEAST64_C(0x3C6EF372FE94F82B), UINT_LEAST64_C(0xA54FF53A5F1D36F1),
+ UINT_LEAST64_C(0x510E527FADE682D1), UINT_LEAST64_C(0x9B05688C2B3E6C1F),
+ UINT_LEAST64_C(0x1F83D9ABFB41BD6B), UINT_LEAST64_C(0x5BE0CD19137E2179),
+ };
+ __m128i v[8], mj, mk, t, f, h[4], m[8], x, y;
+
+ t = _mm_load_si128((const __m128i *)state->t);
+ f = _mm_load_si128((const __m128i *)state->f);
+ v[0] = h[0] = _mm_load_si128((const __m128i *)&state->h[0]);
+ v[1] = h[1] = _mm_load_si128((const __m128i *)&state->h[2]);
+ v[2] = h[2] = _mm_load_si128((const __m128i *)&state->h[4]);
+ v[3] = h[3] = _mm_load_si128((const __m128i *)&state->h[6]);
+ v[4] = _mm_load_si128((const __m128i *)&initvec[0]);
+ v[5] = _mm_load_si128((const __m128i *)&initvec[2]);
+ v[6] = _mm_load_si128((const __m128i *)&initvec[4]);
+ v[7] = _mm_load_si128((const __m128i *)&initvec[6]);
+ v[6] = _mm_xor_si128(v[6], t);
+ v[7] = _mm_xor_si128(v[7], f);
+
+ if (LIKELY((uintptr_t)data % 16 == 0)) {
+ m[0] = _mm_load_si128((const __m128i *)&data[0 * 16]);
+ m[1] = _mm_load_si128((const __m128i *)&data[1 * 16]);
+ m[2] = _mm_load_si128((const __m128i *)&data[2 * 16]);
+ m[3] = _mm_load_si128((const __m128i *)&data[3 * 16]);
+ m[4] = _mm_load_si128((const __m128i *)&data[4 * 16]);
+ m[5] = _mm_load_si128((const __m128i *)&data[5 * 16]);
+ m[6] = _mm_load_si128((const __m128i *)&data[6 * 16]);
+ m[7] = _mm_load_si128((const __m128i *)&data[7 * 16]);
+ } else {
+ m[0] = _mm_loadu_si128((const __m128i *)&data[0 * 16]);
+ m[1] = _mm_loadu_si128((const __m128i *)&data[1 * 16]);
+ m[2] = _mm_loadu_si128((const __m128i *)&data[2 * 16]);
+ m[3] = _mm_loadu_si128((const __m128i *)&data[3 * 16]);
+ m[4] = _mm_loadu_si128((const __m128i *)&data[4 * 16]);
+ m[5] = _mm_loadu_si128((const __m128i *)&data[5 * 16]);
+ m[6] = _mm_loadu_si128((const __m128i *)&data[6 * 16]);
+ m[7] = _mm_loadu_si128((const __m128i *)&data[7 * 16]);
+ }
+
+#define G2B(j1, k1, j2, k2, a, b, c, d, shift)\
+ mj = load_m128i(j1, j2, m);\
+ mk = load_m128i(k1, k2, m);\
+ v[a] = _mm_add_epi64(v[a], v[b]);\
+ v[a] = _mm_add_epi64(v[a], mj);\
+ v[d] = _mm_xor_si128(v[d], v[a]);\
+ v[d] = _mm_shuffle_epi32(v[d], _MM_SHUFFLE(2, 3, 0, 1));\
+ v[c] = _mm_add_epi64(v[c], v[d]);\
+ v[b] = _mm_xor_si128(v[b], v[c]);\
+ v[b] = _mm_shuffle_epi8(v[b], ror24);\
+ v[a] = _mm_add_epi64(v[a], v[b]);\
+ v[a] = _mm_add_epi64(v[a], mk);\
+ v[d] = _mm_xor_si128(v[d], v[a]);\
+ v[d] = _mm_shuffle_epi8(v[d], ror16);\
+ v[c] = _mm_add_epi64(v[c], v[d]);\
+ v[b] = _mm_xor_si128(v[b], v[c]);\
+ v[b] = _mm_xor_si128(_mm_srli_epi64(v[b], 63),\
+ _mm_add_epi64(v[b], v[b]))
+
+#define ROUND2B(S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF)\
+ G2B(S0, S1, S2, S3, 0, 2, 4, 6, 0);\
+ G2B(S4, S5, S6, S7, 1, 3, 5, 7, 0);\
+ x = v[2];\
+ y = v[3];\
+ v[2] = load_high_and_low(x, y);\
+ v[3] = load_high_and_low(y, x);\
+ x = v[6];\
+ y = v[7];\
+ v[6] = load_high_and_low(y, x);\
+ v[7] = load_high_and_low(x, y);\
+ G2B(S8, S9, SA, SB, 0, 2, 5, 6, 1);\
+ G2B(SC, SD, SE, SF, 1, 3, 4, 7, 2);\
+ x = v[2];\
+ y = v[3];\
+ store_high_low_and_low_high(&v[2], &v[3], y, x);\
+ x = v[6];\
+ y = v[7];\
+ store_high_low_and_low_high(&v[7], &v[6], y, x)
+
+ ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F);
+ ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3);
+ ROUND2B(B, 8, C, 0, 5, 2, F, D, A, E, 3, 6, 7, 1, 9, 4);
+ ROUND2B(7, 9, 3, 1, D, C, B, E, 2, 6, 5, A, 4, 0, F, 8);
+ ROUND2B(9, 0, 5, 7, 2, 4, A, F, E, 1, B, C, 6, 8, 3, D);
+ ROUND2B(2, C, 6, A, 0, B, 8, 3, 4, D, 7, 5, F, E, 1, 9);
+ ROUND2B(C, 5, 1, F, E, D, 4, A, 0, 7, 6, 3, 9, 2, 8, B);
+ ROUND2B(D, B, 7, E, C, 1, 3, 9, 5, 0, F, 4, 8, 6, 2, A);
+ ROUND2B(6, F, E, 9, B, 3, 0, 8, C, 2, D, 7, 1, 4, A, 5);
+ ROUND2B(A, 2, 8, 4, 7, 6, 1, 5, F, B, 9, E, 3, C, D, 0);
+ ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F);
+ ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3);
+
+ v[0] = _mm_xor_si128(_mm_xor_si128(v[0], v[4]), h[0]);
+ v[1] = _mm_xor_si128(_mm_xor_si128(v[1], v[5]), h[1]);
+ v[2] = _mm_xor_si128(_mm_xor_si128(v[2], v[6]), h[2]);
+ v[3] = _mm_xor_si128(_mm_xor_si128(v[3], v[7]), h[3]);
+ _mm_store_si128((__m128i *)&state->h[0], v[0]);
+ _mm_store_si128((__m128i *)&state->h[2], v[1]);
+ _mm_store_si128((__m128i *)&state->h[4], v[2]);
+ _mm_store_si128((__m128i *)&state->h[6], v[3]);
+}
diff --git a/libblake_internal_blake2b_compress_mm256.c b/libblake_internal_blake2b_compress_mm256.c
new file mode 100644
index 0000000..f8e6a09
--- /dev/null
+++ b/libblake_internal_blake2b_compress_mm256.c
@@ -0,0 +1,101 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+#include <immintrin.h>
+
+static __m256i ror24, ror16;
+
+static __m256i
+load_m256i(size_t a, size_t b, size_t c, size_t d, const uint_least64_t vec[])
+{
+ return _mm256_set_epi64x((int_least64_t)vec[d], (int_least64_t)vec[c],
+ (int_least64_t)vec[b], (int_least64_t)vec[a]);
+}
+
+void
+libblake_internal_blake2b_compress_mm256_init(void)
+{
+#define X(A, B, C, D, E, F, G, H, P) (A + P), (B + P), (C + P), (D + P), (E + P), (F + P), (G + P), (H + P)
+ ror24 = _mm256_setr_epi8(X(3, 4, 5, 6, 7, 0, 1, 2, 0),
+ X(3, 4, 5, 6, 7, 0, 1, 2, 8),
+ X(3, 4, 5, 6, 7, 0, 1, 2, 16),
+ X(3, 4, 5, 6, 7, 0, 1, 2, 24));
+ ror16 = _mm256_setr_epi8(X(2, 3, 4, 5, 6, 7, 0, 1, 0),
+ X(2, 3, 4, 5, 6, 7, 0, 1, 8),
+ X(2, 3, 4, 5, 6, 7, 0, 1, 16),
+ X(2, 3, 4, 5, 6, 7, 0, 1, 24));
+#undef X
+}
+
+void
+libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data)
+{
+ static const uint_least64_t _Alignas(__m256i) initvec[] = {
+ UINT_LEAST64_C(0x6A09E667F3BCC908), UINT_LEAST64_C(0xBB67AE8584CAA73B),
+ UINT_LEAST64_C(0x3C6EF372FE94F82B), UINT_LEAST64_C(0xA54FF53A5F1D36F1),
+ UINT_LEAST64_C(0x510E527FADE682D1), UINT_LEAST64_C(0x9B05688C2B3E6C1F),
+ UINT_LEAST64_C(0x1F83D9ABFB41BD6B), UINT_LEAST64_C(0x5BE0CD19137E2179),
+ };
+ __m256i v[4], mj, mk, tf, h[2];
+
+ tf = _mm256_load_si256((const __m256i *)state->t);
+ v[0] = h[0] = _mm256_load_si256((const __m256i *)&state->h[0]);
+ v[1] = h[1] = _mm256_load_si256((const __m256i *)&state->h[4]);
+ v[2] = _mm256_load_si256((const __m256i *)&initvec[0]);
+ v[3] = _mm256_load_si256((const __m256i *)&initvec[4]);
+ v[3] = _mm256_xor_si256(v[3], tf);
+
+#define G2B(j1, k1, j2, k2, j3, k3, j4, k4, shift)\
+ do {\
+ mj = load_m256i(j1, j2, j3, j4, (const void *)data);\
+ mk = load_m256i(k1, k2, k3, k4, (const void *)data);\
+ if (shift) {\
+ v[1] = _mm256_permute4x64_epi64(v[1], _MM_SHUFFLE(0, 3, 2, 1));\
+ v[2] = _mm256_permute4x64_epi64(v[2], _MM_SHUFFLE(1, 0, 3, 2));\
+ v[3] = _mm256_permute4x64_epi64(v[3], _MM_SHUFFLE(2, 1, 0, 3));\
+ }\
+ v[0] = _mm256_add_epi64(v[0], v[1]);\
+ v[0] = _mm256_add_epi64(v[0], mj);\
+ v[3] = _mm256_xor_si256(v[3], v[0]);\
+ v[3] = _mm256_shuffle_epi32(v[3], _MM_SHUFFLE(2, 3, 0, 1));\
+ v[2] = _mm256_add_epi64(v[2], v[3]);\
+ v[1] = _mm256_xor_si256(v[1], v[2]);\
+ v[1] = _mm256_shuffle_epi8(v[1], ror24);\
+ v[0] = _mm256_add_epi64(v[0], v[1]);\
+ v[0] = _mm256_add_epi64(v[0], mk);\
+ v[3] = _mm256_xor_si256(v[3], v[0]);\
+ v[3] = _mm256_shuffle_epi8(v[3], ror16);\
+ v[2] = _mm256_add_epi64(v[2], v[3]);\
+ v[1] = _mm256_xor_si256(v[1], v[2]);\
+ v[1] = _mm256_xor_si256(_mm256_srli_epi64(v[1], 63),\
+ _mm256_add_epi64(v[1], v[1]));\
+ if (shift) {\
+ v[1] = _mm256_permute4x64_epi64(v[1], _MM_SHUFFLE(2, 1, 0, 3));\
+ v[2] = _mm256_permute4x64_epi64(v[2], _MM_SHUFFLE(1, 0, 3, 2));\
+ v[3] = _mm256_permute4x64_epi64(v[3], _MM_SHUFFLE(0, 3, 2, 1));\
+ }\
+ } while (0)
+
+#define ROUND2B(S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF)\
+ G2B(S0, S1, S2, S3, S4, S5, S6, S7, 0);\
+ G2B(S8, S9, SA, SB, SC, SD, SE, SF, 1)
+
+ ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F);
+ ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3);
+ ROUND2B(B, 8, C, 0, 5, 2, F, D, A, E, 3, 6, 7, 1, 9, 4);
+ ROUND2B(7, 9, 3, 1, D, C, B, E, 2, 6, 5, A, 4, 0, F, 8);
+ ROUND2B(9, 0, 5, 7, 2, 4, A, F, E, 1, B, C, 6, 8, 3, D);
+ ROUND2B(2, C, 6, A, 0, B, 8, 3, 4, D, 7, 5, F, E, 1, 9);
+ ROUND2B(C, 5, 1, F, E, D, 4, A, 0, 7, 6, 3, 9, 2, 8, B);
+ ROUND2B(D, B, 7, E, C, 1, 3, 9, 5, 0, F, 4, 8, 6, 2, A);
+ ROUND2B(6, F, E, 9, B, 3, 0, 8, C, 2, D, 7, 1, 4, A, 5);
+ ROUND2B(A, 2, 8, 4, 7, 6, 1, 5, F, B, 9, E, 3, C, D, 0);
+ ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F);
+ ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3);
+
+ v[0] = _mm256_xor_si256(v[0], v[2]);
+ v[1] = _mm256_xor_si256(v[1], v[3]);
+ v[0] = _mm256_xor_si256(v[0], h[0]);
+ v[1] = _mm256_xor_si256(v[1], h[1]);
+ _mm256_store_si256((__m256i *)&state->h[0], v[0]);
+ _mm256_store_si256((__m256i *)&state->h[4], v[1]);
+}
diff --git a/libblake_internal_blake2s_compress.c b/libblake_internal_blake2s_compress.c
index 37a61eb..c41876a 100644
--- a/libblake_internal_blake2s_compress.c
+++ b/libblake_internal_blake2s_compress.c
@@ -4,6 +4,7 @@
static uint_least32_t
decode_uint32_le(const unsigned char *data)
{
+ /* This is perfectly optimised by the compiler */
return (((uint_least32_t)(data[0] & 255)) << 0) |
(((uint_least32_t)(data[1] & 255)) << 8) |
(((uint_least32_t)(data[2] & 255)) << 16) |
@@ -13,6 +14,7 @@ decode_uint32_le(const unsigned char *data)
static uint_least32_t
rotate_right(uint_least32_t x, int n)
{
+ /* This is perfectly optimised by the compiler */
return ((x >> n) | (x << (32 - n))) & UINT_LEAST32_C(0xFFFFffff);
}
diff --git a/test.c b/test.c
index a6e6475..0d97df0 100644
--- a/test.c
+++ b/test.c
@@ -490,6 +490,8 @@ main(void)
{
int failed = 0;
+ libblake_init();
+
CHECK_HEX(1, 00, 12, 32, 00, 45, 67, 82, 9A, B0, CD, FE, FF, 80, 08, CC, 28);
CHECK_HEX(0, 00, 12, 32, 00, 45, 67, 82, 9a, b0, cd, fe, ff, 80, 08, cc, 28);