diff options
author | Mattias Andrée <maandree@kth.se> | 2022-01-21 18:29:26 +0100 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2022-01-21 18:29:26 +0100 |
commit | 839a3d17d257e73be9bc99dfa90e56c0824050ba (patch) | |
tree | 6bb010351447edbb0ae8d910948b01837d2de9e5 | |
parent | Fix memory corruption bug in test.c and simplify message byte-length calculation (diff) | |
download | libblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.gz libblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.bz2 libblake-839a3d17d257e73be9bc99dfa90e56c0824050ba.tar.xz |
Initial work on optimising compression function; mm128 version is slower, mm256 version is barely faster
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r-- | Makefile | 15 | ||||
-rw-r--r-- | common.h | 2 | ||||
-rw-r--r-- | config.mk | 9 | ||||
-rw-r--r-- | libblake.h | 17 | ||||
-rw-r--r-- | libblake_init.c | 27 | ||||
-rw-r--r-- | libblake_internal_blake2b_compress.c | 4 | ||||
-rw-r--r-- | libblake_internal_blake2b_compress_mm128.c | 153 | ||||
-rw-r--r-- | libblake_internal_blake2b_compress_mm256.c | 101 | ||||
-rw-r--r-- | libblake_internal_blake2s_compress.c | 2 | ||||
-rw-r--r-- | test.c | 2 |
10 files changed, 330 insertions, 2 deletions
@@ -67,14 +67,15 @@ OBJ_BLAKE2 =\ libblake_blake2xs_update.o\ libblake_internal_blake2b_compress.o\ libblake_internal_blake2s_compress.o\ - libblake_internal_blake2s_output_digest.o\ libblake_internal_blake2b_output_digest.o\ + libblake_internal_blake2s_output_digest.o\ libblake_internal_blake2xb_init0.o\ libblake_internal_blake2xs_init0.o OBJ =\ libblake_encode_hex.o\ libblake_decode_hex.o\ + libblake_init.o\ $(OBJ_BLAKE)\ $(OBJ_BLAKE2) @@ -96,6 +97,18 @@ test.o: $(HDR) .c.lo: $(CC) -fPIC -c -o $@ $< $(CFLAGS) $(CPPFLAGS) +libblake_internal_blake2b_compress_mm128.o: libblake_internal_blake2b_compress_mm128.c $(HDR) + $(CC) -c -o $@ $(@:.o=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM128) + +libblake_internal_blake2b_compress_mm128.lo: libblake_internal_blake2b_compress_mm128.c $(HDR) + $(CC) -c -o $@ $(@:.lo=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM128) + +libblake_internal_blake2b_compress_mm256.o: libblake_internal_blake2b_compress_mm256.c $(HDR) + $(CC) -c -o $@ $(@:.o=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM256) + +libblake_internal_blake2b_compress_mm256.lo: libblake_internal_blake2b_compress_mm256.c $(HDR) + $(CC) -c -o $@ $(@:.lo=.c) $(CFLAGS) $(CPPFLAGS) $(CFLAGS_MM256) + test: test.o libblake.a $(CC) -o $@ test.o libblake.a $(LDFLAGS) @@ -77,6 +77,8 @@ HIDDEN void libblake_internal_blakeb_digest(struct libblake_blakeb_state *state, HIDDEN void libblake_internal_blake2s_compress(struct libblake_blake2s_state *state, const unsigned char *data); HIDDEN void libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data); +/* HIDDEN void libblake_internal_blake2b_compress_mm128_init(void); */ +/* HIDDEN void libblake_internal_blake2b_compress_mm256_init(void); */ HIDDEN void libblake_internal_blake2xs_init0(struct libblake_blake2xs_state *state, const struct libblake_blake2xs_params *params); HIDDEN void libblake_internal_blake2xb_init0(struct libblake_blake2xb_state *state, const struct libblake_blake2xb_params *params); @@ -1,8 +1,15 @@ PREFIX = /usr MANPREFIX = $(PREFIX)/share/man -CC = c99 +CC = cc -std=c11 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE CFLAGS = -Wall -O3 LDFLAGS = -s + +# These optimisations may not only break compatibility with +# processors that the software was not compiled on, but they +# will infact also degrade performance. Therefore they are +# only only used for specific translation units. +CFLAGS_MM128 = -msse4.1 -mavx2 +CFLAGS_MM256 = -msse4.1 -mavx2 @@ -27,6 +27,21 @@ # define LIBBLAKE_CONST__ #endif +#if defined(__STDC_VERSION__) +# if __STDC_VERSION__ >= 201112L +# define LIBBLAKE_ALIGNED__(BYTES) _Alignas(BYTES) +# endif +#endif +#ifndef LIBBLAKE_ALIGNED__ +# if defined(__GNUC__) +# define LIBBLAKE_ALIGNED__(BYTES) __attribute__((__aligned__(BYTES))) +# else +# define LIBBLAKE_ALIGNED__(BYTES) +# endif +#endif + + +LIBBLAKE_PUBLIC__ void libblake_init(void); LIBBLAKE_PUBLIC__ void libblake_encode_hex(const void *data, size_t n, char out[/* static n * 2 + 1 */], int uppercase); LIBBLAKE_PUBLIC__ size_t libblake_decode_hex(const char *data, size_t n, void *out, int *validp); @@ -140,12 +155,14 @@ struct libblake_blake2xb_params { }; struct libblake_blake2s_state { + LIBBLAKE_ALIGNED__(32) uint_least32_t h[8]; uint_least32_t t[2]; uint_least32_t f[2]; }; struct libblake_blake2b_state { + LIBBLAKE_ALIGNED__(32) uint_least64_t h[8]; uint_least64_t t[2]; uint_least64_t f[2]; diff --git a/libblake_init.c b/libblake_init.c new file mode 100644 index 0000000..ebf0e34 --- /dev/null +++ b/libblake_init.c @@ -0,0 +1,27 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" +#include <stdatomic.h> + +#if defined(__GNUC__) +__attribute__((__constructor__)) /* ignored if statically linked, so this function shall + * by the application, we just use the constructor (init) + * attribute incase that is forgotten, as it will only + * improve performance, but the library with function + * perfectly fine even if it's not called */ +#endif +void +libblake_init(void) +{ + static volatile int initialised = 0; + static volatile atomic_flag spinlock = ATOMIC_FLAG_INIT; + + while (atomic_flag_test_and_set(&spinlock)); + + if (!initialised) { + /* libblake_internal_blake2b_compress_mm128_init(); */ + /* libblake_internal_blake2b_compress_mm256_init(); */ + initialised = 1; + } + + atomic_flag_clear(&spinlock); +} diff --git a/libblake_internal_blake2b_compress.c b/libblake_internal_blake2b_compress.c index e844180..d04a469 100644 --- a/libblake_internal_blake2b_compress.c +++ b/libblake_internal_blake2b_compress.c @@ -1,9 +1,12 @@ /* See LICENSE file for copyright and license details. */ #include "common.h" +/* This code performs suboptimally if compiled with -mavx2 */ + static uint_least64_t decode_uint64_le(const unsigned char *data) { + /* This is perfectly optimised by the compiler */ return (((uint_least64_t)(data[0] & 255)) << 0) | (((uint_least64_t)(data[1] & 255)) << 8) | (((uint_least64_t)(data[2] & 255)) << 16) | @@ -17,6 +20,7 @@ decode_uint64_le(const unsigned char *data) static uint_least64_t rotate_right(uint_least64_t x, int n) { + /* This is perfectly optimised by the compiler */ return ((x >> n) | (x << (64 - n))) & UINT_LEAST64_C(0xFFFFffffFFFFffff); } diff --git a/libblake_internal_blake2b_compress_mm128.c b/libblake_internal_blake2b_compress_mm128.c new file mode 100644 index 0000000..eec7eae --- /dev/null +++ b/libblake_internal_blake2b_compress_mm128.c @@ -0,0 +1,153 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" +#include <immintrin.h> + +static __m128i ror24, ror16; + +static __m128i +load_m128i(size_t a, size_t b, __m128i vec[]) +{ + if (a & 1) { + if (b & 1) + return _mm_unpackhi_epi64(vec[a / 2], vec[b / 2]); + else + return _mm_shuffle_epi32(_mm_blend_epi32(vec[a / 2], vec[b / 2], 0x3), _MM_SHUFFLE(1, 0, 3, 2)); + } else { + if (a + 1 == b) + return vec[a / 2]; + else if (b & 1) + return _mm_blend_epi32(vec[b / 2], vec[a / 2], 0x3); + else + return _mm_unpacklo_epi64(vec[a / 2], vec[b / 2]); + } +} + +static __m128i +load_high_and_low(__m128i hi, __m128i lo) +{ + return _mm_shuffle_epi32(_mm_blend_epi32(hi, lo, 0x3), _MM_SHUFFLE(1, 0, 3, 2)); +} + +static void +store_high_low_and_low_high(__m128i *hip, __m128i *lop, __m128i val1, __m128i val2) +{ + *hip = load_high_and_low(val1, val2); + *lop = load_high_and_low(val2, val1); +} + +void +libblake_internal_blake2b_compress_mm128_init(void) +{ +#define X(A, B, C, D, E, F, G, H, P) (A + P), (B + P), (C + P), (D + P), (E + P), (F + P), (G + P), (H + P) + ror24 = _mm_setr_epi8(X(3, 4, 5, 6, 7, 0, 1, 2, 0), + X(3, 4, 5, 6, 7, 0, 1, 2, 8)); + ror16 = _mm_setr_epi8(X(2, 3, 4, 5, 6, 7, 0, 1, 0), + X(2, 3, 4, 5, 6, 7, 0, 1, 8)); +#undef X +} + +void +libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data) +{ + static const uint_least64_t _Alignas(__m128i) initvec[] = { + UINT_LEAST64_C(0x6A09E667F3BCC908), UINT_LEAST64_C(0xBB67AE8584CAA73B), + UINT_LEAST64_C(0x3C6EF372FE94F82B), UINT_LEAST64_C(0xA54FF53A5F1D36F1), + UINT_LEAST64_C(0x510E527FADE682D1), UINT_LEAST64_C(0x9B05688C2B3E6C1F), + UINT_LEAST64_C(0x1F83D9ABFB41BD6B), UINT_LEAST64_C(0x5BE0CD19137E2179), + }; + __m128i v[8], mj, mk, t, f, h[4], m[8], x, y; + + t = _mm_load_si128((const __m128i *)state->t); + f = _mm_load_si128((const __m128i *)state->f); + v[0] = h[0] = _mm_load_si128((const __m128i *)&state->h[0]); + v[1] = h[1] = _mm_load_si128((const __m128i *)&state->h[2]); + v[2] = h[2] = _mm_load_si128((const __m128i *)&state->h[4]); + v[3] = h[3] = _mm_load_si128((const __m128i *)&state->h[6]); + v[4] = _mm_load_si128((const __m128i *)&initvec[0]); + v[5] = _mm_load_si128((const __m128i *)&initvec[2]); + v[6] = _mm_load_si128((const __m128i *)&initvec[4]); + v[7] = _mm_load_si128((const __m128i *)&initvec[6]); + v[6] = _mm_xor_si128(v[6], t); + v[7] = _mm_xor_si128(v[7], f); + + if (LIKELY((uintptr_t)data % 16 == 0)) { + m[0] = _mm_load_si128((const __m128i *)&data[0 * 16]); + m[1] = _mm_load_si128((const __m128i *)&data[1 * 16]); + m[2] = _mm_load_si128((const __m128i *)&data[2 * 16]); + m[3] = _mm_load_si128((const __m128i *)&data[3 * 16]); + m[4] = _mm_load_si128((const __m128i *)&data[4 * 16]); + m[5] = _mm_load_si128((const __m128i *)&data[5 * 16]); + m[6] = _mm_load_si128((const __m128i *)&data[6 * 16]); + m[7] = _mm_load_si128((const __m128i *)&data[7 * 16]); + } else { + m[0] = _mm_loadu_si128((const __m128i *)&data[0 * 16]); + m[1] = _mm_loadu_si128((const __m128i *)&data[1 * 16]); + m[2] = _mm_loadu_si128((const __m128i *)&data[2 * 16]); + m[3] = _mm_loadu_si128((const __m128i *)&data[3 * 16]); + m[4] = _mm_loadu_si128((const __m128i *)&data[4 * 16]); + m[5] = _mm_loadu_si128((const __m128i *)&data[5 * 16]); + m[6] = _mm_loadu_si128((const __m128i *)&data[6 * 16]); + m[7] = _mm_loadu_si128((const __m128i *)&data[7 * 16]); + } + +#define G2B(j1, k1, j2, k2, a, b, c, d, shift)\ + mj = load_m128i(j1, j2, m);\ + mk = load_m128i(k1, k2, m);\ + v[a] = _mm_add_epi64(v[a], v[b]);\ + v[a] = _mm_add_epi64(v[a], mj);\ + v[d] = _mm_xor_si128(v[d], v[a]);\ + v[d] = _mm_shuffle_epi32(v[d], _MM_SHUFFLE(2, 3, 0, 1));\ + v[c] = _mm_add_epi64(v[c], v[d]);\ + v[b] = _mm_xor_si128(v[b], v[c]);\ + v[b] = _mm_shuffle_epi8(v[b], ror24);\ + v[a] = _mm_add_epi64(v[a], v[b]);\ + v[a] = _mm_add_epi64(v[a], mk);\ + v[d] = _mm_xor_si128(v[d], v[a]);\ + v[d] = _mm_shuffle_epi8(v[d], ror16);\ + v[c] = _mm_add_epi64(v[c], v[d]);\ + v[b] = _mm_xor_si128(v[b], v[c]);\ + v[b] = _mm_xor_si128(_mm_srli_epi64(v[b], 63),\ + _mm_add_epi64(v[b], v[b])) + +#define ROUND2B(S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF)\ + G2B(S0, S1, S2, S3, 0, 2, 4, 6, 0);\ + G2B(S4, S5, S6, S7, 1, 3, 5, 7, 0);\ + x = v[2];\ + y = v[3];\ + v[2] = load_high_and_low(x, y);\ + v[3] = load_high_and_low(y, x);\ + x = v[6];\ + y = v[7];\ + v[6] = load_high_and_low(y, x);\ + v[7] = load_high_and_low(x, y);\ + G2B(S8, S9, SA, SB, 0, 2, 5, 6, 1);\ + G2B(SC, SD, SE, SF, 1, 3, 4, 7, 2);\ + x = v[2];\ + y = v[3];\ + store_high_low_and_low_high(&v[2], &v[3], y, x);\ + x = v[6];\ + y = v[7];\ + store_high_low_and_low_high(&v[7], &v[6], y, x) + + ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F); + ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3); + ROUND2B(B, 8, C, 0, 5, 2, F, D, A, E, 3, 6, 7, 1, 9, 4); + ROUND2B(7, 9, 3, 1, D, C, B, E, 2, 6, 5, A, 4, 0, F, 8); + ROUND2B(9, 0, 5, 7, 2, 4, A, F, E, 1, B, C, 6, 8, 3, D); + ROUND2B(2, C, 6, A, 0, B, 8, 3, 4, D, 7, 5, F, E, 1, 9); + ROUND2B(C, 5, 1, F, E, D, 4, A, 0, 7, 6, 3, 9, 2, 8, B); + ROUND2B(D, B, 7, E, C, 1, 3, 9, 5, 0, F, 4, 8, 6, 2, A); + ROUND2B(6, F, E, 9, B, 3, 0, 8, C, 2, D, 7, 1, 4, A, 5); + ROUND2B(A, 2, 8, 4, 7, 6, 1, 5, F, B, 9, E, 3, C, D, 0); + ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F); + ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3); + + v[0] = _mm_xor_si128(_mm_xor_si128(v[0], v[4]), h[0]); + v[1] = _mm_xor_si128(_mm_xor_si128(v[1], v[5]), h[1]); + v[2] = _mm_xor_si128(_mm_xor_si128(v[2], v[6]), h[2]); + v[3] = _mm_xor_si128(_mm_xor_si128(v[3], v[7]), h[3]); + _mm_store_si128((__m128i *)&state->h[0], v[0]); + _mm_store_si128((__m128i *)&state->h[2], v[1]); + _mm_store_si128((__m128i *)&state->h[4], v[2]); + _mm_store_si128((__m128i *)&state->h[6], v[3]); +} diff --git a/libblake_internal_blake2b_compress_mm256.c b/libblake_internal_blake2b_compress_mm256.c new file mode 100644 index 0000000..f8e6a09 --- /dev/null +++ b/libblake_internal_blake2b_compress_mm256.c @@ -0,0 +1,101 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" +#include <immintrin.h> + +static __m256i ror24, ror16; + +static __m256i +load_m256i(size_t a, size_t b, size_t c, size_t d, const uint_least64_t vec[]) +{ + return _mm256_set_epi64x((int_least64_t)vec[d], (int_least64_t)vec[c], + (int_least64_t)vec[b], (int_least64_t)vec[a]); +} + +void +libblake_internal_blake2b_compress_mm256_init(void) +{ +#define X(A, B, C, D, E, F, G, H, P) (A + P), (B + P), (C + P), (D + P), (E + P), (F + P), (G + P), (H + P) + ror24 = _mm256_setr_epi8(X(3, 4, 5, 6, 7, 0, 1, 2, 0), + X(3, 4, 5, 6, 7, 0, 1, 2, 8), + X(3, 4, 5, 6, 7, 0, 1, 2, 16), + X(3, 4, 5, 6, 7, 0, 1, 2, 24)); + ror16 = _mm256_setr_epi8(X(2, 3, 4, 5, 6, 7, 0, 1, 0), + X(2, 3, 4, 5, 6, 7, 0, 1, 8), + X(2, 3, 4, 5, 6, 7, 0, 1, 16), + X(2, 3, 4, 5, 6, 7, 0, 1, 24)); +#undef X +} + +void +libblake_internal_blake2b_compress(struct libblake_blake2b_state *state, const unsigned char *data) +{ + static const uint_least64_t _Alignas(__m256i) initvec[] = { + UINT_LEAST64_C(0x6A09E667F3BCC908), UINT_LEAST64_C(0xBB67AE8584CAA73B), + UINT_LEAST64_C(0x3C6EF372FE94F82B), UINT_LEAST64_C(0xA54FF53A5F1D36F1), + UINT_LEAST64_C(0x510E527FADE682D1), UINT_LEAST64_C(0x9B05688C2B3E6C1F), + UINT_LEAST64_C(0x1F83D9ABFB41BD6B), UINT_LEAST64_C(0x5BE0CD19137E2179), + }; + __m256i v[4], mj, mk, tf, h[2]; + + tf = _mm256_load_si256((const __m256i *)state->t); + v[0] = h[0] = _mm256_load_si256((const __m256i *)&state->h[0]); + v[1] = h[1] = _mm256_load_si256((const __m256i *)&state->h[4]); + v[2] = _mm256_load_si256((const __m256i *)&initvec[0]); + v[3] = _mm256_load_si256((const __m256i *)&initvec[4]); + v[3] = _mm256_xor_si256(v[3], tf); + +#define G2B(j1, k1, j2, k2, j3, k3, j4, k4, shift)\ + do {\ + mj = load_m256i(j1, j2, j3, j4, (const void *)data);\ + mk = load_m256i(k1, k2, k3, k4, (const void *)data);\ + if (shift) {\ + v[1] = _mm256_permute4x64_epi64(v[1], _MM_SHUFFLE(0, 3, 2, 1));\ + v[2] = _mm256_permute4x64_epi64(v[2], _MM_SHUFFLE(1, 0, 3, 2));\ + v[3] = _mm256_permute4x64_epi64(v[3], _MM_SHUFFLE(2, 1, 0, 3));\ + }\ + v[0] = _mm256_add_epi64(v[0], v[1]);\ + v[0] = _mm256_add_epi64(v[0], mj);\ + v[3] = _mm256_xor_si256(v[3], v[0]);\ + v[3] = _mm256_shuffle_epi32(v[3], _MM_SHUFFLE(2, 3, 0, 1));\ + v[2] = _mm256_add_epi64(v[2], v[3]);\ + v[1] = _mm256_xor_si256(v[1], v[2]);\ + v[1] = _mm256_shuffle_epi8(v[1], ror24);\ + v[0] = _mm256_add_epi64(v[0], v[1]);\ + v[0] = _mm256_add_epi64(v[0], mk);\ + v[3] = _mm256_xor_si256(v[3], v[0]);\ + v[3] = _mm256_shuffle_epi8(v[3], ror16);\ + v[2] = _mm256_add_epi64(v[2], v[3]);\ + v[1] = _mm256_xor_si256(v[1], v[2]);\ + v[1] = _mm256_xor_si256(_mm256_srli_epi64(v[1], 63),\ + _mm256_add_epi64(v[1], v[1]));\ + if (shift) {\ + v[1] = _mm256_permute4x64_epi64(v[1], _MM_SHUFFLE(2, 1, 0, 3));\ + v[2] = _mm256_permute4x64_epi64(v[2], _MM_SHUFFLE(1, 0, 3, 2));\ + v[3] = _mm256_permute4x64_epi64(v[3], _MM_SHUFFLE(0, 3, 2, 1));\ + }\ + } while (0) + +#define ROUND2B(S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, SA, SB, SC, SD, SE, SF)\ + G2B(S0, S1, S2, S3, S4, S5, S6, S7, 0);\ + G2B(S8, S9, SA, SB, SC, SD, SE, SF, 1) + + ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F); + ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3); + ROUND2B(B, 8, C, 0, 5, 2, F, D, A, E, 3, 6, 7, 1, 9, 4); + ROUND2B(7, 9, 3, 1, D, C, B, E, 2, 6, 5, A, 4, 0, F, 8); + ROUND2B(9, 0, 5, 7, 2, 4, A, F, E, 1, B, C, 6, 8, 3, D); + ROUND2B(2, C, 6, A, 0, B, 8, 3, 4, D, 7, 5, F, E, 1, 9); + ROUND2B(C, 5, 1, F, E, D, 4, A, 0, 7, 6, 3, 9, 2, 8, B); + ROUND2B(D, B, 7, E, C, 1, 3, 9, 5, 0, F, 4, 8, 6, 2, A); + ROUND2B(6, F, E, 9, B, 3, 0, 8, C, 2, D, 7, 1, 4, A, 5); + ROUND2B(A, 2, 8, 4, 7, 6, 1, 5, F, B, 9, E, 3, C, D, 0); + ROUND2B(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F); + ROUND2B(E, A, 4, 8, 9, F, D, 6, 1, C, 0, 2, B, 7, 5, 3); + + v[0] = _mm256_xor_si256(v[0], v[2]); + v[1] = _mm256_xor_si256(v[1], v[3]); + v[0] = _mm256_xor_si256(v[0], h[0]); + v[1] = _mm256_xor_si256(v[1], h[1]); + _mm256_store_si256((__m256i *)&state->h[0], v[0]); + _mm256_store_si256((__m256i *)&state->h[4], v[1]); +} diff --git a/libblake_internal_blake2s_compress.c b/libblake_internal_blake2s_compress.c index 37a61eb..c41876a 100644 --- a/libblake_internal_blake2s_compress.c +++ b/libblake_internal_blake2s_compress.c @@ -4,6 +4,7 @@ static uint_least32_t decode_uint32_le(const unsigned char *data) { + /* This is perfectly optimised by the compiler */ return (((uint_least32_t)(data[0] & 255)) << 0) | (((uint_least32_t)(data[1] & 255)) << 8) | (((uint_least32_t)(data[2] & 255)) << 16) | @@ -13,6 +14,7 @@ decode_uint32_le(const unsigned char *data) static uint_least32_t rotate_right(uint_least32_t x, int n) { + /* This is perfectly optimised by the compiler */ return ((x >> n) | (x << (32 - n))) & UINT_LEAST32_C(0xFFFFffff); } @@ -490,6 +490,8 @@ main(void) { int failed = 0; + libblake_init(); + CHECK_HEX(1, 00, 12, 32, 00, 45, 67, 82, 9A, B0, CD, FE, FF, 80, 08, CC, 28); CHECK_HEX(0, 00, 12, 32, 00, 45, 67, 82, 9a, b0, cd, fe, ff, 80, 08, cc, 28); |