From 7897170e09aa19122053ff24797b4d7c23f47cbc Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Sun, 15 Sep 2024 11:57:21 +0200 Subject: Optimisation for w=8,16,32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- TODO | 4 - common.h | 27 +++ digest.c | 433 +++++++++++++++++++++++++++++++----- extra/libkeccak_state_marshal.c | 4 +- extra/libkeccak_state_unmarshal.c | 4 +- extra/libkeccak_state_wipe_sponge.c | 2 +- libkeccak.h | 23 +- libkeccak/extra.h | 2 +- libkeccak_state_initialise.c | 4 +- 9 files changed, 434 insertions(+), 69 deletions(-) diff --git a/TODO b/TODO index ec8eda0..a498582 100644 --- a/TODO +++ b/TODO @@ -13,7 +13,3 @@ Add libkeccak_cshakesum_fd (TODO in libkeccak/cshake.h) Add KMAC and KMACXOF Add TupleHash and TupleHashXOF Add ParallelHash and ParallelHashXOF - -Optimise for wordsize 32 -Optimise for wordsize 16 -Optimise for wordsize 8 diff --git a/common.h b/common.h index 96df9e7..44a0b5d 100644 --- a/common.h +++ b/common.h @@ -37,6 +37,33 @@ */ #define COMMA , +/** + * X-macro-enabled listing of all intergers in [0, 0] + * + * @param X(int) The macro to expand 4 times + * @param D Code to insert between each expansion of `X` + */ +#define LIST_1(X, D)\ + X(0) + +/** + * X-macro-enabled listing of all intergers in [0, 1] + * + * @param X(int) The macro to expand 2 times + * @param D Code to insert between each expansion of `X` + */ +#define LIST_2(X, D)\ + X(0) D X(1) + +/** + * X-macro-enabled listing of all intergers in [0, 3] + * + * @param X(int) The macro to expand 4 times + * @param D Code to insert between each expansion of `X` + */ +#define LIST_4(X, D)\ + X(0) D X(1) D X(2) D X(3) + /** * X-macro-enabled listing of all intergers in [0, 4] * diff --git a/digest.c b/digest.c index 7ee744a..384d9df 100644 --- a/digest.c +++ b/digest.c @@ -12,26 +12,63 @@ static const long int LANE_TRANSPOSE_MAP[] = { LIST_25(X, COMMA) }; /** - * Keccak-f round constants - */ -static const uint_fast64_t RC[] = { - 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL, 0x8000000080008000ULL, - 0x000000000000808BULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, - 0x000000000000008AULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL, - 0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL, 0x8000000000008003ULL, - 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800AULL, 0x800000008000000AULL, - 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL + * 64-bit Keccak-f round constants + */ +static const uint64_t rc64[] = { + UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), UINT64_C(0x800000000000808A), UINT64_C(0x8000000080008000), + UINT64_C(0x000000000000808B), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009), + UINT64_C(0x000000000000008A), UINT64_C(0x0000000000000088), UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000A), + UINT64_C(0x000000008000808B), UINT64_C(0x800000000000008B), UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003), + UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), UINT64_C(0x000000000000800A), UINT64_C(0x800000008000000A), + UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008) +}; + + +/** + * 32-bit Keccak-f round constants + */ +static const uint32_t rc32[] = { + UINT32_C(0x00000001), UINT32_C(0x00008082), UINT32_C(0x0000808A), UINT32_C(0x80008000), + UINT32_C(0x0000808B), UINT32_C(0x80000001), UINT32_C(0x80008081), UINT32_C(0x00008009), + UINT32_C(0x0000008A), UINT32_C(0x00000088), UINT32_C(0x80008009), UINT32_C(0x8000000A), + UINT32_C(0x8000808B), UINT32_C(0x0000008B), UINT32_C(0x00008089), UINT32_C(0x00008003), + UINT32_C(0x00008002), UINT32_C(0x00000080), UINT32_C(0x0000800A), UINT32_C(0x8000000A), + UINT32_C(0x80008081), UINT32_C(0x00008080) +}; + + +/** + * 16-bit Keccak-f round constants + */ +static const uint16_t rc16[] = { + UINT16_C(0x0001), UINT16_C(0x8082), UINT16_C(0x808A), UINT16_C(0x8000), + UINT16_C(0x808B), UINT16_C(0x0001), UINT16_C(0x8081), UINT16_C(0x8009), + UINT16_C(0x008A), UINT16_C(0x0088), UINT16_C(0x8009), UINT16_C(0x000A), + UINT16_C(0x808B), UINT16_C(0x008B), UINT16_C(0x8089), UINT16_C(0x8003), + UINT16_C(0x8002), UINT16_C(0x0080), UINT16_C(0x800A), UINT16_C(0x000A) +}; + + +/** + * 8-bit Keccak-f round constants + */ +static const uint8_t rc8[] = { + UINT8_C(0x01), UINT8_C(0x82), UINT8_C(0x8A), UINT8_C(0x00), + UINT8_C(0x8B), UINT8_C(0x01), UINT8_C(0x81), UINT8_C(0x09), + UINT8_C(0x8A), UINT8_C(0x88), UINT8_C(0x09), UINT8_C(0x0A), + UINT8_C(0x8B), UINT8_C(0x8B), UINT8_C(0x89), UINT8_C(0x03), + UINT8_C(0x02), UINT8_C(0x80) }; /** * Rotate a word * - * @param x:int_fast64_t The value to rotate - * @param n:long Rotation steps, may be zero mod `w` - * @param w:long `state->w` - * @param wmod:int_fast64_t `state->wmod` - * @return :int_fast64_t The value rotated + * @param x:uint_fast64_t The value to rotate + * @param n:long int Rotation steps, may be zero mod `w` + * @param w:long int `state->w` + * @param wmod:uint_fast64_t `state->wmod` + * @return :uint_fast64_t The value rotated */ #define rotate(x, n, w, wmod) ((((x) >> ((w) - ((n) % (w)))) | ((x) << ((n) % (w)))) & (wmod)) @@ -39,11 +76,41 @@ static const uint_fast64_t RC[] = { /** * Rotate a 64-bit word * - * @param x:int_fast64_t The value to rotate - * @param n:long Rotation steps, may not be zero - * @return :int_fast64_t The value rotated + * @param x:uint64_t The value to rotate + * @param n:long int Rotation steps, may not be zero + * @return :uint64_t The value rotated + */ +#define rotate64(x, n) ((uint64_t)(((uint64_t)(x) >> (64L - (n))) | ((uint64_t)(x) << (n)))) + + +/** + * Rotate a 32-bit word + * + * @param x:uint32_t The value to rotate + * @param n:long int Rotation steps, may not be zero + * @return :uint32_t The value rotated + */ +#define rotate32(x, n) ((uint32_t)(((uint32_t)(x) >> (32L - (n))) | ((uint32_t)(x) << (n)))) + + +/** + * Rotate a 16-bit word + * + * @param x:uint16_t The value to rotate + * @param n:long int Rotation steps, may not be zero + * @return :uint16_t The value rotated */ -#define rotate64(x, n) ((int_fast64_t)(((uint64_t)(x) >> (64L - (n))) | ((uint64_t)(x) << (n)))) +#define rotate16(x, n) ((uint16_t)(((uint16_t)(x) >> (16L - (n))) | ((uint16_t)(x) << (n)))) + + +/** + * Rotate a 8-bit word + * + * @param x:uint8_t The value to rotate + * @param n:long int Rotation steps, may not be zero + * @return :uint8_t The value rotated + */ +#define rotate8(x, n) ((uint8_t)(((uint8_t)(x) >> (8L - (n))) | ((uint8_t)(x) << (n)))) /** @@ -54,13 +121,13 @@ static const uint_fast64_t RC[] = { */ LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __hot__))) static void -libkeccak_f_round(register struct libkeccak_state *state, register int_fast64_t rc) +libkeccak_f_round(register struct libkeccak_state *state, register uint64_t rc) { - int_fast64_t *restrict A = state->S; - int_fast64_t B[25]; - int_fast64_t C[5]; - int_fast64_t da, db, dc, dd, de; - int_fast64_t wmod = state->wmod; + uint64_t *restrict A = state->S.w64; + uint_fast64_t B[25]; + uint_fast64_t C[5]; + uint_fast64_t da, db, dc, dd, de; + uint_fast64_t wmod = state->wmod; long int w = state->w; /* θ step (step 1 of 3). */ @@ -102,12 +169,10 @@ libkeccak_f_round(register struct libkeccak_state *state, register int_fast64_t */ LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __hot__))) static void -libkeccak_f_round64(register struct libkeccak_state *state, register int_fast64_t rc) +libkeccak_f_round64(register struct libkeccak_state *state, register uint64_t rc) { - int_fast64_t *restrict A = state->S; - int_fast64_t B[25]; - int_fast64_t C[5]; - int_fast64_t da, db, dc, dd, de; + uint64_t *restrict A = state->S.w64; + uint64_t B[25], C[5], da, db, dc, dd, de; /* θ step (step 1 of 3). */ #define X(N) C[N] = A[N * 5] ^ A[N * 5 + 1] ^ A[N * 5 + 2] ^ A[N * 5 + 3] ^ A[N * 5 + 4] @@ -140,6 +205,138 @@ libkeccak_f_round64(register struct libkeccak_state *state, register int_fast64_ } +/** + * 32-bit word version of `libkeccak_f_round` + * + * @param state The hashing state + * @param rc The round contant for this round + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __hot__))) +static void +libkeccak_f_round32(register struct libkeccak_state *state, register uint32_t rc) +{ + uint32_t *restrict A = state->S.w32; + uint32_t B[25], C[5], da, db, dc, dd, de; + + /* θ step (step 1 of 3). */ +#define X(N) C[N] = A[N * 5] ^ A[N * 5 + 1] ^ A[N * 5 + 2] ^ A[N * 5 + 3] ^ A[N * 5 + 4] + LIST_5(X, ;); +#undef X + + /* θ step (step 2 of 3). */ + da = C[4] ^ rotate32(C[1], 1); + dd = C[2] ^ rotate32(C[4], 1); + db = C[0] ^ rotate32(C[2], 1); + de = C[3] ^ rotate32(C[0], 1); + dc = C[1] ^ rotate32(C[3], 1); + + /* ρ and π steps, with last two part of θ. */ +#define X(bi, ai, dv, r) B[bi] = rotate32(A[ai] ^ dv, (r & 31)) + B[0] = A[0] ^ da; X( 1, 15, dd, 28); X( 2, 5, db, 1); X( 3, 20, de, 27); X( 4, 10, dc, 62); + X( 5, 6, db, 44); X( 6, 21, de, 20); X( 7, 11, dc, 6); X( 8, 1, da, 36); X( 9, 16, dd, 55); + X(10, 12, dc, 43); X(11, 2, da, 3); X(12, 17, dd, 25); X(13, 7, db, 10); X(14, 22, de, 39); + X(15, 18, dd, 21); X(16, 8, db, 45); X(17, 23, de, 8); X(18, 13, dc, 15); X(19, 3, da, 41); + X(20, 24, de, 14); X(21, 14, dc, 61); X(22, 4, da, 18); X(23, 19, dd, 56); X(24, 9, db, 2); +#undef X + + /* ξ step. */ +#define X(N) A[N] = (uint32_t)(B[N] ^ ((~(B[(N + 5) % 25])) & B[(N + 10) % 25])) + LIST_25(X, ;); +#undef X + + /* ι step. */ + A[0] ^= rc; +} + + +/** + * 16-bit word version of `libkeccak_f_round` + * + * @param state The hashing state + * @param rc The round contant for this round + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __hot__))) +static void +libkeccak_f_round16(register struct libkeccak_state *state, register uint16_t rc) +{ + uint16_t *restrict A = state->S.w16; + uint16_t B[25], C[5], da, db, dc, dd, de; + + /* θ step (step 1 of 3). */ +#define X(N) C[N] = A[N * 5] ^ A[N * 5 + 1] ^ A[N * 5 + 2] ^ A[N * 5 + 3] ^ A[N * 5 + 4] + LIST_5(X, ;); +#undef X + + /* θ step (step 2 of 3). */ + da = C[4] ^ rotate16(C[1], 1); + dd = C[2] ^ rotate16(C[4], 1); + db = C[0] ^ rotate16(C[2], 1); + de = C[3] ^ rotate16(C[0], 1); + dc = C[1] ^ rotate16(C[3], 1); + + /* ρ and π steps, with last two part of θ. */ +#define X(bi, ai, dv, r) B[bi] = rotate16(A[ai] ^ dv, (r & 15)) + B[0] = A[0] ^ da; X( 1, 15, dd, 28); X( 2, 5, db, 1); X( 3, 20, de, 27); X( 4, 10, dc, 62); + X( 5, 6, db, 44); X( 6, 21, de, 20); X( 7, 11, dc, 6); X( 8, 1, da, 36); X( 9, 16, dd, 55); + X(10, 12, dc, 43); X(11, 2, da, 3); X(12, 17, dd, 25); X(13, 7, db, 10); X(14, 22, de, 39); + X(15, 18, dd, 21); X(16, 8, db, 45); X(17, 23, de, 8); X(18, 13, dc, 15); X(19, 3, da, 41); + X(20, 24, de, 14); X(21, 14, dc, 61); X(22, 4, da, 18); X(23, 19, dd, 56); X(24, 9, db, 2); +#undef X + + /* ξ step. */ +#define X(N) A[N] = (uint16_t)(B[N] ^ ((~(B[(N + 5) % 25])) & B[(N + 10) % 25])) + LIST_25(X, ;); +#undef X + + /* ι step. */ + A[0] ^= rc; +} + + +/** + * 8-bit word version of `libkeccak_f_round` + * + * @param state The hashing state + * @param rc The round contant for this round + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __hot__))) +static void +libkeccak_f_round8(register struct libkeccak_state *state, register uint8_t rc) +{ + uint8_t *restrict A = state->S.w8; + uint8_t B[25], C[5], da, db, dc, dd, de; + + /* θ step (step 1 of 3). */ +#define X(N) C[N] = A[N * 5] ^ A[N * 5 + 1] ^ A[N * 5 + 2] ^ A[N * 5 + 3] ^ A[N * 5 + 4] + LIST_5(X, ;); +#undef X + + /* θ step (step 2 of 3). */ + da = C[4] ^ rotate8(C[1], 1); + dd = C[2] ^ rotate8(C[4], 1); + db = C[0] ^ rotate8(C[2], 1); + de = C[3] ^ rotate8(C[0], 1); + dc = C[1] ^ rotate8(C[3], 1); + + /* ρ and π steps, with last two part of θ. */ +#define X(bi, ai, dv, r) B[bi] = rotate8(A[ai] ^ dv, (r & 7)) + B[0] = A[0] ^ da; X( 1, 15, dd, 28); X( 2, 5, db, 1); X( 3, 20, de, 27); X( 4, 10, dc, 62); + X( 5, 6, db, 44); X( 6, 21, de, 20); X( 7, 11, dc, 6); X( 8, 1, da, 36); X( 9, 16, dd, 55); + X(10, 12, dc, 43); X(11, 2, da, 3); X(12, 17, dd, 25); X(13, 7, db, 10); X(14, 22, de, 39); + X(15, 18, dd, 21); X(16, 8, db, 45); X(17, 23, de, 8); X(18, 13, dc, 15); X(19, 3, da, 41); + X(20, 24, de, 14); X(21, 14, dc, 61); X(22, 4, da, 18); X(23, 19, dd, 56); X(24, 9, db, 2); +#undef X + + /* ξ step. */ +#define X(N) A[N] = (uint8_t)(B[N] ^ ((~(B[(N + 5) % 25])) & B[(N + 10) % 25])) + LIST_25(X, ;); +#undef X + + /* ι step. */ + A[0] ^= rc; +} + + /** * Convert a chunk of bytes to a lane * @@ -151,14 +348,34 @@ libkeccak_f(register struct libkeccak_state *state) { register long int i = 0; register long int nr = state->nr; - register long int wmod = state->wmod; + register uint_fast64_t wmod = state->wmod; + if (nr == 24) { for (; i < nr; i++) - libkeccak_f_round64(state, (int_fast64_t)(RC[i])); - } else { + libkeccak_f_round64(state, rc64[i]); + return; + } + + if (nr == 22) { for (; i < nr; i++) - libkeccak_f_round(state, (int_fast64_t)(RC[i] & (uint_fast64_t)wmod)); + libkeccak_f_round32(state, rc32[i]); + return; } + + if (nr == 20) { + for (; i < nr; i++) + libkeccak_f_round16(state, rc16[i]); + return; + } + + if (nr == 18) { + for (; i < nr; i++) + libkeccak_f_round8(state, rc8[i]); + return; + } + + for (; i < nr; i++) + libkeccak_f_round(state, rc64[i] & wmod); } @@ -173,18 +390,18 @@ libkeccak_f(register struct libkeccak_state *state) * @return The lane */ LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __pure__, __warn_unused_result__, __gnu_inline__))) -static inline int_fast64_t +static inline uint64_t libkeccak_to_lane(register const unsigned char *restrict message, register size_t msglen, register long int rr, register long int ww, size_t off) { register long int n = (long)((msglen < (size_t)rr ? msglen : (size_t)rr) - off); - int_fast64_t rc = 0; + uint_fast64_t rc = 0; message += off; while (ww--) { rc <<= 8; - rc |= __builtin_expect(ww < n, 1) ? (int_fast64_t)(unsigned char)(message[ww]) : 0L; + rc |= __builtin_expect(ww < n, 1) ? (uint_fast64_t)(unsigned char)message[ww] : 0L; } - return rc; + return (uint64_t)rc; } @@ -198,13 +415,13 @@ libkeccak_to_lane(register const unsigned char *restrict message, register size_ * @return The lane */ LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __pure__, __hot__, __warn_unused_result__, __gnu_inline__))) -static inline int_fast64_t +static inline uint64_t libkeccak_to_lane64(register const unsigned char *message, register size_t msglen, register long int rr, size_t off) { register long int n = (long)((msglen < (size_t)rr ? msglen : (size_t)rr) - off); - int_fast64_t rc = 0; + uint64_t rc = 0; message += off; -#define X(N) if (__builtin_expect(N < n, 1)) rc |= (int_fast64_t)(unsigned char)(message[N]) << (N * 8);\ +#define X(N) if (__builtin_expect(N < n, 1)) rc |= (uint64_t)message[N] << (N * 8);\ else return rc LIST_8(X, ;); #undef X @@ -212,6 +429,78 @@ libkeccak_to_lane64(register const unsigned char *message, register size_t msgle } +/** + * 32-bit lane version of `libkeccak_to_lane` + * + * @param message The message + * @param msglen The length of the message + * @param rr Bitrate in bytes + * @param off The offset in the message + * @return The lane + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __pure__, __hot__, __warn_unused_result__, __gnu_inline__))) +static inline uint32_t +libkeccak_to_lane32(register const unsigned char *message, register size_t msglen, register long int rr, size_t off) +{ + register long int n = (long)((msglen < (size_t)rr ? msglen : (size_t)rr) - off); + uint32_t rc = 0; + message += off; +#define X(N) if (__builtin_expect(N < n, 1)) rc |= (uint32_t)message[N] << (N * 8);\ + else return rc + LIST_4(X, ;); +#undef X + return rc; +} + + +/** + * 16-bit lane version of `libkeccak_to_lane` + * + * @param message The message + * @param msglen The length of the message + * @param rr Bitrate in bytes + * @param off The offset in the message + * @return The lane + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __pure__, __hot__, __warn_unused_result__, __gnu_inline__))) +static inline uint16_t +libkeccak_to_lane16(register const unsigned char *message, register size_t msglen, register long int rr, size_t off) +{ + register long int n = (long)((msglen < (size_t)rr ? msglen : (size_t)rr) - off); + uint16_t rc = 0; + message += off; +#define X(N) if (__builtin_expect(N < n, 1)) rc |= (uint16_t)message[N] << (N * 8);\ + else return rc + LIST_2(X, ;); +#undef X + return rc; +} + + +/** + * 8-bit lane version of `libkeccak_to_lane` + * + * @param message The message + * @param msglen The length of the message + * @param rr Bitrate in bytes + * @param off The offset in the message + * @return The lane + */ +LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__, __pure__, __hot__, __warn_unused_result__, __gnu_inline__))) +static inline uint8_t +libkeccak_to_lane8(register const unsigned char *message, register size_t msglen, register long int rr, size_t off) +{ + register long int n = (long)((msglen < (size_t)rr ? msglen : (size_t)rr) - off); + uint8_t rc = 0; + message += off; +#define X(N) if (__builtin_expect(N < n, 1)) rc |= (uint8_t)(unsigned char)message[N] << (N * 8);\ + else return (uint8_t)rc + LIST_1(X, ;); +#undef X + return (uint8_t)rc; +} + + /** * Right-pad message with a 10*1-pad * @@ -263,24 +552,62 @@ libkeccak_absorption_phase(register struct libkeccak_state *restrict state, register long int rr = state->r >> 3; register long int ww = state->w >> 3; register long int n = (long)len / rr; - if (__builtin_expect(ww >= 8, 1)) { /* ww > 8 is impossible, it is just for optimisation possibilities. */ + + if (__builtin_expect(ww == 8, 1)) { while (n--) { -#define X(N) state->S[N] ^= libkeccak_to_lane64(message, len, rr, (size_t)(LANE_TRANSPOSE_MAP[N] * 8)) +#define X(N) state->S.w64[N] ^= libkeccak_to_lane64(message, len, rr, (size_t)(LANE_TRANSPOSE_MAP[N] * 8)) LIST_25(X, ;); #undef X libkeccak_f(state); message += (size_t)rr; len -= (size_t)rr; } - } else { + return; + } + + if (__builtin_expect(ww == 4, 1)) { + while (n--) { +#define X(N) state->S.w32[N] ^= libkeccak_to_lane32(message, len, rr, (size_t)(LANE_TRANSPOSE_MAP[N] * 4)) + LIST_25(X, ;); +#undef X + libkeccak_f(state); + message += (size_t)rr; + len -= (size_t)rr; + } + return; + } + + if (__builtin_expect(ww == 2, 1)) { + while (n--) { +#define X(N) state->S.w16[N] ^= libkeccak_to_lane16(message, len, rr, (size_t)(LANE_TRANSPOSE_MAP[N] * 2)) + LIST_25(X, ;); +#undef X + libkeccak_f(state); + message += (size_t)rr; + len -= (size_t)rr; + } + return; + } + + if (__builtin_expect(ww == 1, 1)) { while (n--) { -#define X(N) state->S[N] ^= libkeccak_to_lane(message, len, rr, ww, (size_t)(LANE_TRANSPOSE_MAP[N] * ww)) +#define X(N) state->S.w8[N] ^= libkeccak_to_lane8(message, len, rr, (size_t)(LANE_TRANSPOSE_MAP[N] * 1)) LIST_25(X, ;); #undef X libkeccak_f(state); message += (size_t)rr; len -= (size_t)rr; } + return; + } + + while (n--) { +#define X(N) state->S.w64[N] ^= libkeccak_to_lane(message, len, rr, ww, (size_t)(LANE_TRANSPOSE_MAP[N] * ww)) + LIST_25(X, ;); +#undef X + libkeccak_f(state); + message += (size_t)rr; + len -= (size_t)rr; } } @@ -299,16 +626,20 @@ static void libkeccak_squeezing_phase(register struct libkeccak_state *restrict state, long int rr, long int nn, long int ww, register unsigned char *restrict hashsum) { - register int_fast64_t v; + register uint64_t v; register long int ni = rr / ww + !!(rr % ww); auto long int olen = state->n; auto long int i, j = 0; register long int k; while (olen > 0) { for (i = 0; i < ni && j < nn; i++) { - v = state->S[LANE_TRANSPOSE_MAP[i]]; + if (__builtin_expect(ww == 8, 1)) v = state->S.w64[LANE_TRANSPOSE_MAP[i]]; else + if (__builtin_expect(ww == 4, 1)) v = state->S.w32[LANE_TRANSPOSE_MAP[i]]; else + if (__builtin_expect(ww == 2, 1)) v = state->S.w16[LANE_TRANSPOSE_MAP[i]]; else + if (__builtin_expect(ww == 1, 1)) v = state->S.w8[LANE_TRANSPOSE_MAP[i]]; else + v = state->S.w64[LANE_TRANSPOSE_MAP[i]]; for (k = 0; k++ < ww && j++ < nn; v >>= 8) - *hashsum++ = (unsigned char)v; + *hashsum++ = (unsigned char)(v & 0xFFU); } olen -= state->r; if (olen > 0) @@ -657,13 +988,7 @@ libkeccak_simple_squeeze(register struct libkeccak_state *state, register long i * @param state The hashing state * @param times The number of digests */ -void -libkeccak_fast_squeeze(register struct libkeccak_state *state, register long int times) -{ - times *= (state->n - 1) / state->r + 1; - while (times--) - libkeccak_f(state); -} +extern inline void libkeccak_fast_squeeze(register struct libkeccak_state *state, register long int times); /** diff --git a/extra/libkeccak_state_marshal.c b/extra/libkeccak_state_marshal.c index 7541164..f4fd4b5 100644 --- a/extra/libkeccak_state_marshal.c +++ b/extra/libkeccak_state_marshal.c @@ -28,7 +28,7 @@ libkeccak_state_marshal(const struct libkeccak_state *restrict state, void *rest if (!data) { return 7 * sizeof(long int) + - 1 * sizeof(int64_t) + + 1 * sizeof(uint64_t) + sizeof(state->S) + 2 * sizeof(size_t) + state->mptr; @@ -42,7 +42,7 @@ libkeccak_state_marshal(const struct libkeccak_state *restrict state, void *rest set(wmod); set(l); set(nr); - __builtin_memcpy(data, state->S, sizeof(state->S)); + __builtin_memcpy(data, &state->S, sizeof(state->S)); data += sizeof(state->S); set(mptr); set(mlen); diff --git a/extra/libkeccak_state_unmarshal.c b/extra/libkeccak_state_unmarshal.c index 4714566..0938ba1 100644 --- a/extra/libkeccak_state_unmarshal.c +++ b/extra/libkeccak_state_unmarshal.c @@ -30,7 +30,7 @@ libkeccak_state_unmarshal(struct libkeccak_state *restrict state, const void *re if (!state) { data += 7U * sizeof(long int); - data += 1U * sizeof(int64_t); + data += 1U * sizeof(uint64_t); data += sizeof(state->S); mptr = *(const size_t *)data; data += 2U * sizeof(size_t); @@ -46,7 +46,7 @@ libkeccak_state_unmarshal(struct libkeccak_state *restrict state, const void *re get(wmod); get(l); get(nr); - memcpy(state->S, data, sizeof(state->S)); + memcpy(&state->S, data, sizeof(state->S)); data += sizeof(state->S); get(mptr); get(mlen); diff --git a/extra/libkeccak_state_wipe_sponge.c b/extra/libkeccak_state_wipe_sponge.c index 91e845a..69aa46a 100644 --- a/extra/libkeccak_state_wipe_sponge.c +++ b/extra/libkeccak_state_wipe_sponge.c @@ -10,7 +10,7 @@ void libkeccak_state_wipe_sponge(volatile struct libkeccak_state *state) { - volatile int64_t *restrict S = state->S; + volatile uint64_t *restrict S = state->S.w64; size_t i; for (i = 0; i < 25; i++) diff --git a/libkeccak.h b/libkeccak.h index 670c69e..814883b 100644 --- a/libkeccak.h +++ b/libkeccak.h @@ -14,6 +14,10 @@ # pragma clang diagnostic ignored "-Wdocumentation" # pragma clang diagnostic ignored "-Wunknown-attributes" #endif +#if defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Winline" +#endif /** @@ -59,7 +63,12 @@ struct libkeccak_state { /** * The lanes (state/sponge) */ - int64_t S[25]; + union { + uint64_t w64[25]; + uint32_t w32[25]; + uint16_t w16[25]; + uint8_t w8[25]; + } S; /** * The bitrate @@ -89,7 +98,7 @@ struct libkeccak_state { /** * The word mask */ - int64_t wmod; + uint64_t wmod; /** * ℓ, the binary logarithm of the word size @@ -255,7 +264,12 @@ void libkeccak_simple_squeeze(register struct libkeccak_state *, register long i * @param times The number of digests */ LIBKECCAK_GCC_ONLY(__attribute__((__nonnull__, __nothrow__))) -void libkeccak_fast_squeeze(register struct libkeccak_state *, register long int); +inline void +libkeccak_fast_squeeze(register struct libkeccak_state *state, register long int times) +{ + times *= (state->n - 1) / state->r + 1; + libkeccak_simple_squeeze(state, times); +} /** * Squeeze out another digest @@ -311,6 +325,9 @@ libkeccak_state_destroy(volatile struct libkeccak_state *state) +#if defined(__GNUC__) +# pragma GCC diagnostic pop +#endif #if defined(__clang__) # pragma clang diagnostic pop #endif diff --git a/libkeccak/extra.h b/libkeccak/extra.h index 60a2601..b9734b3 100644 --- a/libkeccak/extra.h +++ b/libkeccak/extra.h @@ -11,7 +11,7 @@ inline void libkeccak_state_reset(struct libkeccak_state *state) { state->mptr = 0; - memset(state->S, 0, sizeof(state->S)); + memset(&state->S, 0, sizeof(state->S)); } diff --git a/libkeccak_state_initialise.c b/libkeccak_state_initialise.c index 7644ff7..545d3df 100644 --- a/libkeccak_state_initialise.c +++ b/libkeccak_state_initialise.c @@ -34,9 +34,9 @@ libkeccak_state_initialise(struct libkeccak_state *restrict state, const struct } state->nr = 12 + (state->l << 1); - state->wmod = (state->w == 64) ? ~0LL : (int64_t)((1ULL << state->w) - 1); + state->wmod = (state->w == 64) ? ~0ULL : (uint64_t)((1ULL << state->w) - 1); for (x = 0; x < 25; x++) - state->S[x] = 0; + state->S.w64[x] = 0; state->mptr = 0; state->mlen = (size_t)(state->r * state->b) >> 2; -- cgit v1.2.3-70-g09d2