From d52f6ca14e59d95a5aa7377c22e1e37f486e84b5 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Tue, 15 Oct 2024 21:38:21 +0200 Subject: Add optimised SHA-256 implementation using SHA-256 instrinsics for ARMv8 (almost 70 times faster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- Makefile | 7 +- config-armv8.mk | 14 ++++ process.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 207 insertions(+), 16 deletions(-) create mode 100644 config-armv8.mk diff --git a/Makefile b/Makefile index 3eee714..ed4dfd2 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,9 @@ CONFIGFILE = config-x86.mk # enables optimisations on x86 CPU's that have the # required features. # +# Additionally config-arm.mk which uses optimisations +# for ARMv8 is available. +# # config-portable.mk is available for exotic CPU's # and compiler that do not support the features required # for the optimisations. @@ -20,7 +23,7 @@ include mk/$(OS).mk LIB_MAJOR = 1 -LIB_MINOR = 0 +LIB_MINOR = 1 LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR) @@ -99,7 +102,7 @@ libsha2.a: $(OBJ) $(AR) -s $@ check: test - ./test + $(CHECK_PREFIX) ./test $(CHECK_FLAGS) install: mkdir -p -- "$(DESTDIR)$(PREFIX)/lib" diff --git a/config-armv8.mk b/config-armv8.mk new file mode 100644 index 0000000..c98fd0e --- /dev/null +++ b/config-armv8.mk @@ -0,0 +1,14 @@ +PREFIX = /usr +MANPREFIX = $(PREFIX)/share/man + +CC = cc -std=c11 + +CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 +CFLAGS = -Wall -O3 -march=armv8-a+crypto +LDFLAGS = -s + +# You can add -DALLOCA_LIMIT=# to CPPFLAGS, where # is a size_t +# value, to put a limit on how large allocation the library is +# allowed to make with alloca(3). For buffers that can have any +# size this limit will be used if it wants to allocate a larger +# buffer. Choose 0 to use malloc(3) instead of alloca(3). diff --git a/process.c b/process.c index 6f1451e..1a83046 100644 --- a/process.c +++ b/process.c @@ -3,14 +3,25 @@ #include #if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__) -# define HAVE_X86_SHA_INTRINSICS +# define HAVE_X86_SHA256_INTRINSICS +#endif + +#if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) +# if defined(__ARM_NEON) && (defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)) +# define HAVE_ARM_SHA256_INTRINSICS +# endif #endif -#ifdef HAVE_X86_SHA_INTRINSICS +#ifdef HAVE_X86_SHA256_INTRINSICS # include #endif +#ifdef HAVE_ARM_SHA256_INTRINSICS +# include +# include +#endif + /** * Unified implementation (what can unified without performance impact) @@ -68,7 +79,7 @@ h[i] = TRUNC(h[i] + work_h[i]); -#ifdef HAVE_X86_SHA_INTRINSICS +#ifdef HAVE_X86_SHA256_INTRINSICS static size_t process_x86_sha256(struct libsha2_state *restrict state, const unsigned char *restrict data, size_t len) @@ -290,6 +301,168 @@ out: #endif +#ifdef HAVE_ARM_SHA256_INTRINSICS + +static size_t +process_arm_sha256(struct libsha2_state *restrict state, const unsigned char *restrict data, size_t len) +{ + static const uint32_t rc[] = { + UINT32_C(0x428A2F98), UINT32_C(0x71374491), UINT32_C(0xB5C0FBCF), UINT32_C(0xE9B5DBA5), + UINT32_C(0x3956C25B), UINT32_C(0x59F111F1), UINT32_C(0x923F82A4), UINT32_C(0xAB1C5ED5), + UINT32_C(0xD807AA98), UINT32_C(0x12835B01), UINT32_C(0x243185BE), UINT32_C(0x550C7DC3), + UINT32_C(0x72BE5D74), UINT32_C(0x80DEB1FE), UINT32_C(0x9BDC06A7), UINT32_C(0xC19BF174), + UINT32_C(0xE49B69C1), UINT32_C(0xEFBE4786), UINT32_C(0x0FC19DC6), UINT32_C(0x240CA1CC), + UINT32_C(0x2DE92C6F), UINT32_C(0x4A7484AA), UINT32_C(0x5CB0A9DC), UINT32_C(0x76F988DA), + UINT32_C(0x983E5152), UINT32_C(0xA831C66D), UINT32_C(0xB00327C8), UINT32_C(0xBF597FC7), + UINT32_C(0xC6E00BF3), UINT32_C(0xD5A79147), UINT32_C(0x06CA6351), UINT32_C(0x14292967), + UINT32_C(0x27B70A85), UINT32_C(0x2E1B2138), UINT32_C(0x4D2C6DFC), UINT32_C(0x53380D13), + UINT32_C(0x650A7354), UINT32_C(0x766A0ABB), UINT32_C(0x81C2C92E), UINT32_C(0x92722C85), + UINT32_C(0xA2BFE8A1), UINT32_C(0xA81A664B), UINT32_C(0xC24B8B70), UINT32_C(0xC76C51A3), + UINT32_C(0xD192E819), UINT32_C(0xD6990624), UINT32_C(0xF40E3585), UINT32_C(0x106AA070), + UINT32_C(0x19A4C116), UINT32_C(0x1E376C08), UINT32_C(0x2748774C), UINT32_C(0x34B0BCB5), + UINT32_C(0x391C0CB3), UINT32_C(0x4ED8AA4A), UINT32_C(0x5B9CCA4F), UINT32_C(0x682E6FF3), + UINT32_C(0x748F82EE), UINT32_C(0x78A5636F), UINT32_C(0x84C87814), UINT32_C(0x8CC70208), + UINT32_C(0x90BEFFFA), UINT32_C(0xA4506CEB), UINT32_C(0xBEF9A3F7), UINT32_C(0xC67178F2) + }; + + uint32x4_t abcd, efgh, abcd_orig, efgh_orig; + uint32x4_t msg0, msg1, msg2, msg3, tmp0, tmp1, tmp2; + const unsigned char *restrict chunk; + size_t off = 0; + + abcd_orig = vld1q_u32(&state->h.b32[0]); + efgh_orig = vld1q_u32(&state->h.b32[4]); + + for (; len - off >= state->chunk_size; off += state->chunk_size) { + abcd = abcd_orig; + efgh = efgh_orig; + + chunk = &data[off]; + msg0 = vld1q_u32((const uint32_t *)&chunk[0]); + msg1 = vld1q_u32((const uint32_t *)&chunk[16]); + msg2 = vld1q_u32((const uint32_t *)&chunk[32]); + msg3 = vld1q_u32((const uint32_t *)&chunk[48]); + msg0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg0))); + msg1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg1))); + msg2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg2))); + msg3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg3))); + + tmp0 = vaddq_u32(msg0, vld1q_u32(&rc[0 * 4])); + + msg0 = vsha256su0q_u32(msg0, msg1); + tmp2 = abcd; + tmp1 = vaddq_u32(msg1, vld1q_u32(&rc[1 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg0 = vsha256su1q_u32(msg0, msg2, msg3); + + msg1 = vsha256su0q_u32(msg1, msg2); + tmp2 = abcd; + tmp0 = vaddq_u32(msg2, vld1q_u32(&rc[2 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg1 = vsha256su1q_u32(msg1, msg3, msg0); + + msg2 = vsha256su0q_u32(msg2, msg3); + tmp2 = abcd; + tmp1 = vaddq_u32(msg3, vld1q_u32(&rc[3 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg2 = vsha256su1q_u32(msg2, msg0, msg1); + + msg3 = vsha256su0q_u32(msg3, msg0); + tmp2 = abcd; + tmp0 = vaddq_u32(msg0, vld1q_u32(&rc[4 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg3 = vsha256su1q_u32(msg3, msg1, msg2); + + msg0 = vsha256su0q_u32(msg0, msg1); + tmp2 = abcd; + tmp1 = vaddq_u32(msg1, vld1q_u32(&rc[5 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg0 = vsha256su1q_u32(msg0, msg2, msg3); + + msg1 = vsha256su0q_u32(msg1, msg2); + tmp2 = abcd; + tmp0 = vaddq_u32(msg2, vld1q_u32(&rc[6 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg1 = vsha256su1q_u32(msg1, msg3, msg0); + + msg2 = vsha256su0q_u32(msg2, msg3); + tmp2 = abcd; + tmp1 = vaddq_u32(msg3, vld1q_u32(&rc[7 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg2 = vsha256su1q_u32(msg2, msg0, msg1); + + msg3 = vsha256su0q_u32(msg3, msg0); + tmp2 = abcd; + tmp0 = vaddq_u32(msg0, vld1q_u32(&rc[8 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg3 = vsha256su1q_u32(msg3, msg1, msg2); + + msg0 = vsha256su0q_u32(msg0, msg1); + tmp2 = abcd; + tmp1 = vaddq_u32(msg1, vld1q_u32(&rc[9 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg0 = vsha256su1q_u32(msg0, msg2, msg3); + + msg1 = vsha256su0q_u32(msg1, msg2); + tmp2 = abcd; + tmp0 = vaddq_u32(msg2, vld1q_u32(&rc[10 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg1 = vsha256su1q_u32(msg1, msg3, msg0); + + msg2 = vsha256su0q_u32(msg2, msg3); + tmp2 = abcd; + tmp1 = vaddq_u32(msg3, vld1q_u32(&rc[11 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + msg2 = vsha256su1q_u32(msg2, msg0, msg1); + + msg3 = vsha256su0q_u32(msg3, msg0); + tmp2 = abcd; + tmp0 = vaddq_u32(msg0, vld1q_u32(&rc[12 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + msg3 = vsha256su1q_u32(msg3, msg1, msg2); + + tmp2 = abcd; + tmp1 = vaddq_u32(msg1, vld1q_u32(&rc[13 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + + tmp2 = abcd; + tmp0 = vaddq_u32(msg2, vld1q_u32(&rc[14 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + + tmp2 = abcd; + tmp1 = vaddq_u32(msg3, vld1q_u32(&rc[15 * 4])); + abcd = vsha256hq_u32(abcd, efgh, tmp0); + efgh = vsha256h2q_u32(efgh, tmp2, tmp0); + + tmp2 = abcd; + abcd = vsha256hq_u32(abcd, efgh, tmp1); + efgh = vsha256h2q_u32(efgh, tmp2, tmp1); + + abcd_orig = vaddq_u32(abcd_orig, abcd); + efgh_orig = vaddq_u32(efgh_orig, efgh); + } + + vst1q_u32(&state->h.b32[0], abcd_orig); + vst1q_u32(&state->h.b32[4], efgh_orig); + + return off; +} + +#endif size_t libsha2_process(struct libsha2_state *restrict state, const unsigned char *restrict data, size_t len) @@ -298,6 +471,10 @@ libsha2_process(struct libsha2_state *restrict state, const unsigned char *restr size_t off = 0; if (state->algorithm <= LIBSHA2_256) { +#if defined(HAVE_ARM_SHA256_INTRINSICS) + return process_arm_sha256(state, data, len); +#else +# define ROTR(X, N) TRUNC32(((X) >> (N)) | ((X) << (32 - (N)))) uint_least32_t s0, s1; size_t i, j; @@ -305,36 +482,33 @@ libsha2_process(struct libsha2_state *restrict state, const unsigned char *restr # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wmemset-elt-size" #endif -#define ROTR(X, N) TRUNC32(((X) >> (N)) | ((X) << (32 - (N)))) - -#ifdef HAVE_X86_SHA_INTRINSICS +# ifdef HAVE_X86_SHA256_INTRINSICS if (have_sha_intrinsics()) return process_x86_sha256(state, data, len); -#endif - +# endif for (; len - off >= state->chunk_size; off += state->chunk_size) { chunk = &data[off]; SHA2_IMPLEMENTATION(chunk, 7, 18, 3, 17, 19, 10, 6, 11, 25, 2, 13, 22, uint_least32_t, 4, TRUNC32, state->k.b32, state->w.b32, state->h.b32, state->work_h.b32); } - -#undef ROTR -#if defined(__GNUC__) -# pragma GCC diagnostic pop +# if defined(__GNUC__) +# pragma GCC diagnostic pop +# endif +# undef ROTR #endif } else { +#define ROTR(X, N) TRUNC64(((X) >> (N)) | ((X) << (64 - (N)))) uint_least64_t s0, s1; size_t i, j; -#define ROTR(X, N) TRUNC64(((X) >> (N)) | ((X) << (64 - (N)))) + /* TODO Add optimisation using ARMv8.2 SHA-512 intrinsics (when I've access to a machine supporting it) */ for (; len - off >= state->chunk_size; off += state->chunk_size) { chunk = &data[off]; SHA2_IMPLEMENTATION(chunk, 1, 8, 7, 19, 61, 6, 14, 18, 41, 28, 34, 39, uint_least64_t, 8, TRUNC64, state->k.b64, state->w.b64, state->h.b64, state->work_h.b64); } - #undef ROTR } -- cgit v1.2.3-70-g09d2