From c5b86d52256d535149caefbe1031807b28c8face Mon Sep 17 00:00:00 2001
From: Mattias Andrée <maandree@kth.se>
Date: Fri, 8 Jul 2022 00:47:04 +0200
Subject: Add code using SHA intrinsics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mattias Andrée <maandree@kth.se>
---
 common.h  |   8 +-
 config.mk |   2 +-
 digest.c  |   4 +-
 process.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 update.c  |  14 ++-
 5 files changed, 254 insertions(+), 68 deletions(-)

diff --git a/common.h b/common.h
index 3b1cf00..43735fc 100644
--- a/common.h
+++ b/common.h
@@ -27,10 +27,12 @@
 /**
  * Process a chunk using SHA-1 or SHA-0
  * 
- * @param  state  The hashing state
- * @param  chunk  The data to process
+ * @param   state  The hashing state
+ * @param   data   The data to process
+ * @param   len    The number of available bytes
+ * @return         The number of processed bytes
  */
 #if defined(__GNUC__)
 __attribute__((__nonnull__, __nothrow__))
 #endif
-void libsha1_process(struct libsha1_state *restrict, const unsigned char *restrict);
+size_t libsha1_process(struct libsha1_state *restrict, const unsigned char *restrict, size_t);
diff --git a/config.mk b/config.mk
index 66efe55..9ec13d4 100644
--- a/config.mk
+++ b/config.mk
@@ -4,7 +4,7 @@ MANPREFIX = $(PREFIX)/share/man
 CC = c99
 
 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700
-CFLAGS   = -Wall -O3
+CFLAGS   = -Wall -O3 -msse4 -msha
 LDFLAGS  = -s
 
 # You can add -DALLOCA_LIMIT=# to CPPFLAGS, where # is a size_t
diff --git a/digest.c b/digest.c
index 0e9d117..0ad096c 100644
--- a/digest.c
+++ b/digest.c
@@ -29,7 +29,7 @@ libsha1_digest(struct libsha1_state *restrict state, const void *message_, size_
 	if (off > sizeof(state->chunk) - (size_t)8) {
 		memset(state->chunk + off, 0, sizeof(state->chunk) - off);
 		off = 0;
-		libsha1_process(state, state->chunk);
+		libsha1_process(state, state->chunk, sizeof(state->chunk));
 	}
 
 	memset(state->chunk + off, 0, sizeof(state->chunk) - 8 - off);
@@ -41,7 +41,7 @@ libsha1_digest(struct libsha1_state *restrict state, const void *message_, size_
 	state->chunk[sizeof(state->chunk) - 3] = (unsigned char)(state->message_size >> 16);
 	state->chunk[sizeof(state->chunk) - 2] = (unsigned char)(state->message_size >>  8);
 	state->chunk[sizeof(state->chunk) - 1] = (unsigned char)(state->message_size >>  0);
-	libsha1_process(state, state->chunk);
+	libsha1_process(state, state->chunk, sizeof(state->chunk));
 
 	n = libsha1_algorithm_output_size(state->algorithm);
 	for (i = 0, n /= 4; i < n; i++) {
diff --git a/process.c b/process.c
index d9273f8..5a4d9ac 100644
--- a/process.c
+++ b/process.c
@@ -1,6 +1,15 @@
 /* See LICENSE file for copyright and license details. */
 #include "common.h"
 
+#if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__)
+# define HAVE_X86_SHA_INTRINSICS
+#endif
+
+
+#ifdef HAVE_X86_SHA_INTRINSICS
+# include <immintrin.h>
+#endif
+
 
 static inline uint_least32_t
 rorl(uint_least32_t n, int k)
@@ -8,9 +17,8 @@ rorl(uint_least32_t n, int k)
 	return TRUNC32((n << k) | (n >> (32 - k)));
 }
 
-
-void
-libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict chunk)
+static size_t
+process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
 {
 #define F0(B, C, D) (D ^ (B & (C ^ D)))
 #define F1(B, C, D) (B ^ C ^ D)
@@ -24,58 +32,66 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr
 
 	uint_least32_t a, b, c, d, e;
 	int i;
+	size_t off = 0;
 
-	for (i = 0; i < 16; i++) {
-		state->w[i]  = (uint_least32_t)chunk[4 * i + 0] << 24;
-		state->w[i] |= (uint_least32_t)chunk[4 * i + 1] << 16;
-		state->w[i] |= (uint_least32_t)chunk[4 * i + 2] <<  8;
-		state->w[i] |= (uint_least32_t)chunk[4 * i + 3];
-	}
-	if (state->algorithm == LIBSHA1_1) {
-		for (; i < 80; i++)
-			state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
-	} else {
-		for (; i < 80; i++)
-			state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
-	}
-	a = state->h[0];
-	b = state->h[1];
-	c = state->h[2];
-	d = state->h[3];
-	e = state->h[4];
-	for (i = 0; i < 20;) {
-		G0(a, b, c, d, e, i++);
-		G0(e, a, b, c, d, i++);
-		G0(d, e, a, b, c, i++);
-		G0(c, d, e, a, b, i++);
-		G0(b, c, d, e, a, i++);
-	}
-	while (i < 40) {
-		G1(a, b, c, d, e, i++);
-		G1(e, a, b, c, d, i++);
-		G1(d, e, a, b, c, i++);
-		G1(c, d, e, a, b, i++);
-		G1(b, c, d, e, a, i++);
-	}
-	while (i < 60) {
-		G2(a, b, c, d, e, i++);
-		G2(e, a, b, c, d, i++);
-		G2(d, e, a, b, c, i++);
-		G2(c, d, e, a, b, i++);
-		G2(b, c, d, e, a, i++);
-	}
-	while (i < 80) {
-		G3(a, b, c, d, e, i++);
-		G3(e, a, b, c, d, i++);
-		G3(d, e, a, b, c, i++);
-		G3(c, d, e, a, b, i++);
-		G3(b, c, d, e, a, i++);
+	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
+		for (i = 0; i < 16; i++) {
+			state->w[i]  = (uint_least32_t)data[off + 4 * i + 0] << 24;
+			state->w[i] |= (uint_least32_t)data[off + 4 * i + 1] << 16;
+			state->w[i] |= (uint_least32_t)data[off + 4 * i + 2] <<  8;
+			state->w[i] |= (uint_least32_t)data[off + 4 * i + 3];
+		}
+		if (state->algorithm == LIBSHA1_1) {
+			for (; i < 80; i++)
+				state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
+		} else {
+			for (; i < 80; i++)
+				state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
+		}
+
+		a = state->h[0];
+		b = state->h[1];
+		c = state->h[2];
+		d = state->h[3];
+		e = state->h[4];
+
+		for (i = 0; i < 20;) {
+			G0(a, b, c, d, e, i++);
+			G0(e, a, b, c, d, i++);
+			G0(d, e, a, b, c, i++);
+			G0(c, d, e, a, b, i++);
+			G0(b, c, d, e, a, i++);
+		}
+		while (i < 40) {
+			G1(a, b, c, d, e, i++);
+			G1(e, a, b, c, d, i++);
+			G1(d, e, a, b, c, i++);
+			G1(c, d, e, a, b, i++);
+			G1(b, c, d, e, a, i++);
+		}
+		while (i < 60) {
+			G2(a, b, c, d, e, i++);
+			G2(e, a, b, c, d, i++);
+			G2(d, e, a, b, c, i++);
+			G2(c, d, e, a, b, i++);
+			G2(b, c, d, e, a, i++);
+		}
+		while (i < 80) {
+			G3(a, b, c, d, e, i++);
+			G3(e, a, b, c, d, i++);
+			G3(d, e, a, b, c, i++);
+			G3(c, d, e, a, b, i++);
+			G3(b, c, d, e, a, i++);
+		}
+
+		state->h[0] = TRUNC32(state->h[0] + a);
+		state->h[1] = TRUNC32(state->h[1] + b);
+		state->h[2] = TRUNC32(state->h[2] + c);
+		state->h[3] = TRUNC32(state->h[3] + d);
+		state->h[4] = TRUNC32(state->h[4] + e);
 	}
-	state->h[0] = TRUNC32(state->h[0] + a);
-	state->h[1] = TRUNC32(state->h[1] + b);
-	state->h[2] = TRUNC32(state->h[2] + c);
-	state->h[3] = TRUNC32(state->h[3] + d);
-	state->h[4] = TRUNC32(state->h[4] + e);
+
+	return off;
 
 #undef F0
 #undef F1
@@ -87,3 +103,175 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr
 #undef G2
 #undef G3
 }
+
+#ifdef HAVE_X86_SHA_INTRINSICS
+
+static size_t
+process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
+{
+	const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL);
+	register __m128i abcd, e000, temp, w[4];
+	__m128i abcd_orig, e000_orig;
+	size_t off = 0;
+
+	abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5);
+	e000_orig = _mm_set_epi32(state->h[4], 0, 0, 0);
+
+	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
+		w[0] = _mm_loadu_si128((const __m128i *)&data[0]);
+		w[0] = _mm_shuffle_epi8(w[0], SHUFFLE_MASK);
+		e000 = _mm_add_epi32(e000_orig, w[0]);
+		temp = abcd_orig;
+		abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0);
+
+		w[1] = _mm_loadu_si128((const __m128i *)&data[16]);
+		w[1] = _mm_shuffle_epi8(w[1], SHUFFLE_MASK);
+		temp = _mm_sha1nexte_epu32(temp, w[1]);
+		e000 = abcd;
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
+		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+
+		w[2] = _mm_loadu_si128((const __m128i *)&data[32]);
+		w[2] = _mm_shuffle_epi8(w[2], SHUFFLE_MASK);
+		e000 = _mm_sha1nexte_epu32(e000, w[2]);
+		temp = abcd;
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
+		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+		w[0] = _mm_xor_si128(w[0], w[2]);
+
+		w[3] = _mm_loadu_si128((const __m128i *)&data[48]);
+		w[3] = _mm_shuffle_epi8(w[3], SHUFFLE_MASK);
+		temp = _mm_sha1nexte_epu32(temp, w[3]);
+		e000 = abcd;
+		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
+		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+		w[1] = _mm_xor_si128(w[1], w[3]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[0]);
+		temp = abcd;
+		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
+		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+		w[2] = _mm_xor_si128(w[2], w[0]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[1]);
+		e000 = abcd;
+		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+		w[3] = _mm_xor_si128(w[3], w[1]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[2]);
+		temp = abcd;
+		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
+		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+		w[0] = _mm_xor_si128(w[0], w[2]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[3]);
+		e000 = abcd;
+		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+		w[1] = _mm_xor_si128(w[1], w[3]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[0]);
+		temp = abcd;
+		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
+		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+		w[2] = _mm_xor_si128(w[2], w[0]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[1]);
+		e000 = abcd;
+		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+		w[3] = _mm_xor_si128(w[3], w[1]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[2]);
+		temp = abcd;
+		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+		w[0] = _mm_xor_si128(w[0], w[2]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[3]);
+		e000 = abcd;
+		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
+		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+		w[1] = _mm_xor_si128(w[1], w[3]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[0]);
+		temp = abcd;
+		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+		w[2] = _mm_xor_si128(w[2], w[0]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[1]);
+		e000 = abcd;
+		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
+		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+		w[3] = _mm_xor_si128(w[3], w[1]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[2]);
+		temp = abcd;
+		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+		w[0] = _mm_xor_si128(w[0], w[2]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[3]);
+		e000 = abcd;
+		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+		w[1] = _mm_xor_si128(w[1], w[3]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[0]);
+		temp = abcd;
+		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
+		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+		w[2] = _mm_xor_si128(w[2], w[0]);
+
+		temp = _mm_sha1nexte_epu32(temp, w[1]);
+		e000 = abcd;
+		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+		w[3] = _mm_xor_si128(w[3], w[1]);
+
+		e000 = _mm_sha1nexte_epu32(e000, w[2]);
+		temp = abcd;
+		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
+
+		temp = _mm_sha1nexte_epu32(temp, w[3]);
+		e000 = abcd;
+		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+
+		e000_orig = _mm_sha1nexte_epu32(e000, e000_orig);
+		abcd_orig = _mm_add_epi32(abcd, abcd_orig);
+	}
+
+	_mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 32 - 5));
+	state->h[4] = _mm_extract_epi32(e000_orig, 3);
+
+	return off;
+}
+
+#endif
+
+size_t
+libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
+{
+#ifdef HAVE_X86_SHA_INTRINSICS
+	if (state->algorithm == LIBSHA1_1)
+		return process_x86_sha(state, data, len);
+#endif
+	return process_portable(state, data, len);
+}
diff --git a/update.c b/update.c
index 6ae2928..441ecd4 100644
--- a/update.c
+++ b/update.c
@@ -5,7 +5,7 @@
 void
 libsha1_update(struct libsha1_state *restrict state, const void *restrict message_, size_t msglen)
 {
-	const char *restrict message = message_;
+	const unsigned char *restrict message = message_;
 	size_t n, off;
 
 	off = (state->message_size / 8) % sizeof(state->chunk);
@@ -16,17 +16,13 @@ libsha1_update(struct libsha1_state *restrict state, const void *restrict messag
 		n = msglen < sizeof(state->chunk) - off ? msglen : sizeof(state->chunk) - off;
 		memcpy(&state->chunk[off], message, n);
 		if (off + n == sizeof(state->chunk))
-			libsha1_process(state, state->chunk);
+			libsha1_process(state, state->chunk, sizeof(state->chunk));
 		message += n;
 		msglen -= n;
 	}
 
-	while (msglen >= sizeof(state->chunk)) {
-		libsha1_process(state, (const unsigned char *)message);
-		message += sizeof(state->chunk);
-		msglen -= sizeof(state->chunk);
-	}
+	off = libsha1_process(state, message, msglen);
 
-	if (msglen)
-		memcpy(state->chunk, message, msglen);
+	if (msglen > off)
+		memcpy(state->chunk, &message[off], msglen - off);
 }
-- 
cgit v1.3.1