aboutsummaryrefslogtreecommitdiffstats
path: root/process.c
diff options
context:
space:
mode:
Diffstat (limited to 'process.c')
-rw-r--r--process.c294
1 files changed, 241 insertions, 53 deletions
diff --git a/process.c b/process.c
index d9273f8..5a4d9ac 100644
--- a/process.c
+++ b/process.c
@@ -1,6 +1,15 @@
/* See LICENSE file for copyright and license details. */
#include "common.h"
+#if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__)
+# define HAVE_X86_SHA_INTRINSICS
+#endif
+
+
+#ifdef HAVE_X86_SHA_INTRINSICS
+# include <immintrin.h>
+#endif
+
static inline uint_least32_t
rorl(uint_least32_t n, int k)
@@ -8,9 +17,8 @@ rorl(uint_least32_t n, int k)
return TRUNC32((n << k) | (n >> (32 - k)));
}
-
-void
-libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict chunk)
+static size_t
+process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
#define F0(B, C, D) (D ^ (B & (C ^ D)))
#define F1(B, C, D) (B ^ C ^ D)
@@ -24,58 +32,66 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr
uint_least32_t a, b, c, d, e;
int i;
+ size_t off = 0;
- for (i = 0; i < 16; i++) {
- state->w[i] = (uint_least32_t)chunk[4 * i + 0] << 24;
- state->w[i] |= (uint_least32_t)chunk[4 * i + 1] << 16;
- state->w[i] |= (uint_least32_t)chunk[4 * i + 2] << 8;
- state->w[i] |= (uint_least32_t)chunk[4 * i + 3];
- }
- if (state->algorithm == LIBSHA1_1) {
- for (; i < 80; i++)
- state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
- } else {
- for (; i < 80; i++)
- state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
- }
- a = state->h[0];
- b = state->h[1];
- c = state->h[2];
- d = state->h[3];
- e = state->h[4];
- for (i = 0; i < 20;) {
- G0(a, b, c, d, e, i++);
- G0(e, a, b, c, d, i++);
- G0(d, e, a, b, c, i++);
- G0(c, d, e, a, b, i++);
- G0(b, c, d, e, a, i++);
- }
- while (i < 40) {
- G1(a, b, c, d, e, i++);
- G1(e, a, b, c, d, i++);
- G1(d, e, a, b, c, i++);
- G1(c, d, e, a, b, i++);
- G1(b, c, d, e, a, i++);
- }
- while (i < 60) {
- G2(a, b, c, d, e, i++);
- G2(e, a, b, c, d, i++);
- G2(d, e, a, b, c, i++);
- G2(c, d, e, a, b, i++);
- G2(b, c, d, e, a, i++);
- }
- while (i < 80) {
- G3(a, b, c, d, e, i++);
- G3(e, a, b, c, d, i++);
- G3(d, e, a, b, c, i++);
- G3(c, d, e, a, b, i++);
- G3(b, c, d, e, a, i++);
+ for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
+ for (i = 0; i < 16; i++) {
+ state->w[i] = (uint_least32_t)data[off + 4 * i + 0] << 24;
+ state->w[i] |= (uint_least32_t)data[off + 4 * i + 1] << 16;
+ state->w[i] |= (uint_least32_t)data[off + 4 * i + 2] << 8;
+ state->w[i] |= (uint_least32_t)data[off + 4 * i + 3];
+ }
+ if (state->algorithm == LIBSHA1_1) {
+ for (; i < 80; i++)
+ state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
+ } else {
+ for (; i < 80; i++)
+ state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
+ }
+
+ a = state->h[0];
+ b = state->h[1];
+ c = state->h[2];
+ d = state->h[3];
+ e = state->h[4];
+
+ for (i = 0; i < 20;) {
+ G0(a, b, c, d, e, i++);
+ G0(e, a, b, c, d, i++);
+ G0(d, e, a, b, c, i++);
+ G0(c, d, e, a, b, i++);
+ G0(b, c, d, e, a, i++);
+ }
+ while (i < 40) {
+ G1(a, b, c, d, e, i++);
+ G1(e, a, b, c, d, i++);
+ G1(d, e, a, b, c, i++);
+ G1(c, d, e, a, b, i++);
+ G1(b, c, d, e, a, i++);
+ }
+ while (i < 60) {
+ G2(a, b, c, d, e, i++);
+ G2(e, a, b, c, d, i++);
+ G2(d, e, a, b, c, i++);
+ G2(c, d, e, a, b, i++);
+ G2(b, c, d, e, a, i++);
+ }
+ while (i < 80) {
+ G3(a, b, c, d, e, i++);
+ G3(e, a, b, c, d, i++);
+ G3(d, e, a, b, c, i++);
+ G3(c, d, e, a, b, i++);
+ G3(b, c, d, e, a, i++);
+ }
+
+ state->h[0] = TRUNC32(state->h[0] + a);
+ state->h[1] = TRUNC32(state->h[1] + b);
+ state->h[2] = TRUNC32(state->h[2] + c);
+ state->h[3] = TRUNC32(state->h[3] + d);
+ state->h[4] = TRUNC32(state->h[4] + e);
}
- state->h[0] = TRUNC32(state->h[0] + a);
- state->h[1] = TRUNC32(state->h[1] + b);
- state->h[2] = TRUNC32(state->h[2] + c);
- state->h[3] = TRUNC32(state->h[3] + d);
- state->h[4] = TRUNC32(state->h[4] + e);
+
+ return off;
#undef F0
#undef F1
@@ -87,3 +103,175 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr
#undef G2
#undef G3
}
+
+#ifdef HAVE_X86_SHA_INTRINSICS
+
+static size_t
+process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
+{
+ const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL);
+ register __m128i abcd, e000, temp, w[4];
+ __m128i abcd_orig, e000_orig;
+ size_t off = 0;
+
+ abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5);
+ e000_orig = _mm_set_epi32(state->h[4], 0, 0, 0);
+
+ for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
+ w[0] = _mm_loadu_si128((const __m128i *)&data[0]);
+ w[0] = _mm_shuffle_epi8(w[0], SHUFFLE_MASK);
+ e000 = _mm_add_epi32(e000_orig, w[0]);
+ temp = abcd_orig;
+ abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0);
+
+ w[1] = _mm_loadu_si128((const __m128i *)&data[16]);
+ w[1] = _mm_shuffle_epi8(w[1], SHUFFLE_MASK);
+ temp = _mm_sha1nexte_epu32(temp, w[1]);
+ e000 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
+ w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+
+ w[2] = _mm_loadu_si128((const __m128i *)&data[32]);
+ w[2] = _mm_shuffle_epi8(w[2], SHUFFLE_MASK);
+ e000 = _mm_sha1nexte_epu32(e000, w[2]);
+ temp = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
+ w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+ w[0] = _mm_xor_si128(w[0], w[2]);
+
+ w[3] = _mm_loadu_si128((const __m128i *)&data[48]);
+ w[3] = _mm_shuffle_epi8(w[3], SHUFFLE_MASK);
+ temp = _mm_sha1nexte_epu32(temp, w[3]);
+ e000 = abcd;
+ w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
+ w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+ w[1] = _mm_xor_si128(w[1], w[3]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[0]);
+ temp = abcd;
+ w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
+ w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+ w[2] = _mm_xor_si128(w[2], w[0]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[1]);
+ e000 = abcd;
+ w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+ w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+ w[3] = _mm_xor_si128(w[3], w[1]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[2]);
+ temp = abcd;
+ w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
+ w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+ w[0] = _mm_xor_si128(w[0], w[2]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[3]);
+ e000 = abcd;
+ w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+ w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+ w[1] = _mm_xor_si128(w[1], w[3]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[0]);
+ temp = abcd;
+ w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
+ w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+ w[2] = _mm_xor_si128(w[2], w[0]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[1]);
+ e000 = abcd;
+ w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
+ w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+ w[3] = _mm_xor_si128(w[3], w[1]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[2]);
+ temp = abcd;
+ w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+ w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+ w[0] = _mm_xor_si128(w[0], w[2]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[3]);
+ e000 = abcd;
+ w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
+ w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+ w[1] = _mm_xor_si128(w[1], w[3]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[0]);
+ temp = abcd;
+ w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+ w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+ w[2] = _mm_xor_si128(w[2], w[0]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[1]);
+ e000 = abcd;
+ w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
+ w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
+ w[3] = _mm_xor_si128(w[3], w[1]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[2]);
+ temp = abcd;
+ w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
+ w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
+ w[0] = _mm_xor_si128(w[0], w[2]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[3]);
+ e000 = abcd;
+ w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+ w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
+ w[1] = _mm_xor_si128(w[1], w[3]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[0]);
+ temp = abcd;
+ w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
+ w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
+ w[2] = _mm_xor_si128(w[2], w[0]);
+
+ temp = _mm_sha1nexte_epu32(temp, w[1]);
+ e000 = abcd;
+ w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+ w[3] = _mm_xor_si128(w[3], w[1]);
+
+ e000 = _mm_sha1nexte_epu32(e000, w[2]);
+ temp = abcd;
+ w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
+ abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
+
+ temp = _mm_sha1nexte_epu32(temp, w[3]);
+ e000 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
+
+ e000_orig = _mm_sha1nexte_epu32(e000, e000_orig);
+ abcd_orig = _mm_add_epi32(abcd, abcd_orig);
+ }
+
+ _mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 32 - 5));
+ state->h[4] = _mm_extract_epi32(e000_orig, 3);
+
+ return off;
+}
+
+#endif
+
+size_t
+libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
+{
+#ifdef HAVE_X86_SHA_INTRINSICS
+ if (state->algorithm == LIBSHA1_1)
+ return process_x86_sha(state, data, len);
+#endif
+ return process_portable(state, data, len);
+}