/* See LICENSE file for copyright and license details. */ #include "common.h" #if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__) # define HAVE_X86_SHA_INTRINSICS #endif #ifdef HAVE_X86_SHA_INTRINSICS # include #endif static inline uint_least32_t rorl(uint_least32_t n, int k) { return TRUNC32((n << k) | (n >> (32 - k))); } static size_t process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { #define F0(B, C, D) (D ^ (B & (C ^ D))) #define F1(B, C, D) (B ^ C ^ D) #define F2(B, C, D) ((B & C) | (D & (B | C))) #define F3(B, C, D) (B ^ C ^ D) #define G_(A, B, C, D, E, I, F, X) (E = TRUNC32(E + rorl(A, 5) + F(B, C, D) + state->w[I] + (uint_least32_t)X##UL), B = rorl(B, 30)) #define G0(A, B, C, D, E, I) G_(A, B, C, D, E, I, F0, 0x5A827999) #define G1(A, B, C, D, E, I) G_(A, B, C, D, E, I, F1, 0x6ED9EBA1) #define G2(A, B, C, D, E, I) G_(A, B, C, D, E, I, F2, 0x8F1BBCDC) #define G3(A, B, C, D, E, I) G_(A, B, C, D, E, I, F3, 0xCA62C1D6) uint_least32_t a, b, c, d, e; int i; size_t off = 0; for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) { for (i = 0; i < 16; i++) { state->w[i] = (uint_least32_t)data[off + 4 * i + 0] << 24; state->w[i] |= (uint_least32_t)data[off + 4 * i + 1] << 16; state->w[i] |= (uint_least32_t)data[off + 4 * i + 2] << 8; state->w[i] |= (uint_least32_t)data[off + 4 * i + 3]; } if (state->algorithm == LIBSHA1_1) { for (; i < 80; i++) state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1); } else { for (; i < 80; i++) state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16]; } a = state->h[0]; b = state->h[1]; c = state->h[2]; d = state->h[3]; e = state->h[4]; for (i = 0; i < 20;) { G0(a, b, c, d, e, i++); G0(e, a, b, c, d, i++); G0(d, e, a, b, c, i++); G0(c, d, e, a, b, i++); G0(b, c, d, e, a, i++); } while (i < 40) { G1(a, b, c, d, e, i++); G1(e, a, b, c, d, i++); G1(d, e, a, b, c, i++); G1(c, d, e, a, b, i++); G1(b, c, d, e, a, i++); } while (i < 60) { G2(a, b, c, d, e, i++); G2(e, a, b, c, d, i++); G2(d, e, a, b, c, i++); G2(c, d, e, a, b, i++); G2(b, c, d, e, a, i++); } while (i < 80) { G3(a, b, c, d, e, i++); G3(e, a, b, c, d, i++); G3(d, e, a, b, c, i++); G3(c, d, e, a, b, i++); G3(b, c, d, e, a, i++); } state->h[0] = TRUNC32(state->h[0] + a); state->h[1] = TRUNC32(state->h[1] + b); state->h[2] = TRUNC32(state->h[2] + c); state->h[3] = TRUNC32(state->h[3] + d); state->h[4] = TRUNC32(state->h[4] + e); } return off; #undef F0 #undef F1 #undef F2 #undef F3 #undef G_ #undef G0 #undef G1 #undef G2 #undef G3 } #ifdef HAVE_X86_SHA_INTRINSICS static size_t process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL); register __m128i abcd, e000, temp, w[4]; __m128i abcd_orig, e000_orig; size_t off = 0; abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5); e000_orig = _mm_set_epi32(state->h[4], 0, 0, 0); for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) { w[0] = _mm_loadu_si128((const __m128i *)&data[0]); w[0] = _mm_shuffle_epi8(w[0], SHUFFLE_MASK); e000 = _mm_add_epi32(e000_orig, w[0]); temp = abcd_orig; abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0); w[1] = _mm_loadu_si128((const __m128i *)&data[16]); w[1] = _mm_shuffle_epi8(w[1], SHUFFLE_MASK); temp = _mm_sha1nexte_epu32(temp, w[1]); e000 = abcd; abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); w[0] = _mm_sha1msg1_epu32(w[0], w[1]); w[2] = _mm_loadu_si128((const __m128i *)&data[32]); w[2] = _mm_shuffle_epi8(w[2], SHUFFLE_MASK); e000 = _mm_sha1nexte_epu32(e000, w[2]); temp = abcd; abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); w[1] = _mm_sha1msg1_epu32(w[1], w[2]); w[0] = _mm_xor_si128(w[0], w[2]); w[3] = _mm_loadu_si128((const __m128i *)&data[48]); w[3] = _mm_shuffle_epi8(w[3], SHUFFLE_MASK); temp = _mm_sha1nexte_epu32(temp, w[3]); e000 = abcd; w[0] = _mm_sha1msg2_epu32(w[0], w[3]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); w[2] = _mm_sha1msg1_epu32(w[2], w[3]); w[1] = _mm_xor_si128(w[1], w[3]); e000 = _mm_sha1nexte_epu32(e000, w[0]); temp = abcd; w[1] = _mm_sha1msg2_epu32(w[1], w[0]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); w[3] = _mm_sha1msg1_epu32(w[3], w[0]); w[2] = _mm_xor_si128(w[2], w[0]); temp = _mm_sha1nexte_epu32(temp, w[1]); e000 = abcd; w[2] = _mm_sha1msg2_epu32(w[2], w[1]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); w[0] = _mm_sha1msg1_epu32(w[0], w[1]); w[3] = _mm_xor_si128(w[3], w[1]); e000 = _mm_sha1nexte_epu32(e000, w[2]); temp = abcd; w[3] = _mm_sha1msg2_epu32(w[3], w[2]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); w[1] = _mm_sha1msg1_epu32(w[1], w[2]); w[0] = _mm_xor_si128(w[0], w[2]); temp = _mm_sha1nexte_epu32(temp, w[3]); e000 = abcd; w[0] = _mm_sha1msg2_epu32(w[0], w[3]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); w[2] = _mm_sha1msg1_epu32(w[2], w[3]); w[1] = _mm_xor_si128(w[1], w[3]); e000 = _mm_sha1nexte_epu32(e000, w[0]); temp = abcd; w[1] = _mm_sha1msg2_epu32(w[1], w[0]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); w[3] = _mm_sha1msg1_epu32(w[3], w[0]); w[2] = _mm_xor_si128(w[2], w[0]); temp = _mm_sha1nexte_epu32(temp, w[1]); e000 = abcd; w[2] = _mm_sha1msg2_epu32(w[2], w[1]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); w[0] = _mm_sha1msg1_epu32(w[0], w[1]); w[3] = _mm_xor_si128(w[3], w[1]); e000 = _mm_sha1nexte_epu32(e000, w[2]); temp = abcd; w[3] = _mm_sha1msg2_epu32(w[3], w[2]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); w[1] = _mm_sha1msg1_epu32(w[1], w[2]); w[0] = _mm_xor_si128(w[0], w[2]); temp = _mm_sha1nexte_epu32(temp, w[3]); e000 = abcd; w[0] = _mm_sha1msg2_epu32(w[0], w[3]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); w[2] = _mm_sha1msg1_epu32(w[2], w[3]); w[1] = _mm_xor_si128(w[1], w[3]); e000 = _mm_sha1nexte_epu32(e000, w[0]); temp = abcd; w[1] = _mm_sha1msg2_epu32(w[1], w[0]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); w[3] = _mm_sha1msg1_epu32(w[3], w[0]); w[2] = _mm_xor_si128(w[2], w[0]); temp = _mm_sha1nexte_epu32(temp, w[1]); e000 = abcd; w[2] = _mm_sha1msg2_epu32(w[2], w[1]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); w[0] = _mm_sha1msg1_epu32(w[0], w[1]); w[3] = _mm_xor_si128(w[3], w[1]); e000 = _mm_sha1nexte_epu32(e000, w[2]); temp = abcd; w[3] = _mm_sha1msg2_epu32(w[3], w[2]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); w[1] = _mm_sha1msg1_epu32(w[1], w[2]); w[0] = _mm_xor_si128(w[0], w[2]); temp = _mm_sha1nexte_epu32(temp, w[3]); e000 = abcd; w[0] = _mm_sha1msg2_epu32(w[0], w[3]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); w[2] = _mm_sha1msg1_epu32(w[2], w[3]); w[1] = _mm_xor_si128(w[1], w[3]); e000 = _mm_sha1nexte_epu32(e000, w[0]); temp = abcd; w[1] = _mm_sha1msg2_epu32(w[1], w[0]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); w[3] = _mm_sha1msg1_epu32(w[3], w[0]); w[2] = _mm_xor_si128(w[2], w[0]); temp = _mm_sha1nexte_epu32(temp, w[1]); e000 = abcd; w[2] = _mm_sha1msg2_epu32(w[2], w[1]); abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); w[3] = _mm_xor_si128(w[3], w[1]); e000 = _mm_sha1nexte_epu32(e000, w[2]); temp = abcd; w[3] = _mm_sha1msg2_epu32(w[3], w[2]); abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); temp = _mm_sha1nexte_epu32(temp, w[3]); e000 = abcd; abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); e000_orig = _mm_sha1nexte_epu32(e000, e000_orig); abcd_orig = _mm_add_epi32(abcd, abcd_orig); } _mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 32 - 5)); state->h[4] = _mm_extract_epi32(e000_orig, 3); return off; } #endif size_t libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { #ifdef HAVE_X86_SHA_INTRINSICS if (state->algorithm == LIBSHA1_1) return process_x86_sha(state, data, len); #endif return process_portable(state, data, len); }