aboutsummaryrefslogblamecommitdiffstats
path: root/process.c
blob: 5a4d9ac2bad9f16ac9182ba37568f084e4e7af7d (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12


                                                         








                                                                                      
 

                             
 
                                                   

 

                                                                                                      
 








                                                                                                                                    
 
                                     
              
                       
 






















































                                                                                                                               
         

                   




         
         




         











































































































































































                                                                                                     
/* See LICENSE file for copyright and license details. */
#include "common.h"

#if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__)
# define HAVE_X86_SHA_INTRINSICS
#endif


#ifdef HAVE_X86_SHA_INTRINSICS
# include <immintrin.h>
#endif


static inline uint_least32_t
rorl(uint_least32_t n, int k)
{
	return TRUNC32((n << k) | (n >> (32 - k)));
}

static size_t
process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
#define F0(B, C, D) (D ^ (B & (C ^ D)))
#define F1(B, C, D) (B ^ C ^ D)
#define F2(B, C, D) ((B & C) | (D & (B | C)))
#define F3(B, C, D) (B ^ C ^ D)
#define G_(A, B, C, D, E, I, F, X) (E = TRUNC32(E + rorl(A, 5) + F(B, C, D) + state->w[I] + (uint_least32_t)X##UL), B = rorl(B, 30))
#define G0(A, B, C, D, E, I) G_(A, B, C, D, E, I, F0, 0x5A827999)
#define G1(A, B, C, D, E, I) G_(A, B, C, D, E, I, F1, 0x6ED9EBA1)
#define G2(A, B, C, D, E, I) G_(A, B, C, D, E, I, F2, 0x8F1BBCDC)
#define G3(A, B, C, D, E, I) G_(A, B, C, D, E, I, F3, 0xCA62C1D6)

	uint_least32_t a, b, c, d, e;
	int i;
	size_t off = 0;

	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
		for (i = 0; i < 16; i++) {
			state->w[i]  = (uint_least32_t)data[off + 4 * i + 0] << 24;
			state->w[i] |= (uint_least32_t)data[off + 4 * i + 1] << 16;
			state->w[i] |= (uint_least32_t)data[off + 4 * i + 2] <<  8;
			state->w[i] |= (uint_least32_t)data[off + 4 * i + 3];
		}
		if (state->algorithm == LIBSHA1_1) {
			for (; i < 80; i++)
				state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
		} else {
			for (; i < 80; i++)
				state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
		}

		a = state->h[0];
		b = state->h[1];
		c = state->h[2];
		d = state->h[3];
		e = state->h[4];

		for (i = 0; i < 20;) {
			G0(a, b, c, d, e, i++);
			G0(e, a, b, c, d, i++);
			G0(d, e, a, b, c, i++);
			G0(c, d, e, a, b, i++);
			G0(b, c, d, e, a, i++);
		}
		while (i < 40) {
			G1(a, b, c, d, e, i++);
			G1(e, a, b, c, d, i++);
			G1(d, e, a, b, c, i++);
			G1(c, d, e, a, b, i++);
			G1(b, c, d, e, a, i++);
		}
		while (i < 60) {
			G2(a, b, c, d, e, i++);
			G2(e, a, b, c, d, i++);
			G2(d, e, a, b, c, i++);
			G2(c, d, e, a, b, i++);
			G2(b, c, d, e, a, i++);
		}
		while (i < 80) {
			G3(a, b, c, d, e, i++);
			G3(e, a, b, c, d, i++);
			G3(d, e, a, b, c, i++);
			G3(c, d, e, a, b, i++);
			G3(b, c, d, e, a, i++);
		}

		state->h[0] = TRUNC32(state->h[0] + a);
		state->h[1] = TRUNC32(state->h[1] + b);
		state->h[2] = TRUNC32(state->h[2] + c);
		state->h[3] = TRUNC32(state->h[3] + d);
		state->h[4] = TRUNC32(state->h[4] + e);
	}

	return off;

#undef F0
#undef F1
#undef F2
#undef F3
#undef G_
#undef G0
#undef G1
#undef G2
#undef G3
}

#ifdef HAVE_X86_SHA_INTRINSICS

static size_t
process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
	const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL);
	register __m128i abcd, e000, temp, w[4];
	__m128i abcd_orig, e000_orig;
	size_t off = 0;

	abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5);
	e000_orig = _mm_set_epi32(state->h[4], 0, 0, 0);

	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
		w[0] = _mm_loadu_si128((const __m128i *)&data[0]);
		w[0] = _mm_shuffle_epi8(w[0], SHUFFLE_MASK);
		e000 = _mm_add_epi32(e000_orig, w[0]);
		temp = abcd_orig;
		abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0);

		w[1] = _mm_loadu_si128((const __m128i *)&data[16]);
		w[1] = _mm_shuffle_epi8(w[1], SHUFFLE_MASK);
		temp = _mm_sha1nexte_epu32(temp, w[1]);
		e000 = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);

		w[2] = _mm_loadu_si128((const __m128i *)&data[32]);
		w[2] = _mm_shuffle_epi8(w[2], SHUFFLE_MASK);
		e000 = _mm_sha1nexte_epu32(e000, w[2]);
		temp = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
		w[0] = _mm_xor_si128(w[0], w[2]);

		w[3] = _mm_loadu_si128((const __m128i *)&data[48]);
		w[3] = _mm_shuffle_epi8(w[3], SHUFFLE_MASK);
		temp = _mm_sha1nexte_epu32(temp, w[3]);
		e000 = abcd;
		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
		w[1] = _mm_xor_si128(w[1], w[3]);

		e000 = _mm_sha1nexte_epu32(e000, w[0]);
		temp = abcd;
		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
		w[2] = _mm_xor_si128(w[2], w[0]);

		temp = _mm_sha1nexte_epu32(temp, w[1]);
		e000 = abcd;
		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
		w[3] = _mm_xor_si128(w[3], w[1]);

		e000 = _mm_sha1nexte_epu32(e000, w[2]);
		temp = abcd;
		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
		w[0] = _mm_xor_si128(w[0], w[2]);

		temp = _mm_sha1nexte_epu32(temp, w[3]);
		e000 = abcd;
		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
		w[1] = _mm_xor_si128(w[1], w[3]);

		e000 = _mm_sha1nexte_epu32(e000, w[0]);
		temp = abcd;
		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
		w[2] = _mm_xor_si128(w[2], w[0]);

		temp = _mm_sha1nexte_epu32(temp, w[1]);
		e000 = abcd;
		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
		w[3] = _mm_xor_si128(w[3], w[1]);

		e000 = _mm_sha1nexte_epu32(e000, w[2]);
		temp = abcd;
		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
		w[0] = _mm_xor_si128(w[0], w[2]);

		temp = _mm_sha1nexte_epu32(temp, w[3]);
		e000 = abcd;
		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
		w[1] = _mm_xor_si128(w[1], w[3]);

		e000 = _mm_sha1nexte_epu32(e000, w[0]);
		temp = abcd;
		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
		w[2] = _mm_xor_si128(w[2], w[0]);

		temp = _mm_sha1nexte_epu32(temp, w[1]);
		e000 = abcd;
		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
		w[0] = _mm_sha1msg1_epu32(w[0], w[1]);
		w[3] = _mm_xor_si128(w[3], w[1]);

		e000 = _mm_sha1nexte_epu32(e000, w[2]);
		temp = abcd;
		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		w[1] = _mm_sha1msg1_epu32(w[1], w[2]);
		w[0] = _mm_xor_si128(w[0], w[2]);

		temp = _mm_sha1nexte_epu32(temp, w[3]);
		e000 = abcd;
		w[0] = _mm_sha1msg2_epu32(w[0], w[3]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
		w[2] = _mm_sha1msg1_epu32(w[2], w[3]);
		w[1] = _mm_xor_si128(w[1], w[3]);

		e000 = _mm_sha1nexte_epu32(e000, w[0]);
		temp = abcd;
		w[1] = _mm_sha1msg2_epu32(w[1], w[0]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
		w[3] = _mm_sha1msg1_epu32(w[3], w[0]);
		w[2] = _mm_xor_si128(w[2], w[0]);

		temp = _mm_sha1nexte_epu32(temp, w[1]);
		e000 = abcd;
		w[2] = _mm_sha1msg2_epu32(w[2], w[1]);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
		w[3] = _mm_xor_si128(w[3], w[1]);

		e000 = _mm_sha1nexte_epu32(e000, w[2]);
		temp = abcd;
		w[3] = _mm_sha1msg2_epu32(w[3], w[2]);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);

		temp = _mm_sha1nexte_epu32(temp, w[3]);
		e000 = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);

		e000_orig = _mm_sha1nexte_epu32(e000, e000_orig);
		abcd_orig = _mm_add_epi32(abcd, abcd_orig);
	}

	_mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 32 - 5));
	state->h[4] = _mm_extract_epi32(e000_orig, 3);

	return off;
}

#endif

size_t
libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
#ifdef HAVE_X86_SHA_INTRINSICS
	if (state->algorithm == LIBSHA1_1)
		return process_x86_sha(state, data, len);
#endif
	return process_portable(state, data, len);
}