/* See LICENSE file for copyright and license details. */
#include "common.h"

#if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__)
# define HAVE_X86_SHA_INTRINSICS
#endif


#ifdef HAVE_X86_SHA_INTRINSICS
# include <immintrin.h>
#endif


static inline uint_least32_t
rorl(uint_least32_t n, int k)
{
	return TRUNC32((n << k) | (n >> (32 - k)));
}

static size_t
process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
#define F0(B, C, D) (D ^ (B & (C ^ D)))
#define F1(B, C, D) (B ^ C ^ D)
#define F2(B, C, D) ((B & C) | (D & (B | C)))
#define F3(B, C, D) (B ^ C ^ D)
#define G_(A, B, C, D, E, I, F, X) (E = TRUNC32(E + rorl(A, 5) + F(B, C, D) + state->w[I] + (uint_least32_t)X##UL), B = rorl(B, 30))
#define G0(A, B, C, D, E, I) G_(A, B, C, D, E, I, F0, 0x5A827999)
#define G1(A, B, C, D, E, I) G_(A, B, C, D, E, I, F1, 0x6ED9EBA1)
#define G2(A, B, C, D, E, I) G_(A, B, C, D, E, I, F2, 0x8F1BBCDC)
#define G3(A, B, C, D, E, I) G_(A, B, C, D, E, I, F3, 0xCA62C1D6)

	uint_least32_t a, b, c, d, e;
	const unsigned char *restrict chunk;
	int i;
	size_t off = 0;

	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
		chunk = &data[off];
		for (i = 0; i < 16; i++) {
			state->w[i]  = (uint_least32_t)chunk[4 * i + 0] << 24;
			state->w[i] |= (uint_least32_t)chunk[4 * i + 1] << 16;
			state->w[i] |= (uint_least32_t)chunk[4 * i + 2] <<  8;
			state->w[i] |= (uint_least32_t)chunk[4 * i + 3];
		}
		if (state->algorithm == LIBSHA1_1) {
			for (; i < 80; i++)
				state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1);
		} else {
			for (; i < 80; i++)
				state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16];
		}

		a = state->h[0];
		b = state->h[1];
		c = state->h[2];
		d = state->h[3];
		e = state->h[4];

		for (i = 0; i < 20;) {
			G0(a, b, c, d, e, i++);
			G0(e, a, b, c, d, i++);
			G0(d, e, a, b, c, i++);
			G0(c, d, e, a, b, i++);
			G0(b, c, d, e, a, i++);
		}
		while (i < 40) {
			G1(a, b, c, d, e, i++);
			G1(e, a, b, c, d, i++);
			G1(d, e, a, b, c, i++);
			G1(c, d, e, a, b, i++);
			G1(b, c, d, e, a, i++);
		}
		while (i < 60) {
			G2(a, b, c, d, e, i++);
			G2(e, a, b, c, d, i++);
			G2(d, e, a, b, c, i++);
			G2(c, d, e, a, b, i++);
			G2(b, c, d, e, a, i++);
		}
		while (i < 80) {
			G3(a, b, c, d, e, i++);
			G3(e, a, b, c, d, i++);
			G3(d, e, a, b, c, i++);
			G3(c, d, e, a, b, i++);
			G3(b, c, d, e, a, i++);
		}

		state->h[0] = TRUNC32(state->h[0] + a);
		state->h[1] = TRUNC32(state->h[1] + b);
		state->h[2] = TRUNC32(state->h[2] + c);
		state->h[3] = TRUNC32(state->h[3] + d);
		state->h[4] = TRUNC32(state->h[4] + e);
	}

	return off;

#undef F0
#undef F1
#undef F2
#undef F3
#undef G_
#undef G0
#undef G1
#undef G2
#undef G3
}

#ifdef HAVE_X86_SHA_INTRINSICS

static size_t
process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
	const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL);
	register __m128i abcd, e000, temp, msg0, msg1, msg2, msg3;
	__m128i abcd_orig, e000_orig;
	size_t off = 0;

	abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5);
	e000_orig = _mm_set_epi32((int)state->h[4], 0, 0, 0);

	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) {
		msg0 = _mm_loadu_si128((const __m128i *)&data[0]);
		msg0 = _mm_shuffle_epi8(msg0, SHUFFLE_MASK);
		e000 = _mm_add_epi32(e000_orig, msg0);
		temp = abcd_orig;
		abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0);

		msg1 = _mm_loadu_si128((const __m128i *)&data[16]);
		msg1 = _mm_shuffle_epi8(msg1, SHUFFLE_MASK);
		temp = _mm_sha1nexte_epu32(temp, msg1);
		e000 = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
		msg0 = _mm_sha1msg1_epu32(msg0, msg1);

		msg2 = _mm_loadu_si128((const __m128i *)&data[32]);
		msg2 = _mm_shuffle_epi8(msg2, SHUFFLE_MASK);
		e000 = _mm_sha1nexte_epu32(e000, msg2);
		temp = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
		msg1 = _mm_sha1msg1_epu32(msg1, msg2);
		msg0 = _mm_xor_si128(msg0, msg2);

		msg3 = _mm_loadu_si128((const __m128i *)&data[48]);
		msg3 = _mm_shuffle_epi8(msg3, SHUFFLE_MASK);
		temp = _mm_sha1nexte_epu32(temp, msg3);
		e000 = abcd;
		msg0 = _mm_sha1msg2_epu32(msg0, msg3);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0);
		msg2 = _mm_sha1msg1_epu32(msg2, msg3);
		msg1 = _mm_xor_si128(msg1, msg3);

		e000 = _mm_sha1nexte_epu32(e000, msg0);
		temp = abcd;
		msg1 = _mm_sha1msg2_epu32(msg1, msg0);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0);
		msg3 = _mm_sha1msg1_epu32(msg3, msg0);
		msg2 = _mm_xor_si128(msg2, msg0);

		temp = _mm_sha1nexte_epu32(temp, msg1);
		e000 = abcd;
		msg2 = _mm_sha1msg2_epu32(msg2, msg1);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		msg0 = _mm_sha1msg1_epu32(msg0, msg1);
		msg3 = _mm_xor_si128(msg3, msg1);

		e000 = _mm_sha1nexte_epu32(e000, msg2);
		temp = abcd;
		msg3 = _mm_sha1msg2_epu32(msg3, msg2);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
		msg1 = _mm_sha1msg1_epu32(msg1, msg2);
		msg0 = _mm_xor_si128(msg0, msg2);

		temp = _mm_sha1nexte_epu32(temp, msg3);
		e000 = abcd;
		msg0 = _mm_sha1msg2_epu32(msg0, msg3);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		msg2 = _mm_sha1msg1_epu32(msg2, msg3);
		msg1 = _mm_xor_si128(msg1, msg3);

		e000 = _mm_sha1nexte_epu32(e000, msg0);
		temp = abcd;
		msg1 = _mm_sha1msg2_epu32(msg1, msg0);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1);
		msg3 = _mm_sha1msg1_epu32(msg3, msg0);
		msg2 = _mm_xor_si128(msg2, msg0);

		temp = _mm_sha1nexte_epu32(temp, msg1);
		e000 = abcd;
		msg2 = _mm_sha1msg2_epu32(msg2, msg1);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1);
		msg0 = _mm_sha1msg1_epu32(msg0, msg1);
		msg3 = _mm_xor_si128(msg3, msg1);

		e000 = _mm_sha1nexte_epu32(e000, msg2);
		temp = abcd;
		msg3 = _mm_sha1msg2_epu32(msg3, msg2);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		msg1 = _mm_sha1msg1_epu32(msg1, msg2);
		msg0 = _mm_xor_si128(msg0, msg2);

		temp = _mm_sha1nexte_epu32(temp, msg3);
		e000 = abcd;
		msg0 = _mm_sha1msg2_epu32(msg0, msg3);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
		msg2 = _mm_sha1msg1_epu32(msg2, msg3);
		msg1 = _mm_xor_si128(msg1, msg3);

		e000 = _mm_sha1nexte_epu32(e000, msg0);
		temp = abcd;
		msg1 = _mm_sha1msg2_epu32(msg1, msg0);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		msg3 = _mm_sha1msg1_epu32(msg3, msg0);
		msg2 = _mm_xor_si128(msg2, msg0);

		temp = _mm_sha1nexte_epu32(temp, msg1);
		e000 = abcd;
		msg2 = _mm_sha1msg2_epu32(msg2, msg1);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2);
		msg0 = _mm_sha1msg1_epu32(msg0, msg1);
		msg3 = _mm_xor_si128(msg3, msg1);

		e000 = _mm_sha1nexte_epu32(e000, msg2);
		temp = abcd;
		msg3 = _mm_sha1msg2_epu32(msg3, msg2);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2);
		msg1 = _mm_sha1msg1_epu32(msg1, msg2);
		msg0 = _mm_xor_si128(msg0, msg2);

		temp = _mm_sha1nexte_epu32(temp, msg3);
		e000 = abcd;
		msg0 = _mm_sha1msg2_epu32(msg0, msg3);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
		msg2 = _mm_sha1msg1_epu32(msg2, msg3);
		msg1 = _mm_xor_si128(msg1, msg3);

		e000 = _mm_sha1nexte_epu32(e000, msg0);
		temp = abcd;
		msg1 = _mm_sha1msg2_epu32(msg1, msg0);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);
		msg3 = _mm_sha1msg1_epu32(msg3, msg0);
		msg2 = _mm_xor_si128(msg2, msg0);

		temp = _mm_sha1nexte_epu32(temp, msg1);
		e000 = abcd;
		msg2 = _mm_sha1msg2_epu32(msg2, msg1);
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);
		msg3 = _mm_xor_si128(msg3, msg1);

		e000 = _mm_sha1nexte_epu32(e000, msg2);
		temp = abcd;
		msg3 = _mm_sha1msg2_epu32(msg3, msg2);
		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3);

		temp = _mm_sha1nexte_epu32(temp, msg3);
		e000 = abcd;
		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3);

		e000_orig = _mm_sha1nexte_epu32(e000, e000_orig);
		abcd_orig = _mm_add_epi32(abcd, abcd_orig);
	}

	abcd_orig = _mm_shuffle_epi32(abcd_orig, 32 - 5);
	state->h[4] = (uint_least32_t)_mm_extract_epi32(e000_orig, 3);
	_mm_storeu_si128((__m128i *)&state->h[0], abcd_orig);

	return off;
}

#endif

size_t
libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)
{
#ifdef HAVE_X86_SHA_INTRINSICS
	if (state->algorithm == LIBSHA1_1)
		return process_x86_sha(state, data, len);
#endif
	return process_portable(state, data, len);
}