diff options
| author | Mattias Andrée <maandree@kth.se> | 2022-07-08 00:47:04 +0200 | 
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2022-07-08 00:47:04 +0200 | 
| commit | c5b86d52256d535149caefbe1031807b28c8face (patch) | |
| tree | 0f5b52e3196ca435365244b7cc3b97f9fb9750c5 | |
| parent | m (diff) | |
| download | libsha1-c5b86d52256d535149caefbe1031807b28c8face.tar.gz libsha1-c5b86d52256d535149caefbe1031807b28c8face.tar.bz2 libsha1-c5b86d52256d535149caefbe1031807b28c8face.tar.xz | |
Add code using SHA intrinsics
Signed-off-by: Mattias Andrée <maandree@kth.se>
| -rw-r--r-- | common.h | 8 | ||||
| -rw-r--r-- | config.mk | 2 | ||||
| -rw-r--r-- | digest.c | 4 | ||||
| -rw-r--r-- | process.c | 294 | ||||
| -rw-r--r-- | update.c | 14 | 
5 files changed, 254 insertions, 68 deletions
| @@ -27,10 +27,12 @@  /**   * Process a chunk using SHA-1 or SHA-0   *  - * @param  state  The hashing state - * @param  chunk  The data to process + * @param   state  The hashing state + * @param   data   The data to process + * @param   len    The number of available bytes + * @return         The number of processed bytes   */  #if defined(__GNUC__)  __attribute__((__nonnull__, __nothrow__))  #endif -void libsha1_process(struct libsha1_state *restrict, const unsigned char *restrict); +size_t libsha1_process(struct libsha1_state *restrict, const unsigned char *restrict, size_t); @@ -4,7 +4,7 @@ MANPREFIX = $(PREFIX)/share/man  CC = c99  CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -CFLAGS   = -Wall -O3 +CFLAGS   = -Wall -O3 -msse4 -msha  LDFLAGS  = -s  # You can add -DALLOCA_LIMIT=# to CPPFLAGS, where # is a size_t @@ -29,7 +29,7 @@ libsha1_digest(struct libsha1_state *restrict state, const void *message_, size_  	if (off > sizeof(state->chunk) - (size_t)8) {  		memset(state->chunk + off, 0, sizeof(state->chunk) - off);  		off = 0; -		libsha1_process(state, state->chunk); +		libsha1_process(state, state->chunk, sizeof(state->chunk));  	}  	memset(state->chunk + off, 0, sizeof(state->chunk) - 8 - off); @@ -41,7 +41,7 @@ libsha1_digest(struct libsha1_state *restrict state, const void *message_, size_  	state->chunk[sizeof(state->chunk) - 3] = (unsigned char)(state->message_size >> 16);  	state->chunk[sizeof(state->chunk) - 2] = (unsigned char)(state->message_size >>  8);  	state->chunk[sizeof(state->chunk) - 1] = (unsigned char)(state->message_size >>  0); -	libsha1_process(state, state->chunk); +	libsha1_process(state, state->chunk, sizeof(state->chunk));  	n = libsha1_algorithm_output_size(state->algorithm);  	for (i = 0, n /= 4; i < n; i++) { @@ -1,6 +1,15 @@  /* See LICENSE file for copyright and license details. */  #include "common.h" +#if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__) +# define HAVE_X86_SHA_INTRINSICS +#endif + + +#ifdef HAVE_X86_SHA_INTRINSICS +# include <immintrin.h> +#endif +  static inline uint_least32_t  rorl(uint_least32_t n, int k) @@ -8,9 +17,8 @@ rorl(uint_least32_t n, int k)  	return TRUNC32((n << k) | (n >> (32 - k)));  } - -void -libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict chunk) +static size_t +process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len)  {  #define F0(B, C, D) (D ^ (B & (C ^ D)))  #define F1(B, C, D) (B ^ C ^ D) @@ -24,58 +32,66 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr  	uint_least32_t a, b, c, d, e;  	int i; +	size_t off = 0; -	for (i = 0; i < 16; i++) { -		state->w[i]  = (uint_least32_t)chunk[4 * i + 0] << 24; -		state->w[i] |= (uint_least32_t)chunk[4 * i + 1] << 16; -		state->w[i] |= (uint_least32_t)chunk[4 * i + 2] <<  8; -		state->w[i] |= (uint_least32_t)chunk[4 * i + 3]; -	} -	if (state->algorithm == LIBSHA1_1) { -		for (; i < 80; i++) -			state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1); -	} else { -		for (; i < 80; i++) -			state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16]; -	} -	a = state->h[0]; -	b = state->h[1]; -	c = state->h[2]; -	d = state->h[3]; -	e = state->h[4]; -	for (i = 0; i < 20;) { -		G0(a, b, c, d, e, i++); -		G0(e, a, b, c, d, i++); -		G0(d, e, a, b, c, i++); -		G0(c, d, e, a, b, i++); -		G0(b, c, d, e, a, i++); -	} -	while (i < 40) { -		G1(a, b, c, d, e, i++); -		G1(e, a, b, c, d, i++); -		G1(d, e, a, b, c, i++); -		G1(c, d, e, a, b, i++); -		G1(b, c, d, e, a, i++); -	} -	while (i < 60) { -		G2(a, b, c, d, e, i++); -		G2(e, a, b, c, d, i++); -		G2(d, e, a, b, c, i++); -		G2(c, d, e, a, b, i++); -		G2(b, c, d, e, a, i++); -	} -	while (i < 80) { -		G3(a, b, c, d, e, i++); -		G3(e, a, b, c, d, i++); -		G3(d, e, a, b, c, i++); -		G3(c, d, e, a, b, i++); -		G3(b, c, d, e, a, i++); +	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) { +		for (i = 0; i < 16; i++) { +			state->w[i]  = (uint_least32_t)data[off + 4 * i + 0] << 24; +			state->w[i] |= (uint_least32_t)data[off + 4 * i + 1] << 16; +			state->w[i] |= (uint_least32_t)data[off + 4 * i + 2] <<  8; +			state->w[i] |= (uint_least32_t)data[off + 4 * i + 3]; +		} +		if (state->algorithm == LIBSHA1_1) { +			for (; i < 80; i++) +				state->w[i] = rorl(state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16], 1); +		} else { +			for (; i < 80; i++) +				state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16]; +		} + +		a = state->h[0]; +		b = state->h[1]; +		c = state->h[2]; +		d = state->h[3]; +		e = state->h[4]; + +		for (i = 0; i < 20;) { +			G0(a, b, c, d, e, i++); +			G0(e, a, b, c, d, i++); +			G0(d, e, a, b, c, i++); +			G0(c, d, e, a, b, i++); +			G0(b, c, d, e, a, i++); +		} +		while (i < 40) { +			G1(a, b, c, d, e, i++); +			G1(e, a, b, c, d, i++); +			G1(d, e, a, b, c, i++); +			G1(c, d, e, a, b, i++); +			G1(b, c, d, e, a, i++); +		} +		while (i < 60) { +			G2(a, b, c, d, e, i++); +			G2(e, a, b, c, d, i++); +			G2(d, e, a, b, c, i++); +			G2(c, d, e, a, b, i++); +			G2(b, c, d, e, a, i++); +		} +		while (i < 80) { +			G3(a, b, c, d, e, i++); +			G3(e, a, b, c, d, i++); +			G3(d, e, a, b, c, i++); +			G3(c, d, e, a, b, i++); +			G3(b, c, d, e, a, i++); +		} + +		state->h[0] = TRUNC32(state->h[0] + a); +		state->h[1] = TRUNC32(state->h[1] + b); +		state->h[2] = TRUNC32(state->h[2] + c); +		state->h[3] = TRUNC32(state->h[3] + d); +		state->h[4] = TRUNC32(state->h[4] + e);  	} -	state->h[0] = TRUNC32(state->h[0] + a); -	state->h[1] = TRUNC32(state->h[1] + b); -	state->h[2] = TRUNC32(state->h[2] + c); -	state->h[3] = TRUNC32(state->h[3] + d); -	state->h[4] = TRUNC32(state->h[4] + e); + +	return off;  #undef F0  #undef F1 @@ -87,3 +103,175 @@ libsha1_process(struct libsha1_state *restrict state, const unsigned char *restr  #undef G2  #undef G3  } + +#ifdef HAVE_X86_SHA_INTRINSICS + +static size_t +process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) +{ +	const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL); +	register __m128i abcd, e000, temp, w[4]; +	__m128i abcd_orig, e000_orig; +	size_t off = 0; + +	abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 32 - 5); +	e000_orig = _mm_set_epi32(state->h[4], 0, 0, 0); + +	for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) { +		w[0] = _mm_loadu_si128((const __m128i *)&data[0]); +		w[0] = _mm_shuffle_epi8(w[0], SHUFFLE_MASK); +		e000 = _mm_add_epi32(e000_orig, w[0]); +		temp = abcd_orig; +		abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0); + +		w[1] = _mm_loadu_si128((const __m128i *)&data[16]); +		w[1] = _mm_shuffle_epi8(w[1], SHUFFLE_MASK); +		temp = _mm_sha1nexte_epu32(temp, w[1]); +		e000 = abcd; +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); +		w[0] = _mm_sha1msg1_epu32(w[0], w[1]); + +		w[2] = _mm_loadu_si128((const __m128i *)&data[32]); +		w[2] = _mm_shuffle_epi8(w[2], SHUFFLE_MASK); +		e000 = _mm_sha1nexte_epu32(e000, w[2]); +		temp = abcd; +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); +		w[1] = _mm_sha1msg1_epu32(w[1], w[2]); +		w[0] = _mm_xor_si128(w[0], w[2]); + +		w[3] = _mm_loadu_si128((const __m128i *)&data[48]); +		w[3] = _mm_shuffle_epi8(w[3], SHUFFLE_MASK); +		temp = _mm_sha1nexte_epu32(temp, w[3]); +		e000 = abcd; +		w[0] = _mm_sha1msg2_epu32(w[0], w[3]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); +		w[2] = _mm_sha1msg1_epu32(w[2], w[3]); +		w[1] = _mm_xor_si128(w[1], w[3]); + +		e000 = _mm_sha1nexte_epu32(e000, w[0]); +		temp = abcd; +		w[1] = _mm_sha1msg2_epu32(w[1], w[0]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); +		w[3] = _mm_sha1msg1_epu32(w[3], w[0]); +		w[2] = _mm_xor_si128(w[2], w[0]); + +		temp = _mm_sha1nexte_epu32(temp, w[1]); +		e000 = abcd; +		w[2] = _mm_sha1msg2_epu32(w[2], w[1]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); +		w[0] = _mm_sha1msg1_epu32(w[0], w[1]); +		w[3] = _mm_xor_si128(w[3], w[1]); + +		e000 = _mm_sha1nexte_epu32(e000, w[2]); +		temp = abcd; +		w[3] = _mm_sha1msg2_epu32(w[3], w[2]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); +		w[1] = _mm_sha1msg1_epu32(w[1], w[2]); +		w[0] = _mm_xor_si128(w[0], w[2]); + +		temp = _mm_sha1nexte_epu32(temp, w[3]); +		e000 = abcd; +		w[0] = _mm_sha1msg2_epu32(w[0], w[3]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); +		w[2] = _mm_sha1msg1_epu32(w[2], w[3]); +		w[1] = _mm_xor_si128(w[1], w[3]); + +		e000 = _mm_sha1nexte_epu32(e000, w[0]); +		temp = abcd; +		w[1] = _mm_sha1msg2_epu32(w[1], w[0]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); +		w[3] = _mm_sha1msg1_epu32(w[3], w[0]); +		w[2] = _mm_xor_si128(w[2], w[0]); + +		temp = _mm_sha1nexte_epu32(temp, w[1]); +		e000 = abcd; +		w[2] = _mm_sha1msg2_epu32(w[2], w[1]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); +		w[0] = _mm_sha1msg1_epu32(w[0], w[1]); +		w[3] = _mm_xor_si128(w[3], w[1]); + +		e000 = _mm_sha1nexte_epu32(e000, w[2]); +		temp = abcd; +		w[3] = _mm_sha1msg2_epu32(w[3], w[2]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); +		w[1] = _mm_sha1msg1_epu32(w[1], w[2]); +		w[0] = _mm_xor_si128(w[0], w[2]); + +		temp = _mm_sha1nexte_epu32(temp, w[3]); +		e000 = abcd; +		w[0] = _mm_sha1msg2_epu32(w[0], w[3]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); +		w[2] = _mm_sha1msg1_epu32(w[2], w[3]); +		w[1] = _mm_xor_si128(w[1], w[3]); + +		e000 = _mm_sha1nexte_epu32(e000, w[0]); +		temp = abcd; +		w[1] = _mm_sha1msg2_epu32(w[1], w[0]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); +		w[3] = _mm_sha1msg1_epu32(w[3], w[0]); +		w[2] = _mm_xor_si128(w[2], w[0]); + +		temp = _mm_sha1nexte_epu32(temp, w[1]); +		e000 = abcd; +		w[2] = _mm_sha1msg2_epu32(w[2], w[1]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); +		w[0] = _mm_sha1msg1_epu32(w[0], w[1]); +		w[3] = _mm_xor_si128(w[3], w[1]); + +		e000 = _mm_sha1nexte_epu32(e000, w[2]); +		temp = abcd; +		w[3] = _mm_sha1msg2_epu32(w[3], w[2]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); +		w[1] = _mm_sha1msg1_epu32(w[1], w[2]); +		w[0] = _mm_xor_si128(w[0], w[2]); + +		temp = _mm_sha1nexte_epu32(temp, w[3]); +		e000 = abcd; +		w[0] = _mm_sha1msg2_epu32(w[0], w[3]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); +		w[2] = _mm_sha1msg1_epu32(w[2], w[3]); +		w[1] = _mm_xor_si128(w[1], w[3]); + +		e000 = _mm_sha1nexte_epu32(e000, w[0]); +		temp = abcd; +		w[1] = _mm_sha1msg2_epu32(w[1], w[0]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); +		w[3] = _mm_sha1msg1_epu32(w[3], w[0]); +		w[2] = _mm_xor_si128(w[2], w[0]); + +		temp = _mm_sha1nexte_epu32(temp, w[1]); +		e000 = abcd; +		w[2] = _mm_sha1msg2_epu32(w[2], w[1]); +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); +		w[3] = _mm_xor_si128(w[3], w[1]); + +		e000 = _mm_sha1nexte_epu32(e000, w[2]); +		temp = abcd; +		w[3] = _mm_sha1msg2_epu32(w[3], w[2]); +		abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); + +		temp = _mm_sha1nexte_epu32(temp, w[3]); +		e000 = abcd; +		abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); + +		e000_orig = _mm_sha1nexte_epu32(e000, e000_orig); +		abcd_orig = _mm_add_epi32(abcd, abcd_orig); +	} + +	_mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 32 - 5)); +	state->h[4] = _mm_extract_epi32(e000_orig, 3); + +	return off; +} + +#endif + +size_t +libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) +{ +#ifdef HAVE_X86_SHA_INTRINSICS +	if (state->algorithm == LIBSHA1_1) +		return process_x86_sha(state, data, len); +#endif +	return process_portable(state, data, len); +} @@ -5,7 +5,7 @@  void  libsha1_update(struct libsha1_state *restrict state, const void *restrict message_, size_t msglen)  { -	const char *restrict message = message_; +	const unsigned char *restrict message = message_;  	size_t n, off;  	off = (state->message_size / 8) % sizeof(state->chunk); @@ -16,17 +16,13 @@ libsha1_update(struct libsha1_state *restrict state, const void *restrict messag  		n = msglen < sizeof(state->chunk) - off ? msglen : sizeof(state->chunk) - off;  		memcpy(&state->chunk[off], message, n);  		if (off + n == sizeof(state->chunk)) -			libsha1_process(state, state->chunk); +			libsha1_process(state, state->chunk, sizeof(state->chunk));  		message += n;  		msglen -= n;  	} -	while (msglen >= sizeof(state->chunk)) { -		libsha1_process(state, (const unsigned char *)message); -		message += sizeof(state->chunk); -		msglen -= sizeof(state->chunk); -	} +	off = libsha1_process(state, message, msglen); -	if (msglen) -		memcpy(state->chunk, message, msglen); +	if (msglen > off) +		memcpy(state->chunk, &message[off], msglen - off);  } | 
