/* See LICENSE file for copyright and license details. */ #include "common.h" #include #if defined(__SSE4_1__) && defined(__SSSE3__) && defined(__SSE2__) && defined(__SHA__) # define HAVE_X86_SHA_INTRINSICS #endif #if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) # if defined(__ARM_NEON) && (defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)) # define HAVE_ARM_SHA_INTRINSICS # endif #endif #ifdef HAVE_X86_SHA_INTRINSICS # include #endif #ifdef HAVE_ARM_SHA_INTRINSICS # include # include #endif static inline uint_least32_t rorl(uint_least32_t n, int k) { return TRUNC32((n << k) | (n >> (32 - k))); } static size_t process_portable(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { #define F0(B, C, D) (D ^ (B & (C ^ D))) #define F1(B, C, D) (B ^ C ^ D) #define F2(B, C, D) ((B & C) | (D & (B | C))) #define F3(B, C, D) (B ^ C ^ D) #define G_(A, B, C, D, E, I, F, X) (E = TRUNC32(E + rorl(A, 5) + F(B, C, D) + state->w[I] + (uint_least32_t)X##UL), B = rorl(B, 30)) #define G0(A, B, C, D, E, I) G_(A, B, C, D, E, I, F0, 0x5A827999) #define G1(A, B, C, D, E, I) G_(A, B, C, D, E, I, F1, 0x6ED9EBA1) #define G2(A, B, C, D, E, I) G_(A, B, C, D, E, I, F2, 0x8F1BBCDC) #define G3(A, B, C, D, E, I) G_(A, B, C, D, E, I, F3, 0xCA62C1D6) uint_least32_t a, b, c, d, e; const unsigned char *restrict chunk; int i; size_t off = 0; for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk)) { chunk = &data[off]; for (i = 0; i < 16; i++) { state->w[i] = (uint_least32_t)chunk[4 * i + 0] << 24; state->w[i] |= (uint_least32_t)chunk[4 * i + 1] << 16; state->w[i] |= (uint_least32_t)chunk[4 * i + 2] << 8; state->w[i] |= (uint_least32_t)chunk[4 * i + 3]; } if (state->algorithm == LIBSHA1_1) { for (; i < 80; i++) { state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16]; state->w[i] = rorl(state->w[i], 1); } } else { for (; i < 80; i++) state->w[i] = state->w[i - 3] ^ state->w[i - 8] ^ state->w[i - 14] ^ state->w[i - 16]; } a = state->h[0]; b = state->h[1]; c = state->h[2]; d = state->h[3]; e = state->h[4]; for (i = 0; i < 20;) { G0(a, b, c, d, e, i++); G0(e, a, b, c, d, i++); G0(d, e, a, b, c, i++); G0(c, d, e, a, b, i++); G0(b, c, d, e, a, i++); } while (i < 40) { G1(a, b, c, d, e, i++); G1(e, a, b, c, d, i++); G1(d, e, a, b, c, i++); G1(c, d, e, a, b, i++); G1(b, c, d, e, a, i++); } while (i < 60) { G2(a, b, c, d, e, i++); G2(e, a, b, c, d, i++); G2(d, e, a, b, c, i++); G2(c, d, e, a, b, i++); G2(b, c, d, e, a, i++); } while (i < 80) { G3(a, b, c, d, e, i++); G3(e, a, b, c, d, i++); G3(d, e, a, b, c, i++); G3(c, d, e, a, b, i++); G3(b, c, d, e, a, i++); } state->h[0] = TRUNC32(state->h[0] + a); state->h[1] = TRUNC32(state->h[1] + b); state->h[2] = TRUNC32(state->h[2] + c); state->h[3] = TRUNC32(state->h[3] + d); state->h[4] = TRUNC32(state->h[4] + e); } return off; #undef F0 #undef F1 #undef F2 #undef F3 #undef G_ #undef G0 #undef G1 #undef G2 #undef G3 } #ifdef HAVE_X86_SHA_INTRINSICS static size_t process_x86_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { const __m128i SHUFFLE_MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090A0B0C0D0E0FULL); register __m128i abcd, e000, temp, msg0, msg1, msg2, msg3; __m128i abcd_orig, e000_orig; size_t off = 0; abcd_orig = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&state->h[0]), 033 /* 0b00'01'10'11 */); e000_orig = _mm_set_epi32((int)state->h[4], 0, 0, 0); for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk), data = &data[sizeof(state->chunk)]) { msg0 = _mm_loadu_si128((const __m128i *)&data[0]); msg0 = _mm_shuffle_epi8(msg0, SHUFFLE_MASK); e000 = _mm_add_epi32(e000_orig, msg0); temp = abcd_orig; abcd = _mm_sha1rnds4_epu32(abcd_orig, e000, 0); msg1 = _mm_loadu_si128((const __m128i *)&data[16]); msg1 = _mm_shuffle_epi8(msg1, SHUFFLE_MASK); temp = _mm_sha1nexte_epu32(temp, msg1); e000 = abcd; abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); msg0 = _mm_sha1msg1_epu32(msg0, msg1); msg2 = _mm_loadu_si128((const __m128i *)&data[32]); msg2 = _mm_shuffle_epi8(msg2, SHUFFLE_MASK); e000 = _mm_sha1nexte_epu32(e000, msg2); temp = abcd; abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); msg1 = _mm_sha1msg1_epu32(msg1, msg2); msg0 = _mm_xor_si128(msg0, msg2); msg3 = _mm_loadu_si128((const __m128i *)&data[48]); msg3 = _mm_shuffle_epi8(msg3, SHUFFLE_MASK); temp = _mm_sha1nexte_epu32(temp, msg3); e000 = abcd; msg0 = _mm_sha1msg2_epu32(msg0, msg3); abcd = _mm_sha1rnds4_epu32(abcd, temp, 0); msg2 = _mm_sha1msg1_epu32(msg2, msg3); msg1 = _mm_xor_si128(msg1, msg3); e000 = _mm_sha1nexte_epu32(e000, msg0); temp = abcd; msg1 = _mm_sha1msg2_epu32(msg1, msg0); abcd = _mm_sha1rnds4_epu32(abcd, e000, 0); msg3 = _mm_sha1msg1_epu32(msg3, msg0); msg2 = _mm_xor_si128(msg2, msg0); temp = _mm_sha1nexte_epu32(temp, msg1); e000 = abcd; msg2 = _mm_sha1msg2_epu32(msg2, msg1); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); msg0 = _mm_sha1msg1_epu32(msg0, msg1); msg3 = _mm_xor_si128(msg3, msg1); e000 = _mm_sha1nexte_epu32(e000, msg2); temp = abcd; msg3 = _mm_sha1msg2_epu32(msg3, msg2); abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); msg1 = _mm_sha1msg1_epu32(msg1, msg2); msg0 = _mm_xor_si128(msg0, msg2); temp = _mm_sha1nexte_epu32(temp, msg3); e000 = abcd; msg0 = _mm_sha1msg2_epu32(msg0, msg3); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); msg2 = _mm_sha1msg1_epu32(msg2, msg3); msg1 = _mm_xor_si128(msg1, msg3); e000 = _mm_sha1nexte_epu32(e000, msg0); temp = abcd; msg1 = _mm_sha1msg2_epu32(msg1, msg0); abcd = _mm_sha1rnds4_epu32(abcd, e000, 1); msg3 = _mm_sha1msg1_epu32(msg3, msg0); msg2 = _mm_xor_si128(msg2, msg0); temp = _mm_sha1nexte_epu32(temp, msg1); e000 = abcd; msg2 = _mm_sha1msg2_epu32(msg2, msg1); abcd = _mm_sha1rnds4_epu32(abcd, temp, 1); msg0 = _mm_sha1msg1_epu32(msg0, msg1); msg3 = _mm_xor_si128(msg3, msg1); e000 = _mm_sha1nexte_epu32(e000, msg2); temp = abcd; msg3 = _mm_sha1msg2_epu32(msg3, msg2); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); msg1 = _mm_sha1msg1_epu32(msg1, msg2); msg0 = _mm_xor_si128(msg0, msg2); temp = _mm_sha1nexte_epu32(temp, msg3); e000 = abcd; msg0 = _mm_sha1msg2_epu32(msg0, msg3); abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); msg2 = _mm_sha1msg1_epu32(msg2, msg3); msg1 = _mm_xor_si128(msg1, msg3); e000 = _mm_sha1nexte_epu32(e000, msg0); temp = abcd; msg1 = _mm_sha1msg2_epu32(msg1, msg0); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); msg3 = _mm_sha1msg1_epu32(msg3, msg0); msg2 = _mm_xor_si128(msg2, msg0); temp = _mm_sha1nexte_epu32(temp, msg1); e000 = abcd; msg2 = _mm_sha1msg2_epu32(msg2, msg1); abcd = _mm_sha1rnds4_epu32(abcd, temp, 2); msg0 = _mm_sha1msg1_epu32(msg0, msg1); msg3 = _mm_xor_si128(msg3, msg1); e000 = _mm_sha1nexte_epu32(e000, msg2); temp = abcd; msg3 = _mm_sha1msg2_epu32(msg3, msg2); abcd = _mm_sha1rnds4_epu32(abcd, e000, 2); msg1 = _mm_sha1msg1_epu32(msg1, msg2); msg0 = _mm_xor_si128(msg0, msg2); temp = _mm_sha1nexte_epu32(temp, msg3); e000 = abcd; msg0 = _mm_sha1msg2_epu32(msg0, msg3); abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); msg2 = _mm_sha1msg1_epu32(msg2, msg3); msg1 = _mm_xor_si128(msg1, msg3); e000 = _mm_sha1nexte_epu32(e000, msg0); temp = abcd; msg1 = _mm_sha1msg2_epu32(msg1, msg0); abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); msg3 = _mm_sha1msg1_epu32(msg3, msg0); msg2 = _mm_xor_si128(msg2, msg0); temp = _mm_sha1nexte_epu32(temp, msg1); e000 = abcd; msg2 = _mm_sha1msg2_epu32(msg2, msg1); abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); msg3 = _mm_xor_si128(msg3, msg1); e000 = _mm_sha1nexte_epu32(e000, msg2); temp = abcd; msg3 = _mm_sha1msg2_epu32(msg3, msg2); abcd = _mm_sha1rnds4_epu32(abcd, e000, 3); temp = _mm_sha1nexte_epu32(temp, msg3); e000 = abcd; abcd = _mm_sha1rnds4_epu32(abcd, temp, 3); e000_orig = _mm_sha1nexte_epu32(e000, e000_orig); abcd_orig = _mm_add_epi32(abcd, abcd_orig); } _mm_storeu_si128((__m128i *)&state->h[0], _mm_shuffle_epi32(abcd_orig, 033)); state->h[4] = (uint_least32_t)_mm_extract_epi32(e000_orig, 3); return off; } # if defined(__GNUC__) __attribute__((__constructor__)) # endif static int have_sha_intrinsics(void) { static volatile int ret = -1; static volatile atomic_flag spinlock = ATOMIC_FLAG_INIT; int a, b, c, d; if (ret != -1) return ret; while (atomic_flag_test_and_set(&spinlock)); if (ret != -1) goto out; a = 7; c = 0; __asm__ volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(a), "c"(c)); if (!(b & (1 << 29))) { ret = 0; goto out; } a = 1; __asm__ volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(a), "c"(c)); if (!(c & (1 << 19)) || !(c & (1 << 0)) || !(d & (1 << 26))) { ret = 0; goto out; } ret = 1; out: atomic_flag_clear(&spinlock); return ret; } #endif #ifdef HAVE_ARM_SHA_INTRINSICS static size_t process_arm_sha(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { uint32x4_t abcd, tmp0, tmp1, msg0, msg1, msg2, msg3; uint32x4_t abcd_orig, rc0, rc1, rc2, rc3; uint32_t e0, e1, e_orig; size_t off = 0; abcd_orig = vld1q_u32(&state->h[0]); e_orig = state->h[4]; rc0 = vdupq_n_u32(UINT32_C(0x5A827999)); rc1 = vdupq_n_u32(UINT32_C(0x6ED9EBA1)); rc2 = vdupq_n_u32(UINT32_C(0x8F1BBCDC)); rc3 = vdupq_n_u32(UINT32_C(0xCA62C1D6)); for (; len >= off + sizeof(state->chunk); off += sizeof(state->chunk), data = &data[sizeof(state->chunk)]) { msg0 = vld1q_u32((const uint32_t *)&data[0]); msg1 = vld1q_u32((const uint32_t *)&data[16]); msg2 = vld1q_u32((const uint32_t *)&data[32]); msg3 = vld1q_u32((const uint32_t *)&data[48]); msg0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg0))); msg1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg1))); msg2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg2))); msg3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg3))); tmp0 = vaddq_u32(msg0, rc0); tmp1 = vaddq_u32(msg1, rc0); e1 = vsha1h_u32(vgetq_lane_u32(abcd_orig, 0)); abcd = vsha1cq_u32(abcd_orig, e_orig, tmp0); tmp0 = vaddq_u32(msg2, rc0); msg0 = vsha1su0q_u32(msg0, msg1, msg2); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg3, rc0); msg0 = vsha1su1q_u32(msg0, msg3); msg1 = vsha1su0q_u32(msg1, msg2, msg3); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg0, rc0); msg1 = vsha1su1q_u32(msg1, msg0); msg2 = vsha1su0q_u32(msg2, msg3, msg0); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg1, rc1); msg2 = vsha1su1q_u32(msg2, msg1); msg3 = vsha1su0q_u32(msg3, msg0, msg1); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg2, rc1); msg3 = vsha1su1q_u32(msg3, msg2); msg0 = vsha1su0q_u32(msg0, msg1, msg2); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg3, rc1); msg0 = vsha1su1q_u32(msg0, msg3); msg1 = vsha1su0q_u32(msg1, msg2, msg3); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg0, rc1); msg1 = vsha1su1q_u32(msg1, msg0); msg2 = vsha1su0q_u32(msg2, msg3, msg0); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg1, rc1); msg2 = vsha1su1q_u32(msg2, msg1); msg3 = vsha1su0q_u32(msg3, msg0, msg1); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg2, rc2); msg3 = vsha1su1q_u32(msg3, msg2); msg0 = vsha1su0q_u32(msg0, msg1, msg2); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg3, rc2); msg0 = vsha1su1q_u32(msg0, msg3); msg1 = vsha1su0q_u32(msg1, msg2, msg3); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg0, rc2); msg1 = vsha1su1q_u32(msg1, msg0); msg2 = vsha1su0q_u32(msg2, msg3, msg0); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg1, rc2); msg2 = vsha1su1q_u32(msg2, msg1); msg3 = vsha1su0q_u32(msg3, msg0, msg1); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg2, rc2); msg3 = vsha1su1q_u32(msg3, msg2); msg0 = vsha1su0q_u32(msg0, msg1, msg2); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg3, rc3); msg0 = vsha1su1q_u32(msg0, msg3); msg1 = vsha1su0q_u32(msg1, msg2, msg3); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg0, rc3); msg1 = vsha1su1q_u32(msg1, msg0); msg2 = vsha1su0q_u32(msg2, msg3, msg0); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg1, rc3); msg2 = vsha1su1q_u32(msg2, msg1); msg3 = vsha1su0q_u32(msg3, msg0, msg1); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e0, tmp0); tmp0 = vaddq_u32(msg2, rc3); msg3 = vsha1su1q_u32(msg3, msg2); msg0 = vsha1su0q_u32(msg0, msg1, msg2); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); tmp1 = vaddq_u32(msg3, rc3); msg0 = vsha1su1q_u32(msg0, msg3); e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e0, tmp0); e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e1, tmp1); e_orig += e0; abcd_orig = vaddq_u32(abcd_orig, abcd); } vst1q_u32(&state->h[0], abcd_orig); state->h[4] = e_orig; return off; } #endif size_t libsha1_process(struct libsha1_state *restrict state, const unsigned char *restrict data, size_t len) { #ifdef HAVE_X86_SHA_INTRINSICS if (state->algorithm == LIBSHA1_1 && have_sha_intrinsics()) return process_x86_sha(state, data, len); #endif #ifdef HAVE_ARM_SHA_INTRINSICS if (state->algorithm == LIBSHA1_1) return process_arm_sha(state, data, len); #endif return process_portable(state, data, len); }