Optimisation for amd64

Increased major number as the ABI was broken by insertion of padding into the BLAKE2 parameter structures (except for BLAKE2Xs) Signed-off-by: Mattias Andrée <maandree@kth.se>
author: Mattias Andrée <maandree@kth.se> 2022-01-19 20:28:55 +0100
committer: Mattias Andrée <maandree@kth.se> 2022-01-19 20:28:55 +0100
commit: 5d77a0178349ecac6536e0374cf689500efa22bc (patch)
tree: f6fcb38cd39e8f4240537233a08fdbb5c0284798 /libblake_internal_blake2s_output_digest.c
parent: Improve portability (diff)
download: libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.gz
libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.bz2
libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.xz
1 files changed, 37 insertions, 2 deletions
diff --git a/libblake_internal_blake2s_output_digest.c b/libblake_internal_blake2s_output_digest.c
index d7b891c..6b4a5da 100644
--- a/libblake_internal_blake2s_output_digest.c
+++ b/libblake_internal_blake2s_output_digest.c
@@ -4,8 +4,18 @@
 static void
 encode_uint32_le(unsigned char *out, uint_least32_t value, size_t bytes)
 {
+	/* Adding LIKELY to indicate that the default case is the
+	 * expected does not affact the output */
 	switch (bytes) {
 	default:
+		/*
+		 * The following optimisation have been tested:
+		 * 
+		 * 1) Changing the default case, on amd64, to
+		 *    *(uint32_t *)out = (uint32_t)value;
+		 *    break;
+		 *    result: halved performance
+		 */
 		out[3] = (unsigned char)((value >> 24) & 255);
 		/* fall through */
 	case 3:
@@ -27,6 +37,31 @@ libblake_internal_blake2s_output_digest(struct libblake_blake2s_state *state, si
 {
 	size_t i, j;
 
-	for (i = 0, j = 0; i < output_len; i += 4, j += 1)
-		encode_uint32_le(&output[i], state->h[j], output_len - i);
+#ifdef LITTLE_ENDIAN
+	if ((uint_least32_t)(UINT_LEAST32_C(0xFFFFffff) + 1) == 0) {
+		/* No noticeable performance benefit on amd64, however
+		 * it signficantly reduces the translation size and
+		 * a 37.5x performance benefit was seen on the 64-bit
+		 * version on amd64;
+		 * even though the compiler is smart enough to optimise
+		 * `encode_uint32_le(&output[i], state->h[j], 4);` to a
+		 * movq (on amd64), it is note smart enough to optimise
+		 * the rest */
+		memcpy(output, state->h, output_len);
+		return;
+	}
+#endif
+
+	/* Estimated to have similar performance benefit as above
+	 * on big-endian machines */
+	for (i = 0, j = 0; i + 4 < output_len; i += 4, j += 1)
+		encode_uint32_le(&output[i], state->h[j], 4);
+	encode_uint32_le(&output[i], state->h[j], output_len - i);
+
+	/*
+	 * Unoptimised code:
+	 * 
+	 * for (i = 0, j = 0; i < output_len; i += 4, j += 1)
+	 *         encode_uint32_le(&output[i], state->h[j], output_len - i);
+	 */
 }
author	Mattias Andrée <maandree@kth.se>	2022-01-19 20:28:55 +0100
committer	Mattias Andrée <maandree@kth.se>	2022-01-19 20:28:55 +0100
commit	5d77a0178349ecac6536e0374cf689500efa22bc (patch)
tree	f6fcb38cd39e8f4240537233a08fdbb5c0284798 /libblake_internal_blake2s_output_digest.c
parent	Improve portability (diff)
download	libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.gz libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.bz2 libblake-5d77a0178349ecac6536e0374cf689500efa22bc.tar.xz