aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2022-02-17 11:08:42 +0100
committerMattias Andrée <maandree@kth.se>2022-02-17 11:11:09 +0100
commit89a3b78d6818a7e862610a7619bde1a4daa95305 (patch)
tree521ed8f8f341d9e39540b16fe7835e3fd697ce58
parentFix previous commit (diff)
downloadlibar2-89a3b78d6818a7e862610a7619bde1a4daa95305.tar.gz
libar2-89a3b78d6818a7e862610a7619bde1a4daa95305.tar.bz2
libar2-89a3b78d6818a7e862610a7619bde1a4daa95305.tar.xz
Add optimisations (no yet used)
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--common.h12
-rw-r--r--libar2_hash.c157
-rw-r--r--test.c2
3 files changed, 158 insertions, 13 deletions
diff --git a/common.h b/common.h
index 958c7b0..edfaed7 100644
--- a/common.h
+++ b/common.h
@@ -29,7 +29,8 @@
#if defined(__GNUC__)
-# define LIBAR2_WEAKLY_LINKED__ __attribute__((weak))
+# define LIBAR2_WEAKLY_LINKED__ __attribute__((__weak__))
+# define LIBAR2_TARGET__(TARGETS) __attribute__((__target__(TARGETS)))
#endif
@@ -86,6 +87,15 @@
#endif
+#if defined(__x86_64__) && defined(LIBAR2_TARGET__)
+# define MAX_SIMD_ALIGNMENT 64
+# define SIMD_ALIGNED _Alignas(MAX_SIMD_ALIGNMENT)
+#else
+# define MAX_SIMD_ALIGNMENT 1
+# define SIMD_ALIGNED /* use the types native alignment */
+#endif
+
+
#define ELEMSOF(ARR) (sizeof(ARR) / sizeof(*(ARR)))
#define MAX(A, B) ((A) > (B) ? (A) : (B))
diff --git a/libar2_hash.c b/libar2_hash.c
index 49bf61a..b52133b 100644
--- a/libar2_hash.c
+++ b/libar2_hash.c
@@ -2,6 +2,10 @@
#define WARN_UNKNOWN_ENDIAN
#include "common.h"
+#if defined(__x86_64__)
+# include <immintrin.h>
+#endif
+
struct threaded_fill_segments_params {
struct block *memory;
@@ -31,16 +35,136 @@ static const struct libblake_blake2b_params b2params = {
static const struct block zerob; /* implicitly zeroed via `static` */
+#if defined(__x86_64__) && defined(LIBAR2_TARGET__)
+
+/* Relative, approximate, execution times for blockxor:
+ * Over large memory Repeatedly over one block
+ * plain old xor: 1 1
+ * _mm_xor_si128: 1.055 1.031 (removed becaused it's slow)
+ * _mm256_xor_si256: 0.828 0.514
+ * _mm512_xor_si512: I don't have the means to test this
+ *
+ * No difference noted between having a function pointer
+ * to the plain-old-xor version and just implementing it
+ * with a regular function.
+ *
+ * Similar performance is observed for blockxor3 and blockcpy
+ */
+
+LIBAR2_TARGET__("avx2")
+static void
+blockxor_avx2(struct block *a_, const struct block *b_)
+{
+ __m256i *a = (__m256i *)a_;
+ const __m256i *b = (const __m256i *)b_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
+ a[i] = _mm256_xor_si256(a[i], b[i]);
+
+}
+
+LIBAR2_TARGET__("avx512f")
+static void
+blockxor_avx512f(struct block *a_, const struct block *b_)
+{
+ __m512i *a = (__m512i *)a_;
+ const __m512i *b = (const __m512i *)b_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
+ a[i] = _mm512_xor_si512(a[i], b[i]);
+}
+
+static void
+blockxor_vanilla(struct block *a, const struct block *b)
+{
+ size_t i;
+ for (i = 0; i < ELEMSOF(a->w); i++)
+ a->w[i] ^= b->w[i];
+}
+
+static void (*blockxor)(struct block *a, const struct block *b) = &blockxor_vanilla; /* TODO set at init */
+
+LIBAR2_TARGET__("avx2")
+static void
+blockxor3_avx2(struct block *a_, const struct block *b_, const struct block *c_)
+{
+ __m256i *a = (__m256i *)a_;
+ const __m256i *b = (const __m256i *)b_;
+ const __m256i *c = (const __m256i *)c_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
+ a[i] = _mm256_xor_si256(b[i], c[i]);
+
+}
+
+LIBAR2_TARGET__("avx512f")
+static void
+blockxor3_avx512f(struct block *a_, const struct block *b_, const struct block *c_)
+{
+ __m512i *a = (__m512i *)a_;
+ const __m512i *b = (const __m512i *)b_;
+ const __m512i *c = (const __m512i *)c_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
+ a[i] = _mm512_xor_si512(b[i], c[i]);
+}
+
static void
-memxor(void *a_, const void *b_, size_t n) /* TODO using _mm_xor_si128 may improve performance */
+blockxor3_vanilla(struct block *a, const struct block *b, const struct block *c)
{
- unsigned char *a = a_;
- const unsigned char *b = b_;
size_t i;
- for (i = 0; i < n; i++)
- a[i] ^= b[i];
+ for (i = 0; i < ELEMSOF(a->w); i++)
+ a->w[i] = b->w[i] ^ c->w[i];
}
+#define DEFINED_BLOCKXOR3
+static void (*blockxor3)(struct block *a, const struct block *b, const struct block *c) = &blockxor3_vanilla; /* TODO set at init */
+
+LIBAR2_TARGET__("avx2")
+static void
+blockcpy_avx2(struct block *a_, const struct block *b_)
+{
+ __m256i *a = (__m256i *)a_;
+ const __m256i *b = (const __m256i *)b_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (256 / 8); i++)
+ a[i] = _mm256_load_si256(&b[i]);
+}
+
+LIBAR2_TARGET__("avx512f")
+static void
+blockcpy_avx512f(struct block *a_, const struct block *b_)
+{
+ __m512i *a = (__m512i *)a_;
+ const __m512i *b = (const __m512i *)b_;
+ size_t i;
+ for (i = 0; i < sizeof(*a_) / (512 / 8); i++)
+ a[i] = _mm512_load_si512(&b[i]);
+}
+
+static void
+blockcpy_vanilla(struct block *a, const struct block *b)
+{
+ size_t i;
+ for (i = 0; i < ELEMSOF(a->w); i++)
+ a->w[i] = b->w[i];
+}
+
+#define DEFINED_BLOCKCPY
+static void (*blockcpy)(struct block *a, const struct block *b) = &blockcpy_vanilla; /* TODO set at init */
+
+#else
+
+static void
+blockxor(struct block *a, const struct block *b)
+{
+ size_t i;
+ for (i = 0; i < ELEMSOF(a->w); i++)
+ a->w[i] ^= b->w[i];
+}
+
+#endif
+
static size_t
store32(unsigned char *out, uint_least32_t value)
@@ -132,9 +256,18 @@ fill_block(struct block *block, const struct block *prevblock, const struct bloc
{
uint_least64_t x = 0;
uint_least32_t x_hi, x_lo;
- struct block tmpblock;
size_t i;
+ SIMD_ALIGNED struct block tmpblock;
+#if defined(DEFINED_BLOCKXOR3) && defined(DEFINED_BLOCKCPY)
+ if (with_xor) {
+ blockxor3(&tmpblock, refblock, prevblock);
+ blockxor(block, &tmpblock);
+ } else {
+ blockxor3(&tmpblock, refblock, prevblock);
+ blockcpy(block, &tmpblock);
+ }
+#else
if (with_xor) {
for (i = 0; i < ELEMSOF(refblock->w); i++)
block->w[i] ^= tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
@@ -142,6 +275,7 @@ fill_block(struct block *block, const struct block *prevblock, const struct bloc
for (i = 0; i < ELEMSOF(refblock->w); i++)
block->w[i] = tmpblock.w[i] = refblock->w[i] ^ prevblock->w[i];
}
+#endif
if (sbox) {
x = tmpblock.w[0] ^ tmpblock.w[ELEMSOF(tmpblock.w) - 1];
@@ -195,8 +329,7 @@ fill_block(struct block *block, const struct block *prevblock, const struct bloc
96, 97, 112, 113);
}
- for (i = 0; i < ELEMSOF(refblock->w); i++)
- block->w[i] ^= tmpblock.w[i];
+ blockxor(block, &tmpblock);
block->w[0] += x;
block->w[ELEMSOF(block->w) - 1] += x;
@@ -499,6 +632,7 @@ libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameter
size_t i, p, s, nthreads, ts[16], ti, tn, bufsize;
struct threaded_fill_segments_params *tparams = NULL;
uint_least64_t *sbox = NULL; /* This is 8K large (assuming support for uint64_t), so we allocate it dynamically */
+ size_t alignment;
if (libar2_validate_params(params, NULL) || msglen >> 31 > 1) {
errno = EINVAL;
@@ -510,15 +644,16 @@ libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameter
blocks -= blocks % (4 * params->lanes);
lanelen = seglen * 4;
+ alignment = MAX(MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), MAX_SIMD_ALIGNMENT);
#ifdef USING_LITTLE_ENDIAN
/* We are allocating one extra block, this gives use 1024 extra bytes,
* but we only need 128, to ensure that `argon2_blake2b_exthash` does
* not write on unallocated memory. Preferable we would just request
* 128 bytes bytes, but this would require an undesirable API/ABI
* change. */
- memory = ctx->allocate(blocks + 1, sizeof(struct block), MAX(MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), 16), ctx);
+ memory = ctx->allocate(blocks + 1, sizeof(struct block), alignment, ctx);
#else
- memory = ctx->allocate(blocks, sizeof(struct block), MAX(MAX(ALIGNOF(struct block), CACHE_LINE_SIZE), 16), ctx);
+ memory = ctx->allocate(blocks, sizeof(struct block), alignment, ctx);
#endif
if (!memory)
return -1;
@@ -619,7 +754,7 @@ libar2_hash(void *hash, void *msg, size_t msglen, struct libar2_argon2_parameter
}
for (i = 1; i < params->lanes; i++)
- memxor(&memory[lanelen - 1], &memory[i * lanelen + lanelen - 1], sizeof(*memory));
+ blockxor(&memory[lanelen - 1], &memory[i * lanelen + lanelen - 1]);
#ifdef USING_LITTLE_ENDIAN
argon2_blake2b_exthash(hash, params->hashlen, &memory[lanelen - 1], 1024);
#else
diff --git a/test.c b/test.c
index c6f5db5..af9538b 100644
--- a/test.c
+++ b/test.c
@@ -1243,7 +1243,7 @@ main(void)
double ddur;
int r;
memset(&params, 0, sizeof(params));
- params.m_cost = (uint_least32_t)1 << 18;
+ params.m_cost = 8;
params.t_cost = 1;
params.lanes = 1;
params.saltlen = 8;