From 93bf9e5b4bf63708c732f5bf07619d2e59c81ec4 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Thu, 5 May 2016 02:41:50 +0200 Subject: Optimise zadd on x86-64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- src/zadd.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 96 insertions(+), 4 deletions(-) diff --git a/src/zadd.c b/src/zadd.c index a78a918..8efdf19 100644 --- a/src/zadd.c +++ b/src/zadd.c @@ -2,20 +2,79 @@ #include "internals.h" +#if defined(__x86_64__) +# define ASM3(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i)) + +# define ASM2(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i)) + +# define ADD2(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq %%rdx, "#off"(%%rax)" + +# define ADD3(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq "#off"(%%rcx), %%rdx" \ + "\n movq %%rdx, "#off"(%%rax)" + +# define WRAP_CARRY(interior) \ + "\n clc" \ + "\n cmpq $0, %%rdx" \ + "\n je 1f" \ + "\n stc" \ + "\n 1:" \ + interior \ + "\n movq $1, %%rdx" \ + "\n jc 1f" \ + "\n movq $0, %%rdx" \ + "\n 1:" +#endif + + static inline void zadd_impl_4(z_t a, z_t b, z_t c, size_t n) { - zahl_char_t carry = 0, tcarry; + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars; size_t i; +#if defined(__x86_64__) + for (i = 0; (i += 4) <= n;) + ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16))); + break; + case 2: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8))); + break; + case 1: + ASM3(WRAP_CARRY(ADD3(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } +#else + zahl_char_t tcarry; + for (i = 0; i < n; i++) { - tcarry = libzahl_add_overflow(a->chars + i, b->chars[i], c->chars[i]); - carry = tcarry | (zahl_char_t)libzahl_add_overflow(a->chars + i, a->chars[i], carry); + tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]); + carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry); } while (carry) { - carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); + carry = libzahl_add_overflow(ac + i, ac[i], 1); i++; } +#endif if (a->used < i) a->used = i; @@ -24,7 +83,40 @@ zadd_impl_4(z_t a, z_t b, z_t c, size_t n) static inline void zadd_impl_3(z_t a, z_t b, size_t n) { +#if defined(__x86_64__) + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars; + size_t i; + + for (i = 0; (i += 4) <= n;) + ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16))); + break; + case 2: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8))); + break; + case 1: + ASM2(WRAP_CARRY(ADD2(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } + + if (a->used < i) + a->used = i; +#else zadd_impl_4(a, a, b, n); +#endif } static inline void -- cgit v1.2.3-70-g09d2