diff options
| author | Mattias Andrée <maandree@kth.se> | 2016-05-05 02:41:50 +0200 |
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2016-05-05 02:41:50 +0200 |
| commit | 93bf9e5b4bf63708c732f5bf07619d2e59c81ec4 (patch) | |
| tree | 00208f7a9663052dd9fc9daa1a315929d4ef98f9 /src | |
| parent | Optimise zadd (diff) | |
| download | libzahl-93bf9e5b4bf63708c732f5bf07619d2e59c81ec4.tar.gz libzahl-93bf9e5b4bf63708c732f5bf07619d2e59c81ec4.tar.bz2 libzahl-93bf9e5b4bf63708c732f5bf07619d2e59c81ec4.tar.xz | |
Optimise zadd on x86-64
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to 'src')
| -rw-r--r-- | src/zadd.c | 100 |
1 files changed, 96 insertions, 4 deletions
@@ -2,20 +2,79 @@ #include "internals.h" +#if defined(__x86_64__) +# define ASM3(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i)) + +# define ASM2(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i)) + +# define ADD2(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq %%rdx, "#off"(%%rax)" + +# define ADD3(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq "#off"(%%rcx), %%rdx" \ + "\n movq %%rdx, "#off"(%%rax)" + +# define WRAP_CARRY(interior) \ + "\n clc" \ + "\n cmpq $0, %%rdx" \ + "\n je 1f" \ + "\n stc" \ + "\n 1:" \ + interior \ + "\n movq $1, %%rdx" \ + "\n jc 1f" \ + "\n movq $0, %%rdx" \ + "\n 1:" +#endif + + static inline void zadd_impl_4(z_t a, z_t b, z_t c, size_t n) { - zahl_char_t carry = 0, tcarry; + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars; size_t i; +#if defined(__x86_64__) + for (i = 0; (i += 4) <= n;) + ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16))); + break; + case 2: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8))); + break; + case 1: + ASM3(WRAP_CARRY(ADD3(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } +#else + zahl_char_t tcarry; + for (i = 0; i < n; i++) { - tcarry = libzahl_add_overflow(a->chars + i, b->chars[i], c->chars[i]); - carry = tcarry | (zahl_char_t)libzahl_add_overflow(a->chars + i, a->chars[i], carry); + tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]); + carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry); } while (carry) { - carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); + carry = libzahl_add_overflow(ac + i, ac[i], 1); i++; } +#endif if (a->used < i) a->used = i; @@ -24,7 +83,40 @@ zadd_impl_4(z_t a, z_t b, z_t c, size_t n) static inline void zadd_impl_3(z_t a, z_t b, size_t n) { +#if defined(__x86_64__) + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars; + size_t i; + + for (i = 0; (i += 4) <= n;) + ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16))); + break; + case 2: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8))); + break; + case 1: + ASM2(WRAP_CARRY(ADD2(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } + + if (a->used < i) + a->used = i; +#else zadd_impl_4(a, a, b, n); +#endif } static inline void |
