diff options
| author | Mattias Andrée <maandree@kth.se> | 2016-05-05 21:11:43 +0200 |
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2016-05-05 21:13:16 +0200 |
| commit | 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb (patch) | |
| tree | 84890cd284c538938614c90989ebc4a60991fcca /src | |
| parent | Optimise and use __ around all compiler extensions (diff) | |
| download | libzahl-9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb.tar.gz libzahl-9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb.tar.bz2 libzahl-9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb.tar.xz | |
Optimisations
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
| -rw-r--r-- | src/zadd.c | 132 |
1 files changed, 55 insertions, 77 deletions
@@ -4,116 +4,94 @@ #if defined(__x86_64__) # define ASM3(code) \ - __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i)) + __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc)) # define ASM2(code) \ - __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i)) + __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc)) -# define ADD2(off) \ - "\n movq "#off"(%%rbx), %%rdx" \ - "\n adcq %%rdx, "#off"(%%rax)" +# define ADD2(off) \ + "\n movq "#off"(%[b]), %[x]" \ + "\n adcq %[x], "#off"(%[a])" -# define ADD3(off) \ - "\n movq "#off"(%%rbx), %%rdx" \ - "\n adcq "#off"(%%rcx), %%rdx" \ - "\n movq %%rdx, "#off"(%%rax)" +# define ADD3(off) \ + "\n movq "#off"(%[b]), %[x]" \ + "\n adcq "#off"(%[c]), %[x]" \ + "\n movq %[x], "#off"(%[a])" # define WRAP_CARRY(interior) \ - "\n clc" \ - "\n cmpq $0, %%rdx" \ - "\n je 1f" \ - "\n stc" \ - "\n 1:" \ + "\n addq $-1, %[x]" \ interior \ - "\n movq $1, %%rdx" \ + "\n movq $1, %[x]" \ "\n jc 1f" \ - "\n movq $0, %%rdx" \ + "\n movq $0, %[x]" \ "\n 1:" + +# define ASM_ADD(N) \ + do { \ + register zahl_char_t carry = 0; \ + size_t i; \ + for (i = 0; (INC(4)), (i += 4) <= n;) \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) ADD##N(-8))); \ + switch (n & 3) { \ + case 3: \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16))); \ + break; \ + case 2: \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24))); \ + break; \ + case 1: \ + ASM##N(WRAP_CARRY(ADD##N(-32))); \ + break; \ + default: \ + break; \ + } \ + i = n; \ + while (carry) { \ + carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); \ + i++; \ + } \ + if (a->used < i) \ + a->used = i; \ + } while (0) #endif static inline void zadd_impl_4(z_t a, z_t b, z_t c, size_t n) { - zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars; - size_t i; - -#if defined(__x86_64__) - for (i = 0; (i += 4) <= n;) - ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8))); - if (i > n) { - i -= 4; - switch (n & 3) { - case 3: - ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16))); - break; - case 2: - ASM3(WRAP_CARRY(ADD3(0) ADD3(8))); - break; - case 1: - ASM3(WRAP_CARRY(ADD3(0))); - break; - default: - break; - } - } - i = n; - - while (carry) { - carry = libzahl_add_overflow(ac + i, ac[i], 1); - i++; - } +#ifdef ASM_ADD + register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars; +# define INC(P) (ac += (P), bc += (P), cc += (P)) + ASM_ADD(3); +# undef INC #else - zahl_char_t tcarry; + zahl_char_t carry = 0, tcarry; + zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars; + size_t i; for (i = 0; i < n; i++) { tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]); carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry); } + while (carry) { carry = libzahl_add_overflow(ac + i, ac[i], 1); i++; } -#endif if (a->used < i) a->used = i; +#endif } static inline void zadd_impl_3(z_t a, z_t b, size_t n) { -#if defined(__x86_64__) - zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars; - size_t i; - - for (i = 0; (i += 4) <= n;) - ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8))); - if (i > n) { - i -= 4; - switch (n & 3) { - case 3: - ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16))); - break; - case 2: - ASM2(WRAP_CARRY(ADD2(0) ADD2(8))); - break; - case 1: - ASM2(WRAP_CARRY(ADD2(0))); - break; - default: - break; - } - } - i = n; - - while (carry) { - carry = libzahl_add_overflow(ac + i, ac[i], 1); - i++; - } - - if (a->used < i) - a->used = i; +#ifdef ASM_ADD + register zahl_char_t *ac = a->chars, *bc = b->chars; +# define INC(P) (ac += (P), bc += (P)) + ASM_ADD(2); +# undef INC #else zadd_impl_4(a, a, b, n); #endif |
