From 40b860777616071997ec035783eeea402ffb1ae2 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Tue, 3 May 2016 14:03:33 +0200 Subject: Optimise libzahl_memcpy and libzahl_memset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- STATUS | 2 +- TODO | 3 ++- zahl-internals.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/STATUS b/STATUS index 36d9717..8cae48a 100644 --- a/STATUS +++ b/STATUS @@ -6,7 +6,7 @@ left column. Double-parenthesis means there may be a better way to do it. Inside square-brackets, there are some comments on multi-bit comparisons. -zset .................... fastest [until ~750, then gmp, also tomsfastmath after ~2750] +zset .................... fastest [always with gcc, unless ~250 with clang] zseti ................... tomsfastmath is faster [always] zsetu ................... tomsfastmath is faster [always] zneg(a, b) .............. fastest [until ~300, then gmp] diff --git a/TODO b/TODO index 56d8dbe..0327bca 100644 --- a/TODO +++ b/TODO @@ -5,9 +5,10 @@ Add zsets_radix Add zstr_radix Test big endian -Test always having used > 0 for zero +Test always having .used > 0 for zero Test negative/non-negative instead of sign Test long .sign +Test always having .chars % 4 == 0 Test optimisation of zmul: bc = [(Hb * Hc) << (m2 << 1)] diff --git a/zahl-internals.h b/zahl-internals.h index e9232dd..fc6768a 100644 --- a/zahl-internals.h +++ b/zahl-internals.h @@ -109,18 +109,62 @@ struct zahl { void libzahl_realloc(struct zahl *, size_t); -ZAHL_O2 ZAHL_INLINE void +ZAHL_INLINE void libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n) { size_t i; - for (i = 0; i < n; i++) - d[i] = s[i]; + if (n <= 4) { + if (n >= 1) + d[0] = s[0]; + if (n >= 2) + d[1] = s[1]; + if (n >= 3) + d[2] = s[2]; + if (n >= 4) + d[3] = s[3]; + } else { + for (i = 0; (i += 4) <= n;) { + d[i - 1] = s[i - 1]; + d[i - 2] = s[i - 2]; + d[i - 3] = s[i - 3]; + d[i - 4] = s[i - 4]; + } + if (i > n) { + i -= 4; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + } + } } -ZAHL_O2 ZAHL_INLINE void +ZAHL_INLINE void libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n) { size_t i; - for (i = 0; i < n; i++) - a[i] = v; + if (n <= 4) { + if (n >= 1) + a[0] = v; + if (n >= 2) + a[1] = v; + if (n >= 3) + a[2] = v; + if (n >= 4) + a[3] = v; + } else { + for (i = 0; (i += 4) <= n;) { + a[i - 1] = v; + a[i - 2] = v; + a[i - 3] = v; + a[i - 4] = v; + } + if (i > n) + for (i -= 4; i < n; i++) + a[i] = v; + } } -- cgit v1.2.3-70-g09d2