aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2016-05-03 14:03:33 +0200
committerMattias Andrée <maandree@kth.se>2016-05-03 14:03:33 +0200
commit40b860777616071997ec035783eeea402ffb1ae2 (patch)
tree5647de0c6647c0c7ee6c7b4e25a6c9a75089b077
parentOptimise zswap (diff)
downloadlibzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.gz
libzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.bz2
libzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.xz
Optimise libzahl_memcpy and libzahl_memset
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--STATUS2
-rw-r--r--TODO3
-rw-r--r--zahl-internals.h56
3 files changed, 53 insertions, 8 deletions
diff --git a/STATUS b/STATUS
index 36d9717..8cae48a 100644
--- a/STATUS
+++ b/STATUS
@@ -6,7 +6,7 @@ left column. Double-parenthesis means there may be a better way
to do it. Inside square-brackets, there are some comments on
multi-bit comparisons.
-zset .................... fastest [until ~750, then gmp, also tomsfastmath after ~2750]
+zset .................... fastest [always with gcc, unless ~250 with clang]
zseti ................... tomsfastmath is faster [always]
zsetu ................... tomsfastmath is faster [always]
zneg(a, b) .............. fastest [until ~300, then gmp]
diff --git a/TODO b/TODO
index 56d8dbe..0327bca 100644
--- a/TODO
+++ b/TODO
@@ -5,9 +5,10 @@ Add zsets_radix
Add zstr_radix
Test big endian
-Test always having used > 0 for zero
+Test always having .used > 0 for zero
Test negative/non-negative instead of sign
Test long .sign
+Test always having .chars % 4 == 0
Test optimisation of zmul:
bc = [(Hb * Hc) << (m2 << 1)]
diff --git a/zahl-internals.h b/zahl-internals.h
index e9232dd..fc6768a 100644
--- a/zahl-internals.h
+++ b/zahl-internals.h
@@ -109,18 +109,62 @@ struct zahl {
void libzahl_realloc(struct zahl *, size_t);
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n)
{
size_t i;
- for (i = 0; i < n; i++)
- d[i] = s[i];
+ if (n <= 4) {
+ if (n >= 1)
+ d[0] = s[0];
+ if (n >= 2)
+ d[1] = s[1];
+ if (n >= 3)
+ d[2] = s[2];
+ if (n >= 4)
+ d[3] = s[3];
+ } else {
+ for (i = 0; (i += 4) <= n;) {
+ d[i - 1] = s[i - 1];
+ d[i - 2] = s[i - 2];
+ d[i - 3] = s[i - 3];
+ d[i - 4] = s[i - 4];
+ }
+ if (i > n) {
+ i -= 4;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ }
+ }
}
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
{
size_t i;
- for (i = 0; i < n; i++)
- a[i] = v;
+ if (n <= 4) {
+ if (n >= 1)
+ a[0] = v;
+ if (n >= 2)
+ a[1] = v;
+ if (n >= 3)
+ a[2] = v;
+ if (n >= 4)
+ a[3] = v;
+ } else {
+ for (i = 0; (i += 4) <= n;) {
+ a[i - 1] = v;
+ a[i - 2] = v;
+ a[i - 3] = v;
+ a[i - 4] = v;
+ }
+ if (i > n)
+ for (i -= 4; i < n; i++)
+ a[i] = v;
+ }
}