diff options
| author | Mattias Andrée <maandree@kth.se> | 2016-05-03 14:03:33 +0200 |
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2016-05-03 14:03:33 +0200 |
| commit | 40b860777616071997ec035783eeea402ffb1ae2 (patch) | |
| tree | 5647de0c6647c0c7ee6c7b4e25a6c9a75089b077 | |
| parent | Optimise zswap (diff) | |
| download | libzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.gz libzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.bz2 libzahl-40b860777616071997ec035783eeea402ffb1ae2.tar.xz | |
Optimise libzahl_memcpy and libzahl_memset
Signed-off-by: Mattias Andrée <maandree@kth.se>
| -rw-r--r-- | STATUS | 2 | ||||
| -rw-r--r-- | TODO | 3 | ||||
| -rw-r--r-- | zahl-internals.h | 56 |
3 files changed, 53 insertions, 8 deletions
@@ -6,7 +6,7 @@ left column. Double-parenthesis means there may be a better way to do it. Inside square-brackets, there are some comments on multi-bit comparisons. -zset .................... fastest [until ~750, then gmp, also tomsfastmath after ~2750] +zset .................... fastest [always with gcc, unless ~250 with clang] zseti ................... tomsfastmath is faster [always] zsetu ................... tomsfastmath is faster [always] zneg(a, b) .............. fastest [until ~300, then gmp] @@ -5,9 +5,10 @@ Add zsets_radix Add zstr_radix Test big endian -Test always having used > 0 for zero +Test always having .used > 0 for zero Test negative/non-negative instead of sign Test long .sign +Test always having .chars % 4 == 0 Test optimisation of zmul: bc = [(Hb * Hc) << (m2 << 1)] diff --git a/zahl-internals.h b/zahl-internals.h index e9232dd..fc6768a 100644 --- a/zahl-internals.h +++ b/zahl-internals.h @@ -109,18 +109,62 @@ struct zahl { void libzahl_realloc(struct zahl *, size_t); -ZAHL_O2 ZAHL_INLINE void +ZAHL_INLINE void libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n) { size_t i; - for (i = 0; i < n; i++) - d[i] = s[i]; + if (n <= 4) { + if (n >= 1) + d[0] = s[0]; + if (n >= 2) + d[1] = s[1]; + if (n >= 3) + d[2] = s[2]; + if (n >= 4) + d[3] = s[3]; + } else { + for (i = 0; (i += 4) <= n;) { + d[i - 1] = s[i - 1]; + d[i - 2] = s[i - 2]; + d[i - 3] = s[i - 3]; + d[i - 4] = s[i - 4]; + } + if (i > n) { + i -= 4; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + if (i < n) + d[i] = s[i], i++; + } + } } -ZAHL_O2 ZAHL_INLINE void +ZAHL_INLINE void libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n) { size_t i; - for (i = 0; i < n; i++) - a[i] = v; + if (n <= 4) { + if (n >= 1) + a[0] = v; + if (n >= 2) + a[1] = v; + if (n >= 3) + a[2] = v; + if (n >= 4) + a[3] = v; + } else { + for (i = 0; (i += 4) <= n;) { + a[i - 1] = v; + a[i - 2] = v; + a[i - 3] = v; + a[i - 4] = v; + } + if (i > n) + for (i -= 4; i < n; i++) + a[i] = v; + } } |
