diff options
| author | Mattias Andrée <m@maandree.se> | 2026-01-24 14:11:01 +0100 |
|---|---|---|
| committer | Mattias Andrée <m@maandree.se> | 2026-01-24 14:11:01 +0100 |
| commit | c8b7fdc7294329dc5eaf9f089f83184ece7d098c (patch) | |
| tree | 1059fd3ab69fb9219ef5a6be3f53cd8208015c40 /libcharconv_latin.c | |
| download | charconv-c8b7fdc7294329dc5eaf9f089f83184ece7d098c.tar.gz charconv-c8b7fdc7294329dc5eaf9f089f83184ece7d098c.tar.bz2 charconv-c8b7fdc7294329dc5eaf9f089f83184ece7d098c.tar.xz | |
First commit
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to 'libcharconv_latin.c')
| -rw-r--r-- | libcharconv_latin.c | 410 |
1 files changed, 410 insertions, 0 deletions
diff --git a/libcharconv_latin.c b/libcharconv_latin.c new file mode 100644 index 0000000..d682b17 --- /dev/null +++ b/libcharconv_latin.c @@ -0,0 +1,410 @@ +/* See LICENSE file for copyright and license details. */ +#include "libcharconv.h" + + +static size_t +decode_utf8(const char *s, size_t slen, uint_least32_t *cp) +{ + uint_least32_t min, max; + size_t i, n; + + if (slen < 1u) + return 0u; + + if (!(s[0] & 0x80)) { + *cp = (uint_least32_t)s[0]; + return 1u; + } else if ((s[0] & 0xE0) == 0xC0) { + *cp = (uint_least32_t)s[0] & 0x3Fu; + n = 2u; + min = UINT32_C(0x80); + max = UINT32_C(0x800); + } else if ((s[0] & 0xF0) == 0xE0) { + *cp = (uint_least32_t)s[0] & 0x1Fu; + n = 3u; + min = UINT32_C(0x800); + max = UINT32_C(0x10000); + } else if ((s[0] & 0xF8) == 0xF0) { + *cp = (uint_least32_t)s[0] & 0x0Fu; + n = 4u; + min = UINT32_C(0x10000); + max = UINT32_C(0x110000); + } else { + return 0u; + } + + if (slen < n) + return n; + + for (i = 1u; i < n; i++) { + *cp <<= 6; + *cp |= (uint_least32_t)s[i] & 0x3Fu; + } + + if (min > *cp || *cp >= max) + return 0u; + + return n; +} + + +enum libcharconv_result +libcharconv_latin(const char *s, size_t slen, size_t *n, uint_least32_t *cp, size_t *ncp) +{ + enum libcharconv_result ret = LIBCHARCONV_CONVERTED; + uint_least32_t c; + char c1, c2, c3; + size_t clen; + + *n = 0; + for (; slen; s++) { + clen = decode_utf8(s, slen, &c); + if (clen > slen) + return LIBCHARCONV_INDETERMINATE; + if (!clen) { + *n += 1u; + slen -= 1u; + continue; + } + slen -= clen; + + if (UINT32_C(0x2680) <= c && c <= UINT32_C(0x2685)) { + /* dice */ + c -= (uint_least32_t)UINT32_C(0x2680) - (uint_least32_t)'1'; + goto conv; + + } else if (UINT32_C(0x1F1E6) <= c && c <= UINT32_C(0x1F1FF)) { + /* region indicators */ + c -= (uint_least32_t)UINT32_C(0x1F1E6) - (uint_least32_t)'A'; + goto conv; + + } else if (UINT32_C(0xE0020) <= c && c <= UINT32_C(0xE007E)) { + /* tags */ + c -= (uint_least32_t)UINT32_C(0xE0000); + goto conv; + + } else if (UINT32_C(0x10800) <= c && c <= UINT32_C(0x1083F)) { + /* cypriot */ + c -= UINT32_C(0x10800); + c1 = "_jklmnprstwxz"[c / 5]; + c2 = "aeiou"[c % 5]; + if (c1 == '_') { + c = (uint_least32_t)c2; + goto conv; + } + if (c1 == 'j' && c2 != 'a' && c2 != 'o') + goto no_match; + if (c1 == 'w' && c2 == 'u') + goto no_match; + if (c1 == 'x' && c2 != 'a' && c2 != 'e') + goto no_match; + if (c1 == 'z' && c2 != 'a' && c2 != 'o') + goto no_match; + goto conv2; + + } else if (UINT32_C(0x1D400) <= c && c <= UINT32_C(0x1D419)) { + /* bold (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D400) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D41A) <= c && c <= UINT32_C(0x1D433)) { + /* bold (small) */ + c -= (uint_least32_t)UINT32_C(0x1D41A) - (uint_least32_t)'a'; + goto conv; + } else if (UINT32_C(0x1D7CE) <= c && c <= UINT32_C(0x1D7D7)) { + /* bold (digit) */ + c -= (uint_least32_t)UINT32_C(0x1D7CE) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1D434) <= c && c <= UINT32_C(0x1D44D)) { + /* italic (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D434) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D44E) <= c && c <= UINT32_C(0x1D467)) { + /* italic (small) */ + c -= (uint_least32_t)UINT32_C(0x1D44E) - (uint_least32_t)'a'; + goto conv; + } else if (c == UINT32_C(0x210E)) { + /* italic (small h) */ + c = (uint_least32_t)'h'; + goto conv; + + } else if (UINT32_C(0x1D468) <= c && c <= UINT32_C(0x1D481)) { + /* bold italic (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D468) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D482) <= c && c <= UINT32_C(0x1D49B)) { + /* bold italic (small) */ + c -= (uint_least32_t)UINT32_C(0x1D482) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D670) <= c && c <= UINT32_C(0x1D689)) { + /* monospace (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D670) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D68A) <= c && c <= UINT32_C(0x1D6A3)) { + /* monospace (small) */ + c -= (uint_least32_t)UINT32_C(0x1D68A) - (uint_least32_t)'a'; + goto conv; + } else if (UINT32_C(0x1D7F6) <= c && c <= UINT32_C(0x1D7FF)) { + /* monospace (digit) */ + c -= (uint_least32_t)UINT32_C(0x1D7F6) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1FBF0) <= c && c <= UINT32_C(0x1FBF9)) { + /* segmented */ + c -= (uint_least32_t)UINT32_C(0x1FBF0) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1D5A0) <= c && c <= UINT32_C(0x1D5B9)) { + /* sans-serif (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D5A0) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D5BA) <= c && c <= UINT32_C(0x1D5D3)) { + /* sans-serif (small) */ + c -= (uint_least32_t)UINT32_C(0x1D5BA) - (uint_least32_t)'a'; + goto conv; + } else if (UINT32_C(0x1D7E2) <= c && c <= UINT32_C(0x1D7EB)) { + /* sans-serif (digit) */ + c -= (uint_least32_t)UINT32_C(0x1D7E2) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1D5D4) <= c && c <= UINT32_C(0x1D5ED)) { + /* sans-serif bold (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D5D4) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D5EE) <= c && c <= UINT32_C(0x1D607)) { + /* sans-serif bold (small) */ + c -= (uint_least32_t)UINT32_C(0x1D5EE) - (uint_least32_t)'a'; + goto conv; + } else if (UINT32_C(0x1D7EC) <= c && c <= UINT32_C(0x1D7F5)) { + /* sans-serif bold (digit) */ + c -= (uint_least32_t)UINT32_C(0x1D7EC) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1D608) <= c && c <= UINT32_C(0x1D621)) { + /* sans-serif italic (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D608) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D622) <= c && c <= UINT32_C(0x1D63B)) { + /* sans-serif italic (small) */ + c -= (uint_least32_t)UINT32_C(0x1D622) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D63C) <= c && c <= UINT32_C(0x1D655)) { + /* sans-serif bold italic (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D63C) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D656) <= c && c <= UINT32_C(0x1D66F)) { + /* sans-serif bold italic (small) */ + c -= (uint_least32_t)UINT32_C(0x1D656) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D538) <= c && c <= UINT32_C(0x1D551)) { + /* double-struck (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D538) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D552) <= c && c <= UINT32_C(0x1D56B)) { + /* double-struck (small) */ + c -= (uint_least32_t)UINT32_C(0x1D552) - (uint_least32_t)'a'; + goto conv; + } else if (UINT32_C(0x1D7D8) <= c && c <= UINT32_C(0x1D7E1)) { + /* double-struck (digit) */ + c -= (uint_least32_t)UINT32_C(0x1D7D8) - (uint_least32_t)'0'; + goto conv; + + } else if (UINT32_C(0x1D504) <= c && c <= UINT32_C(0x1D51D)) { + /* fraktur (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D504) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D51E) <= c && c <= UINT32_C(0x1D537)) { + /* fraktur (small) */ + c -= (uint_least32_t)UINT32_C(0x1D51E) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D56C) <= c && c <= UINT32_C(0x1D585)) { + /* bold fraktur (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D56C) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D586) <= c && c <= UINT32_C(0x1D59F)) { + /* bold fraktur (small) */ + c -= (uint_least32_t)UINT32_C(0x1D586) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D49C) <= c && c <= UINT32_C(0x1D4B5)) { + /* script (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D49C) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D4B6) <= c && c <= UINT32_C(0x1D4CF)) { + /* script (small) */ + c -= (uint_least32_t)UINT32_C(0x1D4B6) - (uint_least32_t)'a'; + goto conv; + + } else if (UINT32_C(0x1D4D0) <= c && c <= UINT32_C(0x1D4E9)) { + /* bold script (captial) */ + c -= (uint_least32_t)UINT32_C(0x1D4D0) - (uint_least32_t)'A'; + goto conv; + } else if (UINT32_C(0x1D4EA) <= c && c <= UINT32_C(0x1D503)) { + /* bold script (small) */ + c -= (uint_least32_t)UINT32_C(0x1D4EA) - (uint_least32_t)'a'; + goto conv; + + } else { + switch (c) { + /* shogi */ + case UINT32_C(0x2616): c = (uint_least32_t)'w'; goto conv; + case UINT32_C(0x2617): c = (uint_least32_t)'b'; goto conv; + case UINT32_C(0x26C9): c = (uint_least32_t)'W'; goto conv; + case UINT32_C(0x26CA): c = (uint_least32_t)'B'; goto conv; + + /* go (white) */ + case UINT32_C(0x25CB): c = (uint_least32_t)'0'; goto conv; + case UINT32_C(0x2686): c = (uint_least32_t)'1'; goto conv; + case UINT32_C(0x2687): c = (uint_least32_t)'2'; goto conv; + + /* go (black) */ + case UINT32_C(0x25CF): c = (uint_least32_t)'0'; goto conv; + case UINT32_C(0x2688): c = (uint_least32_t)'1'; goto conv; + case UINT32_C(0x2689): c = (uint_least32_t)'2'; goto conv; + + /* draughts */ + case UINT32_C(0x26C0): c = (uint_least32_t)'m'; goto conv; + case UINT32_C(0x26C1): c = (uint_least32_t)'k'; goto conv; + case UINT32_C(0x26C2): c = (uint_least32_t)'M'; goto conv; + case UINT32_C(0x26C3): c = (uint_least32_t)'K'; goto conv; + + /* gender symbols */ + case UINT32_C(0x2640): c = (uint_least32_t)'f'; goto conv; + case UINT32_C(0x2642): c = (uint_least32_t)'m'; goto conv; + case UINT32_C(0x263F): c = (uint_least32_t)'i'; goto conv; + + /* double-struck */ + case UINT32_C(0x2102): c = (uint_least32_t)'C'; goto conv; + case UINT32_C(0x210D): c = (uint_least32_t)'H'; goto conv; + case UINT32_C(0x2115): c = (uint_least32_t)'N'; goto conv; + case UINT32_C(0x2119): c = (uint_least32_t)'P'; goto conv; + case UINT32_C(0x211A): c = (uint_least32_t)'Q'; goto conv; + case UINT32_C(0x211D): c = (uint_least32_t)'R'; goto conv; + case UINT32_C(0x2124): c = (uint_least32_t)'Z'; goto conv; + + /* double-struck italic */ + case UINT32_C(0x2145): c = (uint_least32_t)'D'; goto conv; + case UINT32_C(0x2146): c = (uint_least32_t)'d'; goto conv; + case UINT32_C(0x2147): c = (uint_least32_t)'e'; goto conv; + case UINT32_C(0x2148): c = (uint_least32_t)'i'; goto conv; + case UINT32_C(0x2149): c = (uint_least32_t)'j'; goto conv; + + /* fraktur */ + case UINT32_C(0x212D): c = (uint_least32_t)'C'; goto conv; + case UINT32_C(0x210C): c = (uint_least32_t)'H'; goto conv; + case UINT32_C(0x2111): c = (uint_least32_t)'I'; goto conv; + case UINT32_C(0x211C): c = (uint_least32_t)'R'; goto conv; + case UINT32_C(0x2128): c = (uint_least32_t)'Z'; goto conv; + + /* script */ + case UINT32_C(0x212C): c = (uint_least32_t)'B'; goto conv; + case UINT32_C(0x2130): c = (uint_least32_t)'E'; goto conv; + case UINT32_C(0x2131): c = (uint_least32_t)'F'; goto conv; + case UINT32_C(0x210B): c = (uint_least32_t)'H'; goto conv; + case UINT32_C(0x2110): c = (uint_least32_t)'I'; goto conv; + case UINT32_C(0x2112): c = (uint_least32_t)'L'; goto conv; + case UINT32_C(0x2133): c = (uint_least32_t)'M'; goto conv; + case UINT32_C(0x211B): c = (uint_least32_t)'R'; goto conv; + case UINT32_C(0x212F): c = (uint_least32_t)'e'; goto conv; + case UINT32_C(0x210A): c = (uint_least32_t)'g'; goto conv; + case UINT32_C(0x2134): c = (uint_least32_t)'o'; goto conv; + + /* buhid */ + case UINT32_C(0x1740): c = (uint_least32_t)'a'; goto conv; + case UINT32_C(0x1741): c = (uint_least32_t)'i'; goto conv; + case UINT32_C(0x1742): c = (uint_least32_t)'u'; goto conv; + case UINT32_C(0x1752): c2 = 'i'; goto budih_combining; + case UINT32_C(0x1753): c2 = 'u'; goto budih_combining; + budih_combining: + c1 = '^'; + goto conv2; + case UINT32_C(0x174A): c1 = 'b'; goto budih; + case UINT32_C(0x1747): c1 = 'd'; goto budih; + case UINT32_C(0x1744): c1 = 'g'; goto budih; + case UINT32_C(0x1751): c1 = 'h'; goto budih; + case UINT32_C(0x1743): c1 = 'k'; goto budih; + case UINT32_C(0x174E): c1 = 'l'; goto budih; + case UINT32_C(0x174B): c1 = 'm'; goto budih; + case UINT32_C(0x1748): c1 = 'n'; goto budih; + case UINT32_C(0x1749): c1 = 'p'; goto budih; + case UINT32_C(0x174D): c1 = 'r'; goto budih; + case UINT32_C(0x1750): c1 = 's'; goto budih; + case UINT32_C(0x1746): c1 = 't'; goto budih; + case UINT32_C(0x174C): c1 = 'y'; goto budih; + case UINT32_C(0x174F): c1 = 'w'; goto budih; + case UINT32_C(0x1745): c1 = '-'; goto budih; + budih: + if (*n) + goto no_conv; + c2 = 'a'; + s = &s[clen]; + *n += clen; + if (!slen) { + ret = LIBCHARCONV_CONVERT_IF_END; + goto budih_conv; + } + clen = decode_utf8(s, slen, &c); + if (clen > slen) + return LIBCHARCONV_INDETERMINATE; + if (!clen) + goto budih_conv; + switch (c) { + case UINT32_C(0x1752): c2 = 'i'; *n += clen; break; + case UINT32_C(0x1753): c2 = 'u'; *n += clen; break; + default: + break; + } + budih_conv: + if (c1 == '-') { + c3 = c2; + c2 = 'g'; + c1 = 'n'; + goto conv3_prechecked; + } + goto conv2_prechecked; + + default: + no_match: + *n += clen; + break; + } + } + } +no_conv: + return LIBCHARCONV_NO_CONVERT; + +conv: + if (*n) + goto no_conv; + *n += clen; + if (*ncp) + *cp = c; + *ncp = 1u; + return ret; + +conv2: + if (*n) + goto no_conv; + *n += clen; +conv2_prechecked: + if (*ncp >= 1u) + cp[0] = (uint_least32_t)c1; + if (*ncp >= 2u) + cp[1] = (uint_least32_t)c2; + *ncp = 2u; + return ret; + +conv3_prechecked: + if (*ncp >= 1u) + cp[0] = (uint_least32_t)c1; + if (*ncp >= 2u) + cp[1] = (uint_least32_t)c2; + if (*ncp >= 3u) + cp[2] = (uint_least32_t)c3; + *ncp = 3u; + return ret; +} |
