aboutsummaryrefslogtreecommitdiffstats
path: root/libcharconv_latin.c
diff options
context:
space:
mode:
Diffstat (limited to 'libcharconv_latin.c')
-rw-r--r--libcharconv_latin.c410
1 files changed, 410 insertions, 0 deletions
diff --git a/libcharconv_latin.c b/libcharconv_latin.c
new file mode 100644
index 0000000..d682b17
--- /dev/null
+++ b/libcharconv_latin.c
@@ -0,0 +1,410 @@
+/* See LICENSE file for copyright and license details. */
+#include "libcharconv.h"
+
+
+static size_t
+decode_utf8(const char *s, size_t slen, uint_least32_t *cp)
+{
+ uint_least32_t min, max;
+ size_t i, n;
+
+ if (slen < 1u)
+ return 0u;
+
+ if (!(s[0] & 0x80)) {
+ *cp = (uint_least32_t)s[0];
+ return 1u;
+ } else if ((s[0] & 0xE0) == 0xC0) {
+ *cp = (uint_least32_t)s[0] & 0x3Fu;
+ n = 2u;
+ min = UINT32_C(0x80);
+ max = UINT32_C(0x800);
+ } else if ((s[0] & 0xF0) == 0xE0) {
+ *cp = (uint_least32_t)s[0] & 0x1Fu;
+ n = 3u;
+ min = UINT32_C(0x800);
+ max = UINT32_C(0x10000);
+ } else if ((s[0] & 0xF8) == 0xF0) {
+ *cp = (uint_least32_t)s[0] & 0x0Fu;
+ n = 4u;
+ min = UINT32_C(0x10000);
+ max = UINT32_C(0x110000);
+ } else {
+ return 0u;
+ }
+
+ if (slen < n)
+ return n;
+
+ for (i = 1u; i < n; i++) {
+ *cp <<= 6;
+ *cp |= (uint_least32_t)s[i] & 0x3Fu;
+ }
+
+ if (min > *cp || *cp >= max)
+ return 0u;
+
+ return n;
+}
+
+
+enum libcharconv_result
+libcharconv_latin(const char *s, size_t slen, size_t *n, uint_least32_t *cp, size_t *ncp)
+{
+ enum libcharconv_result ret = LIBCHARCONV_CONVERTED;
+ uint_least32_t c;
+ char c1, c2, c3;
+ size_t clen;
+
+ *n = 0;
+ for (; slen; s++) {
+ clen = decode_utf8(s, slen, &c);
+ if (clen > slen)
+ return LIBCHARCONV_INDETERMINATE;
+ if (!clen) {
+ *n += 1u;
+ slen -= 1u;
+ continue;
+ }
+ slen -= clen;
+
+ if (UINT32_C(0x2680) <= c && c <= UINT32_C(0x2685)) {
+ /* dice */
+ c -= (uint_least32_t)UINT32_C(0x2680) - (uint_least32_t)'1';
+ goto conv;
+
+ } else if (UINT32_C(0x1F1E6) <= c && c <= UINT32_C(0x1F1FF)) {
+ /* region indicators */
+ c -= (uint_least32_t)UINT32_C(0x1F1E6) - (uint_least32_t)'A';
+ goto conv;
+
+ } else if (UINT32_C(0xE0020) <= c && c <= UINT32_C(0xE007E)) {
+ /* tags */
+ c -= (uint_least32_t)UINT32_C(0xE0000);
+ goto conv;
+
+ } else if (UINT32_C(0x10800) <= c && c <= UINT32_C(0x1083F)) {
+ /* cypriot */
+ c -= UINT32_C(0x10800);
+ c1 = "_jklmnprstwxz"[c / 5];
+ c2 = "aeiou"[c % 5];
+ if (c1 == '_') {
+ c = (uint_least32_t)c2;
+ goto conv;
+ }
+ if (c1 == 'j' && c2 != 'a' && c2 != 'o')
+ goto no_match;
+ if (c1 == 'w' && c2 == 'u')
+ goto no_match;
+ if (c1 == 'x' && c2 != 'a' && c2 != 'e')
+ goto no_match;
+ if (c1 == 'z' && c2 != 'a' && c2 != 'o')
+ goto no_match;
+ goto conv2;
+
+ } else if (UINT32_C(0x1D400) <= c && c <= UINT32_C(0x1D419)) {
+ /* bold (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D400) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D41A) <= c && c <= UINT32_C(0x1D433)) {
+ /* bold (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D41A) - (uint_least32_t)'a';
+ goto conv;
+ } else if (UINT32_C(0x1D7CE) <= c && c <= UINT32_C(0x1D7D7)) {
+ /* bold (digit) */
+ c -= (uint_least32_t)UINT32_C(0x1D7CE) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1D434) <= c && c <= UINT32_C(0x1D44D)) {
+ /* italic (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D434) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D44E) <= c && c <= UINT32_C(0x1D467)) {
+ /* italic (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D44E) - (uint_least32_t)'a';
+ goto conv;
+ } else if (c == UINT32_C(0x210E)) {
+ /* italic (small h) */
+ c = (uint_least32_t)'h';
+ goto conv;
+
+ } else if (UINT32_C(0x1D468) <= c && c <= UINT32_C(0x1D481)) {
+ /* bold italic (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D468) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D482) <= c && c <= UINT32_C(0x1D49B)) {
+ /* bold italic (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D482) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D670) <= c && c <= UINT32_C(0x1D689)) {
+ /* monospace (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D670) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D68A) <= c && c <= UINT32_C(0x1D6A3)) {
+ /* monospace (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D68A) - (uint_least32_t)'a';
+ goto conv;
+ } else if (UINT32_C(0x1D7F6) <= c && c <= UINT32_C(0x1D7FF)) {
+ /* monospace (digit) */
+ c -= (uint_least32_t)UINT32_C(0x1D7F6) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1FBF0) <= c && c <= UINT32_C(0x1FBF9)) {
+ /* segmented */
+ c -= (uint_least32_t)UINT32_C(0x1FBF0) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1D5A0) <= c && c <= UINT32_C(0x1D5B9)) {
+ /* sans-serif (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D5A0) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D5BA) <= c && c <= UINT32_C(0x1D5D3)) {
+ /* sans-serif (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D5BA) - (uint_least32_t)'a';
+ goto conv;
+ } else if (UINT32_C(0x1D7E2) <= c && c <= UINT32_C(0x1D7EB)) {
+ /* sans-serif (digit) */
+ c -= (uint_least32_t)UINT32_C(0x1D7E2) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1D5D4) <= c && c <= UINT32_C(0x1D5ED)) {
+ /* sans-serif bold (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D5D4) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D5EE) <= c && c <= UINT32_C(0x1D607)) {
+ /* sans-serif bold (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D5EE) - (uint_least32_t)'a';
+ goto conv;
+ } else if (UINT32_C(0x1D7EC) <= c && c <= UINT32_C(0x1D7F5)) {
+ /* sans-serif bold (digit) */
+ c -= (uint_least32_t)UINT32_C(0x1D7EC) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1D608) <= c && c <= UINT32_C(0x1D621)) {
+ /* sans-serif italic (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D608) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D622) <= c && c <= UINT32_C(0x1D63B)) {
+ /* sans-serif italic (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D622) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D63C) <= c && c <= UINT32_C(0x1D655)) {
+ /* sans-serif bold italic (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D63C) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D656) <= c && c <= UINT32_C(0x1D66F)) {
+ /* sans-serif bold italic (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D656) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D538) <= c && c <= UINT32_C(0x1D551)) {
+ /* double-struck (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D538) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D552) <= c && c <= UINT32_C(0x1D56B)) {
+ /* double-struck (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D552) - (uint_least32_t)'a';
+ goto conv;
+ } else if (UINT32_C(0x1D7D8) <= c && c <= UINT32_C(0x1D7E1)) {
+ /* double-struck (digit) */
+ c -= (uint_least32_t)UINT32_C(0x1D7D8) - (uint_least32_t)'0';
+ goto conv;
+
+ } else if (UINT32_C(0x1D504) <= c && c <= UINT32_C(0x1D51D)) {
+ /* fraktur (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D504) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D51E) <= c && c <= UINT32_C(0x1D537)) {
+ /* fraktur (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D51E) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D56C) <= c && c <= UINT32_C(0x1D585)) {
+ /* bold fraktur (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D56C) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D586) <= c && c <= UINT32_C(0x1D59F)) {
+ /* bold fraktur (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D586) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D49C) <= c && c <= UINT32_C(0x1D4B5)) {
+ /* script (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D49C) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D4B6) <= c && c <= UINT32_C(0x1D4CF)) {
+ /* script (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D4B6) - (uint_least32_t)'a';
+ goto conv;
+
+ } else if (UINT32_C(0x1D4D0) <= c && c <= UINT32_C(0x1D4E9)) {
+ /* bold script (captial) */
+ c -= (uint_least32_t)UINT32_C(0x1D4D0) - (uint_least32_t)'A';
+ goto conv;
+ } else if (UINT32_C(0x1D4EA) <= c && c <= UINT32_C(0x1D503)) {
+ /* bold script (small) */
+ c -= (uint_least32_t)UINT32_C(0x1D4EA) - (uint_least32_t)'a';
+ goto conv;
+
+ } else {
+ switch (c) {
+ /* shogi */
+ case UINT32_C(0x2616): c = (uint_least32_t)'w'; goto conv;
+ case UINT32_C(0x2617): c = (uint_least32_t)'b'; goto conv;
+ case UINT32_C(0x26C9): c = (uint_least32_t)'W'; goto conv;
+ case UINT32_C(0x26CA): c = (uint_least32_t)'B'; goto conv;
+
+ /* go (white) */
+ case UINT32_C(0x25CB): c = (uint_least32_t)'0'; goto conv;
+ case UINT32_C(0x2686): c = (uint_least32_t)'1'; goto conv;
+ case UINT32_C(0x2687): c = (uint_least32_t)'2'; goto conv;
+
+ /* go (black) */
+ case UINT32_C(0x25CF): c = (uint_least32_t)'0'; goto conv;
+ case UINT32_C(0x2688): c = (uint_least32_t)'1'; goto conv;
+ case UINT32_C(0x2689): c = (uint_least32_t)'2'; goto conv;
+
+ /* draughts */
+ case UINT32_C(0x26C0): c = (uint_least32_t)'m'; goto conv;
+ case UINT32_C(0x26C1): c = (uint_least32_t)'k'; goto conv;
+ case UINT32_C(0x26C2): c = (uint_least32_t)'M'; goto conv;
+ case UINT32_C(0x26C3): c = (uint_least32_t)'K'; goto conv;
+
+ /* gender symbols */
+ case UINT32_C(0x2640): c = (uint_least32_t)'f'; goto conv;
+ case UINT32_C(0x2642): c = (uint_least32_t)'m'; goto conv;
+ case UINT32_C(0x263F): c = (uint_least32_t)'i'; goto conv;
+
+ /* double-struck */
+ case UINT32_C(0x2102): c = (uint_least32_t)'C'; goto conv;
+ case UINT32_C(0x210D): c = (uint_least32_t)'H'; goto conv;
+ case UINT32_C(0x2115): c = (uint_least32_t)'N'; goto conv;
+ case UINT32_C(0x2119): c = (uint_least32_t)'P'; goto conv;
+ case UINT32_C(0x211A): c = (uint_least32_t)'Q'; goto conv;
+ case UINT32_C(0x211D): c = (uint_least32_t)'R'; goto conv;
+ case UINT32_C(0x2124): c = (uint_least32_t)'Z'; goto conv;
+
+ /* double-struck italic */
+ case UINT32_C(0x2145): c = (uint_least32_t)'D'; goto conv;
+ case UINT32_C(0x2146): c = (uint_least32_t)'d'; goto conv;
+ case UINT32_C(0x2147): c = (uint_least32_t)'e'; goto conv;
+ case UINT32_C(0x2148): c = (uint_least32_t)'i'; goto conv;
+ case UINT32_C(0x2149): c = (uint_least32_t)'j'; goto conv;
+
+ /* fraktur */
+ case UINT32_C(0x212D): c = (uint_least32_t)'C'; goto conv;
+ case UINT32_C(0x210C): c = (uint_least32_t)'H'; goto conv;
+ case UINT32_C(0x2111): c = (uint_least32_t)'I'; goto conv;
+ case UINT32_C(0x211C): c = (uint_least32_t)'R'; goto conv;
+ case UINT32_C(0x2128): c = (uint_least32_t)'Z'; goto conv;
+
+ /* script */
+ case UINT32_C(0x212C): c = (uint_least32_t)'B'; goto conv;
+ case UINT32_C(0x2130): c = (uint_least32_t)'E'; goto conv;
+ case UINT32_C(0x2131): c = (uint_least32_t)'F'; goto conv;
+ case UINT32_C(0x210B): c = (uint_least32_t)'H'; goto conv;
+ case UINT32_C(0x2110): c = (uint_least32_t)'I'; goto conv;
+ case UINT32_C(0x2112): c = (uint_least32_t)'L'; goto conv;
+ case UINT32_C(0x2133): c = (uint_least32_t)'M'; goto conv;
+ case UINT32_C(0x211B): c = (uint_least32_t)'R'; goto conv;
+ case UINT32_C(0x212F): c = (uint_least32_t)'e'; goto conv;
+ case UINT32_C(0x210A): c = (uint_least32_t)'g'; goto conv;
+ case UINT32_C(0x2134): c = (uint_least32_t)'o'; goto conv;
+
+ /* buhid */
+ case UINT32_C(0x1740): c = (uint_least32_t)'a'; goto conv;
+ case UINT32_C(0x1741): c = (uint_least32_t)'i'; goto conv;
+ case UINT32_C(0x1742): c = (uint_least32_t)'u'; goto conv;
+ case UINT32_C(0x1752): c2 = 'i'; goto budih_combining;
+ case UINT32_C(0x1753): c2 = 'u'; goto budih_combining;
+ budih_combining:
+ c1 = '^';
+ goto conv2;
+ case UINT32_C(0x174A): c1 = 'b'; goto budih;
+ case UINT32_C(0x1747): c1 = 'd'; goto budih;
+ case UINT32_C(0x1744): c1 = 'g'; goto budih;
+ case UINT32_C(0x1751): c1 = 'h'; goto budih;
+ case UINT32_C(0x1743): c1 = 'k'; goto budih;
+ case UINT32_C(0x174E): c1 = 'l'; goto budih;
+ case UINT32_C(0x174B): c1 = 'm'; goto budih;
+ case UINT32_C(0x1748): c1 = 'n'; goto budih;
+ case UINT32_C(0x1749): c1 = 'p'; goto budih;
+ case UINT32_C(0x174D): c1 = 'r'; goto budih;
+ case UINT32_C(0x1750): c1 = 's'; goto budih;
+ case UINT32_C(0x1746): c1 = 't'; goto budih;
+ case UINT32_C(0x174C): c1 = 'y'; goto budih;
+ case UINT32_C(0x174F): c1 = 'w'; goto budih;
+ case UINT32_C(0x1745): c1 = '-'; goto budih;
+ budih:
+ if (*n)
+ goto no_conv;
+ c2 = 'a';
+ s = &s[clen];
+ *n += clen;
+ if (!slen) {
+ ret = LIBCHARCONV_CONVERT_IF_END;
+ goto budih_conv;
+ }
+ clen = decode_utf8(s, slen, &c);
+ if (clen > slen)
+ return LIBCHARCONV_INDETERMINATE;
+ if (!clen)
+ goto budih_conv;
+ switch (c) {
+ case UINT32_C(0x1752): c2 = 'i'; *n += clen; break;
+ case UINT32_C(0x1753): c2 = 'u'; *n += clen; break;
+ default:
+ break;
+ }
+ budih_conv:
+ if (c1 == '-') {
+ c3 = c2;
+ c2 = 'g';
+ c1 = 'n';
+ goto conv3_prechecked;
+ }
+ goto conv2_prechecked;
+
+ default:
+ no_match:
+ *n += clen;
+ break;
+ }
+ }
+ }
+no_conv:
+ return LIBCHARCONV_NO_CONVERT;
+
+conv:
+ if (*n)
+ goto no_conv;
+ *n += clen;
+ if (*ncp)
+ *cp = c;
+ *ncp = 1u;
+ return ret;
+
+conv2:
+ if (*n)
+ goto no_conv;
+ *n += clen;
+conv2_prechecked:
+ if (*ncp >= 1u)
+ cp[0] = (uint_least32_t)c1;
+ if (*ncp >= 2u)
+ cp[1] = (uint_least32_t)c2;
+ *ncp = 2u;
+ return ret;
+
+conv3_prechecked:
+ if (*ncp >= 1u)
+ cp[0] = (uint_least32_t)c1;
+ if (*ncp >= 2u)
+ cp[1] = (uint_least32_t)c2;
+ if (*ncp >= 3u)
+ cp[2] = (uint_least32_t)c3;
+ *ncp = 3u;
+ return ret;
+}