diff options
Diffstat (limited to 'libterminput_utf8_decode__.c')
-rw-r--r-- | libterminput_utf8_decode__.c | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/libterminput_utf8_decode__.c b/libterminput_utf8_decode__.c new file mode 100644 index 0000000..e4d0e75 --- /dev/null +++ b/libterminput_utf8_decode__.c @@ -0,0 +1,79 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" + + +unsigned long long int +libterminput_utf8_decode__(const char *s, size_t *ip) +{ + unsigned long long int cp = 0; + size_t len; + + /* Parse the first byte, to get the highest codepoint bits and the encoding length */ + if ((s[*ip] & 0x80) == 0) { + return (unsigned long long int)s[(*ip)++]; + } else if ((s[*ip] & 0xE0) == 0xC0) { + cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xC0U); + len = 2U; + goto need_1; + } else if ((s[*ip] & 0xF0) == 0xE0) { + cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xE0U); + len = 3U; + goto need_2; + } else if ((s[*ip] & 0xF8) == 0xF0) { + cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xF0U); + len = 4U; + goto need_3; + } else if ((s[*ip] & 0xFC) == 0xF8) { + cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xF8U); + len = 5U; + goto need_4; + } else if ((s[*ip] & 0xFE) == 0xFC) { + cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xFCU); + len = 6U; + goto need_5; + } + + /* Parse continuation bytes; check marked as continuation the get codepoint bits */ +need_5: + if ((s[*ip] & 0xC0) != 0x80) return 0; + cp <<= 6; + cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U); + +need_4: + if ((s[*ip] & 0xC0) != 0x80) return 0; + cp <<= 6; + cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U); + +need_3: + if ((s[*ip] & 0xC0) != 0x80) return 0; + cp <<= 6; + cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U); + +need_2: + if ((s[*ip] & 0xC0) != 0x80) return 0; + cp <<= 6; + cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U); + +need_1: + if ((s[*ip] & 0xC0) != 0x80) return 0; + cp <<= 6; + cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U); + + /* Check that encoded codepoint is encoded with the minimum possible length */ + if (cp < 1ULL << (7 + 0 * 6)) + return 0; + if (cp < 1ULL << (5 + 1 * 6)) + return len > 2U ? 0ULL : cp; + if (cp < 1ULL << (4 + 2 * 6)) + return len > 3U ? 0ULL : cp; + if (cp < 1ULL << (3 + 3 * 6)) + return len > 4U ? 0ULL : cp; + if (cp < 1ULL << (2 + 4 * 6)) + return len > 5U ? 0ULL : cp; + if (cp < 1ULL << (1 + 5 * 6)) + return len > 6U ? 0ULL : cp; + + /* (Let's ignore the 0x10FFFF upper bound.) */ + + return 0; +} |