aboutsummaryrefslogtreecommitdiffstats
path: root/libterminput_utf8_decode__.c
diff options
context:
space:
mode:
Diffstat (limited to 'libterminput_utf8_decode__.c')
-rw-r--r--libterminput_utf8_decode__.c79
1 files changed, 79 insertions, 0 deletions
diff --git a/libterminput_utf8_decode__.c b/libterminput_utf8_decode__.c
new file mode 100644
index 0000000..e4d0e75
--- /dev/null
+++ b/libterminput_utf8_decode__.c
@@ -0,0 +1,79 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+unsigned long long int
+libterminput_utf8_decode__(const char *s, size_t *ip)
+{
+ unsigned long long int cp = 0;
+ size_t len;
+
+ /* Parse the first byte, to get the highest codepoint bits and the encoding length */
+ if ((s[*ip] & 0x80) == 0) {
+ return (unsigned long long int)s[(*ip)++];
+ } else if ((s[*ip] & 0xE0) == 0xC0) {
+ cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xC0U);
+ len = 2U;
+ goto need_1;
+ } else if ((s[*ip] & 0xF0) == 0xE0) {
+ cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xE0U);
+ len = 3U;
+ goto need_2;
+ } else if ((s[*ip] & 0xF8) == 0xF0) {
+ cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xF0U);
+ len = 4U;
+ goto need_3;
+ } else if ((s[*ip] & 0xFC) == 0xF8) {
+ cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xF8U);
+ len = 5U;
+ goto need_4;
+ } else if ((s[*ip] & 0xFE) == 0xFC) {
+ cp = (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0xFCU);
+ len = 6U;
+ goto need_5;
+ }
+
+ /* Parse continuation bytes; check marked as continuation the get codepoint bits */
+need_5:
+ if ((s[*ip] & 0xC0) != 0x80) return 0;
+ cp <<= 6;
+ cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U);
+
+need_4:
+ if ((s[*ip] & 0xC0) != 0x80) return 0;
+ cp <<= 6;
+ cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U);
+
+need_3:
+ if ((s[*ip] & 0xC0) != 0x80) return 0;
+ cp <<= 6;
+ cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U);
+
+need_2:
+ if ((s[*ip] & 0xC0) != 0x80) return 0;
+ cp <<= 6;
+ cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U);
+
+need_1:
+ if ((s[*ip] & 0xC0) != 0x80) return 0;
+ cp <<= 6;
+ cp |= (unsigned long long int)((unsigned char)s[(*ip)++] ^ 0x80U);
+
+ /* Check that encoded codepoint is encoded with the minimum possible length */
+ if (cp < 1ULL << (7 + 0 * 6))
+ return 0;
+ if (cp < 1ULL << (5 + 1 * 6))
+ return len > 2U ? 0ULL : cp;
+ if (cp < 1ULL << (4 + 2 * 6))
+ return len > 3U ? 0ULL : cp;
+ if (cp < 1ULL << (3 + 3 * 6))
+ return len > 4U ? 0ULL : cp;
+ if (cp < 1ULL << (2 + 4 * 6))
+ return len > 5U ? 0ULL : cp;
+ if (cp < 1ULL << (1 + 5 * 6))
+ return len > 6U ? 0ULL : cp;
+
+ /* (Let's ignore the 0x10FFFF upper bound.) */
+
+ return 0;
+}