/* See LICENSE file for copyright and license details. */ #include "common.h" #ifndef TEST int libsimple_memisutf8(const char *string, size_t n, int allow_modified_nul) { static long BYTES_TO_MIN_BITS[] = {0, 0, 8, 12, 17, 22, 27}; static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31}; long int bytes = 0, read_bytes = 0, bits = 0, c, character; size_t i; character = 0; /* silence false warning from compiler; `character` is set in `if (!read_bytes)` before `else` */ /* min bits max bits 0....... 0 7 110..... 10...... 8 11 1110.... 10...... 10...... 12 16 11110... 10...... 10...... 10...... 17 21 111110.. 10...... 10...... 10...... 10...... 22 26 1111110. 10...... 10...... 10...... 10...... 10...... 27 31 */ for (i = 0; i < n; i++) { c = (long int)string[i]; if (!read_bytes) { /* First byte of the character. */ if (!(c & 0x80)) /* Single-byte character. */ continue; if ((c & 0xC0) == 0x80) /* Single-byte character marked as multibyte, or a non-first byte in a multibyte character. */ return 0; /* Multibyte character. */ while ((c & 0x80)) { bytes++; c <<= 1; } read_bytes = 1; character = (c & 0xFF) >> bytes; if (bytes > 6) /* 31-bit characters can be encoded with 6-bytes, and UTF-8 does not cover higher code points. */ return 0; } else { /* Not first byte of the character. */ if ((c & 0xC0) != 0x80) /* Beginning of new character before a multibyte character has ended. */ return 0; character = (character << 6) | (c & 0x7F); if (++read_bytes < bytes) /* Not at last byte yet. */ continue; /* Check that the character is not unnecessarily long. */ while (character) { character >>= 1; bits++; } bits = (!bits && bytes == 2 && allow_modified_nul) ? 8 : bits; if (bits < BYTES_TO_MIN_BITS[bytes] || BYTES_TO_MAX_BITS[bytes] < bits) return 0; read_bytes = bytes = bits = 0; } } /* Make sure we did not stop at the middle of a multibyte character. */ return !read_bytes; } #else #include "test.h" int main(void) { #define ASSERT(STRING, GOOD)\ do {\ assert(libsimple_memisutf8(STRING, sizeof(STRING) - 1, i) == (GOOD));\ assert(libsimple_memisutf8(STRING "\xFF", sizeof(STRING) - 1, i) == (GOOD));\ assert(libsimple_memisutf8(STRING "\x00", sizeof(STRING) - 1, i) == (GOOD));\ } while (0) int i; for (i = 0; i < 2; i++) { ASSERT("", 1); ASSERT("a", 1); ASSERT("abc", 1); ASSERT("123", 1); ASSERT("Γ₯Àâ", 1); ASSERT("π–†π–‡π–ˆ", 1); ASSERT("\x1b", 1); ASSERT("\n\r\t\f", 1); ASSERT("\xFF", 0); ASSERT("\x01", 1); ASSERT("\x7F", 1); ASSERT("\x80", 0); ASSERT("\xC0", 0); ASSERT("\xC0\x80", i); ASSERT("\xC0\x81", 0); ASSERT("\xCF", 0); ASSERT("\xEF", 0); ASSERT("\xEF\x8F", 0); ASSERT("\xF7", 0); ASSERT("\xF7\x8F", 0); ASSERT("\xF7\x8F\x8F", 0); ASSERT("\xFA", 0); ASSERT("\xFA\x8F", 0); ASSERT("\xFA\x8F\x8F", 0); ASSERT("\xFA\x8F\x8F\x8F", 0); ASSERT("\xFD", 0); ASSERT("\xFD\x8F", 0); ASSERT("\xFD\x8F\x8F", 0); ASSERT("\xFD\x8F\x8F\x8F", 0); ASSERT("\xFD\x8F\x8F\x8F\x8F", 0); ASSERT("\xFE", 0); ASSERT("\xFE\x8F", 0); ASSERT("\xFE\x8F\x8F", 0); ASSERT("\xFE\x8F\x8F\x8F", 0); ASSERT("\xFE\x8F\x8F\x8F\x8F", 0); ASSERT("\xFE\x8F\x8F\x8F\x8F\x8F", 0); ASSERT("\xFF", 0); ASSERT("\xFF\x8F", 0); ASSERT("\xFF\x8F\x8F", 0); ASSERT("\xFF\x8F\x8F\x8F", 0); ASSERT("\xFF\x8F\x8F\x8F\x8F", 0); ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F", 0); ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", 0); ASSERT("\xC1\x80", 0); ASSERT("\xC2\x80", 1); ASSERT("\xE1\x80\x80\x80", 0); ASSERT("\xE1\x80\xC0\x80", 0); ASSERT("\xE1\x80\x00\x80", 0); ASSERT("\xF1\x80\x80\x80", 1); ASSERT("\xFF\x80\x80\x80\x80\x80\x80\x80", 0); assert(libsimple_memisutf8("\0abc", sizeof("\0abc") - 1, i) == 1); assert(libsimple_memisutf8("\0abc\x80", sizeof("\0abc\x80") - 1, i) == 0); } return 0; } #endif