diff options
Diffstat (limited to '')
-rw-r--r-- | doc/info/mds.texinfo | 12 | ||||
-rw-r--r-- | src/libmdsserver/util.c | 77 | ||||
-rw-r--r-- | src/libmdsserver/util.h | 11 |
3 files changed, 99 insertions, 1 deletions
diff --git a/doc/info/mds.texinfo b/doc/info/mds.texinfo index e191a73..6648760 100644 --- a/doc/info/mds.texinfo +++ b/doc/info/mds.texinfo @@ -3990,6 +3990,18 @@ Wrapper around @code{waitpid} that never returns on an interruption unless it is interrupted one hundred times within the same clock second. The parameters and return value are exactly those of @code{waitpid}. + +@item @code{verify_utf8}[(@code{const char* string, int allow_modified_nul}) @arrow{} @code{int}] +Checks whether a NUL-terminated string's encoding +matches UTF-8. This function will reject the string +if it does not use the shorted possible +byte-combination for each character. However, if +@code{allow_modified_nul} is set, it will allow +@code{192 128} in place of @code{0} for a +NUL-character.@footnote{Remember @code{0} is used +to terminated the string, but @code{192 128} is not.} +This function returns zero if the @code{string} is +properly formatted, and @code{-1} otherwise. @end table diff --git a/src/libmdsserver/util.c b/src/libmdsserver/util.c index 204d155..b1dff3c 100644 --- a/src/libmdsserver/util.c +++ b/src/libmdsserver/util.c @@ -30,6 +30,7 @@ #include <ctype.h> #include <time.h> #include <sys/wait.h> +#include <stdint.h> @@ -373,3 +374,79 @@ pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options) return rc; } + +/** + * Check whether a NUL-terminated string is encoded in UTF-8 + * + * @param string The string + * @param allow_modified_nul Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL + * @return Zero if good, -1 on encoding error + */ +int verify_utf8(const char* string, int allow_modified_nul) +{ + static long BYTES_TO_MIN_BITS[] = {0, 0, 8, 12, 17, 22, 37}; + static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31}; + long bytes = 0, read_bytes = 0, bits = 0, c, character; + + /* min bits max bits + 0....... 0 7 + 110..... 10...... 8 11 + 1110.... 10...... 10...... 12 16 + 11110... 10...... 10...... 10...... 17 21 + 111110.. 10...... 10...... 10...... 10...... 22 26 + 1111110. 10...... 10...... 10...... 10...... 10...... 27 31 + */ + + while ((c = (long)(*string++))) + if (read_bytes == 0) + { + /* First byte of the character. */ + + if ((c & 0x80) == 0x00) + /* Single-byte character. */ + continue; + + if ((c & 0xC0) == 0x80) + /* Single-byte character marked as multibyte, or + a non-first byte in a multibyte character. */ + return -1; + + /* Multibyte character. */ + while ((c & 0x80)) + bytes++, c <<= 1; + read_bytes = 1; + character = c & 0x7F; + if (bytes > 6) + /* 31-bit characters can be encoded with 6-bytes, + and UTF-8 does not cover higher code points. */ + return -1; + } + else + { + /* Not first byte of the character. */ + + if ((c & 0xC0) != 0x80) + /* Beginning of new character before a + multibyte character has ended. */ + return -1; + + character = (character << 6) | (c & 0x7F); + + if (++read_bytes < bytes) + /* Not at last byte yet. */ + continue; + + /* Check that the character is not unnecessarily long. */ + while (character) + character >>= 1, bits++; + bits = ((bits == 0) && (bytes == 2) && allow_modified_nul) ? 8 : bits; + if ((bits < BYTES_TO_MIN_BITS[bytes]) || (BYTES_TO_MAX_BITS[bytes] < bits)) + return -1; + + read_bytes = bytes = bits = 0; + } + + /* Make sure we did not stop at the middle of a multibyte character. */ + return read_bytes == 0 ? 0 : -1; +} + diff --git a/src/libmdsserver/util.h b/src/libmdsserver/util.h index 23804bc..514b86d 100644 --- a/src/libmdsserver/util.h +++ b/src/libmdsserver/util.h @@ -124,7 +124,7 @@ int full_write(int fd, const char* buffer, size_t length); * * @param fd The file descriptor * @param length Output parameter for the length of the file, may be `NULL` - * @return The content of the file, you will need to free it. `NULL` on error. + * @return The content of the file, you will need to free it. `NULL` on error */ char* full_read(int fd, size_t* length); @@ -151,6 +151,15 @@ int startswith_n(const char* haystack, const char* needle, size_t haystack_n, si */ pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options); +/** + * Check whether a NUL-terminated string is encoded in UTF-8 + * + * @param string The string + * @param allow_modified_nul Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL + * @return Zero if good, -1 on encoding error + */ +int verify_utf8(const char* string, int allow_modified_nul) __attribute__((pure)); + #endif |