aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMattias Andrée <maandree@operamail.com>2014-09-20 12:38:17 +0200
committerMattias Andrée <maandree@operamail.com>2014-09-20 12:38:17 +0200
commitbc836c1a57a408431071cf4242158c3ff1c0fd10 (patch)
tree6e70fd85c2bcb775cae2db59c0604a3e1d49a62e /src
parenttypo (diff)
downloadmds-bc836c1a57a408431071cf4242158c3ff1c0fd10.tar.gz
mds-bc836c1a57a408431071cf4242158c3ff1c0fd10.tar.bz2
mds-bc836c1a57a408431071cf4242158c3ff1c0fd10.tar.xz
add verify_utf8
Signed-off-by: Mattias Andrée <maandree@operamail.com>
Diffstat (limited to 'src')
-rw-r--r--src/libmdsserver/util.c77
-rw-r--r--src/libmdsserver/util.h11
2 files changed, 87 insertions, 1 deletions
diff --git a/src/libmdsserver/util.c b/src/libmdsserver/util.c
index 204d155..b1dff3c 100644
--- a/src/libmdsserver/util.c
+++ b/src/libmdsserver/util.c
@@ -30,6 +30,7 @@
#include <ctype.h>
#include <time.h>
#include <sys/wait.h>
+#include <stdint.h>
@@ -373,3 +374,79 @@ pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options)
return rc;
}
+
+/**
+ * Check whether a NUL-terminated string is encoded in UTF-8
+ *
+ * @param string The string
+ * @param allow_modified_nul Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL
+ * @return Zero if good, -1 on encoding error
+ */
+int verify_utf8(const char* string, int allow_modified_nul)
+{
+ static long BYTES_TO_MIN_BITS[] = {0, 0, 8, 12, 17, 22, 37};
+ static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31};
+ long bytes = 0, read_bytes = 0, bits = 0, c, character;
+
+ /* min bits max bits
+ 0....... 0 7
+ 110..... 10...... 8 11
+ 1110.... 10...... 10...... 12 16
+ 11110... 10...... 10...... 10...... 17 21
+ 111110.. 10...... 10...... 10...... 10...... 22 26
+ 1111110. 10...... 10...... 10...... 10...... 10...... 27 31
+ */
+
+ while ((c = (long)(*string++)))
+ if (read_bytes == 0)
+ {
+ /* First byte of the character. */
+
+ if ((c & 0x80) == 0x00)
+ /* Single-byte character. */
+ continue;
+
+ if ((c & 0xC0) == 0x80)
+ /* Single-byte character marked as multibyte, or
+ a non-first byte in a multibyte character. */
+ return -1;
+
+ /* Multibyte character. */
+ while ((c & 0x80))
+ bytes++, c <<= 1;
+ read_bytes = 1;
+ character = c & 0x7F;
+ if (bytes > 6)
+ /* 31-bit characters can be encoded with 6-bytes,
+ and UTF-8 does not cover higher code points. */
+ return -1;
+ }
+ else
+ {
+ /* Not first byte of the character. */
+
+ if ((c & 0xC0) != 0x80)
+ /* Beginning of new character before a
+ multibyte character has ended. */
+ return -1;
+
+ character = (character << 6) | (c & 0x7F);
+
+ if (++read_bytes < bytes)
+ /* Not at last byte yet. */
+ continue;
+
+ /* Check that the character is not unnecessarily long. */
+ while (character)
+ character >>= 1, bits++;
+ bits = ((bits == 0) && (bytes == 2) && allow_modified_nul) ? 8 : bits;
+ if ((bits < BYTES_TO_MIN_BITS[bytes]) || (BYTES_TO_MAX_BITS[bytes] < bits))
+ return -1;
+
+ read_bytes = bytes = bits = 0;
+ }
+
+ /* Make sure we did not stop at the middle of a multibyte character. */
+ return read_bytes == 0 ? 0 : -1;
+}
+
diff --git a/src/libmdsserver/util.h b/src/libmdsserver/util.h
index 23804bc..514b86d 100644
--- a/src/libmdsserver/util.h
+++ b/src/libmdsserver/util.h
@@ -124,7 +124,7 @@ int full_write(int fd, const char* buffer, size_t length);
*
* @param fd The file descriptor
* @param length Output parameter for the length of the file, may be `NULL`
- * @return The content of the file, you will need to free it. `NULL` on error.
+ * @return The content of the file, you will need to free it. `NULL` on error
*/
char* full_read(int fd, size_t* length);
@@ -151,6 +151,15 @@ int startswith_n(const char* haystack, const char* needle, size_t haystack_n, si
*/
pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options);
+/**
+ * Check whether a NUL-terminated string is encoded in UTF-8
+ *
+ * @param string The string
+ * @param allow_modified_nul Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL
+ * @return Zero if good, -1 on encoding error
+ */
+int verify_utf8(const char* string, int allow_modified_nul) __attribute__((pure));
+
#endif