3 files changed, 99 insertions, 1 deletions
diff --git a/doc/info/mds.texinfo b/doc/info/mds.texinfo
index e191a73..6648760 100644
--- a/doc/info/mds.texinfo
+++ b/doc/info/mds.texinfo
@@ -3990,6 +3990,18 @@ Wrapper around @code{waitpid} that never returns on an
 interruption unless it is interrupted one hundred times
 within the same clock second. The parameters and return
 value are exactly those of @code{waitpid}.
+
+@item @code{verify_utf8}[(@code{const char* string, int allow_modified_nul}) @arrow{} @code{int}]
+Checks whether a NUL-terminated string's encoding
+matches UTF-8. This function will reject the string
+if it does not use the shorted possible
+byte-combination for each character. However, if
+@code{allow_modified_nul} is set, it will allow
+@code{192 128} in place of @code{0} for a
+NUL-character.@footnote{Remember @code{0} is used
+to terminated the string, but @code{192 128} is not.}
+This function returns zero if the @code{string} is
+properly formatted, and @code{-1} otherwise.
 @end table
 
 
diff --git a/src/libmdsserver/util.c b/src/libmdsserver/util.c
index 204d155..b1dff3c 100644
--- a/src/libmdsserver/util.c
+++ b/src/libmdsserver/util.c
@@ -30,6 +30,7 @@
 #include <ctype.h>
 #include <time.h>
 #include <sys/wait.h>
+#include <stdint.h>
 
 
 
@@ -373,3 +374,79 @@ pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options)
   return rc;
 }
 
+
+/**
+ * Check whether a NUL-terminated string is encoded in UTF-8
+ * 
+ * @param   string              The string
+ * @param   allow_modified_nul  Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL
+ * @return                      Zero if good, -1 on encoding error
+ */
+int verify_utf8(const char* string, int allow_modified_nul)
+{
+  static long BYTES_TO_MIN_BITS[] = {0, 0,  8, 12, 17, 22, 37};
+  static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31};
+  long bytes = 0, read_bytes = 0, bits = 0, c, character;
+  
+  /*                                                      min bits  max bits
+    0.......                                                 0         7
+    110..... 10......                                        8        11
+    1110.... 10...... 10......                              12        16
+    11110... 10...... 10...... 10......                     17        21
+    111110.. 10...... 10...... 10...... 10......            22        26
+    1111110. 10...... 10...... 10...... 10...... 10......   27        31
+   */
+  
+  while ((c = (long)(*string++)))
+    if (read_bytes == 0)
+      {
+	/* First byte of the character. */
+	
+	if ((c & 0x80) == 0x00)
+	  /* Single-byte character. */
+	  continue;
+	
+	if ((c & 0xC0) == 0x80)
+	  /* Single-byte character marked as multibyte, or
+	     a non-first byte in a multibyte character. */
+	  return -1;
+	
+	/* Multibyte character. */
+	while ((c & 0x80))
+	  bytes++, c <<= 1;
+	read_bytes = 1;
+	character = c & 0x7F;
+	if (bytes > 6)
+	  /* 31-bit characters can be encoded with 6-bytes,
+	     and UTF-8 does not cover higher code points. */
+	  return -1;
+      }
+    else
+      {
+	/* Not first byte of the character. */
+	
+	if ((c & 0xC0) != 0x80)
+	  /* Beginning of new character before a
+	     multibyte character has ended. */
+	  return -1;
+	
+	character = (character << 6) | (c & 0x7F);
+	
+	if (++read_bytes < bytes)
+	  /* Not at last byte yet. */
+	  continue;
+	
+	/* Check that the character is not unnecessarily long. */
+	while (character)
+	  character >>= 1, bits++;
+	bits = ((bits == 0) && (bytes == 2) && allow_modified_nul) ? 8 : bits;
+	if ((bits < BYTES_TO_MIN_BITS[bytes]) || (BYTES_TO_MAX_BITS[bytes] < bits))
+	  return -1;
+	
+	read_bytes = bytes = bits = 0;
+      }
+  
+  /* Make sure we did not stop at the middle of a multibyte character. */
+  return read_bytes == 0 ? 0 : -1;
+}
+
diff --git a/src/libmdsserver/util.h b/src/libmdsserver/util.h
index 23804bc..514b86d 100644
--- a/src/libmdsserver/util.h
+++ b/src/libmdsserver/util.h
@@ -124,7 +124,7 @@ int full_write(int fd, const char* buffer, size_t length);
  * 
  * @param   fd      The file descriptor
  * @param   length  Output parameter for the length of the file, may be `NULL`
- * @return          The content of the file, you will need to free it. `NULL` on error.
+ * @return          The content of the file, you will need to free it. `NULL` on error
  */
 char* full_read(int fd, size_t* length);
 
@@ -151,6 +151,15 @@ int startswith_n(const char* haystack, const char* needle, size_t haystack_n, si
  */
 pid_t uninterruptable_waitpid(pid_t pid, int* restrict status, int options);
 
+/**
+ * Check whether a NUL-terminated string is encoded in UTF-8
+ * 
+ * @param   string              The string
+ * @param   allow_modified_nul  Whether Modified UTF-8 is allowed, which allows a two-byte encoding for NUL
+ * @return                      Zero if good, -1 on encoding error
+ */
+int verify_utf8(const char* string, int allow_modified_nul) __attribute__((pure));
+
 
 #endif