From e19f3d5e8c0e9576a3fda2b77ad69be135bef047 Mon Sep 17 00:00:00 2001
From: Mattias Andrée <maandree@kth.se>
Date: Mon, 5 Nov 2018 18:54:48 +0100
Subject: Rename isutf8 to strisutf8 and add man page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mattias Andrée <maandree@kth.se>
---
 Makefile                  |   2 +-
 isutf8.c                  | 136 ----------------------------------------------
 libsimple/str.h           |   6 +-
 man/libsimple_inchrset.3  |   2 +-
 man/libsimple_strisutf8.3 |  74 +++++++++++++++++++++++++
 strisutf8.c               | 136 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 215 insertions(+), 141 deletions(-)
 delete mode 100644 isutf8.c
 create mode 100644 man/libsimple_strisutf8.3
 create mode 100644 strisutf8.c

diff --git a/Makefile b/Makefile
index 3469282..dbd993c 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,6 @@ OBJ =\
 	envposix_memalignzn.o\
 	envputenvf.o\
 	envreallocn.o\
-	isutf8.o\
 	memcasechr.o\
 	memcasecmp.o\
 	memcaseends.o\
@@ -94,6 +93,7 @@ OBJ =\
 	strchrnul.o\
 	strends.o\
 	streqlen.o\
+	strisutf8.o\
 	strncasechr.o\
 	strncasechrnul.o\
 	strncaseends.o\
diff --git a/isutf8.c b/isutf8.c
deleted file mode 100644
index 232be5d..0000000
--- a/isutf8.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include "libsimple.h"
-#ifndef TEST
-
-
-int
-libsimple_isutf8(const char *string, int allow_modified_nul)
-{
-	static long BYTES_TO_MIN_BITS[] = {0, 0,  8, 12, 17, 22, 27};
-        static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31};
-        long int bytes = 0, read_bytes = 0, bits = 0, c, character;
-
-        /*                                                      min bits  max bits
-          0.......                                                 0         7
-          110..... 10......                                        8        11
-          1110.... 10...... 10......                              12        16
-          11110... 10...... 10...... 10......                     17        21
-          111110.. 10...... 10...... 10...... 10......            22        26
-          1111110. 10...... 10...... 10...... 10...... 10......   27        31
-        */
-
-	while ((c = (long int)(*string++))) {
-                if (!read_bytes) {
-                        /* First byte of the character. */
-
-                        if (!(c & 0x80))
-                                /* Single-byte character. */
-                                continue;
-
-                        if ((c & 0xC0) == 0x80)
-                                /* Single-byte character marked as multibyte, or
-                                   a non-first byte in a multibyte character. */
-                                return 0;
-
-                        /* Multibyte character. */
-                        while ((c & 0x80))
-                                bytes++, c <<= 1;
-                        read_bytes = 1;
-			character = (c & 0xFF) >> bytes;
-                        if (bytes > 6)
-                                /* 31-bit characters can be encoded with 6-bytes,
-                                   and UTF-8 does not cover higher code points. */
-                                return 0;
-                } else {
-                        /* Not first byte of the character. */
-
-                        if ((c & 0xC0) != 0x80)
-                                /* Beginning of new character before a
-                                   multibyte character has ended. */
-                                return 0;
-
-                        character = (character << 6) | (c & 0x7F);
-
-                        if (++read_bytes < bytes)
-                                /* Not at last byte yet. */
-                                continue;
-
-                        /* Check that the character is not unnecessarily long. */
-                        while (character)
-                                character >>= 1, bits++;
-                        bits = (!bits && bytes == 2 && allow_modified_nul) ? 8 : bits;
-                        if (bits < BYTES_TO_MIN_BITS[bytes] || BYTES_TO_MAX_BITS[bytes] < bits)
-                                return 0;
-
-                        read_bytes = bytes = bits = 0;
-                }
-        }
-
-        /* Make sure we did not stop at the middle of a multibyte character. */
-        return !read_bytes;
-}
-
-
-#else
-#include "test.h"
-
-int
-main(void)
-{
-	int i;
-	for (i = 0; i < 2; i++) {
-		assert(libsimple_isutf8("", i) == 1);
-		assert(libsimple_isutf8("a", i) == 1);
-		assert(libsimple_isutf8("abc", i) == 1);
-		assert(libsimple_isutf8("123", i) == 1);
-		assert(libsimple_isutf8("åäö", i) == 1);
-		assert(libsimple_isutf8("𝖆𝖇𝖈", i) == 1);
-		assert(libsimple_isutf8("\x1b", i) == 1);
-		assert(libsimple_isutf8("\n\r\t\f", i) == 1);
-		assert(libsimple_isutf8("\xFF", i) == 0);
-		assert(libsimple_isutf8("\x01", i) == 1);
-		assert(libsimple_isutf8("\x7F", i) == 1);
-		assert(libsimple_isutf8("\x80", i) == 0);
-		assert(libsimple_isutf8("\xC0", i) == 0);
-		assert(libsimple_isutf8("\xC0\x80", i) == i);
-		assert(libsimple_isutf8("\xC0\x81", i) == 0);
-		assert(libsimple_isutf8("\xCF", i) == 0);
-		assert(libsimple_isutf8("\xEF", i) == 0);
-		assert(libsimple_isutf8("\xEF\x8F", i) == 0);
-		assert(libsimple_isutf8("\xF7", i) == 0);
-		assert(libsimple_isutf8("\xF7\x8F", i) == 0);
-		assert(libsimple_isutf8("\xF7\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFA", i) == 0);
-		assert(libsimple_isutf8("\xFA\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFA\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFA\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFD", i) == 0);
-		assert(libsimple_isutf8("\xFD\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFD\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFD\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFD\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFE", i) == 0);
-		assert(libsimple_isutf8("\xFE\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFE\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFE\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFE\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFE\x8F\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", i) == 0);
-		assert(libsimple_isutf8("\xC1\x80", i) == 0);
-		assert(libsimple_isutf8("\xC2\x80", i) == 1);
-		assert(libsimple_isutf8("\xE1\x80\x80\x80", i) == 0);
-		assert(libsimple_isutf8("\xE1\x80\xC0\x80", i) == 0);
-		assert(libsimple_isutf8("\xE1\x80\x00\x80", i) == 0);
-		assert(libsimple_isutf8("\xF1\x80\x80\x80", i) == 1);
-		assert(libsimple_isutf8("\xFF\x80\x80\x80\x80\x80\x80\x80", i) == 0);
-	}
-	return 0;
-}
-
-#endif
diff --git a/libsimple/str.h b/libsimple/str.h
index 3fd1e21..b990fe9 100644
--- a/libsimple/str.h
+++ b/libsimple/str.h
@@ -383,7 +383,7 @@ static inline int libsimple_inchrcaseset(int __c, const char *__s)
  * @return                      1 if good, 0 on encoding error
  */
 _LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__)))
-int libsimple_isutf8(const char *, int);
-#ifndef isutf8
-# define isutf8 libsimple_isutf8
+int libsimple_strisutf8(const char *, int);
+#ifndef strisutf8
+# define strisutf8 libsimple_strisutf8
 #endif
diff --git a/man/libsimple_inchrset.3 b/man/libsimple_inchrset.3
index 09b1ca1..f07263c 100644
--- a/man/libsimple_inchrset.3
+++ b/man/libsimple_inchrset.3
@@ -1,4 +1,4 @@
-.TH LIBSIMPLE_STRCHRNUL 3 2018-11-05 libsimple
+.TH LIBSIMPLE_INCHRSET 3 2018-11-05 libsimple
 .SH NAME
 libsimple_inchrset, libsimple_inchrcaseset \- the whether a character belongs to a set
 .SH SYNOPSIS
diff --git a/man/libsimple_strisutf8.3 b/man/libsimple_strisutf8.3
new file mode 100644
index 0000000..24dcd96
--- /dev/null
+++ b/man/libsimple_strisutf8.3
@@ -0,0 +1,74 @@
+.TH LIBSIMPLE_strisutf8 3 2018-11-05 libsimple
+.SH NAME
+libsimple_strisutf8 \- check if a string is encoded in UTF-8
+.SH SYNOPSIS
+.nf
+#include <libsimple.h>
+
+int libsimple_strisutf8(const char *\fIstring\fP, int \fIallow_modified_nul\fP);
+
+#ifndef strisutf8
+# define strisutf8 libsimple_strisutf8
+#endif
+.fi
+.PP
+Link with
+.IR \-lsimple .
+.SH DESCRIPTION
+The
+.BR libsimple_strisutf8 ()
+function checks if
+.I string
+is in valid UTF-8. If
+.I allow_modified_nul
+is non-zero, NUL encoded with 2 bytes is accepted.
+.SH RETURN VALUE
+The
+.BR libsimple_strisutf8 ()
+returns 1 if the
+.I string
+is in valid UTF-8 (Modified UTF-8 if
+.I allow_modified_nul
+is non-zero); otherwise 0 is returned.
+.SH ERRORS
+The
+.BR libsimple_strisutf8 ()
+function cannot fail.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lb lb lb
+l l l.
+Interface	Attribute	Value
+T{
+.BR libsimple_inchrset (),
+.br
+.BR libsimple_inchrcaseset ()
+T}	Thread safety	MT-Safe
+T{
+.BR libsimple_inchrset (),
+.br
+.BR libsimple_strchrnul ()
+T}	Async-signal safety	AS-Safe
+T{
+.BR libsimple_inchrset (),
+.br
+.BR libsimple_strchrnul ()
+T}	Async-cancel safety	AC-Safe
+.TE
+.SH EXAMPLES
+None.
+.SH APPLICATION USAGE
+None.
+.SH RATIONALE
+None.
+.SH FUTURE DIRECTIONS
+None.
+.SH NOTES
+None.
+.SH BUGS
+None.
+.SH SEE ALSO
+None.
diff --git a/strisutf8.c b/strisutf8.c
new file mode 100644
index 0000000..ac4d0cb
--- /dev/null
+++ b/strisutf8.c
@@ -0,0 +1,136 @@
+/* See LICENSE file for copyright and license details. */
+#include "libsimple.h"
+#ifndef TEST
+
+
+int
+libsimple_strisutf8(const char *string, int allow_modified_nul)
+{
+	static long BYTES_TO_MIN_BITS[] = {0, 0,  8, 12, 17, 22, 27};
+        static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31};
+        long int bytes = 0, read_bytes = 0, bits = 0, c, character;
+
+        /*                                                      min bits  max bits
+          0.......                                                 0         7
+          110..... 10......                                        8        11
+          1110.... 10...... 10......                              12        16
+          11110... 10...... 10...... 10......                     17        21
+          111110.. 10...... 10...... 10...... 10......            22        26
+          1111110. 10...... 10...... 10...... 10...... 10......   27        31
+        */
+
+	while ((c = (long int)(*string++))) {
+                if (!read_bytes) {
+                        /* First byte of the character. */
+
+                        if (!(c & 0x80))
+                                /* Single-byte character. */
+                                continue;
+
+                        if ((c & 0xC0) == 0x80)
+                                /* Single-byte character marked as multibyte, or
+                                   a non-first byte in a multibyte character. */
+                                return 0;
+
+                        /* Multibyte character. */
+                        while ((c & 0x80))
+                                bytes++, c <<= 1;
+                        read_bytes = 1;
+			character = (c & 0xFF) >> bytes;
+                        if (bytes > 6)
+                                /* 31-bit characters can be encoded with 6-bytes,
+                                   and UTF-8 does not cover higher code points. */
+                                return 0;
+                } else {
+                        /* Not first byte of the character. */
+
+                        if ((c & 0xC0) != 0x80)
+                                /* Beginning of new character before a
+                                   multibyte character has ended. */
+                                return 0;
+
+                        character = (character << 6) | (c & 0x7F);
+
+                        if (++read_bytes < bytes)
+                                /* Not at last byte yet. */
+                                continue;
+
+                        /* Check that the character is not unnecessarily long. */
+                        while (character)
+                                character >>= 1, bits++;
+                        bits = (!bits && bytes == 2 && allow_modified_nul) ? 8 : bits;
+                        if (bits < BYTES_TO_MIN_BITS[bytes] || BYTES_TO_MAX_BITS[bytes] < bits)
+                                return 0;
+
+                        read_bytes = bytes = bits = 0;
+                }
+        }
+
+        /* Make sure we did not stop at the middle of a multibyte character. */
+        return !read_bytes;
+}
+
+
+#else
+#include "test.h"
+
+int
+main(void)
+{
+	int i;
+	for (i = 0; i < 2; i++) {
+		assert(libsimple_strisutf8("", i) == 1);
+		assert(libsimple_strisutf8("a", i) == 1);
+		assert(libsimple_strisutf8("abc", i) == 1);
+		assert(libsimple_strisutf8("123", i) == 1);
+		assert(libsimple_strisutf8("åäö", i) == 1);
+		assert(libsimple_strisutf8("𝖆𝖇𝖈", i) == 1);
+		assert(libsimple_strisutf8("\x1b", i) == 1);
+		assert(libsimple_strisutf8("\n\r\t\f", i) == 1);
+		assert(libsimple_strisutf8("\xFF", i) == 0);
+		assert(libsimple_strisutf8("\x01", i) == 1);
+		assert(libsimple_strisutf8("\x7F", i) == 1);
+		assert(libsimple_strisutf8("\x80", i) == 0);
+		assert(libsimple_strisutf8("\xC0", i) == 0);
+		assert(libsimple_strisutf8("\xC0\x80", i) == i);
+		assert(libsimple_strisutf8("\xC0\x81", i) == 0);
+		assert(libsimple_strisutf8("\xCF", i) == 0);
+		assert(libsimple_strisutf8("\xEF", i) == 0);
+		assert(libsimple_strisutf8("\xEF\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xF7", i) == 0);
+		assert(libsimple_strisutf8("\xF7\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xF7\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFA", i) == 0);
+		assert(libsimple_strisutf8("\xFA\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFA\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFA\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFD", i) == 0);
+		assert(libsimple_strisutf8("\xFD\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFD\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFE", i) == 0);
+		assert(libsimple_strisutf8("\xFE\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFE\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", i) == 0);
+		assert(libsimple_strisutf8("\xC1\x80", i) == 0);
+		assert(libsimple_strisutf8("\xC2\x80", i) == 1);
+		assert(libsimple_strisutf8("\xE1\x80\x80\x80", i) == 0);
+		assert(libsimple_strisutf8("\xE1\x80\xC0\x80", i) == 0);
+		assert(libsimple_strisutf8("\xE1\x80\x00\x80", i) == 0);
+		assert(libsimple_strisutf8("\xF1\x80\x80\x80", i) == 1);
+		assert(libsimple_strisutf8("\xFF\x80\x80\x80\x80\x80\x80\x80", i) == 0);
+	}
+	return 0;
+}
+
+#endif
-- 
cgit v1.3.1