aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2018-11-16 20:11:21 +0100
committerMattias Andrée <maandree@kth.se>2018-11-16 20:11:21 +0100
commitce1d224de8a64c53d18316edd9938bb127542e54 (patch)
tree9ad5e40b4089da0f9f485b901031065219f75701
parentAdd TWOS_COMPLEMENT, ONES_COMPLEMENT, and SIGN_MAGNITUDE (diff)
downloadlibsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.gz
libsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.bz2
libsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.xz
Add strnisutf8 and memisutf8
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
-rw-r--r--Makefile2
-rw-r--r--libsimple/mem.h16
-rw-r--r--libsimple/str.h3
-rw-r--r--libsimple/strn.h20
-rw-r--r--man0/libsimple.h.09
l---------man3/libsimple_memisutf8.31
-rw-r--r--man3/libsimple_strisutf8.387
l---------man3/libsimple_strnisutf8.31
l---------man3/memisutf8.3libsimple1
l---------man3/strnisutf8.3libsimple1
-rw-r--r--memisutf8.c (renamed from strisutf8.c)120
11 files changed, 190 insertions, 71 deletions
diff --git a/Makefile b/Makefile
index ec24fef..f35fc4e 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,7 @@ OBJ =\
memelem.o\
memends.o\
memeqlen.o\
+ memisutf8.o\
memmem.o\
mempsetelem.o\
memrcasechr.o\
@@ -93,7 +94,6 @@ OBJ =\
strchrnul.o\
strends.o\
streqlen.o\
- strisutf8.o\
strncasechr.o\
strncasechrnul.o\
strncaseends.o\
diff --git a/libsimple/mem.h b/libsimple/mem.h
index b50e714..456256d 100644
--- a/libsimple/mem.h
+++ b/libsimple/mem.h
@@ -493,3 +493,19 @@ size_t libsimple_memrcaseeqlen(const void *, size_t, const void *, size_t);
#ifndef memrcaseeqlen
# define memrcaseeqlen libsimple_memrcaseeqlen
#endif
+
+
+/**
+ * Check whether a string is encoded in UTF-8
+ *
+ * @param string The string
+ * @param n The length of the string
+ * @param allow_modified_nul Whether Modified UTF-8 is allowed, which
+ * allows a two-byte encoding for NUL
+ * @return 1 if good, 0 on encoding error
+ */
+_LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__)))
+int libsimple_memisutf8(const char *, size_t, int);
+#ifndef memisutf8
+# define memisutf8 libsimple_memisutf8
+#endif
diff --git a/libsimple/str.h b/libsimple/str.h
index b990fe9..9f782cd 100644
--- a/libsimple/str.h
+++ b/libsimple/str.h
@@ -383,7 +383,8 @@ static inline int libsimple_inchrcaseset(int __c, const char *__s)
* @return 1 if good, 0 on encoding error
*/
_LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__)))
-int libsimple_strisutf8(const char *, int);
+static inline int libsimple_strisutf8(const char *__string, int __allow_modified_nul)
+{ return libsimple_memisutf8(__string, strlen(__string) ,__allow_modified_nul); }
#ifndef strisutf8
# define strisutf8 libsimple_strisutf8
#endif
diff --git a/libsimple/strn.h b/libsimple/strn.h
index 73947ea..cf25a20 100644
--- a/libsimple/strn.h
+++ b/libsimple/strn.h
@@ -426,3 +426,23 @@ static inline size_t libsimple_strrncaseeqlen(const char *__a, const char *__b,
#ifndef strrncaseeqlen
# define strrncaseeqlen libsimple_strrncaseeqlen
#endif
+
+
+/**
+ * Check whether a string, that may be or may not be NUL-terminated,
+ * is encoded in UTF-8
+ *
+ * @param string The string
+ * @param n The maximum length of `string`, its
+ * length is `strlen(string)` if there is a
+ * NUL byte at any position lower than `n`
+ * @param allow_modified_nul Whether Modified UTF-8 is allowed, which
+ * allows a two-byte encoding for NUL
+ * @return 1 if good, 0 on encoding error
+ */
+_LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__)))
+static inline int libsimple_strnisutf8(const char *__string, size_t __n, int __allow_modified_nul)
+{ return libsimple_memisutf8(__string, strnlen(__string, __n) ,__allow_modified_nul); }
+#ifndef strnisutf8
+# define strnisutf8 libsimple_strnisutf8
+#endif
diff --git a/man0/libsimple.h.0 b/man0/libsimple.h.0
index 47ca55b..e5e51d1 100644
--- a/man0/libsimple.h.0
+++ b/man0/libsimple.h.0
@@ -1004,8 +1004,15 @@ that support
.RE
.TP
-.BR libsimple_strisutf8 (3)
+.BR libsimple_strisutf8 (3),
+.RS 0
+.BR libsimple_strnisutf8 (3),
+.br
+.BR libsimple_memisutf8 (3)
+.RE
+.RS
Check if a string is valid UTF-8.
+.RE
.TP
.BR libsimple_strnchr (3),
diff --git a/man3/libsimple_memisutf8.3 b/man3/libsimple_memisutf8.3
new file mode 120000
index 0000000..0f01521
--- /dev/null
+++ b/man3/libsimple_memisutf8.3
@@ -0,0 +1 @@
+libsimple_strisutf8.3 \ No newline at end of file
diff --git a/man3/libsimple_strisutf8.3 b/man3/libsimple_strisutf8.3
index 24dcd96..ff8f609 100644
--- a/man3/libsimple_strisutf8.3
+++ b/man3/libsimple_strisutf8.3
@@ -1,39 +1,88 @@
-.TH LIBSIMPLE_strisutf8 3 2018-11-05 libsimple
+.TH LIBSIMPLE_STRISUTF8 3 2018-11-16 libsimple
.SH NAME
-libsimple_strisutf8 \- check if a string is encoded in UTF-8
+libsimple_strisutf8, libsimple_strnisutf8, libsimple_memisutf8 \- check if a string is encoded in UTF-8
.SH SYNOPSIS
.nf
#include <libsimple.h>
-int libsimple_strisutf8(const char *\fIstring\fP, int \fIallow_modified_nul\fP);
+static inline int libsimple_strisutf8(const char *\fIstring\fP, int \fIallow_modified_nul\fP);
+static inline int libsimple_strnisutf8(const char *\fIstring\fP, size_t \fIn\fP, int \fIallow_modified_nul\fP);
+int libsimple_memisutf8(const char *\fIstring\fP, size_t \fIn\fP, int \fIallow_modified_nul\fP);
#ifndef strisutf8
# define strisutf8 libsimple_strisutf8
#endif
+#ifndef strnisutf8
+# define strnisutf8 libsimple_strnisutf8
+#endif
+#ifndef memisutf8
+# define memisutf8 libsimple_memisutf8
+#endif
.fi
.PP
Link with
.IR \-lsimple .
.SH DESCRIPTION
The
-.BR libsimple_strisutf8 ()
-function checks if
+.BR libsimple_strisutf8 (),
+.BR libsimple_strnisutf8 (),
+and
+.BR libsimple_memisutf8 ()
+functions checks if
.I string
is in valid UTF-8. If
.I allow_modified_nul
is non-zero, NUL encoded with 2 bytes is accepted.
-.SH RETURN VALUE
+.PP
The
.BR libsimple_strisutf8 ()
-returns 1 if the
+function reads
+.I string
+until the first NUL byte.
+.PP
+The
+.BR libsimple_strnisutf8 ()
+function reads
+.I string
+until the first NUL byte or the first
+.I n
+bytes, whichever is shorter.
+.PP
+The
+.BR libsimple_memisutf8 ()
+function reads the first
+.I n
+bytes from string
+.IR string ,
+allowing the checked text to contain NUL bytes.
+Note that unlike other
+.BR mem *
+functions, the
+.I string
+parameter for the
+.BR libsimple_memisutf8 ()
+function is a
+.B const char *
+rather than a
+.BR "const void *" .
+.SH RETURN VALUE
+The
+.BR libsimple_strisutf8 (),
+.BR libsimple_strnisutf8 (),
+and
+.BR libsimple_memisutf8 ()
+function returns 1 if the
.I string
is in valid UTF-8 (Modified UTF-8 if
.I allow_modified_nul
is non-zero); otherwise 0 is returned.
.SH ERRORS
The
-.BR libsimple_strisutf8 ()
-function cannot fail.
+.BR libsimple_strisutf8 (),
+.BR libsimple_strnisutf8 (),
+and
+.BR libsimple_memisutf8 ()
+functions cannot fail.
.SH ATTRIBUTES
For an explanation of the terms used in this section, see
.BR attributes (7).
@@ -43,19 +92,25 @@ lb lb lb
l l l.
Interface Attribute Value
T{
-.BR libsimple_inchrset (),
+.BR libsimple_strisutf8 (),
+.br
+.BR libsimple_strnisutf8 (),
.br
-.BR libsimple_inchrcaseset ()
+.BR libsimple_memisutf8 ()
T} Thread safety MT-Safe
T{
-.BR libsimple_inchrset (),
+.BR libsimple_strisutf8 (),
.br
-.BR libsimple_strchrnul ()
+.BR libsimple_strnisutf8 (),
+.br
+.BR libsimple_memisutf8 ()
T} Async-signal safety AS-Safe
T{
-.BR libsimple_inchrset (),
+.BR libsimple_strisutf8 (),
+.br
+.BR libsimple_strnisutf8 (),
.br
-.BR libsimple_strchrnul ()
+.BR libsimple_memisutf8 ()
T} Async-cancel safety AC-Safe
.TE
.SH EXAMPLES
@@ -70,5 +125,5 @@ None.
None.
.SH BUGS
None.
-.SH SEE ALSO
+.SH SEE ALSO,
None.
diff --git a/man3/libsimple_strnisutf8.3 b/man3/libsimple_strnisutf8.3
new file mode 120000
index 0000000..0f01521
--- /dev/null
+++ b/man3/libsimple_strnisutf8.3
@@ -0,0 +1 @@
+libsimple_strisutf8.3 \ No newline at end of file
diff --git a/man3/memisutf8.3libsimple b/man3/memisutf8.3libsimple
new file mode 120000
index 0000000..c1865bf
--- /dev/null
+++ b/man3/memisutf8.3libsimple
@@ -0,0 +1 @@
+libsimple_memisutf8.3 \ No newline at end of file
diff --git a/man3/strnisutf8.3libsimple b/man3/strnisutf8.3libsimple
new file mode 120000
index 0000000..279f507
--- /dev/null
+++ b/man3/strnisutf8.3libsimple
@@ -0,0 +1 @@
+libsimple_strnisutf8.3 \ No newline at end of file
diff --git a/strisutf8.c b/memisutf8.c
index ac4d0cb..96a8b6b 100644
--- a/strisutf8.c
+++ b/memisutf8.c
@@ -4,11 +4,12 @@
int
-libsimple_strisutf8(const char *string, int allow_modified_nul)
+libsimple_memisutf8(const char *string, size_t n, int allow_modified_nul)
{
static long BYTES_TO_MIN_BITS[] = {0, 0, 8, 12, 17, 22, 27};
static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31};
long int bytes = 0, read_bytes = 0, bits = 0, c, character;
+ size_t i;
/* min bits max bits
0....... 0 7
@@ -19,7 +20,9 @@ libsimple_strisutf8(const char *string, int allow_modified_nul)
1111110. 10...... 10...... 10...... 10...... 10...... 27 31
*/
- while ((c = (long int)(*string++))) {
+ for (i = 0; i < n; i++) {
+ c = (long int)string[i];
+
if (!read_bytes) {
/* First byte of the character. */
@@ -77,58 +80,71 @@ libsimple_strisutf8(const char *string, int allow_modified_nul)
int
main(void)
{
+#define ASSERT(STRING, GOOD)\
+ do {\
+ assert(libsimple_memisutf8(STRING, sizeof(STRING) - 1, i) == (GOOD));\
+ assert(libsimple_memisutf8(STRING "\xFF", sizeof(STRING) - 1, i) == (GOOD));\
+ assert(libsimple_memisutf8(STRING "\x00", sizeof(STRING) - 1, i) == (GOOD));\
+ assert(libsimple_strisutf8(STRING, i) == (GOOD));\
+ assert(libsimple_strnisutf8(STRING, sizeof(STRING) - 1, i) == (GOOD));\
+ assert(libsimple_strnisutf8(STRING "\xFF", sizeof(STRING) - 1, i) == (GOOD));\
+ assert(libsimple_strnisutf8(STRING "\x00", sizeof(STRING) - 1, i) == (GOOD));\
+ } while (0)
+
int i;
for (i = 0; i < 2; i++) {
- assert(libsimple_strisutf8("", i) == 1);
- assert(libsimple_strisutf8("a", i) == 1);
- assert(libsimple_strisutf8("abc", i) == 1);
- assert(libsimple_strisutf8("123", i) == 1);
- assert(libsimple_strisutf8("åäö", i) == 1);
- assert(libsimple_strisutf8("𝖆𝖇𝖈", i) == 1);
- assert(libsimple_strisutf8("\x1b", i) == 1);
- assert(libsimple_strisutf8("\n\r\t\f", i) == 1);
- assert(libsimple_strisutf8("\xFF", i) == 0);
- assert(libsimple_strisutf8("\x01", i) == 1);
- assert(libsimple_strisutf8("\x7F", i) == 1);
- assert(libsimple_strisutf8("\x80", i) == 0);
- assert(libsimple_strisutf8("\xC0", i) == 0);
- assert(libsimple_strisutf8("\xC0\x80", i) == i);
- assert(libsimple_strisutf8("\xC0\x81", i) == 0);
- assert(libsimple_strisutf8("\xCF", i) == 0);
- assert(libsimple_strisutf8("\xEF", i) == 0);
- assert(libsimple_strisutf8("\xEF\x8F", i) == 0);
- assert(libsimple_strisutf8("\xF7", i) == 0);
- assert(libsimple_strisutf8("\xF7\x8F", i) == 0);
- assert(libsimple_strisutf8("\xF7\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFA", i) == 0);
- assert(libsimple_strisutf8("\xFA\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFA\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFA\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFD", i) == 0);
- assert(libsimple_strisutf8("\xFD\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFD\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFE", i) == 0);
- assert(libsimple_strisutf8("\xFE\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFE\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", i) == 0);
- assert(libsimple_strisutf8("\xC1\x80", i) == 0);
- assert(libsimple_strisutf8("\xC2\x80", i) == 1);
- assert(libsimple_strisutf8("\xE1\x80\x80\x80", i) == 0);
- assert(libsimple_strisutf8("\xE1\x80\xC0\x80", i) == 0);
- assert(libsimple_strisutf8("\xE1\x80\x00\x80", i) == 0);
- assert(libsimple_strisutf8("\xF1\x80\x80\x80", i) == 1);
- assert(libsimple_strisutf8("\xFF\x80\x80\x80\x80\x80\x80\x80", i) == 0);
+ ASSERT("", 1);
+ ASSERT("a", 1);
+ ASSERT("abc", 1);
+ ASSERT("123", 1);
+ ASSERT("åäö", 1);
+ ASSERT("𝖆𝖇𝖈", 1);
+ ASSERT("\x1b", 1);
+ ASSERT("\n\r\t\f", 1);
+ ASSERT("\xFF", 0);
+ ASSERT("\x01", 1);
+ ASSERT("\x7F", 1);
+ ASSERT("\x80", 0);
+ ASSERT("\xC0", 0);
+ ASSERT("\xC0\x80", i);
+ ASSERT("\xC0\x81", 0);
+ ASSERT("\xCF", 0);
+ ASSERT("\xEF", 0);
+ ASSERT("\xEF\x8F", 0);
+ ASSERT("\xF7", 0);
+ ASSERT("\xF7\x8F", 0);
+ ASSERT("\xF7\x8F\x8F", 0);
+ ASSERT("\xFA", 0);
+ ASSERT("\xFA\x8F", 0);
+ ASSERT("\xFA\x8F\x8F", 0);
+ ASSERT("\xFA\x8F\x8F\x8F", 0);
+ ASSERT("\xFD", 0);
+ ASSERT("\xFD\x8F", 0);
+ ASSERT("\xFD\x8F\x8F", 0);
+ ASSERT("\xFD\x8F\x8F\x8F", 0);
+ ASSERT("\xFD\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xFE", 0);
+ ASSERT("\xFE\x8F", 0);
+ ASSERT("\xFE\x8F\x8F", 0);
+ ASSERT("\xFE\x8F\x8F\x8F", 0);
+ ASSERT("\xFE\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xFE\x8F\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xFF", 0);
+ ASSERT("\xFF\x8F", 0);
+ ASSERT("\xFF\x8F\x8F", 0);
+ ASSERT("\xFF\x8F\x8F\x8F", 0);
+ ASSERT("\xFF\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", 0);
+ ASSERT("\xC1\x80", 0);
+ ASSERT("\xC2\x80", 1);
+ ASSERT("\xE1\x80\x80\x80", 0);
+ ASSERT("\xE1\x80\xC0\x80", 0);
+ ASSERT("\xE1\x80\x00\x80", 0);
+ ASSERT("\xF1\x80\x80\x80", 1);
+ ASSERT("\xFF\x80\x80\x80\x80\x80\x80\x80", 0);
+ assert(libsimple_memisutf8("\0abc", sizeof("\0abc") - 1, i) == 1);
+ assert(libsimple_memisutf8("\0abc\x80", sizeof("\0abc\x80") - 1, i) == 0);
}
return 0;
}