diff options
author | Mattias Andrée <maandree@kth.se> | 2018-11-16 20:11:21 +0100 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2018-11-16 20:11:21 +0100 |
commit | ce1d224de8a64c53d18316edd9938bb127542e54 (patch) | |
tree | 9ad5e40b4089da0f9f485b901031065219f75701 | |
parent | Add TWOS_COMPLEMENT, ONES_COMPLEMENT, and SIGN_MAGNITUDE (diff) | |
download | libsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.gz libsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.bz2 libsimple-ce1d224de8a64c53d18316edd9938bb127542e54.tar.xz |
Add strnisutf8 and memisutf8
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | libsimple/mem.h | 16 | ||||
-rw-r--r-- | libsimple/str.h | 3 | ||||
-rw-r--r-- | libsimple/strn.h | 20 | ||||
-rw-r--r-- | man0/libsimple.h.0 | 9 | ||||
l--------- | man3/libsimple_memisutf8.3 | 1 | ||||
-rw-r--r-- | man3/libsimple_strisutf8.3 | 87 | ||||
l--------- | man3/libsimple_strnisutf8.3 | 1 | ||||
l--------- | man3/memisutf8.3libsimple | 1 | ||||
l--------- | man3/strnisutf8.3libsimple | 1 | ||||
-rw-r--r-- | memisutf8.c (renamed from strisutf8.c) | 120 |
11 files changed, 190 insertions, 71 deletions
@@ -68,6 +68,7 @@ OBJ =\ memelem.o\ memends.o\ memeqlen.o\ + memisutf8.o\ memmem.o\ mempsetelem.o\ memrcasechr.o\ @@ -93,7 +94,6 @@ OBJ =\ strchrnul.o\ strends.o\ streqlen.o\ - strisutf8.o\ strncasechr.o\ strncasechrnul.o\ strncaseends.o\ diff --git a/libsimple/mem.h b/libsimple/mem.h index b50e714..456256d 100644 --- a/libsimple/mem.h +++ b/libsimple/mem.h @@ -493,3 +493,19 @@ size_t libsimple_memrcaseeqlen(const void *, size_t, const void *, size_t); #ifndef memrcaseeqlen # define memrcaseeqlen libsimple_memrcaseeqlen #endif + + +/** + * Check whether a string is encoded in UTF-8 + * + * @param string The string + * @param n The length of the string + * @param allow_modified_nul Whether Modified UTF-8 is allowed, which + * allows a two-byte encoding for NUL + * @return 1 if good, 0 on encoding error + */ +_LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__))) +int libsimple_memisutf8(const char *, size_t, int); +#ifndef memisutf8 +# define memisutf8 libsimple_memisutf8 +#endif diff --git a/libsimple/str.h b/libsimple/str.h index b990fe9..9f782cd 100644 --- a/libsimple/str.h +++ b/libsimple/str.h @@ -383,7 +383,8 @@ static inline int libsimple_inchrcaseset(int __c, const char *__s) * @return 1 if good, 0 on encoding error */ _LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__))) -int libsimple_strisutf8(const char *, int); +static inline int libsimple_strisutf8(const char *__string, int __allow_modified_nul) +{ return libsimple_memisutf8(__string, strlen(__string) ,__allow_modified_nul); } #ifndef strisutf8 # define strisutf8 libsimple_strisutf8 #endif diff --git a/libsimple/strn.h b/libsimple/strn.h index 73947ea..cf25a20 100644 --- a/libsimple/strn.h +++ b/libsimple/strn.h @@ -426,3 +426,23 @@ static inline size_t libsimple_strrncaseeqlen(const char *__a, const char *__b, #ifndef strrncaseeqlen # define strrncaseeqlen libsimple_strrncaseeqlen #endif + + +/** + * Check whether a string, that may be or may not be NUL-terminated, + * is encoded in UTF-8 + * + * @param string The string + * @param n The maximum length of `string`, its + * length is `strlen(string)` if there is a + * NUL byte at any position lower than `n` + * @param allow_modified_nul Whether Modified UTF-8 is allowed, which + * allows a two-byte encoding for NUL + * @return 1 if good, 0 on encoding error + */ +_LIBSIMPLE_GCC_ONLY(__attribute__((__pure__, __nonnull__, __warn_unused_result__))) +static inline int libsimple_strnisutf8(const char *__string, size_t __n, int __allow_modified_nul) +{ return libsimple_memisutf8(__string, strnlen(__string, __n) ,__allow_modified_nul); } +#ifndef strnisutf8 +# define strnisutf8 libsimple_strnisutf8 +#endif diff --git a/man0/libsimple.h.0 b/man0/libsimple.h.0 index 47ca55b..e5e51d1 100644 --- a/man0/libsimple.h.0 +++ b/man0/libsimple.h.0 @@ -1004,8 +1004,15 @@ that support .RE .TP -.BR libsimple_strisutf8 (3) +.BR libsimple_strisutf8 (3), +.RS 0 +.BR libsimple_strnisutf8 (3), +.br +.BR libsimple_memisutf8 (3) +.RE +.RS Check if a string is valid UTF-8. +.RE .TP .BR libsimple_strnchr (3), diff --git a/man3/libsimple_memisutf8.3 b/man3/libsimple_memisutf8.3 new file mode 120000 index 0000000..0f01521 --- /dev/null +++ b/man3/libsimple_memisutf8.3 @@ -0,0 +1 @@ +libsimple_strisutf8.3
\ No newline at end of file diff --git a/man3/libsimple_strisutf8.3 b/man3/libsimple_strisutf8.3 index 24dcd96..ff8f609 100644 --- a/man3/libsimple_strisutf8.3 +++ b/man3/libsimple_strisutf8.3 @@ -1,39 +1,88 @@ -.TH LIBSIMPLE_strisutf8 3 2018-11-05 libsimple +.TH LIBSIMPLE_STRISUTF8 3 2018-11-16 libsimple .SH NAME -libsimple_strisutf8 \- check if a string is encoded in UTF-8 +libsimple_strisutf8, libsimple_strnisutf8, libsimple_memisutf8 \- check if a string is encoded in UTF-8 .SH SYNOPSIS .nf #include <libsimple.h> -int libsimple_strisutf8(const char *\fIstring\fP, int \fIallow_modified_nul\fP); +static inline int libsimple_strisutf8(const char *\fIstring\fP, int \fIallow_modified_nul\fP); +static inline int libsimple_strnisutf8(const char *\fIstring\fP, size_t \fIn\fP, int \fIallow_modified_nul\fP); +int libsimple_memisutf8(const char *\fIstring\fP, size_t \fIn\fP, int \fIallow_modified_nul\fP); #ifndef strisutf8 # define strisutf8 libsimple_strisutf8 #endif +#ifndef strnisutf8 +# define strnisutf8 libsimple_strnisutf8 +#endif +#ifndef memisutf8 +# define memisutf8 libsimple_memisutf8 +#endif .fi .PP Link with .IR \-lsimple . .SH DESCRIPTION The -.BR libsimple_strisutf8 () -function checks if +.BR libsimple_strisutf8 (), +.BR libsimple_strnisutf8 (), +and +.BR libsimple_memisutf8 () +functions checks if .I string is in valid UTF-8. If .I allow_modified_nul is non-zero, NUL encoded with 2 bytes is accepted. -.SH RETURN VALUE +.PP The .BR libsimple_strisutf8 () -returns 1 if the +function reads +.I string +until the first NUL byte. +.PP +The +.BR libsimple_strnisutf8 () +function reads +.I string +until the first NUL byte or the first +.I n +bytes, whichever is shorter. +.PP +The +.BR libsimple_memisutf8 () +function reads the first +.I n +bytes from string +.IR string , +allowing the checked text to contain NUL bytes. +Note that unlike other +.BR mem * +functions, the +.I string +parameter for the +.BR libsimple_memisutf8 () +function is a +.B const char * +rather than a +.BR "const void *" . +.SH RETURN VALUE +The +.BR libsimple_strisutf8 (), +.BR libsimple_strnisutf8 (), +and +.BR libsimple_memisutf8 () +function returns 1 if the .I string is in valid UTF-8 (Modified UTF-8 if .I allow_modified_nul is non-zero); otherwise 0 is returned. .SH ERRORS The -.BR libsimple_strisutf8 () -function cannot fail. +.BR libsimple_strisutf8 (), +.BR libsimple_strnisutf8 (), +and +.BR libsimple_memisutf8 () +functions cannot fail. .SH ATTRIBUTES For an explanation of the terms used in this section, see .BR attributes (7). @@ -43,19 +92,25 @@ lb lb lb l l l. Interface Attribute Value T{ -.BR libsimple_inchrset (), +.BR libsimple_strisutf8 (), +.br +.BR libsimple_strnisutf8 (), .br -.BR libsimple_inchrcaseset () +.BR libsimple_memisutf8 () T} Thread safety MT-Safe T{ -.BR libsimple_inchrset (), +.BR libsimple_strisutf8 (), .br -.BR libsimple_strchrnul () +.BR libsimple_strnisutf8 (), +.br +.BR libsimple_memisutf8 () T} Async-signal safety AS-Safe T{ -.BR libsimple_inchrset (), +.BR libsimple_strisutf8 (), +.br +.BR libsimple_strnisutf8 (), .br -.BR libsimple_strchrnul () +.BR libsimple_memisutf8 () T} Async-cancel safety AC-Safe .TE .SH EXAMPLES @@ -70,5 +125,5 @@ None. None. .SH BUGS None. -.SH SEE ALSO +.SH SEE ALSO, None. diff --git a/man3/libsimple_strnisutf8.3 b/man3/libsimple_strnisutf8.3 new file mode 120000 index 0000000..0f01521 --- /dev/null +++ b/man3/libsimple_strnisutf8.3 @@ -0,0 +1 @@ +libsimple_strisutf8.3
\ No newline at end of file diff --git a/man3/memisutf8.3libsimple b/man3/memisutf8.3libsimple new file mode 120000 index 0000000..c1865bf --- /dev/null +++ b/man3/memisutf8.3libsimple @@ -0,0 +1 @@ +libsimple_memisutf8.3
\ No newline at end of file diff --git a/man3/strnisutf8.3libsimple b/man3/strnisutf8.3libsimple new file mode 120000 index 0000000..279f507 --- /dev/null +++ b/man3/strnisutf8.3libsimple @@ -0,0 +1 @@ +libsimple_strnisutf8.3
\ No newline at end of file diff --git a/strisutf8.c b/memisutf8.c index ac4d0cb..96a8b6b 100644 --- a/strisutf8.c +++ b/memisutf8.c @@ -4,11 +4,12 @@ int -libsimple_strisutf8(const char *string, int allow_modified_nul) +libsimple_memisutf8(const char *string, size_t n, int allow_modified_nul) { static long BYTES_TO_MIN_BITS[] = {0, 0, 8, 12, 17, 22, 27}; static long BYTES_TO_MAX_BITS[] = {0, 7, 11, 16, 21, 26, 31}; long int bytes = 0, read_bytes = 0, bits = 0, c, character; + size_t i; /* min bits max bits 0....... 0 7 @@ -19,7 +20,9 @@ libsimple_strisutf8(const char *string, int allow_modified_nul) 1111110. 10...... 10...... 10...... 10...... 10...... 27 31 */ - while ((c = (long int)(*string++))) { + for (i = 0; i < n; i++) { + c = (long int)string[i]; + if (!read_bytes) { /* First byte of the character. */ @@ -77,58 +80,71 @@ libsimple_strisutf8(const char *string, int allow_modified_nul) int main(void) { +#define ASSERT(STRING, GOOD)\ + do {\ + assert(libsimple_memisutf8(STRING, sizeof(STRING) - 1, i) == (GOOD));\ + assert(libsimple_memisutf8(STRING "\xFF", sizeof(STRING) - 1, i) == (GOOD));\ + assert(libsimple_memisutf8(STRING "\x00", sizeof(STRING) - 1, i) == (GOOD));\ + assert(libsimple_strisutf8(STRING, i) == (GOOD));\ + assert(libsimple_strnisutf8(STRING, sizeof(STRING) - 1, i) == (GOOD));\ + assert(libsimple_strnisutf8(STRING "\xFF", sizeof(STRING) - 1, i) == (GOOD));\ + assert(libsimple_strnisutf8(STRING "\x00", sizeof(STRING) - 1, i) == (GOOD));\ + } while (0) + int i; for (i = 0; i < 2; i++) { - assert(libsimple_strisutf8("", i) == 1); - assert(libsimple_strisutf8("a", i) == 1); - assert(libsimple_strisutf8("abc", i) == 1); - assert(libsimple_strisutf8("123", i) == 1); - assert(libsimple_strisutf8("åäö", i) == 1); - assert(libsimple_strisutf8("𝖆𝖇𝖈", i) == 1); - assert(libsimple_strisutf8("\x1b", i) == 1); - assert(libsimple_strisutf8("\n\r\t\f", i) == 1); - assert(libsimple_strisutf8("\xFF", i) == 0); - assert(libsimple_strisutf8("\x01", i) == 1); - assert(libsimple_strisutf8("\x7F", i) == 1); - assert(libsimple_strisutf8("\x80", i) == 0); - assert(libsimple_strisutf8("\xC0", i) == 0); - assert(libsimple_strisutf8("\xC0\x80", i) == i); - assert(libsimple_strisutf8("\xC0\x81", i) == 0); - assert(libsimple_strisutf8("\xCF", i) == 0); - assert(libsimple_strisutf8("\xEF", i) == 0); - assert(libsimple_strisutf8("\xEF\x8F", i) == 0); - assert(libsimple_strisutf8("\xF7", i) == 0); - assert(libsimple_strisutf8("\xF7\x8F", i) == 0); - assert(libsimple_strisutf8("\xF7\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFA", i) == 0); - assert(libsimple_strisutf8("\xFA\x8F", i) == 0); - assert(libsimple_strisutf8("\xFA\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFA\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFD", i) == 0); - assert(libsimple_strisutf8("\xFD\x8F", i) == 0); - assert(libsimple_strisutf8("\xFD\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFD\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFE", i) == 0); - assert(libsimple_strisutf8("\xFE\x8F", i) == 0); - assert(libsimple_strisutf8("\xFE\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFE\x8F\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", i) == 0); - assert(libsimple_strisutf8("\xC1\x80", i) == 0); - assert(libsimple_strisutf8("\xC2\x80", i) == 1); - assert(libsimple_strisutf8("\xE1\x80\x80\x80", i) == 0); - assert(libsimple_strisutf8("\xE1\x80\xC0\x80", i) == 0); - assert(libsimple_strisutf8("\xE1\x80\x00\x80", i) == 0); - assert(libsimple_strisutf8("\xF1\x80\x80\x80", i) == 1); - assert(libsimple_strisutf8("\xFF\x80\x80\x80\x80\x80\x80\x80", i) == 0); + ASSERT("", 1); + ASSERT("a", 1); + ASSERT("abc", 1); + ASSERT("123", 1); + ASSERT("åäö", 1); + ASSERT("𝖆𝖇𝖈", 1); + ASSERT("\x1b", 1); + ASSERT("\n\r\t\f", 1); + ASSERT("\xFF", 0); + ASSERT("\x01", 1); + ASSERT("\x7F", 1); + ASSERT("\x80", 0); + ASSERT("\xC0", 0); + ASSERT("\xC0\x80", i); + ASSERT("\xC0\x81", 0); + ASSERT("\xCF", 0); + ASSERT("\xEF", 0); + ASSERT("\xEF\x8F", 0); + ASSERT("\xF7", 0); + ASSERT("\xF7\x8F", 0); + ASSERT("\xF7\x8F\x8F", 0); + ASSERT("\xFA", 0); + ASSERT("\xFA\x8F", 0); + ASSERT("\xFA\x8F\x8F", 0); + ASSERT("\xFA\x8F\x8F\x8F", 0); + ASSERT("\xFD", 0); + ASSERT("\xFD\x8F", 0); + ASSERT("\xFD\x8F\x8F", 0); + ASSERT("\xFD\x8F\x8F\x8F", 0); + ASSERT("\xFD\x8F\x8F\x8F\x8F", 0); + ASSERT("\xFE", 0); + ASSERT("\xFE\x8F", 0); + ASSERT("\xFE\x8F\x8F", 0); + ASSERT("\xFE\x8F\x8F\x8F", 0); + ASSERT("\xFE\x8F\x8F\x8F\x8F", 0); + ASSERT("\xFE\x8F\x8F\x8F\x8F\x8F", 0); + ASSERT("\xFF", 0); + ASSERT("\xFF\x8F", 0); + ASSERT("\xFF\x8F\x8F", 0); + ASSERT("\xFF\x8F\x8F\x8F", 0); + ASSERT("\xFF\x8F\x8F\x8F\x8F", 0); + ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F", 0); + ASSERT("\xFF\x8F\x8F\x8F\x8F\x8F\x8F", 0); + ASSERT("\xC1\x80", 0); + ASSERT("\xC2\x80", 1); + ASSERT("\xE1\x80\x80\x80", 0); + ASSERT("\xE1\x80\xC0\x80", 0); + ASSERT("\xE1\x80\x00\x80", 0); + ASSERT("\xF1\x80\x80\x80", 1); + ASSERT("\xFF\x80\x80\x80\x80\x80\x80\x80", 0); + assert(libsimple_memisutf8("\0abc", sizeof("\0abc") - 1, i) == 1); + assert(libsimple_memisutf8("\0abc\x80", sizeof("\0abc\x80") - 1, i) == 0); } return 0; } |