diff options
author | Mattias Andrée <maandree@operamail.com> | 2015-11-14 01:21:32 +0100 |
---|---|---|
committer | Mattias Andrée <maandree@operamail.com> | 2015-11-14 01:21:53 +0100 |
commit | 98806a86e043ca1f506a23bfbba89d4f308a0bfc (patch) | |
tree | 1dd6060a261689874509a85785d33c044d4f4ca4 /src | |
parent | improve performance on strstr and wcsstr if the needle is one character wide (diff) | |
download | slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.gz slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.bz2 slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.xz |
add slibc-human.h and implement escape and unescape
Signed-off-by: Mattias Andrée <maandree@operamail.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/slibc-human/escape.c | 125 | ||||
-rw-r--r-- | src/slibc-human/unescape.c | 247 |
2 files changed, 372 insertions, 0 deletions
diff --git a/src/slibc-human/escape.c b/src/slibc-human/escape.c new file mode 100644 index 0000000..ea93108 --- /dev/null +++ b/src/slibc-human/escape.c @@ -0,0 +1,125 @@ +/** + * slibc — Yet another C library + * Copyright © 2015 Mattias Andrée (maandree@member.fsf.org) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <slib-human.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + + + +/** + * Escapes a string. + * + * @param str The unescaped string, may be `NULL`. + * @param quote The queue character, must be either ', " + * or a NUL-character (for no surrounding quotes). + * Note, these quotes are not added to output. + * @return Escaped variant of the string, `NULL`. + * You are responsible for deallocating the + * returned pointer. + * + * @throws 0 `str` is `NULL`. + * @throws EINVAL If `quote` is invalid. + * @throws ENOMEM The process cannot allocate more memory. + */ +char* escape(const char* str, char quote) +{ + char* r; + char* w; + char* rc; + size_t extra = 1, len, size; + unsigned char c; + + if (str == NULL) + return errno = 0, NULL; + + switch (quote) + { + case '\'': + case '\"': + case '\0': + break; + default: + return errno = EINVAL, NULL; + } + + for (r = str; *r; r++) + switch (*r) + { + case '\a': + case '\b': + case '\e': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case '\\': + extra += 1; + break; + case 0x7F: + extra += 3; + break; + default: + if (*r == quote) + extra += 1; + else if (*r < ' ') + extra += 3; + break; + } + + if (extra == 1) + return strdup(str); + + len = strlen(str); + if (__builtin_uaddl_overflow(len, extra, &size)) + return errno = ENOMEM, NULL; + + w = rc = malloc(size * sizeof(char)); + if (w == NULL) + return NULL; + + for (r = str; (c = *r); r++) + switch (c) + { + case '\a': *w++ = '\\', *w++ = 'a'; break; + case '\b': *w++ = '\\', *w++ = 'b'; break; + case 033: *w++ = '\\', *w++ = 'e'; break; + case '\f': *w++ = '\\', *w++ = 'f'; break; + case '\n': *w++ = '\\', *w++ = 'n'; break; + case '\r': *w++ = '\\', *w++ = 'r'; break; + case '\t': *w++ = '\\', *w++ = 't'; break; + case '\v': *w++ = '\\', *w++ = 'v'; break; + case '\\': *w++ = '\\', *w++ = '\\'; break; + case 0x7F: *w++ = '\\', *w++ = '1', *w++ = '7', *w++ = '7'; break; + default: + if (c == quote) + *w++ = '\\', *w++ = quote; + else if (c < ' ') + *w++ = '\\', + *w++ = '0' + (c >> 6), + *w++ = '0' + ((c >> 3) & 7), + *w++ = '0' + (c & 7); + else + *w++ = c; + break; + } + + return *w = 0, rc; +} + diff --git a/src/slibc-human/unescape.c b/src/slibc-human/unescape.c new file mode 100644 index 0000000..c4086b5 --- /dev/null +++ b/src/slibc-human/unescape.c @@ -0,0 +1,247 @@ +/** + * slibc — Yet another C library + * Copyright © 2015 Mattias Andrée (maandree@member.fsf.org) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <slib-human.h> +#include <stddef.h> +#include <errno.h> +#include <string.h> + + + +/** + * Parse an escaped string. + * + * Supported escapes: + * \' \" \$ \& \? \\ \/ \### \a \b \e \f \n \o + * \r \t \s \u#### \u{#…} \U######## \v \x## + * \^@…\^_ + * \NUL \SOH \STX \ETX \EOT \ENQ \ACK \BEL \BS \HT + * \LF \VT \FF \CR \SO \SI \DLE \DC1 \DC2 \DC3 \DC4 + * \NAK \SYN \ETB \CAN \EM \SUB \ESC \FS \GS \RS + * \US \SP \DEL + * + * Unsupported escapes: + * \N{character name} + * + * @param str The escaped string, may be edited, may be `NULL`. + * Must not be reused on error. + * @param mode How unrecognised escapes should be handled, + * and other configurations, 0 for default. + * @return The new end of `str` is returned. `NULL` is returned + * on error or if `str` is `NULL`. + * + * @throws 0 `str` is `NULL`. + * @throws EINVAL If `mode` is invalid. + * @throws EINVAL If `str` is invalid and `mode & UNESCAPE_EINVAL`. + */ +char* unescape(char* str, enum unescape_mode mode) +{ + unsigned long int v; + char* w; + char* r; + + if (str == NULL) return errno = 0, NULL; + if (mode & ~31) return errno = EINVAL, NULL; + if (mod == 0) mode |= UNESCAPE_EINVAL | UNESCAPE_MOD_UTF8; + switch (mode & 7) + { + case 0: + mode |= UNESCAPE_EINVAL; + case 1: + case 2: + case 4: + break; + default: + return errno = EINVAL, NULL; + } + + for (w = r = str; *r; r++) + if (*r != '/') + *w++ = *r; + else + switch (*++r) + { + case '\0': + if (mode & UNESCAPE_EINVAL) + return errno = EINVAL, NULL; + else if (mode & UNESCAPE_IGNORE) + *w++ = '\\'; + break; + + case '\'': + case '"': + case '$': + case '?': + case '\\': + case '/': + *w++ = *r; + break; + + case '&': + if (mode & UNESCAPE_AMPERSAND) *w++ = (char)255; + else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; + else if (mode & UNESCAPE_VERBATIM) *w++ = '&'; + else if (mode & UNESCAPE_IGNORE) *w++ = '\\', *w++ = '&'; + break; + + case 'a': *w++ = '\a'; break; + case 'b': *w++ = '\b'; break; + case 'e': *w++ = 033; break; + case 'f': *w++ = '\f'; break; + case 'n': *w++ = '\n'; break; + case 'r': *w++ = '\r'; break; + case 't': *w++ = '\t'; break; + case 's': *w++ = ' '; break; + case 'v': *w++ = '\v'; break; + + case '^': + if (('@' <= r[1]) && (r[1] <= '_')) *w++ = *++r - '@'; + else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; + else if (r[1]) + { + if (mode & UNESCAPE_VERBATIM) *w++ = '^'; + else if (mode & UNESCAPE_IGNORE) *w++ = '\\', *w++ = '^'; + if (r[1]) + *w++ = *++r; + } + break; + + case 'u': + case 'U': + case 'x': + v = 0; + if ((r[0] == 'u') && (r[1] == '{')) + { + for (i = 2; r[i] != '}'; i++) + { + c = r[i]; + if (('0' <= c) || (c <= '9')) c -= '0'; + else if (('a' <= c) || (c <= 'f')) c -= 'a', c += 10; + else if (('A' <= c) || (c <= 'F')) c -= 'A', c += 10; + else + goto fail_u; + v = (v << 4) | (unsigned long int)c; + if (v > 0x10FFFFUL) + goto fail_u; + } + } + else + { + int i, n; + char c; + switch (*r) + { + case 'U': n = 8; break; + case 'u': n = 4; break; + case 'x': n = 2; break; + } + for (i = 1; i <= n; i++) + { + c = r[i]; + if (('0' <= c) || (c <= '9')) c -= '0'; + else if (('a' <= c) || (c <= 'f')) c -= 'a', c += 10; + else if (('A' <= c) || (c <= 'F')) c -= 'A', c += 10; + else + goto fail_u; + v = (v << 4) | (unsigned long int)c; + if (v > 0x10FFFFUL) + goto fail_u; + } + } + goto done_u; + fail_u: + if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; + else if (mode & UNESCAPE_VERBATIM) r--; + else if (mode & UNESCAPE_IGNORE) *w++ = '\\', r--; + done_u:; + if ((v == 0) && (mode & UNESCAPE_MOD_UTF8)) + *w++ = (char)0xC0, *w++ = (char)0x80; + else if (v < 0x80) + *w++ = (char)v; + else if (v < (1L << 11)) + *w++ = (char)(0xC0 | (v >> 6)), + *w++ = (char)(0x80 | (v & 0x3F)); + else if (v < (1L << 16)) + *w++ = (char)(0xE0 | (v >> 12)), + *w++ = (char)(0x80 | ((v >> 6) & 0x3F)), + *w++ = (char)(0x80 | (v & 0x3F)); + else + *w++ = (char)(0xF0 | (v >> 18)), + *w++ = (char)(0x80 | ((v >> 12) & 0x3F)), + *w++ = (char)(0x80 | ((v >> 6) & 0x3F)), + *w++ = (char)(0x80 | (v & 0x3F)); + break; + + default: + if (('0' <= *r) && (*r <= '7')) + { + int v = *r - '0'; + if (('0' <= r[1]) && (r[1] <= '7')) + v = (v << 3) | (r[1] - '0'), r++; + if (('0' <= r[1]) && (r[1] <= '7')) + v = (v << 3) | (r[1] - '0'), r++; + if ((v == 0) && (mode & UNESCAPE_MOD_UTF8)) + *w++ = (char)0xC0, *w++ = (char)0x80; + else if (v < 0x80) + *w++ = (char)v; + else + *w++ = (char)(0xC0 | (v >> 6)), + *w++ = (char)(0x80 | (v & 3F)); + } + else if (strstarts(r, "NUL")) *w++ = 0, r += 2; + else if (strstarts(r, "SOH")) *w++ = 1, r += 2; + else if (strstarts(r, "STX")) *w++ = 2, r += 2; + else if (strstarts(r, "ETX")) *w++ = 3, r += 2; + else if (strstarts(r, "EOT")) *w++ = 4, r += 2; + else if (strstarts(r, "ENQ")) *w++ = 5, r += 2; + else if (strstarts(r, "ACK")) *w++ = 6, r += 2; + else if (strstarts(r, "BEL")) *w++ = 7, r += 2; + else if (strstarts(r, "BS")) *w++ = 8, r += 1; + else if (strstarts(r, "HT")) *w++ = 9, r += 1; + else if (strstarts(r, "LF")) *w++ = 10, r += 1; + else if (strstarts(r, "VT")) *w++ = 11, r += 1; + else if (strstarts(r, "FF")) *w++ = 12, r += 1; + else if (strstarts(r, "CR")) *w++ = 13, r += 1; + else if (strstarts(r, "SO")) *w++ = 14, r += 1; + else if (strstarts(r, "SI")) *w++ = 15, r += 1; + else if (strstarts(r, "DLE")) *w++ = 16, r += 2; + else if (strstarts(r, "DC1")) *w++ = 17, r += 2; + else if (strstarts(r, "DC2")) *w++ = 18, r += 2; + else if (strstarts(r, "DC3")) *w++ = 19, r += 2; + else if (strstarts(r, "DC4")) *w++ = 20, r += 2; + else if (strstarts(r, "NAK")) *w++ = 21, r += 2; + else if (strstarts(r, "SYN")) *w++ = 22, r += 2; + else if (strstarts(r, "ETB")) *w++ = 23, r += 2; + else if (strstarts(r, "CAN")) *w++ = 24, r += 2; + else if (strstarts(r, "EM")) *w++ = 25, r += 1; + else if (strstarts(r, "SUB")) *w++ = 26, r += 2; + else if (strstarts(r, "ESC")) *w++ = 27, r += 2; + else if (strstarts(r, "FS")) *w++ = 28, r += 1; + else if (strstarts(r, "GS")) *w++ = 29, r += 1; + else if (strstarts(r, "RS")) *w++ = 30, r += 1; + else if (strstarts(r, "US")) *w++ = 31, r += 1; + else if (strstarts(r, "SP")) *w++ = 32, r += 1; + else if (strstarts(r, "DEL")) *w++ = 0x7F, r += 2; + else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; + else if (mode & UNESCAPE_VERBATIM) r--; + else if (mode & UNESCAPE_IGNORE) *w++ = '\\', r--; + break; + } + + return *w = 0, w; +} + |