diff options
Diffstat (limited to 'src/slibc-human')
-rw-r--r-- | src/slibc-human/escape.c | 75 | ||||
-rw-r--r-- | src/slibc-human/escapes.h | 79 | ||||
-rw-r--r-- | src/slibc-human/unescape.c | 188 |
3 files changed, 166 insertions, 176 deletions
diff --git a/src/slibc-human/escape.c b/src/slibc-human/escape.c index 832be8d..a7de84b 100644 --- a/src/slibc-human/escape.c +++ b/src/slibc-human/escape.c @@ -19,6 +19,7 @@ #include <stdlib.h> #include <string.h> #include <errno.h> +#include "escapes.h" @@ -39,10 +40,13 @@ */ char* escape(const char* restrict str, char quote) { +#define OCTAL(s) (*w++ = '0' + ((c >> (s)) & 7)) +#define MODNUL(s) (((unsigned)((s)[0]) == 0xC0) && ((unsigned)((s)[1]) == 0x80)) + const char* restrict r; char* restrict w; char* restrict rc; - size_t extra = 1, len, size; + size_t extra = 0, len, size; unsigned char c; if (str == NULL) @@ -58,67 +62,44 @@ char* escape(const char* restrict str, char quote) return errno = EINVAL, NULL; } - for (r = str; *r; r++) - switch (*r) + for (r = str; (c = *r); r++) + switch (c) { - case '\a': - case '\b': - case '\e': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - case '\\': - extra += 1; - break; - case 0x7F: - extra += 3; - break; +#define X(E, C) case C: + LIST_BIJECTIVE_ESCAPES +#undef X + extra += 1; break; + case 0x7F: extra += 3; break; default: - if (*r == quote) - extra += 1; - else if (*r < ' ') - extra += 3; + if (c == quote) extra += 1; + else if (c < ' ') extra += 3; break; } - if (extra == 1) + if (!extra++) return strdup(str); - len = strlen(str); - if (__builtin_uaddl_overflow(len, extra, &size)) + len = strlen(str) * sizeof(char); + if (__builtin_uaddl_overflow(len, extra * sizeof(char), &size)) return errno = ENOMEM, NULL; - w = rc = malloc(size * sizeof(char)); - if (w == NULL) + w = rc = malloc(size); + if (rc == NULL) return NULL; for (r = str; (c = *r); r++) switch (c) { - case '\a': *w++ = '\\', *w++ = 'a'; break; - case '\b': *w++ = '\\', *w++ = 'b'; break; - case 033: *w++ = '\\', *w++ = 'e'; break; - case '\f': *w++ = '\\', *w++ = 'f'; break; - case '\n': *w++ = '\\', *w++ = 'n'; break; - case '\r': *w++ = '\\', *w++ = 'r'; break; - case '\t': *w++ = '\\', *w++ = 't'; break; - case '\v': *w++ = '\\', *w++ = 'v'; break; - case '\\': *w++ = '\\', *w++ = '\\'; break; - case 0x7F: *w++ = '\\', *w++ = '1', *w++ = '7', *w++ = '7'; break; +#define X(E, C) case C: *w++ = '\\', *w++ = E; break; + LIST_BIJECTIVE_ESCAPES +#undef X + case 0x7F: w = stpcpy(w, "\\177"); break; default: - if (((unsigned int)c == 0xC0) && ((unsigned int)(r[1]) == 0x80)) - *w++ = '\\', *w++ = '0', r++; - else if (c == quote) - *w++ = '\\', *w++ = quote; - else if (c < ' ') - *w++ = '\\', - *w++ = '0' + (c >> 6), - *w++ = '0' + ((c >> 3) & 7), - *w++ = '0' + (c & 7); - else - *w++ = c; + *w++ = '\\'; + if (MODNUL(r)) *w++ = '0', r++; + else if (c == quote) *w++ = quote; + else if (c < ' ') OCTAL(6), OCTAL(3), OCTAL(0); + else w[-1] = c; break; } diff --git a/src/slibc-human/escapes.h b/src/slibc-human/escapes.h new file mode 100644 index 0000000..28e3830 --- /dev/null +++ b/src/slibc-human/escapes.h @@ -0,0 +1,79 @@ +/** + * slibc — Yet another C library + * Copyright © 2015 Mattias Andrée (maandree@member.fsf.org) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + + +/** + * List all escapes, excluding initial backslash, + * with ASCII-character names, mapped to their + * codepoint. + */ +#define LIST_ASCII_NAMES \ + X("NUL", 0) \ + X("SOH", 1) \ + X("STX", 2) \ + X("ETX", 3) \ + X("EOT", 4) \ + X("ENQ", 5) \ + X("ACK", 6) \ + X("BEL", 7) \ + X("BS", 8) \ + X("HT", 9) \ + X("LF", 10) \ + X("VT", 11) \ + X("FF", 12) \ + X("CR", 13) \ + X("SO", 14) \ + X("SI", 15) \ + X("DLE", 16) \ + X("DC1", 17) \ + X("DC2", 18) \ + X("DC3", 19) \ + X("DC4", 20) \ + X("NAK", 21) \ + X("SYN", 22) \ + X("ETB", 23) \ + X("CAN", 24) \ + X("EM", 25) \ + X("SUB", 26) \ + X("ESC", 27) \ + X("FS", 28) \ + X("GS", 29) \ + X("RS", 30) \ + X("US", 31) \ + X("SP", 32) \ + X("DEL", 0x7F) + + +/** + * List all escapes, excluding initial backslash, + * of the characters (including initial backslash) + * that should both escaped and unescaped, mapped + * to their codepoint. + */ +#define LIST_BIJECTIVE_ESCAPES \ + X('a', '\a') \ + X('b', '\b') \ + X('e', 033) \ + X('f', '\f') \ + X('n', '\n') \ + X('r', '\r') \ + X('t', '\t') \ + X('v', '\v') \ + X('\\', '\\') + diff --git a/src/slibc-human/unescape.c b/src/slibc-human/unescape.c index c4086b5..33b58b5 100644 --- a/src/slibc-human/unescape.c +++ b/src/slibc-human/unescape.c @@ -19,6 +19,7 @@ #include <stddef.h> #include <errno.h> #include <string.h> +#include "escapes.h" @@ -50,12 +51,37 @@ */ char* unescape(char* str, enum unescape_mode mode) { +#define RANGE(a, c, z) (((a) <= (c)) && ((c) <= (z))) +#define CxC0(s, m) (*w++ = (char)((m) | (v >> (s)))) +#define Cx80(s) (*w++ = (char)(0x80 | ((v >> (s)) & 0x3F))) +#define PARSE_HEX(v, C) \ + do { \ + char c = (C); \ + if (RANGE('0', c, '9')) c -= '0'; \ + else if (RANGE('a', c, 'f')) c -= 'a', c += 10; \ + else if (RANGE('A', c, 'F')) c -= 'A', c += 10; \ + else \ + goto fail_u; \ + v = (v << 4) | (unsigned long int)c; \ + if (v > 0x10FFFFUL) \ + goto fail_u; \ + } while (0) +#define NEXT_OCTAL(v) if (RANGE('0', r[1], '7')) v = (v << 3) | (r[1] - '0'), r++; +#define UNRECOGNISED(c, action) \ + if ( mode & UNESCAPE_EINVAL) goto invalid; \ + else if ((c) && (mode & UNESCAPE_VERBATIM)) action; \ + else if ((c) && (mode & UNESCAPE_IGNORE)) *w++ = '\\', action +#define ASCII() \ + ((v == 0) && (mode & UNESCAPE_MOD_UTF8)) ? (*w++ = (char)0xC0, *w++ = (char)0x80) : \ + (v < 0x80) ? (*w++ = (char)v, 1) : 0 + + int i, n; unsigned long int v; char* w; char* r; if (str == NULL) return errno = 0, NULL; - if (mode & ~31) return errno = EINVAL, NULL; + if (mode & ~31) goto invalid; if (mod == 0) mode |= UNESCAPE_EINVAL | UNESCAPE_MOD_UTF8; switch (mode & 7) { @@ -66,7 +92,7 @@ char* unescape(char* str, enum unescape_mode mode) case 4: break; default: - return errno = EINVAL, NULL; + goto invalid; } for (w = r = str; *r; r++) @@ -76,45 +102,24 @@ char* unescape(char* str, enum unescape_mode mode) switch (*++r) { case '\0': - if (mode & UNESCAPE_EINVAL) - return errno = EINVAL, NULL; - else if (mode & UNESCAPE_IGNORE) - *w++ = '\\'; - break; - - case '\'': - case '"': - case '$': - case '?': - case '\\': - case '/': - *w++ = *r; + UNRECOGNISED(1, (void)0); break; case '&': - if (mode & UNESCAPE_AMPERSAND) *w++ = (char)255; - else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; - else if (mode & UNESCAPE_VERBATIM) *w++ = '&'; - else if (mode & UNESCAPE_IGNORE) *w++ = '\\', *w++ = '&'; + if (mode & UNESCAPE_AMPERSAND) *w++ = (char)255; + else UNRECOGNISED(*r, *w++ = '&'); break; - case 'a': *w++ = '\a'; break; - case 'b': *w++ = '\b'; break; - case 'e': *w++ = 033; break; - case 'f': *w++ = '\f'; break; - case 'n': *w++ = '\n'; break; - case 'r': *w++ = '\r'; break; - case 't': *w++ = '\t'; break; - case 's': *w++ = ' '; break; - case 'v': *w++ = '\v'; break; +#define X(e, c) case e: *w++ = c; break; + LIST_BIJECTIVE_ESCAPES +#undef X + case 's': *w++ = ' '; break; case '^': - if (('@' <= r[1]) && (r[1] <= '_')) *w++ = *++r - '@'; - else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; - else if (r[1]) + if (RANGE('@', r[1], '_')) *w++ = *++r - '@'; + else { - if (mode & UNESCAPE_VERBATIM) *w++ = '^'; - else if (mode & UNESCAPE_IGNORE) *w++ = '\\', *w++ = '^'; + UNRECOGNISED(r[1], *w++ = '^'); if (r[1]) *w++ = *++r; } @@ -125,24 +130,10 @@ char* unescape(char* str, enum unescape_mode mode) case 'x': v = 0; if ((r[0] == 'u') && (r[1] == '{')) - { - for (i = 2; r[i] != '}'; i++) - { - c = r[i]; - if (('0' <= c) || (c <= '9')) c -= '0'; - else if (('a' <= c) || (c <= 'f')) c -= 'a', c += 10; - else if (('A' <= c) || (c <= 'F')) c -= 'A', c += 10; - else - goto fail_u; - v = (v << 4) | (unsigned long int)c; - if (v > 0x10FFFFUL) - goto fail_u; - } - } + for (i = 2; r[i] != '}'; i++) + PARSE_HEX(v, r[i]); else { - int i, n; - char c; switch (*r) { case 'U': n = 8; break; @@ -150,98 +141,37 @@ char* unescape(char* str, enum unescape_mode mode) case 'x': n = 2; break; } for (i = 1; i <= n; i++) - { - c = r[i]; - if (('0' <= c) || (c <= '9')) c -= '0'; - else if (('a' <= c) || (c <= 'f')) c -= 'a', c += 10; - else if (('A' <= c) || (c <= 'F')) c -= 'A', c += 10; - else - goto fail_u; - v = (v << 4) | (unsigned long int)c; - if (v > 0x10FFFFUL) - goto fail_u; - } + PARSE_HEX(v, r[i]); } goto done_u; fail_u: - if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; - else if (mode & UNESCAPE_VERBATIM) r--; - else if (mode & UNESCAPE_IGNORE) *w++ = '\\', r--; - done_u:; - if ((v == 0) && (mode & UNESCAPE_MOD_UTF8)) - *w++ = (char)0xC0, *w++ = (char)0x80; - else if (v < 0x80) - *w++ = (char)v; - else if (v < (1L << 11)) - *w++ = (char)(0xC0 | (v >> 6)), - *w++ = (char)(0x80 | (v & 0x3F)); - else if (v < (1L << 16)) - *w++ = (char)(0xE0 | (v >> 12)), - *w++ = (char)(0x80 | ((v >> 6) & 0x3F)), - *w++ = (char)(0x80 | (v & 0x3F)); - else - *w++ = (char)(0xF0 | (v >> 18)), - *w++ = (char)(0x80 | ((v >> 12) & 0x3F)), - *w++ = (char)(0x80 | ((v >> 6) & 0x3F)), - *w++ = (char)(0x80 | (v & 0x3F)); + UNRECOGNISED(r--); + done_u: + if (ASCII()); + else if (v < (1L << 11)) CxC0(6, 0xC0), Cx80(0); + else if (v < (1L << 16)) CxC0(12, 0xE0), Cx80(6), Cx80(0); + else CxC0(18, 0xF0), Cx80(12), Cx80(0), Cx80(0); break; default: - if (('0' <= *r) && (*r <= '7')) + if (RANGE('0', *r, '7')) { int v = *r - '0'; - if (('0' <= r[1]) && (r[1] <= '7')) - v = (v << 3) | (r[1] - '0'), r++; - if (('0' <= r[1]) && (r[1] <= '7')) - v = (v << 3) | (r[1] - '0'), r++; - if ((v == 0) && (mode & UNESCAPE_MOD_UTF8)) - *w++ = (char)0xC0, *w++ = (char)0x80; - else if (v < 0x80) - *w++ = (char)v; - else - *w++ = (char)(0xC0 | (v >> 6)), - *w++ = (char)(0x80 | (v & 3F)); + NEXT_OCTAL(v); + NEXT_OCTAL(v); + if (ASCII()); + else CxC0(6, 0xC0), Cx80(0); } - else if (strstarts(r, "NUL")) *w++ = 0, r += 2; - else if (strstarts(r, "SOH")) *w++ = 1, r += 2; - else if (strstarts(r, "STX")) *w++ = 2, r += 2; - else if (strstarts(r, "ETX")) *w++ = 3, r += 2; - else if (strstarts(r, "EOT")) *w++ = 4, r += 2; - else if (strstarts(r, "ENQ")) *w++ = 5, r += 2; - else if (strstarts(r, "ACK")) *w++ = 6, r += 2; - else if (strstarts(r, "BEL")) *w++ = 7, r += 2; - else if (strstarts(r, "BS")) *w++ = 8, r += 1; - else if (strstarts(r, "HT")) *w++ = 9, r += 1; - else if (strstarts(r, "LF")) *w++ = 10, r += 1; - else if (strstarts(r, "VT")) *w++ = 11, r += 1; - else if (strstarts(r, "FF")) *w++ = 12, r += 1; - else if (strstarts(r, "CR")) *w++ = 13, r += 1; - else if (strstarts(r, "SO")) *w++ = 14, r += 1; - else if (strstarts(r, "SI")) *w++ = 15, r += 1; - else if (strstarts(r, "DLE")) *w++ = 16, r += 2; - else if (strstarts(r, "DC1")) *w++ = 17, r += 2; - else if (strstarts(r, "DC2")) *w++ = 18, r += 2; - else if (strstarts(r, "DC3")) *w++ = 19, r += 2; - else if (strstarts(r, "DC4")) *w++ = 20, r += 2; - else if (strstarts(r, "NAK")) *w++ = 21, r += 2; - else if (strstarts(r, "SYN")) *w++ = 22, r += 2; - else if (strstarts(r, "ETB")) *w++ = 23, r += 2; - else if (strstarts(r, "CAN")) *w++ = 24, r += 2; - else if (strstarts(r, "EM")) *w++ = 25, r += 1; - else if (strstarts(r, "SUB")) *w++ = 26, r += 2; - else if (strstarts(r, "ESC")) *w++ = 27, r += 2; - else if (strstarts(r, "FS")) *w++ = 28, r += 1; - else if (strstarts(r, "GS")) *w++ = 29, r += 1; - else if (strstarts(r, "RS")) *w++ = 30, r += 1; - else if (strstarts(r, "US")) *w++ = 31, r += 1; - else if (strstarts(r, "SP")) *w++ = 32, r += 1; - else if (strstarts(r, "DEL")) *w++ = 0x7F, r += 2; - else if (mode & UNESCAPE_EINVAL) return errno = EINVAL, NULL; - else if (mode & UNESCAPE_VERBATIM) r--; - else if (mode & UNESCAPE_IGNORE) *w++ = '\\', r--; + else if (strchr("'\"$?\\/", *r)) *w++ = *r; +#define X(e, i) else if (strstarts(r, e) ? (*w++ = i, r += sizeof(e) / sizeof(char) - 2, 1) : 0); + LIST_ASCII_NAMES +#undef X + else UNRECOGNISED(r--); break; } return *w = 0, w; + invalid: + return errno = EINVAL, NULL; } |