add slibc-human.h and implement escape and unescape

Signed-off-by: Mattias Andrée <maandree@operamail.com>
author: Mattias Andrée <maandree@operamail.com> 2015-11-14 01:21:32 +0100
committer: Mattias Andrée <maandree@operamail.com> 2015-11-14 01:21:53 +0100
commit: 98806a86e043ca1f506a23bfbba89d4f308a0bfc (patch)
tree: 1dd6060a261689874509a85785d33c044d4f4ca4
parent: improve performance on strstr and wcsstr if the needle is one character wide (diff)
download: slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.gz
slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.bz2
slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.xz
3 files changed, 598 insertions, 0 deletions
diff --git a/include/slibc-human.h b/include/slibc-human.h
new file mode 100644
index 0000000..13d299a
--- /dev/null
+++ b/include/slibc-human.h
@@ -0,0 +1,226 @@
+/**
+ * slibc — Yet another C library
+ * Copyright © 2015  Mattias Andrée (maandree@member.fsf.org)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _SLIBC_HUMAN_H
+#define _SLIBC_HUMAN_H
+#include <slibc/version.h>
+#include <slibc/features.h>
+#ifndef __PORTABLE
+
+
+
+#define __NEED_mode_t
+#define __NEED_intmax_t
+#define __NEED_uintmax_t
+
+#include <bits/types.h>
+
+
+
+/**
+ * Ways to handled unrecognised escapes,
+ * and other configurations.
+ */
+enum unescape_mode
+  {
+    /**
+     * For any unrecognised character '#',
+     * '\#' results in an EINVAL-error.
+     * 
+     * Cannot be used together with
+     * `UNESCAPE_VERBATIM` or `UNESCAPE_IGNORE`.
+     */
+    UNESCAPE_EINVAL = 1,
+    
+    /**
+     * For any unrecognised character '#',
+     * '\#' results in '#'.
+     * 
+     * Cannot be used together with
+     * `UNESCAPE_EINVAL` or `UNESCAPE_IGNORE`.
+     */
+    UNESCAPE_VERBATIM = 2,
+    
+    /**
+     * For any unrecognised character '#',
+     * '\#' results in '\#'.
+     * 
+     * Cannot be used together with
+     * `UNESCAPE_EINVAL` or `UNESCAPE_VERBATIM`.
+     */
+    UNESCAPE_IGNORE = 4,
+    
+    
+    /**
+     * '\&' resolves to the byte 255 (0xFF).
+     * 
+     * If not used, '\&' is handled as an
+     * unsupported escape.
+     */
+    UNESCAPE_AMPERSAND = 8,
+    
+    /**
+     * '\0' resolves to the byte sequence
+     * 192 128 (0xC0 0x80).
+     * 
+     * If not used, '\&' is handled as an
+     * unsupported escape.
+     */
+    UNESCAPE_MOD_UTF8 = 16,
+  };
+
+
+enum humansize_mode
+  {
+    /**
+     * 'k' is 1000.
+     */
+    HUMANSIZE_SI = 1,
+    
+    /**
+     * 'K' is 1024.
+     */
+    HUMANSIZE_IEC = 2,
+    
+    /**
+     * 'Ki' is 1024.
+     */
+    HUMANSIZE_IEC_EXPLICIT = 4,
+    
+    
+    /**
+     * 'B' is only included if there is no prefix.
+     */
+    HUMANSIZE_PREFIX_ONLY = 8,
+    
+    
+    /**
+     * Print size exactly if `detail` is 0,
+     * otherwise use the highest `detail` prefixes.
+     * 
+     * For example `detail == 0` may yeild '3TB 2MB 1KB',
+     * and `detail == 3` may yeild '3TB 2MB' for the same size.
+     */
+    HUMANSIZE_EXACT = 16,
+    
+    /**
+     * Similar to `HUMANSIZE_EXACT` with `detail == 1`,
+     * but the value will include `detail` digits.
+     * `detail` < 0 is allowed, 
+     */
+    HUMANSIZE_ROUND = 32,
+  }
+
+
+enum machinesize_mode
+  {
+    /**
+     * 'k' and 'K' is 1000.
+     * 
+     * If `MACHINESIZE_IEC` is also used,
+     * 1000-base is used if 'B' is explicitly
+     * included, otherwise 1024-base is used.
+     */
+    MACHINESIZE_SI = 1,
+    
+    /**
+     * 'k' and 'K' is 1024.
+     * 
+     * If `MACHINESIZE_SI` is also used,
+     * 1000-base is used if 'B' is explicitly
+     * included, otherwise 1024-base is used.
+     */
+    MACHINESIZE_IEC = 2,
+  }
+
+
+
+char* humanmode(const char* buffer, mode_t mode);
+
+mode_t machinemode(const char* str, mode_t mode, mode_t mask);
+
+
+char* humansize(const char* buffer, size_t size, enum humansize_mode mode, int detail);
+
+int machinesize(size_t* restrict size, char* string, enum machinesize_mode mode);
+
+
+int humandur(intmax_t restrict sec, long int nsec, const char* comma, const char* format);
+
+int machinedur(intmax_t* restrict sec, long int* nsec, const char* restrict str,
+	       const char* restrict space, const char* restrict comma);
+
+
+int machineint(intmax_t* restrict r, const char* restrict str);
+
+int machineuint(uintmax_t* restrict r, const char* restrict str);
+
+int machinefloat(long double* restrict r, const char* restrict str,
+		 const char* restrict space, const char* restrict comma);
+
+
+/**
+ * Parse an escaped string.
+ * 
+ * Supported escapes:
+ *   \' \" \$ \& \? \\ \/ \### \a \b \e \f \n
+ *   \r \t \s \u#### \u{#…} \U######## \v \x##
+ *   \^@…\^_
+ *   \NUL \SOH \STX \ETX \EOT \ENQ \ACK \BEL \BS \HT
+ *   \LF \VT \FF \CR \SO \SI \DLE \DC1 \DC2 \DC3 \DC4
+ *   \NAK \SYN \ETB \CAN \EM \SUB \ESC \FS \GS \RS
+ *   \US \SP \DEL
+ * 
+ * Unsupported escapes:
+ *   \N{character name}
+ * 
+ * @param   str   The escaped string, may be edited, may be `NULL`.
+ *                Must not be reused on error.
+ * @param   mode  How unrecognised escapes should be handled,
+ *                and other configurations, 0 for default.
+ * @return        The new end of `str` is returned. `NULL` is returned
+ *                on error or if `str` is `NULL`.
+ * 
+ * @throws  0       `str` is `NULL`.
+ * @throws  EINVAL  If `mode` is invalid.
+ * @throws  EINVAL  If `str` is invalid and `mode & UNESCAPE_EINVAL`.
+ */
+char* unescape(char*, enum unescape_mode);
+
+/**
+ * Escapes a string.
+ * 
+ * @param   str    The unescaped string, may be `NULL`.
+ * @param   quote  The queue character, must be either ', "
+ *                 or a NUL-character (for no surrounding quotes).
+ *                 Note, these quotes are not added to output.
+ * @return         Escaped variant of the string, `NULL`.
+ *                 You are responsible for deallocating the
+ *                 returned pointer.
+ * 
+ * @throws  0       `str` is `NULL`.
+ * @throws  EINVAL  If `quote` is invalid.
+ * @throws  ENOMEM  The process cannot allocate more memory.
+ */
+char* escape(const char*)
+  __GCC_ONLY(__attribute__((__malloc__, __warn_unused_result__)));
+
+
+
+#endif
+#endif
+
diff --git a/src/slibc-human/escape.c b/src/slibc-human/escape.c
new file mode 100644
index 0000000..ea93108
--- /dev/null
+++ b/src/slibc-human/escape.c
@@ -0,0 +1,125 @@
+/**
+ * slibc — Yet another C library
+ * Copyright © 2015  Mattias Andrée (maandree@member.fsf.org)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <slib-human.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+
+
+/**
+ * Escapes a string.
+ * 
+ * @param   str    The unescaped string, may be `NULL`.
+ * @param   quote  The queue character, must be either ', "
+ *                 or a NUL-character (for no surrounding quotes).
+ *                 Note, these quotes are not added to output.
+ * @return         Escaped variant of the string, `NULL`.
+ *                 You are responsible for deallocating the
+ *                 returned pointer.
+ * 
+ * @throws  0       `str` is `NULL`.
+ * @throws  EINVAL  If `quote` is invalid.
+ * @throws  ENOMEM  The process cannot allocate more memory.
+ */
+char* escape(const char* str, char quote)
+{
+  char* r;
+  char* w;
+  char* rc;
+  size_t extra = 1, len, size;
+  unsigned char c;
+  
+  if (str == NULL)
+    return errno = 0, NULL;
+  
+  switch (quote)
+    {
+    case '\'':
+    case '\"':
+    case '\0':
+      break;
+    default:
+      return errno = EINVAL, NULL;
+    }
+  
+  for (r = str; *r; r++)
+    switch (*r)
+      {
+      case '\a':
+      case '\b':
+      case '\e':
+      case '\f':
+      case '\n':
+      case '\r':
+      case '\t':
+      case '\v':
+      case '\\':
+	extra += 1;
+	break;
+      case 0x7F:
+	extra += 3;
+	break;
+      default:
+	if (*r == quote)
+	  extra += 1;
+	else if (*r < ' ')
+	  extra += 3;
+	break;
+      }
+  
+  if (extra == 1)
+    return strdup(str);
+  
+  len = strlen(str);
+  if (__builtin_uaddl_overflow(len, extra, &size))
+    return errno = ENOMEM, NULL;
+  
+  w = rc = malloc(size * sizeof(char));
+  if (w == NULL)
+    return NULL;
+  
+  for (r = str; (c = *r); r++)
+    switch (c)
+      {
+      case '\a':  *w++ = '\\', *w++ = 'a';   break;
+      case '\b':  *w++ = '\\', *w++ = 'b';   break;
+      case 033:   *w++ = '\\', *w++ = 'e';   break;
+      case '\f':  *w++ = '\\', *w++ = 'f';   break;
+      case '\n':  *w++ = '\\', *w++ = 'n';   break;
+      case '\r':  *w++ = '\\', *w++ = 'r';   break;
+      case '\t':  *w++ = '\\', *w++ = 't';   break;
+      case '\v':  *w++ = '\\', *w++ = 'v';   break;
+      case '\\':  *w++ = '\\', *w++ = '\\';  break;
+      case 0x7F:  *w++ = '\\', *w++ = '1', *w++ = '7', *w++ = '7';  break;
+      default:
+	if (c == quote)
+	  *w++ = '\\', *w++ = quote;
+	else if (c < ' ')
+	  *w++ = '\\',
+	    *w++ = '0' + (c >> 6),
+	    *w++ = '0' + ((c >> 3) & 7),
+	    *w++ = '0' + (c & 7);
+	else
+	  *w++ = c;
+	break;
+      }
+  
+  return *w = 0, rc;
+}
+
diff --git a/src/slibc-human/unescape.c b/src/slibc-human/unescape.c
new file mode 100644
index 0000000..c4086b5
--- /dev/null
+++ b/src/slibc-human/unescape.c
@@ -0,0 +1,247 @@
+/**
+ * slibc — Yet another C library
+ * Copyright © 2015  Mattias Andrée (maandree@member.fsf.org)
+ * 
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <slib-human.h>
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+
+
+
+/**
+ * Parse an escaped string.
+ * 
+ * Supported escapes:
+ *   \' \" \$ \& \? \\ \/ \### \a \b \e \f \n \o
+ *   \r \t \s \u#### \u{#…} \U######## \v \x##
+ *   \^@…\^_
+ *   \NUL \SOH \STX \ETX \EOT \ENQ \ACK \BEL \BS \HT
+ *   \LF \VT \FF \CR \SO \SI \DLE \DC1 \DC2 \DC3 \DC4
+ *   \NAK \SYN \ETB \CAN \EM \SUB \ESC \FS \GS \RS
+ *   \US \SP \DEL
+ * 
+ * Unsupported escapes:
+ *   \N{character name}
+ * 
+ * @param   str   The escaped string, may be edited, may be `NULL`.
+ *                Must not be reused on error.
+ * @param   mode  How unrecognised escapes should be handled,
+ *                and other configurations, 0 for default.
+ * @return        The new end of `str` is returned. `NULL` is returned
+ *                on error or if `str` is `NULL`.
+ * 
+ * @throws  0       `str` is `NULL`.
+ * @throws  EINVAL  If `mode` is invalid.
+ * @throws  EINVAL  If `str` is invalid and `mode & UNESCAPE_EINVAL`.
+ */
+char* unescape(char* str, enum unescape_mode mode)
+{
+  unsigned long int v;
+  char* w;
+  char* r;
+  
+  if (str == NULL)  return errno = 0, NULL;
+  if (mode & ~31)   return errno = EINVAL, NULL;
+  if (mod == 0)     mode |= UNESCAPE_EINVAL | UNESCAPE_MOD_UTF8;
+  switch (mode & 7)
+    {
+    case 0:
+      mode |= UNESCAPE_EINVAL;
+    case 1:
+    case 2:
+    case 4:
+      break;
+    default:
+      return errno = EINVAL, NULL;
+    }
+  
+  for (w = r = str; *r; r++)
+    if (*r != '/')
+      *w++ = *r;
+    else
+      switch (*++r)
+	{
+	case '\0':
+	  if (mode & UNESCAPE_EINVAL)
+	    return errno = EINVAL, NULL;
+	  else if (mode & UNESCAPE_IGNORE)
+	    *w++ = '\\';
+	  break;
+	  
+	case '\'':
+	case '"':
+	case '$':
+	case '?':
+	case '\\':
+	case '/':
+	  *w++ = *r;
+	  break;
+	  
+	case '&':
+	  if      (mode & UNESCAPE_AMPERSAND)  *w++ = (char)255;
+	  else if (mode & UNESCAPE_EINVAL)     return errno = EINVAL, NULL;
+	  else if (mode & UNESCAPE_VERBATIM)   *w++ = '&';
+	  else if (mode & UNESCAPE_IGNORE)     *w++ = '\\', *w++ = '&';
+	  break;
+	  
+	case 'a':  *w++ = '\a';  break;
+	case 'b':  *w++ = '\b';  break;
+	case 'e':  *w++ = 033;   break;
+	case 'f':  *w++ = '\f';  break;
+	case 'n':  *w++ = '\n';  break;
+	case 'r':  *w++ = '\r';  break;
+	case 't':  *w++ = '\t';  break;
+	case 's':  *w++ = ' ';   break;
+	case 'v':  *w++ = '\v';  break;
+	  
+	case '^':
+	  if (('@' <= r[1]) && (r[1] <= '_'))      *w++ = *++r - '@';
+	  else if (mode & UNESCAPE_EINVAL)         return errno = EINVAL, NULL;
+	  else if (r[1])
+	    {
+	      if      (mode & UNESCAPE_VERBATIM)   *w++ = '^';
+	      else if (mode & UNESCAPE_IGNORE)     *w++ = '\\', *w++ = '^';
+	      if (r[1])
+		*w++ = *++r;
+	    }
+	  break;
+	  
+	case 'u':
+	case 'U':
+	case 'x':
+	  v = 0;
+	  if ((r[0] == 'u') && (r[1] == '{'))
+	    {
+	      for (i = 2; r[i] != '}'; i++)
+		{
+		  c = r[i];
+		  if      (('0' <= c) || (c <= '9'))  c -= '0';
+		  else if (('a' <= c) || (c <= 'f'))  c -= 'a', c += 10;
+		  else if (('A' <= c) || (c <= 'F'))  c -= 'A', c += 10;
+		  else
+		    goto fail_u;
+		  v = (v << 4) | (unsigned long int)c;
+		  if (v > 0x10FFFFUL)
+		    goto fail_u;
+		}
+	    }
+	  else
+	    {
+	      int i, n;
+	      char c;
+	      switch (*r)
+		{
+		case 'U':  n = 8;  break;
+		case 'u':  n = 4;  break;
+		case 'x':  n = 2;  break;
+		}
+	      for (i = 1; i <= n; i++)
+		{
+		  c = r[i];
+		  if      (('0' <= c) || (c <= '9'))  c -= '0';
+		  else if (('a' <= c) || (c <= 'f'))  c -= 'a', c += 10;
+		  else if (('A' <= c) || (c <= 'F'))  c -= 'A', c += 10;
+		  else
+		    goto fail_u;
+		  v = (v << 4) | (unsigned long int)c;
+		  if (v > 0x10FFFFUL)
+		    goto fail_u;
+		}
+	    }
+	  goto done_u;
+	fail_u:
+	  if      (mode & UNESCAPE_EINVAL)     return errno = EINVAL, NULL;
+	  else if (mode & UNESCAPE_VERBATIM)   r--;
+	  else if (mode & UNESCAPE_IGNORE)     *w++ = '\\', r--;
+	done_u:;
+	  if ((v == 0) && (mode & UNESCAPE_MOD_UTF8))
+	    *w++ = (char)0xC0, *w++ = (char)0x80;
+	  else if (v < 0x80)
+	    *w++ = (char)v;
+	  else if (v < (1L << 11))
+	    *w++ = (char)(0xC0 | (v >> 6)),
+	      *w++ = (char)(0x80 | (v & 0x3F));
+	  else if (v < (1L << 16))
+	    *w++ = (char)(0xE0 | (v >> 12)),
+	      *w++ = (char)(0x80 | ((v >> 6) & 0x3F)),
+	      *w++ = (char)(0x80 | (v & 0x3F));
+	  else
+	    *w++ = (char)(0xF0 | (v >> 18)),
+	      *w++ = (char)(0x80 | ((v >> 12) & 0x3F)),
+	      *w++ = (char)(0x80 | ((v >> 6) & 0x3F)),
+	      *w++ = (char)(0x80 | (v & 0x3F));
+	  break;
+	  
+	default:
+	  if (('0' <= *r) && (*r <= '7'))
+	    {
+	      int v = *r - '0';
+	      if (('0' <= r[1]) && (r[1] <= '7'))
+		v = (v << 3) | (r[1] - '0'), r++;
+	      if (('0' <= r[1]) && (r[1] <= '7'))
+		v = (v << 3) | (r[1] - '0'), r++;
+	      if ((v == 0) && (mode & UNESCAPE_MOD_UTF8))
+		*w++ = (char)0xC0, *w++ = (char)0x80;
+	      else if (v < 0x80)
+		*w++ = (char)v;
+	      else
+		*w++ = (char)(0xC0 | (v >> 6)),
+		  *w++ = (char)(0x80 | (v & 3F));
+	    }
+	  else if (strstarts(r, "NUL"))        *w++ =  0, r += 2;
+	  else if (strstarts(r, "SOH"))        *w++ =  1, r += 2;
+	  else if (strstarts(r, "STX"))        *w++ =  2, r += 2;
+	  else if (strstarts(r, "ETX"))        *w++ =  3, r += 2;
+	  else if (strstarts(r, "EOT"))        *w++ =  4, r += 2;
+	  else if (strstarts(r, "ENQ"))        *w++ =  5, r += 2;
+	  else if (strstarts(r, "ACK"))        *w++ =  6, r += 2;
+	  else if (strstarts(r, "BEL"))        *w++ =  7, r += 2;
+	  else if (strstarts(r, "BS"))         *w++ =  8, r += 1;
+	  else if (strstarts(r, "HT"))         *w++ =  9, r += 1;
+	  else if (strstarts(r, "LF"))         *w++ = 10, r += 1;
+	  else if (strstarts(r, "VT"))         *w++ = 11, r += 1;
+	  else if (strstarts(r, "FF"))         *w++ = 12, r += 1;
+	  else if (strstarts(r, "CR"))         *w++ = 13, r += 1;
+	  else if (strstarts(r, "SO"))         *w++ = 14, r += 1;
+	  else if (strstarts(r, "SI"))         *w++ = 15, r += 1;
+	  else if (strstarts(r, "DLE"))        *w++ = 16, r += 2;
+	  else if (strstarts(r, "DC1"))        *w++ = 17, r += 2;
+	  else if (strstarts(r, "DC2"))        *w++ = 18, r += 2;
+	  else if (strstarts(r, "DC3"))        *w++ = 19, r += 2;
+	  else if (strstarts(r, "DC4"))        *w++ = 20, r += 2;
+	  else if (strstarts(r, "NAK"))        *w++ = 21, r += 2;
+	  else if (strstarts(r, "SYN"))        *w++ = 22, r += 2;
+	  else if (strstarts(r, "ETB"))        *w++ = 23, r += 2;
+	  else if (strstarts(r, "CAN"))        *w++ = 24, r += 2;
+	  else if (strstarts(r, "EM"))         *w++ = 25, r += 1;
+	  else if (strstarts(r, "SUB"))        *w++ = 26, r += 2;
+	  else if (strstarts(r, "ESC"))        *w++ = 27, r += 2;
+	  else if (strstarts(r, "FS"))         *w++ = 28, r += 1;
+	  else if (strstarts(r, "GS"))         *w++ = 29, r += 1;
+	  else if (strstarts(r, "RS"))         *w++ = 30, r += 1;
+	  else if (strstarts(r, "US"))         *w++ = 31, r += 1;
+	  else if (strstarts(r, "SP"))         *w++ = 32, r += 1;
+	  else if (strstarts(r, "DEL"))        *w++ = 0x7F, r += 2;
+	  else if (mode & UNESCAPE_EINVAL)     return errno = EINVAL, NULL;
+	  else if (mode & UNESCAPE_VERBATIM)   r--;
+	  else if (mode & UNESCAPE_IGNORE)     *w++ = '\\', r--;
+	  break;
+	}
+  
+  return *w = 0, w;
+}
+
author	Mattias Andrée <maandree@operamail.com>	2015-11-14 01:21:32 +0100
committer	Mattias Andrée <maandree@operamail.com>	2015-11-14 01:21:53 +0100
commit	98806a86e043ca1f506a23bfbba89d4f308a0bfc (patch)
tree	1dd6060a261689874509a85785d33c044d4f4ca4
parent	improve performance on strstr and wcsstr if the needle is one character wide (diff)
download	slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.gz slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.bz2 slibc-98806a86e043ca1f506a23bfbba89d4f308a0bfc.tar.xz