aboutsummaryrefslogtreecommitdiffstats
path: root/libcharconv_control_characters.c
diff options
context:
space:
mode:
authorMattias Andrée <m@maandree.se>2026-01-25 18:37:59 +0100
committerMattias Andrée <m@maandree.se>2026-01-25 18:37:59 +0100
commit87f373ca116abbb5a7ba8e6ad7111082e8dfb128 (patch)
tree0943f33e352ae98297cfd55cc058acd7b0c9609d /libcharconv_control_characters.c
parentAdd symbols (diff)
downloadcharconv-87f373ca116abbb5a7ba8e6ad7111082e8dfb128.tar.gz
charconv-87f373ca116abbb5a7ba8e6ad7111082e8dfb128.tar.bz2
charconv-87f373ca116abbb5a7ba8e6ad7111082e8dfb128.tar.xz
Add control characters
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to 'libcharconv_control_characters.c')
-rw-r--r--libcharconv_control_characters.c169
1 files changed, 169 insertions, 0 deletions
diff --git a/libcharconv_control_characters.c b/libcharconv_control_characters.c
new file mode 100644
index 0000000..55e4bbf
--- /dev/null
+++ b/libcharconv_control_characters.c
@@ -0,0 +1,169 @@
+/* See LICENSE file for copyright and license details. */
+#include "lib-common.h"
+#include <string.h>
+
+
+static struct {
+ uint_least32_t cp;
+ const char *s;
+} symbols[] = {
+ {UINT32_C(0x00), "NUL"}, /* NULL */
+ {UINT32_C(0x01), "SOH"}, /* START OF HEADING */
+ {UINT32_C(0x02), "STX"}, /* START OF TEXT */
+ {UINT32_C(0x03), "ETX"}, /* END OF TEXT */
+ {UINT32_C(0x04), "EOT"}, /* END OF TRANSMISSION */
+ {UINT32_C(0x05), "ENQ"}, /* ENQUIRY */
+ {UINT32_C(0x06), "ACK"}, /* ACKNOWLEDGE */
+ {UINT32_C(0x07), "BEL"}, /* BELL */
+ {UINT32_C(0x08), "BS"}, /* BACKSPACE */
+ {UINT32_C(0x09), "HT"}, /* CHARACTER TABULATION */
+ {UINT32_C(0x0A), "LF"}, /* LINE FEED */
+ {UINT32_C(0x0B), "VT"}, /* LINE TABULATION */
+ {UINT32_C(0x0C), "FF"}, /* FORM FEED */
+ {UINT32_C(0x0D), "CR"}, /* CARRIAGE RETURN */
+ {UINT32_C(0x0E), "SS"}, /* SHIFT OUT */
+ {UINT32_C(0x0F), "SI"}, /* SHIFT IN */
+ {UINT32_C(0x10), "DLE"}, /* DATA LINK ESCAPE */
+ {UINT32_C(0x11), "DC1"}, /* DEVICE CONTROL ONE */
+ {UINT32_C(0x12), "DC2"}, /* DEVICE CONTROL TWO */
+ {UINT32_C(0x13), "DC3"}, /* DEVICE CONTROL THREE */
+ {UINT32_C(0x14), "DC4"}, /* DEVICE CONTROL FOUR */
+ {UINT32_C(0x15), "NAK"}, /* NEGATIVE ACKNOWLEDGE */
+ {UINT32_C(0x16), "SYN"}, /* SYNCHRONOUS IDLE */
+ {UINT32_C(0x17), "ETB"}, /* END OF TRANSMISSION BLOCK */
+ {UINT32_C(0x18), "CAN"}, /* CANCEL */
+ {UINT32_C(0x19), "EM"}, /* END OF MEDIUM */
+ {UINT32_C(0x1A), "SUB"}, /* SUBSTITUTE */
+ {UINT32_C(0x1B), "ESC"}, /* ESCAPE */
+ {UINT32_C(0x1C), "FS"}, /* INFORMATION SEPARATOR FOUR */
+ {UINT32_C(0x1D), "GS"}, /* INFORMATION SEPARATOR THREE */
+ {UINT32_C(0x1E), "RS"}, /* INFORMATION SEPARATOR TWO */
+ {UINT32_C(0x1F), "US"}, /* INFORMATION SEPARATOR ONE */
+ {UINT32_C(0x7F), "DEL"}, /* DELETE */
+ {UINT32_C(0x0080), "PAD"}, /* Padding Character */
+ {UINT32_C(0x0081), "HOP"}, /* High Octet Preset */
+ {UINT32_C(0x0082), "BPH"}, /* BREAK PERMITTED HERE */
+ {UINT32_C(0x0083), "NBP"}, /* NO BREAK HERE */
+ {UINT32_C(0x0084), "IND"}, /* INDEX */
+ {UINT32_C(0x0085), "NEL"}, /* NEXT LINE */
+ {UINT32_C(0x0086), "SSA"}, /* START OF SELECTED AREA */
+ {UINT32_C(0x0087), "ESA"}, /* END OF SELECTED AREA */
+ {UINT32_C(0x0088), "HTS"}, /* CHARACTER TABULATION SET */
+ {UINT32_C(0x0089), "HTJ"}, /* CHARACTER TABULATION WITH JUSTIFICATION */
+ {UINT32_C(0x008A), "LTS"}, /* LINE TABULATION SET */
+ {UINT32_C(0x008B), "PLD"}, /* PARTIAL LINE FORWARD */
+ {UINT32_C(0x008C), "PLU"}, /* PARTIAL LINE BACKWARD */
+ {UINT32_C(0x008D), "RI"}, /* REVERSE LINE FEED */
+ {UINT32_C(0x008E), "SS2"}, /* SINGLE SHIFT TWO */
+ {UINT32_C(0x008F), "SS3"}, /* SINGLE SHIFT THREE */
+ {UINT32_C(0x0090), "DCS"}, /* DEVICE CONTROL STRING */
+ {UINT32_C(0x0091), "PU1"}, /* PRIVATE USE ONE */
+ {UINT32_C(0x0092), "PU2"}, /* PRIVATE USE TWO */
+ {UINT32_C(0x0093), "STS"}, /* SET TRANSMIT STATE */
+ {UINT32_C(0x0094), "CCH"}, /* CANCEL CHARACTER */
+ {UINT32_C(0x0095), "MW"}, /* MESSAGE WAITING */
+ {UINT32_C(0x0096), "SPA"}, /* START OF GUARDED AREA */
+ {UINT32_C(0x0097), "EPA"}, /* END OF GUARDED AREA */
+ {UINT32_C(0x0098), "SOS"}, /* START OF STRING */
+ {UINT32_C(0x0099), "SGCI"}, /* Single Graphic Character Introducer */
+ {UINT32_C(0x009A), "SSI"}, /* SINGLE CHARACTER INTRODUCER */
+ {UINT32_C(0x009B), "CSI"}, /* CONTROL SEQUENCE INTRODUCER */
+ {UINT32_C(0x009C), "ST"}, /* STRING TERMINATOR */
+ {UINT32_C(0x009D), "OSC"}, /* OPERATING SYSTEM COMMAND */
+ {UINT32_C(0x009E), "PM"}, /* PRIVACY MESSAGE */
+ {UINT32_C(0x009F), "APC"}, /* APPLICATION PROGRAM COMMAND */
+ {UINT32_C(0x200B), "ZWS"}, /* ZERO WIDTH SPACE */
+ {UINT32_C(0x200C), "ZWNJ"}, /* ZERO WIDTH NON-JOINER */
+ {UINT32_C(0x200D), "ZWJ"}, /* ZERO WIDTH JOINER */
+ {UINT32_C(0x200E), "LTRM"}, /* LEFT-TO-RIGHT MARK */
+ {UINT32_C(0x200F), "RTLM"}, /* RIGHT-TO-LEFT MARK */
+ {UINT32_C(0x202A), "LTRE"}, /* LEFT-TO-RIGHT EMBEDDING */
+ {UINT32_C(0x202B), "RTLE"}, /* RIGHT-TO-LEFT EMBEDDING */
+ {UINT32_C(0x202C), "PDF"}, /* POP DIRECTIONAL FORMATTING */
+ {UINT32_C(0x202D), "LTRO"}, /* LEFT-TO-RIGHT OVERRIDE */
+ {UINT32_C(0x202E), "RTLO"}, /* RIGHT-TO-LEFT OVERRIDE */
+ {UINT32_C(0x2060), "WJ"}, /* WORD JOINER */
+ {UINT32_C(0x2066), "LTRI"}, /* LEFT-TO-RIGHT ISOLATE */
+ {UINT32_C(0x2067), "RTLI"}, /* RIGHT-TO-LEFT ISOLATE */
+ {UINT32_C(0x2068), "FSI"}, /* FIRST STRONG ISOLATE */
+ {UINT32_C(0x2069), "PDI"}, /* POP DIRECTIONAL ISOLATE */
+ {UINT32_C(0x206A), "ISS"}, /* INHIBIT SYMMETRIC SWAPPING */
+ {UINT32_C(0x206B), "ASS"}, /* ACTIVATE SYMMETRIC SWAPPING */
+ {UINT32_C(0x206C), "IAFS"}, /* INHIBIT ARABIC FORM SHAPING */
+ {UINT32_C(0x206D), "AAFS"}, /* ACTIVATE ARABIC FORM SHAPING */
+ {UINT32_C(0x206E), "NADS"}, /* NATIONAL DIGIT SHAPES */
+ {UINT32_C(0x206F), "NODS"}, /* NOMINAL DIGIT SHAPES */
+ {UINT32_C(0xFFF9), "IAA"}, /* INTERLINEAR ANNOTATION ANCHOR */
+ {UINT32_C(0xFFFA), "IAS"}, /* INTERLINEAR ANNOTATION SEPARATOR */
+ {UINT32_C(0xFFFB), "IAT"}, /* INTERLINEAR ANNOTATION TERMINATOR */
+ {UINT32_C(0x1BCA0), "SFLO"}, /* SHORTHAND FORMAT LETTER OVERLAP */
+ {UINT32_C(0x1BCA1), "SFCO"}, /* SHORTHAND FORMAT CONTINUING OVERLAP */
+ {UINT32_C(0x1BCA2), "SFDS"}, /* SHORTHAND FORMAT DOWN STEP */
+ {UINT32_C(0x1BCA3), "SFUS"}, /* SHORTHAND FORMAT UP STEP */
+ {UINT32_C(0xE0001), "LTAG"}, /* LANGUAGE TAG */
+ {UINT32_C(0xE007F), "CTAG"}, /* CANCEL TAG */
+
+ {UINT32_C(0x20), "SP"}, /* SPACE */
+ {UINT32_C(0x00A0), "NBSP"}, /* NO-BREAK SPACE */
+ {UINT32_C(0x2000), "NQ"}, /* EN QUAD */
+ {UINT32_C(0x2001), "MQ"}, /* EM QUAD */
+ {UINT32_C(0x2002), "NSP"}, /* EN SPACE */
+ {UINT32_C(0x2003), "MSP"}, /* EM SPACE */
+ {UINT32_C(0x2004), "3MSP"}, /* THREE-PER-EM SPACE */
+ {UINT32_C(0x2005), "4MSP"}, /* FOUR-PER-EM SPACE */
+ {UINT32_C(0x2006), "6MSP"}, /* SIX-PER-EM SPACE */
+ {UINT32_C(0x2007), "FSP"}, /* FIGURE SPACE */
+ {UINT32_C(0x2008), "PSP"}, /* PUNCTUATION SPACE */
+ {UINT32_C(0x2009), "TSP"}, /* THIN SPACE */
+ {UINT32_C(0x200A), "HSP"}, /* HAIR SPACE */
+ {UINT32_C(0x2028), "LS"}, /* LINE SEPARATOR */
+ {UINT32_C(0x2029), "PS"}, /* PARAGRAPH SEPARATOR */
+ {UINT32_C(0x202F), "NNBSP"}, /* NARROW NO-BREAK SPACE */
+ {UINT32_C(0x205F), "MMSP"}, /* MEDIUM MATHEMATICAL SPACE */
+
+ {UINT32_C(0x00AD), "SHY"} /* SOFT HYPHEN */
+};
+
+
+enum libcharconv_result
+libcharconv_control_characters(const char *s, size_t slen, size_t *n, uint_least32_t *cp, size_t *ncp)
+{
+ size_t i, len, found, found_len;
+ int indeterminate;
+ *n = 0;
+ for (; slen; s++, slen--, ++*n) {
+ indeterminate = 0;
+ found = SIZE_MAX;
+ found_len = 0u;
+ for (i = 0u; i < sizeof(symbols) / sizeof(*symbols); i++) {
+ len = strlen(symbols[i].s);
+ if (strncmp(s, symbols[i].s, len < slen ? len : slen))
+ continue;
+ if (slen < len) {
+ indeterminate = 1;
+ continue;
+ }
+ if (len > found_len) {
+ found = i;
+ found_len = len;
+ }
+ }
+ if (found_len)
+ goto conv;
+ if (*n)
+ goto no_conv;
+ if (indeterminate)
+ return LIBCHARCONV_INDETERMINATE;
+ }
+no_conv:
+ return LIBCHARCONV_NO_CONVERT;
+
+conv:
+ if (*n)
+ goto no_conv;
+ if (*ncp)
+ *cp = symbols[found].cp;
+ *n += found_len;
+ *ncp = 1u;
+ return indeterminate ? LIBCHARCONV_CONVERT_IF_END : LIBCHARCONV_CONVERTED;
+}