aboutsummaryrefslogtreecommitdiffstats
path: root/libcharconv_control_characters.c
blob: c45f03e70dcf39973f8b31c13e969c53cce67e63 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/* See LICENSE file for copyright and license details. */
#include "lib-common.h"
#include <string.h>


static struct {
	uint_least32_t cp;
	const char *s;
} symbols[] = {
	{UINT32_C(0x00),    "NUL"},   /* NULL */
	{UINT32_C(0x01),    "SOH"},   /* START OF HEADING */
	{UINT32_C(0x02),    "STX"},   /* START OF TEXT */
	{UINT32_C(0x03),    "ETX"},   /* END OF TEXT */
	{UINT32_C(0x04),    "EOT"},   /* END OF TRANSMISSION */
	{UINT32_C(0x05),    "ENQ"},   /* ENQUIRY */
	{UINT32_C(0x06),    "ACK"},   /* ACKNOWLEDGE */
	{UINT32_C(0x07),    "BEL"},   /* BELL */
	{UINT32_C(0x08),    "BS"},    /* BACKSPACE */
	{UINT32_C(0x09),    "HT"},    /* CHARACTER TABULATION */
	{UINT32_C(0x0A),    "LF"},    /* LINE FEED */
	{UINT32_C(0x0B),    "VT"},    /* LINE TABULATION */
	{UINT32_C(0x0C),    "FF"},    /* FORM FEED */
	{UINT32_C(0x0D),    "CR"},    /* CARRIAGE RETURN */
	{UINT32_C(0x0E),    "SS"},    /* SHIFT OUT */
	{UINT32_C(0x0F),    "SI"},    /* SHIFT IN */
	{UINT32_C(0x10),    "DLE"},   /* DATA LINK ESCAPE */
	{UINT32_C(0x11),    "DC1"},   /* DEVICE CONTROL ONE */
	{UINT32_C(0x12),    "DC2"},   /* DEVICE CONTROL TWO */
	{UINT32_C(0x13),    "DC3"},   /* DEVICE CONTROL THREE */
	{UINT32_C(0x14),    "DC4"},   /* DEVICE CONTROL FOUR */
	{UINT32_C(0x15),    "NAK"},   /* NEGATIVE ACKNOWLEDGE */
	{UINT32_C(0x16),    "SYN"},   /* SYNCHRONOUS IDLE */
	{UINT32_C(0x17),    "ETB"},   /* END OF TRANSMISSION BLOCK */
	{UINT32_C(0x18),    "CAN"},   /* CANCEL */
	{UINT32_C(0x19),    "EM"},    /* END OF MEDIUM */
	{UINT32_C(0x1A),    "SUB"},   /* SUBSTITUTE */
	{UINT32_C(0x1B),    "ESC"},   /* ESCAPE */
	{UINT32_C(0x1C),    "FS"},    /* INFORMATION SEPARATOR FOUR */
	{UINT32_C(0x1D),    "GS"},    /* INFORMATION SEPARATOR THREE */
	{UINT32_C(0x1E),    "RS"},    /* INFORMATION SEPARATOR TWO */
	{UINT32_C(0x1F),    "US"},    /* INFORMATION SEPARATOR ONE */
	{UINT32_C(0x7F),    "DEL"},   /* DELETE */
	{UINT32_C(0x0080),  "PAD"},   /* Padding Character */
	{UINT32_C(0x0081),  "HOP"},   /* High Octet Preset */
	{UINT32_C(0x0082),  "BPH"},   /* BREAK PERMITTED HERE */
	{UINT32_C(0x0083),  "NBP"},   /* NO BREAK HERE */
	{UINT32_C(0x0084),  "IND"},   /* INDEX */
	{UINT32_C(0x0085),  "NEL"},   /* NEXT LINE */
	{UINT32_C(0x0086),  "SSA"},   /* START OF SELECTED AREA */
	{UINT32_C(0x0087),  "ESA"},   /* END OF SELECTED AREA */
	{UINT32_C(0x0088),  "HTS"},   /* CHARACTER TABULATION SET */
	{UINT32_C(0x0089),  "HTJ"},   /* CHARACTER TABULATION WITH JUSTIFICATION */
	{UINT32_C(0x008A),  "LTS"},   /* LINE TABULATION SET */
	{UINT32_C(0x008B),  "PLD"},   /* PARTIAL LINE FORWARD */
	{UINT32_C(0x008C),  "PLU"},   /* PARTIAL LINE BACKWARD */
	{UINT32_C(0x008D),  "RI"},    /* REVERSE LINE FEED */
	{UINT32_C(0x008E),  "SS2"},   /* SINGLE SHIFT TWO */
	{UINT32_C(0x008F),  "SS3"},   /* SINGLE SHIFT THREE */
	{UINT32_C(0x0090),  "DCS"},   /* DEVICE CONTROL STRING */
	{UINT32_C(0x0091),  "PU1"},   /* PRIVATE USE ONE */
	{UINT32_C(0x0092),  "PU2"},   /* PRIVATE USE TWO */
	{UINT32_C(0x0093),  "STS"},   /* SET TRANSMIT STATE */
	{UINT32_C(0x0094),  "CCH"},   /* CANCEL CHARACTER */
	{UINT32_C(0x0095),  "MW"},    /* MESSAGE WAITING */
	{UINT32_C(0x0096),  "SPA"},   /* START OF GUARDED AREA */
	{UINT32_C(0x0097),  "EPA"},   /* END OF GUARDED AREA */
	{UINT32_C(0x0098),  "SOS"},   /* START OF STRING */
	{UINT32_C(0x0099),  "SGCI"},  /* Single Graphic Character Introducer */
	{UINT32_C(0x009A),  "SSI"},   /* SINGLE CHARACTER INTRODUCER */
	{UINT32_C(0x009B),  "CSI"},   /* CONTROL SEQUENCE INTRODUCER */
	{UINT32_C(0x009C),  "ST"},    /* STRING TERMINATOR */
	{UINT32_C(0x009D),  "OSC"},   /* OPERATING SYSTEM COMMAND */
	{UINT32_C(0x009E),  "PM"},    /* PRIVACY MESSAGE */
	{UINT32_C(0x009F),  "APC"},   /* APPLICATION PROGRAM COMMAND */
	{UINT32_C(0x200B),  "ZWS"},   /* ZERO WIDTH SPACE */
	{UINT32_C(0x200C),  "ZWNJ"},  /* ZERO WIDTH NON-JOINER */
	{UINT32_C(0x200D),  "ZWJ"},   /* ZERO WIDTH JOINER */
	{UINT32_C(0x200E),  "LTRM"},  /* LEFT-TO-RIGHT MARK */
	{UINT32_C(0x200F),  "RTLM"},  /* RIGHT-TO-LEFT MARK */
	{UINT32_C(0x202A),  "LTRE"},  /* LEFT-TO-RIGHT EMBEDDING */
	{UINT32_C(0x202B),  "RTLE"},  /* RIGHT-TO-LEFT EMBEDDING */
	{UINT32_C(0x202C),  "PDF"},   /* POP DIRECTIONAL FORMATTING */
	{UINT32_C(0x202D),  "LTRO"},  /* LEFT-TO-RIGHT OVERRIDE */
	{UINT32_C(0x202E),  "RTLO"},  /* RIGHT-TO-LEFT OVERRIDE */
	{UINT32_C(0x2060),  "WJ"},    /* WORD JOINER */
	{UINT32_C(0x2066),  "LTRI"},  /* LEFT-TO-RIGHT ISOLATE */
	{UINT32_C(0x2067),  "RTLI"},  /* RIGHT-TO-LEFT ISOLATE */
	{UINT32_C(0x2068),  "FSI"},   /* FIRST STRONG ISOLATE */
	{UINT32_C(0x2069),  "PDI"},   /* POP DIRECTIONAL ISOLATE */
	{UINT32_C(0x206A),  "ISS"},   /* INHIBIT SYMMETRIC SWAPPING */
	{UINT32_C(0x206B),  "ASS"},   /* ACTIVATE SYMMETRIC SWAPPING */
	{UINT32_C(0x206C),  "IAFS"},  /* INHIBIT ARABIC FORM SHAPING */
	{UINT32_C(0x206D),  "AAFS"},  /* ACTIVATE ARABIC FORM SHAPING */
	{UINT32_C(0x206E),  "NADS"},  /* NATIONAL DIGIT SHAPES */
	{UINT32_C(0x206F),  "NODS"},  /* NOMINAL DIGIT SHAPES */
	{UINT32_C(0xFFF9),  "IAA"},   /* INTERLINEAR ANNOTATION ANCHOR */
	{UINT32_C(0xFFFA),  "IAS"},   /* INTERLINEAR ANNOTATION SEPARATOR */
	{UINT32_C(0xFFFB),  "IAT"},   /* INTERLINEAR ANNOTATION TERMINATOR */
	{UINT32_C(0x1BCA0), "SFLO"},  /* SHORTHAND FORMAT LETTER OVERLAP */
	{UINT32_C(0x1BCA1), "SFCO"},  /* SHORTHAND FORMAT CONTINUING OVERLAP */
	{UINT32_C(0x1BCA2), "SFDS"},  /* SHORTHAND FORMAT DOWN STEP */
	{UINT32_C(0x1BCA3), "SFUS"},  /* SHORTHAND FORMAT UP STEP */
	{UINT32_C(0xE0001), "LTAG"},  /* LANGUAGE TAG */
	{UINT32_C(0xE007F), "CTAG"},  /* CANCEL TAG */

	{UINT32_C(0x20),    "SP"},    /* SPACE */
	{UINT32_C(0x00A0),  "NBSP"},  /* NO-BREAK SPACE */
	{UINT32_C(0x2000),  "NQ"},    /* EN QUAD */
	{UINT32_C(0x2001),  "MQ"},    /* EM QUAD */
	{UINT32_C(0x2002),  "NSP"},   /* EN SPACE */
	{UINT32_C(0x2003),  "MSP"},   /* EM SPACE */
	{UINT32_C(0x2004),  "3MSP"},  /* THREE-PER-EM SPACE */
	{UINT32_C(0x2005),  "4MSP"},  /* FOUR-PER-EM SPACE */
	{UINT32_C(0x2006),  "6MSP"},  /* SIX-PER-EM SPACE */
	{UINT32_C(0x2007),  "FSP"},   /* FIGURE SPACE */
	{UINT32_C(0x2008),  "PSP"},   /* PUNCTUATION SPACE */
	{UINT32_C(0x2009),  "TSP"},   /* THIN SPACE */
	{UINT32_C(0x200A),  "HSP"},   /* HAIR SPACE */
	{UINT32_C(0x2028),  "LS"},    /* LINE SEPARATOR */
	{UINT32_C(0x2029),  "PS"},    /* PARAGRAPH SEPARATOR */
	{UINT32_C(0x202F),  "NNBSP"}, /* NARROW NO-BREAK SPACE */
	{UINT32_C(0x205F),  "MMSP"},  /* MEDIUM MATHEMATICAL SPACE */

	{UINT32_C(0x00AD),  "SHY"}    /* SOFT HYPHEN */
};


enum libcharconv_result
libcharconv_control_characters(const char *s, size_t slen, size_t *n, uint_least32_t *cp, size_t *ncp)
{
	size_t i, len, found, found_len;
	int indeterminate;
	*n = 0;
	for (; slen; s++, slen--, ++*n) {
		if (s[0] == '\xE2') {
			if (slen == 1u)
				return LIBCHARCONV_INDETERMINATE;
			if (s[1] != '\x90')
				goto search;
			if (slen == 2u)
				return LIBCHARCONV_INDETERMINATE;
			i = ((const unsigned char *)s)[2];
			if (0x80u > i || i > 0xA1u)
				goto search;
			i &= 0x3Fu;
			i = i == 0x21u ? 0x7Fu : i;
			goto conv_repr;
		}
	search:
		indeterminate = 0;
		found = SIZE_MAX;
		found_len = 0u;
		for (i = 0u; i < sizeof(symbols) / sizeof(*symbols); i++) {
			len = strlen(symbols[i].s);
			if (strncmp(s, symbols[i].s, len < slen ? len : slen))
				continue;
			if (slen < len) {
				indeterminate = 1;
				continue;
			}
			if (len > found_len) {
				found = i;
				found_len = len;
			}
		}
		if (found_len)
			goto conv;
		if (*n)
			goto no_conv;
		if (indeterminate)
			return LIBCHARCONV_INDETERMINATE;
	}
no_conv:
	return LIBCHARCONV_NO_CONVERT;

conv_repr:
	if (*n)
		goto no_conv;
	if (*ncp)
		*cp = (uint_least32_t)i;
	*n += 3u;
	*ncp = 1u;
	return LIBCHARCONV_CONVERTED;

conv:
	if (*n)
		goto no_conv;
	if (*ncp)
		*cp = symbols[found].cp;
	*n += found_len;
	*ncp = 1u;
	return indeterminate ? LIBCHARCONV_CONVERT_IF_END : LIBCHARCONV_CONVERTED;
}