Validate UTF-8 encoding and check for long lines

Signed-off-by: Mattias Andrée <maandree@kth.se>
author: Mattias Andrée <maandree@kth.se> 2022-01-01 12:37:03 +0100
committer: Mattias Andrée <maandree@kth.se> 2022-01-01 12:37:03 +0100
commit: e6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
tree: b6c23893cbb526746071f8c21fe0adb478729a0e /text.c
parent: Change [warn] to [warning] (diff)
download: makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz
1 files changed, 143 insertions, 0 deletions
diff --git a/text.c b/text.c
new file mode 100644
index 0000000..e5498eb
--- /dev/null
+++ b/text.c
@@ -0,0 +1,143 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+struct line *
+load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp)
+{
+	struct line *lines;
+	char *buf = NULL, *p;
+	size_t size = 0;
+	size_t len = 0;
+	size_t i;
+	ssize_t r;
+
+	/* getline(3) may seem like the best way to read line by line,
+	 * however, it may terminate before the end of the line is
+	 * reached, which we would have to deal with, additionally,
+	 * we want to check for null bytes. Therefore we will keep
+	 * this simple and use read(3) and scan manually; and as a
+	 * bonus we can leave the file descriptor open, and let the
+	 * caller than created it close it.
+	 */
+
+	i = 0;
+	*nlinesp = 0;
+	for (;;) {
+		if (len == size)
+			buf = erealloc(buf, size += 2048);
+		r = read(fd, &buf[len], size - len);
+		if (r > 0)
+			len += (size_t)r;
+		else if (!r)
+			break;
+		else if (errno == EINTR)
+			continue;
+		else
+			eprintf("read %s:", fname);
+
+		for (; i < len; i++) {
+			if (buf[i] == '\n') {
+				*nlinesp += 1;
+				buf[i] = '\0';
+			} else if (buf[i] == '\0') {
+				/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+				warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
+				                         "input files are text files, and causes undefined behaviour",
+				                fname, *nlinesp + 1);
+				/* make(1) should probably just abort */
+				printinfof(WC_TEXT, "this implementation will replace it with a <space>");
+				buf[i] = ' ';
+			}
+		}
+	}
+
+	if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
+		/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+		warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
+		                         "required because input files are text files, and omission of it "
+		                         "causes undefined behaviour",
+		                fname, *nlinesp + 1);
+		/* make(1) should probably just abort */
+		printinfof(WC_TEXT, "this implementation will add the missing <newline>");
+		buf = erealloc(buf, len + 1);
+		buf[len++] = '\0';
+		*nlinesp += 1;
+	}
+
+	lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
+	for (p = buf, i = 0; i < *nlinesp; i++) {
+		lines[i].lineno = i + 1;
+		lines[i].path = fname;
+		lines[i].len = strlen(p);
+		lines[i].data = ememdup(p, lines[i].len + 1);
+		lines[i].eof = i + 1 == *nlinesp;
+		lines[i].nest_level = nest_level;
+
+		if (lines[i].len + 1 > 2048) {
+			/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+			warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
+			                         "2048 bytes which causes undefined behaviour as input files are "
+			                         "text files and POSIX only guarantees support for lines up to 2048 "
+			                         "bytes long including the <newline> character in text files",
+			                fname, *nlinesp + 1);
+		}
+		p += lines[i].len + 1;
+	}
+
+	free(buf);
+	return lines;
+}
+
+
+void
+check_utf8_encoding(struct line *line)
+{
+	size_t off, r;
+	uint_least32_t codepoint;
+#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD
+	unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD};
+#endif
+
+	for (off = 0; off < line->len; off += r) {
+		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+
+		if (codepoint == GRAPHEME_INVALID_CODEPOINT &&
+		    (r != ELEMSOF(invalid_codepoint_encoding) ||
+		     memcmp(&line->data[off], invalid_codepoint_encoding, r))) {
+
+			warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno);
+			printinfof(WC_ENCODING, "this implementation will replace it the "
+			                        "Unicode replacement character (U+FFFD)");
+
+			line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding));
+			memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)],
+			        &line->data[off + r],
+			        line->len - off - r);
+			memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding));
+			line->len = line->len - r + ELEMSOF(invalid_codepoint_encoding);
+		}
+	}
+}
+
+
+void
+check_column_count(struct line *line)
+{
+	size_t columns = 0;
+	size_t off, r;
+	uint_least32_t codepoint;
+
+	if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */
+		return;
+
+	for (off = 0; off < line->len; off += r) {
+		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+		columns += (size_t)abs(wcwidth((wchar_t)codepoint));
+	}
+
+	if (columns > style.max_line_length) {
+		warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns",
+		            line->path, line->lineno, columns);
+	}
+}
author	Mattias Andrée <maandree@kth.se>	2022-01-01 12:37:03 +0100
committer	Mattias Andrée <maandree@kth.se>	2022-01-01 12:37:03 +0100
commit	e6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
tree	b6c23893cbb526746071f8c21fe0adb478729a0e /text.c
parent	Change [warn] to [warning] (diff)
download	makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2 makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz