Validate UTF-8 encoding and check for long lines

Signed-off-by: Mattias Andrée <maandree@kth.se>
author: Mattias Andrée <maandree@kth.se> 2022-01-01 12:37:03 +0100
committer: Mattias Andrée <maandree@kth.se> 2022-01-01 12:37:03 +0100
commit: e6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
tree: b6c23893cbb526746071f8c21fe0adb478729a0e
parent: Change [warn] to [warning] (diff)
download: makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz
7 files changed, 199 insertions, 97 deletions
diff --git a/LICENSE b/LICENSE
index c44b2d9..c90d3cc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 ISC License
 
-© 2021 Mattias Andrée <maandree@kth.se>
+© 2021, 2022 Mattias Andrée <maandree@kth.se>
 
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted, provided that the above
diff --git a/Makefile b/Makefile
index 2c7f513..02aefdc 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@ include $(CONFIGFILE)
 
 OBJ =\
 	mklint.o\
+	text.o\
 	ui.o
 
 HDR =\
diff --git a/common.h b/common.h
index b2d0a4e..8dcbdc6 100644
--- a/common.h
+++ b/common.h
@@ -1,6 +1,9 @@
 /* See LICENSE file for copyright and license details. */
+#include <locale.h>
+
 #include <libsimple.h>
 #include <libsimple-arg.h>
+#include <grapheme.h>
 
 #define EXIT_STYLE         1
 #define EXIT_WARNING       2
@@ -15,12 +18,15 @@
 	X(WC_MAKEFILE, "makefile", INFORM)\
 	X(WC_EXTRA_MAKEFILE, "extra-makefile", WARN)\
 	X(WC_CMDLINE, "cmdline", WARN)\
-	X(WC_TEXT, "text", WARN)
+	X(WC_TEXT, "text", WARN)\
+	X(WC_ENCODING, "encoding", WARN)\
+	X(WC_LONG_LINE, "long-line", WARN_STYLE)
 
 
 enum action {
 	IGNORE,
 	INFORM,
+	WARN_STYLE,
 	WARN
 };
 
@@ -41,16 +47,30 @@ struct line {
 	size_t len;
 	const char *path;
 	size_t lineno;
+	int eof;
+	int nest_level;
+};
+
+struct style {
+	size_t max_line_length;
 };
 
 
 extern int exit_status;
+extern struct style style;
+
+
+/* text.c */
+struct line *load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp);
+void check_utf8_encoding(struct line *line);
+void check_column_count(struct line *line);
 
 
 /* ui.c */
 extern struct warning_class_data warning_classes[];
 void xprintwarningf(enum warning_class class, int severity, const char *fmt, ...);
-#define printinfof(CLASS, ...) xprintwarningf(CLASS, EXIT_STYLE, __VA_ARGS__)
+#define printinfof(CLASS, ...) xprintwarningf(CLASS, 0, __VA_ARGS__)
+#define warnf_style(CLASS, ...) xprintwarningf(CLASS, EXIT_STYLE, __VA_ARGS__)
 #define warnf_warning(CLASS, ...) xprintwarningf(CLASS, EXIT_WARNING, __VA_ARGS__)
 #define warnf_unspecified(CLASS, ...) xprintwarningf(CLASS, EXIT_UNSPECIFIED, __VA_ARGS__)
 #define warnf_undefined(CLASS, ...) xprintwarningf(CLASS, EXIT_UNDEFINED, __VA_ARGS__)
diff --git a/config.mk b/config.mk
index d354a28..bf92326 100644
--- a/config.mk
+++ b/config.mk
@@ -5,4 +5,4 @@ CC = cc
 
 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
 CFLAGS   = -std=c99 -Wall -g
-LDFLAGS  = -lsimple
+LDFLAGS  = -lsimple -lgrapheme
diff --git a/mklint.c b/mklint.c
index 1c17eea..7356860 100644
--- a/mklint.c
+++ b/mklint.c
@@ -4,9 +4,13 @@
 NUSAGE(EXIT_ERROR, "[-f makefile]");
 
 
-
 int exit_status = 0;
 
+struct style style = {
+	.max_line_length = 120
+};
+
+
 static const char *const default_makefiles[] = {
 	"makefile",
 	"Makefile"
@@ -69,86 +73,27 @@ cmdline_opt_f(const char *arg, const char **makefile_pathp)
 
 
 static struct line *
-load_file(int fd, const char *fname, size_t *nlinesp)
+load_makefile(const char *path, size_t *nlinesp)
 {
 	struct line *lines;
-	char *buf = NULL, *p;
-	size_t size = 0;
-	size_t len = 0;
-	size_t i;
-	ssize_t r;
-
-	/* getline(3) may seem like the best way to read line by line,
-	 * however, it may terminate before the end of the line is
-	 * reached, which we would have to deal with, additionally,
-	 * we want to check for null bytes. Therefore we will keep
-	 * this simple and use read(3) and scan manually; and as a
-	 * bonus we can leave the file descriptor open, and let the
-	 * caller than created it close it.
-	 */
-
-	i = 0;
-	*nlinesp = 0;
-	for (;;) {
-		if (len == size)
-			buf = erealloc(buf, size += 2048);
-		r = read(fd, &buf[len], size - len);
-		if (r > 0)
-			len += (size_t)r;
-		else if (!r)
-			break;
-		else if (errno == EINTR)
-			continue;
-		else
-			eprintf("read %s:", fname);
-
-		for (; i < len; i++) {
-			if (buf[i] == '\n') {
-				*nlinesp += 1;
-				buf[i] = '\0';
-			} else if (buf[i] == '\0') {
-				/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
-				warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
-				                         "input files are text files, and causes undefined behaviour",
-				                fname, *nlinesp + 1);
-				/* make(1) should probably just abort */
-				printinfof(WC_TEXT, "this implementation will replace it with a <space>");
-				buf[i] = ' ';
-			}
-		}
-	}
-
-	if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
-		/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
-		warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
-		                         "required because input files are text files, and omission of it "
-		                         "causes undefined behaviour",
-		                fname, *nlinesp + 1);
-		/* make(1) should probably just abort */
-		printinfof(WC_TEXT, "this implementation will add the missing <newline>");
-		buf = erealloc(buf, len + 1);
-		buf[len++] = '\0';
-		*nlinesp += 1;
-	}
+	int fd;
 
-	lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
-	for (p = buf, i = 0; i < *nlinesp; i++) {
-		lines[i].lineno = i + 1;
-		lines[i].path = fname;
-		lines[i].len = strlen(p);
-		lines[i].data = ememdup(p, lines[i].len + 1);
-		if (lines[i].len + 1 > 2048) {
-			/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
-			warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
-			                         "2048 bytes which causes undefined behaviour as input files are "
-			                         "text files and POSIX only guarantees support for lines up to 2048 "
-			                         "bytes long including the <newline> character in text files",
-			                fname, *nlinesp + 1);
-		}
-		p += lines[i].len + 1;
+	if (!path) {
+		fd = open_default_makefile(&path);
+	} else if (!strcmp(path, "-")) {
+		/* “A pathname of '-' shall denote the standard input” */
+		fd = dup(STDIN_FILENO);
+		if (fd < 0)
+			eprintf("dup <stdin>:");
+		path = "<stdin>";
+	} else {
+		fd = open(path, O_RDONLY);
+		if (fd < 0)
+			eprintf("open %s O_RDONLY:", path);
 	}
 
-	free(buf);
+	lines = load_text_file(fd, path, 0, nlinesp);
+	close(fd);
 	return lines;
 }
 
@@ -157,9 +102,9 @@ int
 main(int argc, char *argv[])
 {
 	const char *path = NULL;
-	int fd;
 	struct line *lines;
 	size_t nlines;
+	size_t i;
 
 	libsimple_default_failure_exit = EXIT_ERROR;
 
@@ -176,22 +121,14 @@ main(int argc, char *argv[])
 	if (argc)
 		usage();
 
-	if (!path) {
-		fd = open_default_makefile(&path);
-	} else if (!strcmp(path, "-")) {
-		/* “A pathname of '-' shall denote the standard input” */
-		fd = dup(STDIN_FILENO);
-		if (fd < 0)
-			eprintf("dup <stdin>:");
-		path = "<stdin>";
-	} else {
-		fd = open(path, O_RDONLY);
-		if (fd < 0)
-			eprintf("open %s O_RDONLY:", path);
-	}
+	setlocale(LC_ALL, ""); /* Required by wcwidth(3) */
 
-	lines = load_file(fd, path, &nlines);
-	close(fd);
+	lines = load_makefile(path, &nlines);
+
+	for (i = 0; i < nlines; i++) {
+		check_utf8_encoding(&lines[i]);
+		check_column_count(&lines[i]);
+	}
 
 	free(lines);
 	return exit_status;
diff --git a/text.c b/text.c
new file mode 100644
index 0000000..e5498eb
--- /dev/null
+++ b/text.c
@@ -0,0 +1,143 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+struct line *
+load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp)
+{
+	struct line *lines;
+	char *buf = NULL, *p;
+	size_t size = 0;
+	size_t len = 0;
+	size_t i;
+	ssize_t r;
+
+	/* getline(3) may seem like the best way to read line by line,
+	 * however, it may terminate before the end of the line is
+	 * reached, which we would have to deal with, additionally,
+	 * we want to check for null bytes. Therefore we will keep
+	 * this simple and use read(3) and scan manually; and as a
+	 * bonus we can leave the file descriptor open, and let the
+	 * caller than created it close it.
+	 */
+
+	i = 0;
+	*nlinesp = 0;
+	for (;;) {
+		if (len == size)
+			buf = erealloc(buf, size += 2048);
+		r = read(fd, &buf[len], size - len);
+		if (r > 0)
+			len += (size_t)r;
+		else if (!r)
+			break;
+		else if (errno == EINTR)
+			continue;
+		else
+			eprintf("read %s:", fname);
+
+		for (; i < len; i++) {
+			if (buf[i] == '\n') {
+				*nlinesp += 1;
+				buf[i] = '\0';
+			} else if (buf[i] == '\0') {
+				/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+				warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
+				                         "input files are text files, and causes undefined behaviour",
+				                fname, *nlinesp + 1);
+				/* make(1) should probably just abort */
+				printinfof(WC_TEXT, "this implementation will replace it with a <space>");
+				buf[i] = ' ';
+			}
+		}
+	}
+
+	if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
+		/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+		warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
+		                         "required because input files are text files, and omission of it "
+		                         "causes undefined behaviour",
+		                fname, *nlinesp + 1);
+		/* make(1) should probably just abort */
+		printinfof(WC_TEXT, "this implementation will add the missing <newline>");
+		buf = erealloc(buf, len + 1);
+		buf[len++] = '\0';
+		*nlinesp += 1;
+	}
+
+	lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
+	for (p = buf, i = 0; i < *nlinesp; i++) {
+		lines[i].lineno = i + 1;
+		lines[i].path = fname;
+		lines[i].len = strlen(p);
+		lines[i].data = ememdup(p, lines[i].len + 1);
+		lines[i].eof = i + 1 == *nlinesp;
+		lines[i].nest_level = nest_level;
+
+		if (lines[i].len + 1 > 2048) {
+			/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+			warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
+			                         "2048 bytes which causes undefined behaviour as input files are "
+			                         "text files and POSIX only guarantees support for lines up to 2048 "
+			                         "bytes long including the <newline> character in text files",
+			                fname, *nlinesp + 1);
+		}
+		p += lines[i].len + 1;
+	}
+
+	free(buf);
+	return lines;
+}
+
+
+void
+check_utf8_encoding(struct line *line)
+{
+	size_t off, r;
+	uint_least32_t codepoint;
+#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD
+	unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD};
+#endif
+
+	for (off = 0; off < line->len; off += r) {
+		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+
+		if (codepoint == GRAPHEME_INVALID_CODEPOINT &&
+		    (r != ELEMSOF(invalid_codepoint_encoding) ||
+		     memcmp(&line->data[off], invalid_codepoint_encoding, r))) {
+
+			warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno);
+			printinfof(WC_ENCODING, "this implementation will replace it the "
+			                        "Unicode replacement character (U+FFFD)");
+
+			line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding));
+			memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)],
+			        &line->data[off + r],
+			        line->len - off - r);
+			memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding));
+			line->len = line->len - r + ELEMSOF(invalid_codepoint_encoding);
+		}
+	}
+}
+
+
+void
+check_column_count(struct line *line)
+{
+	size_t columns = 0;
+	size_t off, r;
+	uint_least32_t codepoint;
+
+	if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */
+		return;
+
+	for (off = 0; off < line->len; off += r) {
+		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+		columns += (size_t)abs(wcwidth((wchar_t)codepoint));
+	}
+
+	if (columns > style.max_line_length) {
+		warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns",
+		            line->path, line->lineno, columns);
+	}
+}
diff --git a/ui.c b/ui.c
index 5fdc10e..610729b 100644
--- a/ui.c
+++ b/ui.c
@@ -15,7 +15,8 @@ vxprintwarningf(enum warning_class class, int severity, const char *fmt, va_list
 {
 	if (warning_classes[class].action != IGNORE) {
 		fprintf(stderr, "%s: [%s] ", argv0,
-		        warning_classes[class].action == INFORM ? "info" : "warning");
+		        warning_classes[class].action == INFORM ? "info" :
+		        warning_classes[class].action == WARN_STYLE ? "style" : "warning");
 		vfprintf(stderr, fmt, ap);
 		fprintf(stderr, " (-w%s)\n", warning_classes[class].name);
 		if (warning_classes[class].action != INFORM)
author	Mattias Andrée <maandree@kth.se>	2022-01-01 12:37:03 +0100
committer	Mattias Andrée <maandree@kth.se>	2022-01-01 12:37:03 +0100
commit	e6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
tree	b6c23893cbb526746071f8c21fe0adb478729a0e
parent	Change [warn] to [warning] (diff)
download	makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2 makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz