summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2022-01-01 12:37:03 +0100
committerMattias Andrée <maandree@kth.se>2022-01-01 12:37:03 +0100
commite6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
treeb6c23893cbb526746071f8c21fe0adb478729a0e
parentChange [warn] to [warning] (diff)
downloadmakel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz
Validate UTF-8 encoding and check for long lines
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--LICENSE2
-rw-r--r--Makefile1
-rw-r--r--common.h24
-rw-r--r--config.mk2
-rw-r--r--mklint.c121
-rw-r--r--text.c143
-rw-r--r--ui.c3
7 files changed, 199 insertions, 97 deletions
diff --git a/LICENSE b/LICENSE
index c44b2d9..c90d3cc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
ISC License
-© 2021 Mattias Andrée <maandree@kth.se>
+© 2021, 2022 Mattias Andrée <maandree@kth.se>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
diff --git a/Makefile b/Makefile
index 2c7f513..02aefdc 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@ include $(CONFIGFILE)
OBJ =\
mklint.o\
+ text.o\
ui.o
HDR =\
diff --git a/common.h b/common.h
index b2d0a4e..8dcbdc6 100644
--- a/common.h
+++ b/common.h
@@ -1,6 +1,9 @@
/* See LICENSE file for copyright and license details. */
+#include <locale.h>
+
#include <libsimple.h>
#include <libsimple-arg.h>
+#include <grapheme.h>
#define EXIT_STYLE 1
#define EXIT_WARNING 2
@@ -15,12 +18,15 @@
X(WC_MAKEFILE, "makefile", INFORM)\
X(WC_EXTRA_MAKEFILE, "extra-makefile", WARN)\
X(WC_CMDLINE, "cmdline", WARN)\
- X(WC_TEXT, "text", WARN)
+ X(WC_TEXT, "text", WARN)\
+ X(WC_ENCODING, "encoding", WARN)\
+ X(WC_LONG_LINE, "long-line", WARN_STYLE)
enum action {
IGNORE,
INFORM,
+ WARN_STYLE,
WARN
};
@@ -41,16 +47,30 @@ struct line {
size_t len;
const char *path;
size_t lineno;
+ int eof;
+ int nest_level;
+};
+
+struct style {
+ size_t max_line_length;
};
extern int exit_status;
+extern struct style style;
+
+
+/* text.c */
+struct line *load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp);
+void check_utf8_encoding(struct line *line);
+void check_column_count(struct line *line);
/* ui.c */
extern struct warning_class_data warning_classes[];
void xprintwarningf(enum warning_class class, int severity, const char *fmt, ...);
-#define printinfof(CLASS, ...) xprintwarningf(CLASS, EXIT_STYLE, __VA_ARGS__)
+#define printinfof(CLASS, ...) xprintwarningf(CLASS, 0, __VA_ARGS__)
+#define warnf_style(CLASS, ...) xprintwarningf(CLASS, EXIT_STYLE, __VA_ARGS__)
#define warnf_warning(CLASS, ...) xprintwarningf(CLASS, EXIT_WARNING, __VA_ARGS__)
#define warnf_unspecified(CLASS, ...) xprintwarningf(CLASS, EXIT_UNSPECIFIED, __VA_ARGS__)
#define warnf_undefined(CLASS, ...) xprintwarningf(CLASS, EXIT_UNDEFINED, __VA_ARGS__)
diff --git a/config.mk b/config.mk
index d354a28..bf92326 100644
--- a/config.mk
+++ b/config.mk
@@ -5,4 +5,4 @@ CC = cc
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
CFLAGS = -std=c99 -Wall -g
-LDFLAGS = -lsimple
+LDFLAGS = -lsimple -lgrapheme
diff --git a/mklint.c b/mklint.c
index 1c17eea..7356860 100644
--- a/mklint.c
+++ b/mklint.c
@@ -4,9 +4,13 @@
NUSAGE(EXIT_ERROR, "[-f makefile]");
-
int exit_status = 0;
+struct style style = {
+ .max_line_length = 120
+};
+
+
static const char *const default_makefiles[] = {
"makefile",
"Makefile"
@@ -69,86 +73,27 @@ cmdline_opt_f(const char *arg, const char **makefile_pathp)
static struct line *
-load_file(int fd, const char *fname, size_t *nlinesp)
+load_makefile(const char *path, size_t *nlinesp)
{
struct line *lines;
- char *buf = NULL, *p;
- size_t size = 0;
- size_t len = 0;
- size_t i;
- ssize_t r;
-
- /* getline(3) may seem like the best way to read line by line,
- * however, it may terminate before the end of the line is
- * reached, which we would have to deal with, additionally,
- * we want to check for null bytes. Therefore we will keep
- * this simple and use read(3) and scan manually; and as a
- * bonus we can leave the file descriptor open, and let the
- * caller than created it close it.
- */
-
- i = 0;
- *nlinesp = 0;
- for (;;) {
- if (len == size)
- buf = erealloc(buf, size += 2048);
- r = read(fd, &buf[len], size - len);
- if (r > 0)
- len += (size_t)r;
- else if (!r)
- break;
- else if (errno == EINTR)
- continue;
- else
- eprintf("read %s:", fname);
-
- for (; i < len; i++) {
- if (buf[i] == '\n') {
- *nlinesp += 1;
- buf[i] = '\0';
- } else if (buf[i] == '\0') {
- /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
- warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
- "input files are text files, and causes undefined behaviour",
- fname, *nlinesp + 1);
- /* make(1) should probably just abort */
- printinfof(WC_TEXT, "this implementation will replace it with a <space>");
- buf[i] = ' ';
- }
- }
- }
-
- if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
- /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
- warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
- "required because input files are text files, and omission of it "
- "causes undefined behaviour",
- fname, *nlinesp + 1);
- /* make(1) should probably just abort */
- printinfof(WC_TEXT, "this implementation will add the missing <newline>");
- buf = erealloc(buf, len + 1);
- buf[len++] = '\0';
- *nlinesp += 1;
- }
+ int fd;
- lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
- for (p = buf, i = 0; i < *nlinesp; i++) {
- lines[i].lineno = i + 1;
- lines[i].path = fname;
- lines[i].len = strlen(p);
- lines[i].data = ememdup(p, lines[i].len + 1);
- if (lines[i].len + 1 > 2048) {
- /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
- warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
- "2048 bytes which causes undefined behaviour as input files are "
- "text files and POSIX only guarantees support for lines up to 2048 "
- "bytes long including the <newline> character in text files",
- fname, *nlinesp + 1);
- }
- p += lines[i].len + 1;
+ if (!path) {
+ fd = open_default_makefile(&path);
+ } else if (!strcmp(path, "-")) {
+ /* “A pathname of '-' shall denote the standard input” */
+ fd = dup(STDIN_FILENO);
+ if (fd < 0)
+ eprintf("dup <stdin>:");
+ path = "<stdin>";
+ } else {
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ eprintf("open %s O_RDONLY:", path);
}
- free(buf);
+ lines = load_text_file(fd, path, 0, nlinesp);
+ close(fd);
return lines;
}
@@ -157,9 +102,9 @@ int
main(int argc, char *argv[])
{
const char *path = NULL;
- int fd;
struct line *lines;
size_t nlines;
+ size_t i;
libsimple_default_failure_exit = EXIT_ERROR;
@@ -176,22 +121,14 @@ main(int argc, char *argv[])
if (argc)
usage();
- if (!path) {
- fd = open_default_makefile(&path);
- } else if (!strcmp(path, "-")) {
- /* “A pathname of '-' shall denote the standard input” */
- fd = dup(STDIN_FILENO);
- if (fd < 0)
- eprintf("dup <stdin>:");
- path = "<stdin>";
- } else {
- fd = open(path, O_RDONLY);
- if (fd < 0)
- eprintf("open %s O_RDONLY:", path);
- }
+ setlocale(LC_ALL, ""); /* Required by wcwidth(3) */
- lines = load_file(fd, path, &nlines);
- close(fd);
+ lines = load_makefile(path, &nlines);
+
+ for (i = 0; i < nlines; i++) {
+ check_utf8_encoding(&lines[i]);
+ check_column_count(&lines[i]);
+ }
free(lines);
return exit_status;
diff --git a/text.c b/text.c
new file mode 100644
index 0000000..e5498eb
--- /dev/null
+++ b/text.c
@@ -0,0 +1,143 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+struct line *
+load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp)
+{
+ struct line *lines;
+ char *buf = NULL, *p;
+ size_t size = 0;
+ size_t len = 0;
+ size_t i;
+ ssize_t r;
+
+ /* getline(3) may seem like the best way to read line by line,
+ * however, it may terminate before the end of the line is
+ * reached, which we would have to deal with, additionally,
+ * we want to check for null bytes. Therefore we will keep
+ * this simple and use read(3) and scan manually; and as a
+ * bonus we can leave the file descriptor open, and let the
+ * caller than created it close it.
+ */
+
+ i = 0;
+ *nlinesp = 0;
+ for (;;) {
+ if (len == size)
+ buf = erealloc(buf, size += 2048);
+ r = read(fd, &buf[len], size - len);
+ if (r > 0)
+ len += (size_t)r;
+ else if (!r)
+ break;
+ else if (errno == EINTR)
+ continue;
+ else
+ eprintf("read %s:", fname);
+
+ for (; i < len; i++) {
+ if (buf[i] == '\n') {
+ *nlinesp += 1;
+ buf[i] = '\0';
+ } else if (buf[i] == '\0') {
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
+ "input files are text files, and causes undefined behaviour",
+ fname, *nlinesp + 1);
+ /* make(1) should probably just abort */
+ printinfof(WC_TEXT, "this implementation will replace it with a <space>");
+ buf[i] = ' ';
+ }
+ }
+ }
+
+ if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
+ "required because input files are text files, and omission of it "
+ "causes undefined behaviour",
+ fname, *nlinesp + 1);
+ /* make(1) should probably just abort */
+ printinfof(WC_TEXT, "this implementation will add the missing <newline>");
+ buf = erealloc(buf, len + 1);
+ buf[len++] = '\0';
+ *nlinesp += 1;
+ }
+
+ lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
+ for (p = buf, i = 0; i < *nlinesp; i++) {
+ lines[i].lineno = i + 1;
+ lines[i].path = fname;
+ lines[i].len = strlen(p);
+ lines[i].data = ememdup(p, lines[i].len + 1);
+ lines[i].eof = i + 1 == *nlinesp;
+ lines[i].nest_level = nest_level;
+
+ if (lines[i].len + 1 > 2048) {
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
+ "2048 bytes which causes undefined behaviour as input files are "
+ "text files and POSIX only guarantees support for lines up to 2048 "
+ "bytes long including the <newline> character in text files",
+ fname, *nlinesp + 1);
+ }
+ p += lines[i].len + 1;
+ }
+
+ free(buf);
+ return lines;
+}
+
+
+void
+check_utf8_encoding(struct line *line)
+{
+ size_t off, r;
+ uint_least32_t codepoint;
+#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD
+ unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD};
+#endif
+
+ for (off = 0; off < line->len; off += r) {
+ r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+
+ if (codepoint == GRAPHEME_INVALID_CODEPOINT &&
+ (r != ELEMSOF(invalid_codepoint_encoding) ||
+ memcmp(&line->data[off], invalid_codepoint_encoding, r))) {
+
+ warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno);
+ printinfof(WC_ENCODING, "this implementation will replace it the "
+ "Unicode replacement character (U+FFFD)");
+
+ line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding));
+ memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)],
+ &line->data[off + r],
+ line->len - off - r);
+ memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding));
+ line->len = line->len - r + ELEMSOF(invalid_codepoint_encoding);
+ }
+ }
+}
+
+
+void
+check_column_count(struct line *line)
+{
+ size_t columns = 0;
+ size_t off, r;
+ uint_least32_t codepoint;
+
+ if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */
+ return;
+
+ for (off = 0; off < line->len; off += r) {
+ r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+ columns += (size_t)abs(wcwidth((wchar_t)codepoint));
+ }
+
+ if (columns > style.max_line_length) {
+ warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns",
+ line->path, line->lineno, columns);
+ }
+}
diff --git a/ui.c b/ui.c
index 5fdc10e..610729b 100644
--- a/ui.c
+++ b/ui.c
@@ -15,7 +15,8 @@ vxprintwarningf(enum warning_class class, int severity, const char *fmt, va_list
{
if (warning_classes[class].action != IGNORE) {
fprintf(stderr, "%s: [%s] ", argv0,
- warning_classes[class].action == INFORM ? "info" : "warning");
+ warning_classes[class].action == INFORM ? "info" :
+ warning_classes[class].action == WARN_STYLE ? "style" : "warning");
vfprintf(stderr, fmt, ap);
fprintf(stderr, " (-w%s)\n", warning_classes[class].name);
if (warning_classes[class].action != INFORM)