summaryrefslogtreecommitdiffstats
path: root/text.c
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2022-01-01 12:37:03 +0100
committerMattias Andrée <maandree@kth.se>2022-01-01 12:37:03 +0100
commite6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch)
treeb6c23893cbb526746071f8c21fe0adb478729a0e /text.c
parentChange [warn] to [warning] (diff)
downloadmakel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2
makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz
Validate UTF-8 encoding and check for long lines
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to 'text.c')
-rw-r--r--text.c143
1 files changed, 143 insertions, 0 deletions
diff --git a/text.c b/text.c
new file mode 100644
index 0000000..e5498eb
--- /dev/null
+++ b/text.c
@@ -0,0 +1,143 @@
+/* See LICENSE file for copyright and license details. */
+#include "common.h"
+
+
+struct line *
+load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp)
+{
+ struct line *lines;
+ char *buf = NULL, *p;
+ size_t size = 0;
+ size_t len = 0;
+ size_t i;
+ ssize_t r;
+
+ /* getline(3) may seem like the best way to read line by line,
+ * however, it may terminate before the end of the line is
+ * reached, which we would have to deal with, additionally,
+ * we want to check for null bytes. Therefore we will keep
+ * this simple and use read(3) and scan manually; and as a
+ * bonus we can leave the file descriptor open, and let the
+ * caller than created it close it.
+ */
+
+ i = 0;
+ *nlinesp = 0;
+ for (;;) {
+ if (len == size)
+ buf = erealloc(buf, size += 2048);
+ r = read(fd, &buf[len], size - len);
+ if (r > 0)
+ len += (size_t)r;
+ else if (!r)
+ break;
+ else if (errno == EINTR)
+ continue;
+ else
+ eprintf("read %s:", fname);
+
+ for (; i < len; i++) {
+ if (buf[i] == '\n') {
+ *nlinesp += 1;
+ buf[i] = '\0';
+ } else if (buf[i] == '\0') {
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
+ "input files are text files, and causes undefined behaviour",
+ fname, *nlinesp + 1);
+ /* make(1) should probably just abort */
+ printinfof(WC_TEXT, "this implementation will replace it with a <space>");
+ buf[i] = ' ';
+ }
+ }
+ }
+
+ if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
+ "required because input files are text files, and omission of it "
+ "causes undefined behaviour",
+ fname, *nlinesp + 1);
+ /* make(1) should probably just abort */
+ printinfof(WC_TEXT, "this implementation will add the missing <newline>");
+ buf = erealloc(buf, len + 1);
+ buf[len++] = '\0';
+ *nlinesp += 1;
+ }
+
+ lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
+ for (p = buf, i = 0; i < *nlinesp; i++) {
+ lines[i].lineno = i + 1;
+ lines[i].path = fname;
+ lines[i].len = strlen(p);
+ lines[i].data = ememdup(p, lines[i].len + 1);
+ lines[i].eof = i + 1 == *nlinesp;
+ lines[i].nest_level = nest_level;
+
+ if (lines[i].len + 1 > 2048) {
+ /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
+ warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
+ "2048 bytes which causes undefined behaviour as input files are "
+ "text files and POSIX only guarantees support for lines up to 2048 "
+ "bytes long including the <newline> character in text files",
+ fname, *nlinesp + 1);
+ }
+ p += lines[i].len + 1;
+ }
+
+ free(buf);
+ return lines;
+}
+
+
+void
+check_utf8_encoding(struct line *line)
+{
+ size_t off, r;
+ uint_least32_t codepoint;
+#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD
+ unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD};
+#endif
+
+ for (off = 0; off < line->len; off += r) {
+ r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+
+ if (codepoint == GRAPHEME_INVALID_CODEPOINT &&
+ (r != ELEMSOF(invalid_codepoint_encoding) ||
+ memcmp(&line->data[off], invalid_codepoint_encoding, r))) {
+
+ warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno);
+ printinfof(WC_ENCODING, "this implementation will replace it the "
+ "Unicode replacement character (U+FFFD)");
+
+ line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding));
+ memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)],
+ &line->data[off + r],
+ line->len - off - r);
+ memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding));
+ line->len = line->len - r + ELEMSOF(invalid_codepoint_encoding);
+ }
+ }
+}
+
+
+void
+check_column_count(struct line *line)
+{
+ size_t columns = 0;
+ size_t off, r;
+ uint_least32_t codepoint;
+
+ if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */
+ return;
+
+ for (off = 0; off < line->len; off += r) {
+ r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
+ columns += (size_t)abs(wcwidth((wchar_t)codepoint));
+ }
+
+ if (columns > style.max_line_length) {
+ warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns",
+ line->path, line->lineno, columns);
+ }
+}