diff options
author | Mattias Andrée <maandree@kth.se> | 2022-01-01 12:37:03 +0100 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2022-01-01 12:37:03 +0100 |
commit | e6e6d879ee6444b39c144118ad1d0ec2804dd41b (patch) | |
tree | b6c23893cbb526746071f8c21fe0adb478729a0e /text.c | |
parent | Change [warn] to [warning] (diff) | |
download | makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.gz makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.bz2 makel-e6e6d879ee6444b39c144118ad1d0ec2804dd41b.tar.xz |
Validate UTF-8 encoding and check for long lines
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to 'text.c')
-rw-r--r-- | text.c | 143 |
1 files changed, 143 insertions, 0 deletions
@@ -0,0 +1,143 @@ +/* See LICENSE file for copyright and license details. */ +#include "common.h" + + +struct line * +load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp) +{ + struct line *lines; + char *buf = NULL, *p; + size_t size = 0; + size_t len = 0; + size_t i; + ssize_t r; + + /* getline(3) may seem like the best way to read line by line, + * however, it may terminate before the end of the line is + * reached, which we would have to deal with, additionally, + * we want to check for null bytes. Therefore we will keep + * this simple and use read(3) and scan manually; and as a + * bonus we can leave the file descriptor open, and let the + * caller than created it close it. + */ + + i = 0; + *nlinesp = 0; + for (;;) { + if (len == size) + buf = erealloc(buf, size += 2048); + r = read(fd, &buf[len], size - len); + if (r > 0) + len += (size_t)r; + else if (!r) + break; + else if (errno == EINTR) + continue; + else + eprintf("read %s:", fname); + + for (; i < len; i++) { + if (buf[i] == '\n') { + *nlinesp += 1; + buf[i] = '\0'; + } else if (buf[i] == '\0') { + /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */ + warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because " + "input files are text files, and causes undefined behaviour", + fname, *nlinesp + 1); + /* make(1) should probably just abort */ + printinfof(WC_TEXT, "this implementation will replace it with a <space>"); + buf[i] = ' '; + } + } + } + + if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */ + /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */ + warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is " + "required because input files are text files, and omission of it " + "causes undefined behaviour", + fname, *nlinesp + 1); + /* make(1) should probably just abort */ + printinfof(WC_TEXT, "this implementation will add the missing <newline>"); + buf = erealloc(buf, len + 1); + buf[len++] = '\0'; + *nlinesp += 1; + } + + lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL; + for (p = buf, i = 0; i < *nlinesp; i++) { + lines[i].lineno = i + 1; + lines[i].path = fname; + lines[i].len = strlen(p); + lines[i].data = ememdup(p, lines[i].len + 1); + lines[i].eof = i + 1 == *nlinesp; + lines[i].nest_level = nest_level; + + if (lines[i].len + 1 > 2048) { + /* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */ + warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than " + "2048 bytes which causes undefined behaviour as input files are " + "text files and POSIX only guarantees support for lines up to 2048 " + "bytes long including the <newline> character in text files", + fname, *nlinesp + 1); + } + p += lines[i].len + 1; + } + + free(buf); + return lines; +} + + +void +check_utf8_encoding(struct line *line) +{ + size_t off, r; + uint_least32_t codepoint; +#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD + unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD}; +#endif + + for (off = 0; off < line->len; off += r) { + r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint); + + if (codepoint == GRAPHEME_INVALID_CODEPOINT && + (r != ELEMSOF(invalid_codepoint_encoding) || + memcmp(&line->data[off], invalid_codepoint_encoding, r))) { + + warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno); + printinfof(WC_ENCODING, "this implementation will replace it the " + "Unicode replacement character (U+FFFD)"); + + line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding)); + memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)], + &line->data[off + r], + line->len - off - r); + memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding)); + line->len = line->len - r + ELEMSOF(invalid_codepoint_encoding); + } + } +} + + +void +check_column_count(struct line *line) +{ + size_t columns = 0; + size_t off, r; + uint_least32_t codepoint; + + if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */ + return; + + for (off = 0; off < line->len; off += r) { + r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint); + columns += (size_t)abs(wcwidth((wchar_t)codepoint)); + } + + if (columns > style.max_line_length) { + warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns", + line->path, line->lineno, columns); + } +} |