path: root/text.c



/* See LICENSE file for copyright and license details. */
#include "common.h"


struct line *
load_text_file(int fd, const char *fname, int nest_level, size_t *nlinesp)
{
	struct line *lines;
	char *buf = NULL, *p;
	size_t size = 0;
	size_t len = 0;
	size_t i;
	ssize_t r;

	/* getline(3) may seem like the best way to read line by line,
	 * however, it may terminate before the end of the line is
	 * reached, which we would have to deal with, additionally,
	 * we want to check for null bytes. Therefore we will keep
	 * this simple and use read(3) and scan manually; and as a
	 * bonus we can leave the file descriptor open, and let the
	 * caller than created it close it.
	 */

	i = 0;
	*nlinesp = 0;
	for (;;) {
		if (len == size)
			buf = erealloc(buf, size += 2048);
		r = read(fd, &buf[len], size - len);
		if (r > 0)
			len += (size_t)r;
		else if (!r)
			break;
		else if (errno == EINTR)
			continue;
		else
			eprintf("read %s:", fname);

		for (; i < len; i++) {
			if (buf[i] == '\n') {
				*nlinesp += 1;
				buf[i] = '\0';
			} else if (buf[i] == '\0') {
				/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
				warnf_undefined(WC_TEXT, "%s:%zu: file contains a NUL byte, this is disallowed, because "
				                         "input files are text files, and causes undefined behaviour",
				                fname, *nlinesp + 1);
				/* make(1) should probably just abort */
				printinfof(WC_TEXT, "this implementation will replace it with a <space>");
				buf[i] = ' ';
			}
		}
	}

	if (len && buf[len - 1] != '\0') { /* LF has been converted to NUL above */
		/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
		warnf_undefined(WC_TEXT, "%s:%zu: is non-empty but does not end with a <newline>, which is "
		                         "required because input files are text files, and omission of it "
		                         "causes undefined behaviour",
		                fname, *nlinesp + 1);
		/* make(1) should probably just abort */
		printinfof(WC_TEXT, "this implementation will add the missing <newline>");
		buf = erealloc(buf, len + 1);
		buf[len++] = '\0';
		*nlinesp += 1;
	}

	lines = *nlinesp ? ecalloc(*nlinesp, sizeof(*lines)) : NULL;
	for (p = buf, i = 0; i < *nlinesp; i++) {
		lines[i].lineno = i + 1;
		lines[i].path = fname;
		lines[i].len = strlen(p);
		lines[i].data = ememdup(p, lines[i].len + 1);
		lines[i].eof = i + 1 == *nlinesp;
		lines[i].nest_level = nest_level;

		if (lines[i].len + 1 > 2048) {
			/* https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_403 */
			warnf_undefined(WC_TEXT, "%s:%zu: line is, including the <newline> character, longer than "
			                         "2048 bytes which causes undefined behaviour as input files are "
			                         "text files and POSIX only guarantees support for lines up to 2048 "
			                         "bytes long including the <newline> character in text files",
			                fname, *nlinesp + 1);
		}
		p += lines[i].len + 1;
	}

	free(buf);
	return lines;
}


void
check_utf8_encoding(struct line *line)
{
	size_t off, r;
	uint_least32_t codepoint;
#if GRAPHEME_INVALID_CODEPOINT == 0xFFFD
	unsigned char invalid_codepoint_encoding[] = {0xEF, 0xBF, 0xBD};
#endif

	for (off = 0; off < line->len; off += r) {
		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);

		if (codepoint == GRAPHEME_INVALID_CODEPOINT &&
		    (r != ELEMSOF(invalid_codepoint_encoding) ||
		     memcmp(&line->data[off], invalid_codepoint_encoding, r))) {

			warnf_unspecified(WC_ENCODING, "%s:%zu: line contains invalid UTF-8", line->path, line->lineno);
			printinfof(WC_ENCODING, "this implementation will replace it the "
			                        "Unicode replacement character (U+FFFD)");

			line->data = erealloc(line->data, line->len - r + ELEMSOF(invalid_codepoint_encoding));
			memmove(&line->data[off + ELEMSOF(invalid_codepoint_encoding)],
			        &line->data[off + r],
			        line->len - off - r);
			memcpy(&line->data[off], invalid_codepoint_encoding, ELEMSOF(invalid_codepoint_encoding));
			line->len -= r;
			line->len += r = ELEMSOF(invalid_codepoint_encoding);
		}
	}
}


void
check_column_count(struct line *line)
{
	size_t columns = 0;
	size_t off, r;
	uint_least32_t codepoint;

	if (line->len <= style.max_line_length) /* Column count cannot be more than byte count */
		return;

	for (off = 0; off < line->len; off += r) {
		r = grapheme_decode_utf8(&line->data[off], line->len - off, &codepoint);
		columns += (size_t)abs(wcwidth((wchar_t)codepoint));
	}

	if (columns > style.max_line_length) {
		warnf_style(WC_LONG_LINE, "%s:%zu: line is longer than %zu columns",
		            line->path, line->lineno, columns);
	}
}