path: root/src/mds-kbdc/raw-data.c


                                 
/**
 * mds — A micro-display server
 * Copyright © 2014, 2015, 2016, 2017  Mattias Andrée (maandree@kth.se)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
#include "raw-data.h"

#include "globals.h"
#include "string.h"

#include <libmdsserver/macros.h>

#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>


/**
 * Initialise a `mds_kbdc_source_code_t*`
 * 
 * @param  this  The `mds_kbdc_source_code_t*`
 */
void
mds_kbdc_source_code_initialise(mds_kbdc_source_code_t *restrict this)
{
	this->lines        = NULL;
	this->real_lines   = NULL;
	this->content      = NULL;
	this->real_content = NULL;
	this->line_count   = 0;
	this->duplicates   = 0;
}


/**
 * Release all data in a `mds_kbdc_source_code_t*`
 * 
 * @param  this  The `mds_kbdc_source_code_t*`
 */
void
mds_kbdc_source_code_destroy(mds_kbdc_source_code_t *restrict this)
{
	if (!this)
		return;
	if (this->duplicates--)
		return;
	free(this->lines),        this->lines        = NULL;
	free(this->real_lines),   this->real_lines   = NULL;
	free(this->content),      this->content      = NULL;
	free(this->real_content), this->real_content = NULL;
}


/**
 * Release all data in a `mds_kbdc_source_code_t*`, and free it
 * 
 * @param  this  The `mds_kbdc_source_code_t*`
 */
void
mds_kbdc_source_code_free(mds_kbdc_source_code_t *restrict this)
{
	if (!this)
		return;
	if (this->duplicates--)
		return;
	free(this->lines);
	free(this->real_lines);
	free(this->content);
	free(this->real_content);
	free(this);
}

/**
 * Create a duplicate of a `mds_kbdc_source_code_t*`
 * 
 * @param   this  The `mds_kbdc_source_code_t*`
 * @return        `this` is returned
 */
mds_kbdc_source_code_t *
mds_kbdc_source_code_dup(mds_kbdc_source_code_t *restrict this)
{
	this->duplicates++;
	return this;
}


/**
 * Read the content of a file, ignoring interruptions
 * 
 * @param   pathname  The file to read
 * @param   size      Output parameter for the size of the read content, in char:s
 * @return            The read content, `NULL` on error
 */
static char *
read_file(const char *restrict pathname, size_t *restrict size)
{
	size_t buf_size = 8096;
	size_t buf_ptr = 0;
	char *restrict content = NULL;
	char *restrict old = NULL;
	int fd = -1;
	ssize_t got;

	/* Allocate buffer for the file's content. */
	fail_if (xmalloc(content, buf_size, char));
	/* Open the file to compile. */
	fail_if ((fd = open(pathname, O_RDONLY)) < 0);

	/* Read the file to compile. */
	for (;;) {
		/* Make sure the buffer is not small. */
		if (buf_size - buf_ptr < 2048)
			fail_if (xxrealloc(old, content, buf_size <<= 1, char));
		/* Read a chunk of the file. */
		got = read(fd, content + buf_ptr, (buf_size - buf_ptr) * sizeof(char));
		if (got < 0 && errno == EINTR) continue;
		if (got == 0)                  break;
		fail_if (got < 0);
		buf_ptr += (size_t)got;
	}

	/* Shrink the buffer so it is not excessively large. */
	if (buf_ptr) /* Simplest way to handle empty files: let the have the initial allocation size. */
		fail_if (xxrealloc(old, content, buf_ptr, char));

	/* Close file decriptor for the file. */
	xclose(fd);

	*size = buf_ptr;
	return content;

fail:
	xperror(*argv);
	free(old);
	free(content);
	if (fd >= 0)
		xclose(fd);
	return NULL;
}


/**
 * Find the end of a function call
 * 
 * @param   content  The code
 * @param   offset   The index after the first character after the backslash
 *                   that triggered this call
 * @param   size     The length of `code`
 * @return           The index of the character after the bracket that closes
 *                   the function call (may be outside the code by one character),
 *                   or `size` if the call do not end (that is, the code ends
 *                   prematurely), or zero if there is no function call at `offset`
 */
size_t
get_end_of_call(const char *restrict content, size_t offset, size_t size)
{
#define C               content[ptr]
#define r(lower, upper) ((lower) <= C && C <= (upper))

	size_t ptr = offset, call_end = 0;
	int escape = 0, quote = 0;
	char c;

	/* Skip to end of function name. */
	while (ptr < size && (r('a', 'z') || r('A', 'Z') || r('0', '9') || (C == '_')))
		ptr++;

	/* Check that it is a function call. */
	if (ptr == size || ptr == offset || C != '(')
		return 0;

	/* Find the end of the function call. */
	while (ptr < size) {
		c = content[ptr++];

		if (escape) {
			/* Escapes may be longer than one character,
			   but only the first can affect the parsing. */
			escape = 0;
		} else if (ptr <= call_end) {
			/* Nested function and nested quotes can appear. */;
		} else if (c == '\\') {
			/* Quotes end with the same symbols as they start with,
			   and quotes automatically escape brackets. */
			/* \ can either start a functon call or an escape. */

			/* It may not be an escape, but registering it
			   as an escape cannot harm us since we only
			   skip the first character, and a function call
			   cannot be that short. */
			escape = 1;
			/* Nested quotes can appear at function calls. */
			call_end = get_end_of_call(content, ptr, size);
		} else if (quote) {
			quote = (c != '"');
		} else if (c == ')') {
			/* End of function call, end of fun. */
			break;
		} else if (c == '"') {
			/* " is the quote symbol. */
			quote = 1;
		}
	}

	return ptr;

#undef r
#undef C
}


/**
 * Remove comments from the content
 * 
 * @param   content  The code to shrink
 * @param   size     The size of `content`, in char:s
 * @return           The new size of `content`, in char:s; this function cannot fail
 */
static size_t
remove_comments(char *restrict content, size_t size)
{
#define t content[n_ptr++] = c

	size_t n_ptr = 0, o_ptr = 0, call_end = 0;
	int comment = 0, quote = 0, escape = 0;
	char c;

	while (o_ptr < size) {
		c = content[o_ptr++];
		if (comment) {
			/* Remove comment. */
			if (c == '\n')
				t, comment = 0;
		} else if (escape) {
			/* Escapes may be longer than one character,
			   but only the first can affect the parsing. */
			t, escape = 0;
		} else if (o_ptr <= call_end) {
			/* Nested quotes can appear at function calls. */
			t;
		} else if (c == '\\') {
			/* \ can either start a functon call or an escape. */
			t;
			/* It may not be an escape, but registering it
			   as an escape cannot harm us since we only
			   skip the first character, and a function call
			   cannot be that short. */
			escape = 1;
			/* Nested quotes can appear at function calls. */
			call_end = get_end_of_call(content, o_ptr, size);
		} else if (quote) {
			/* Quotes end with the same symbols as they start with,
			   and quotes automatically escape comments. */
			t;
			if (strchr("\"\n", c))
				quote = 0;
		} else if (c == '#') {
			/* # is the comment symbol. */
			comment = 1;
		} else if (c == '"') {
			/* " is the quote symbol. */
			t, quote = 1;
		} else {
			/* Code and whitespace.  */
			t;
		}
	}

	return n_ptr;

#undef t
}


/**
 * Create an array of each line in a text
 * 
 * @param   content  The text to split, it must end with an LF.
 *                   LF:s are treated as line endings rather than
 *                   new lines, this means that the final LF will
 *                   not create a new line in the returned array.
 *                   Each LF will be replaced by a NUL-character.
 * @param   length   The length of `content`.
 * @return           An array of each line in `content`. This
 *                   array will be `NULL`-terminated. It will also
 *                   reuse the allocate of `content`. This means
 *                   that each element must not be free:d, rather
 *                   you should simply free this returned allocation
 *                   and the allocation of `content`. On error
 *                   `NULL` is returned, and `content` will not
 *                   have been modified.
 */
static char **
line_split(char *content, size_t length)
{
	char **restrict lines = NULL;
	size_t count = 0;
	size_t i, j;
	int new_line = 1;

	for (i = 0; i < length; i++)
		if (content[i] == '\n')
			count++;

	fail_if (xmalloc(lines, count + 1, char*));
	lines[count] = NULL;

	for (i = j = 0; i < length; i++) {
		if (new_line)
			new_line = 0, lines[j++] = content + i;
		if (content[i] == '\n') {
			new_line = 1;
			content[i] = '\0';
		}
	}

	return lines;

fail:
	xperror(*argv);
	return NULL;
}


/**
 * Translate all tab spaces into blank spaces
 * 
 * @param   content       Input and output parameter for the file's content
 * @param   content_size  Input and output parameter for the size of the file's content
 * @return                Zero on success, -1 on error 
 */
static int
expand(char **restrict content, size_t *restrict content_size)
{
	size_t extra = 0, added = 0, ptr, col, n = *content_size;
	char *restrict data = *content;

	/* Calculate the new size of the file. */
	for (ptr = col = 0; ptr < n; ptr++) {
		if (data[ptr] == '\n')
			col = 0;
		else if (data[ptr] == '\t')
			extra += 8 - (col % 8) - 1;
	}

	/* Extend the allocation. */
	if (!extra)
		return 0;
	*content_size += extra;
	fail_if (xrealloc(data, *content_size, char));
	*content = data;

	/* Expand tab spaces. */
	memmove(data + extra, data, n);
	for (ptr = 0; ptr < n; ptr++, added--) {
		if (data[ptr + extra] == '\n') {
			data[ptr + added++] = data[ptr + extra], col = 0;
		} else if (data[ptr + extra] != '\t') {
			data[ptr + added++] = data[ptr + extra], col++;
		} else {
			do
				data[ptr + added++] = ' ';
			while (++col % 8);
		}
	}

	return 0;
fail:
	return -1;
}


/**
 * Read lines of a source file
 * 
 * @param   pathname     The pathname of the source file
 * @param   source_code  Output parameter for read data
 * @return               Zero on success, -1 on error
 */
int
read_source_lines(const char *restrict pathname, mds_kbdc_source_code_t *restrict source_code)
{
	char *content = NULL;
	char *real_content = NULL;
	char *old = NULL;
	size_t content_size;
	size_t real_content_size;
	char **lines = NULL;
	char **real_lines = NULL;
	size_t line_count = 0;

	/* Read the file. */
	content = read_file(pathname, &content_size);
	fail_if (!content);

	/* Expand tab spaces. */
	fail_if (expand(&content, &content_size));

	/* Make sure the content ends with a new line. */
	if (!content_size || content[content_size - 1] != '\n') {
		fail_if (xxrealloc(old, content, content_size + 1, char));
		content[content_size++] = '\n';
	}

	/* Simplify file. */
	fail_if (xmemdup(real_content, content, content_size, char));
	real_content_size = content_size;
	content_size = remove_comments(content, content_size);
	fail_if (xxrealloc(old, content, content_size, char));

	/* Split by line.  */
	fail_if (!(lines = line_split(content, content_size)));
	fail_if (!(real_lines = line_split(real_content, real_content_size)));

	/* Count the number of lines. */
	while (lines[line_count])
		line_count++;

	source_code->lines = lines;
	source_code->real_lines = real_lines;
	source_code->content = content;
	source_code->real_content = real_content;
	source_code->line_count = line_count;
	return 0;

fail:
	xperror(*argv);
	free(old);
	free(content);
	free(real_content);
	free(lines);
	free(real_lines);
	return -1;
}


/**
 * Encode a character in UTF-8
 * 
 * @param   buffer     The buffer where the character should be stored
 * @param   character  The character
 * @return             The of the character in `buffer`, `NULL` on error
 */
static char *
encode_utf8(char *buffer, char32_t character)
{
	char32_t text[2];
	char *restrict str;
	char *restrict str_;

	text[0] = character;
	text[1] = -1;

	fail_if (!(str_ = str = string_encode(text)));

	while (*str)
		*buffer++ = *str++;

	free(str_);
	return buffer;
fail:
	return NULL;
}


/**
 * Parse a quoted and escaped string that may not include function calls or variable dereferences
 * 
 * @param   string  The string
 * @return          The string in machine-readable format, `NULL` on error
 */
char *
parse_raw_string(const char *restrict string)
{
#define r(cond, lower, upper) ((cond) && ((lower) <= c) && (c <= (upper)))
	char *rc, *p;
	int escape = 0;
	char32_t buf = 0;
	char c;

	/* We know that the output string can only be shorter because
	 * it is surrounded by 2 quotes and escape can only be longer
	 * then what they escape, for example \uA0, is four characters,
	 * but when parsed it generateds 2 bytes in UTF-8, and their
	 * is not code point whose UTF-8 encoding is longer than its
	 * hexadecimal representation. */
	fail_if (xmalloc(p = rc, strlen(string), char));

	while ((c = *string++)) {
		if (r(escape ==  8, '0', '7')) {
			buf = (buf << 3) | (c & 15);
		} else if (r(escape == 16, '0', '9')) {
			buf = (buf << 4) | (c & 15);
		} else if (r(escape == 16, 'a', 'f')) {
			buf = (buf << 4) | ((c & 15) + 9);
		} else if (r(escape == 16, 'A', 'F')) {
			buf = (buf << 4) | ((c & 15) + 9);
		} else if (escape > 1) {
			escape = 0;
			fail_if (!(p = encode_utf8(p, buf)));
			if (c != '.')
				*p++ = c;
		} else if (escape == 1) {
			escape = 0;
			buf = 0;
			if (c == '0')
				escape = 8;
			else if (c == 'u')
				escape = 16;
			else
				*p++ = c;
		} else if (c == '\\') {
			escape = 1;
		} else if (c != '\"') {
			*p++ = c;
		}
	}

	*p = '\0';
	return rc;
fail:
	free(rc);
	return NULL;
#undef r
}