aboutsummaryrefslogblamecommitdiffstats
path: root/tokeniser.c
blob: 606726b2c23e78450ff0682aa647eb0037d0b740 (plain) (tree)
1
2
3
4
5
6
7






                                                               

















                                                                                                           





                                    




                                                             
                                                    




































































































































                                                                                                                                    










                                                                              

                                                   













                                                                                         

                                                                                          


















































































                                                                                                      
                                                                                                                 








                                                                               
                                                                                                                  













































                                                                                                                

















                                                                                                                      
                                        





























































                                                                                                                              






                                                                







                                                                                                                             






















































































                                                                                                                         
                                               






















                                                                                       
                                                                                                                 






























































































                                                                                                                   
             













                                                                              
/* See LICENSE file for copyright and license details. */
#include "common.h"


void
push_mode(struct parser_context *ctx, enum tokeniser_mode mode)
{
	struct mode_stack *new_mode_stack;
	struct here_document_stack *new_here_document_stack;

	if (mode == BQ_QUOTE_MODE)
		weprintf("backquote expression found at line %zu, stop it!\n", ctx->tokeniser_line_number);

	if (ctx->mode_stack->mode == HERE_DOCUMENT_MODE) {
		new_here_document_stack = ecalloc(1, sizeof(*new_here_document_stack));
		new_here_document_stack->next = &new_here_document_stack->first;
		new_here_document_stack->previous = ctx->here_document_stack;
		ctx->here_document_stack = new_here_document_stack;
	}

	new_mode_stack = emalloc(sizeof(*new_mode_stack));
	new_mode_stack->mode = mode;
	new_mode_stack->she_is_comment = 1;
	new_mode_stack->previous = ctx->mode_stack;
	ctx->mode_stack = new_mode_stack;
}


void
pop_mode(struct parser_context *ctx)
{
	struct mode_stack *old_mode_stack;
	struct here_document_stack *old_here_document_stack;
	struct here_document_stack *prev_here_document_stack;

	old_mode_stack = ctx->mode_stack;
	ctx->mode_stack = ctx->mode_stack->previous;
	free(old_mode_stack);

	if (ctx->mode_stack->mode == HERE_DOCUMENT_MODE) {
		if (ctx->here_document_stack->first) {
			if (posix_mode) {
				eprintf("subshell expression closed at line %zu before here-documents, "
				        "this is non-portable\n", ctx->tokeniser_line_number);
			}
			prev_here_document_stack = ctx->here_document_stack->previous;
			*ctx->here_document_stack->next = prev_here_document_stack->first;
			ctx->here_document_stack->next = prev_here_document_stack->next;
			ctx->here_document_stack->previous = prev_here_document_stack->previous;
			ctx->here_document_stack->interpret_when_empty = prev_here_document_stack->interpret_when_empty;
			free(prev_here_document_stack);
		} else {
			old_here_document_stack = ctx->here_document_stack;
			ctx->here_document_stack = old_here_document_stack->previous;
			free(old_here_document_stack);
		}
	}
}


static void
append_and_destroy_quote_to_here_document_terminator(struct here_document *here_document, struct parser_state *quote)
{
	struct argument *terminator, *part, *next_part;
	size_t i;

	terminator = here_document->argument->next_part;

	for (i = 0; i < quote->narguments; i++) {
		for (part = quote->arguments[i]; part; part = next_part) {
			next_part = part->next_part;
			if (part->type != QUOTED && part->type != UNQUOTED) {
				eprintf("use of run-time evaluated expression as right-hand side "
				        "of %s operator (at line %zu) is illegal\n",
				        here_document->redirection->type == HERE_DOCUMENT_INDENTED ? "<<-" : "<<",
				        here_document->argument->line_number);
			}
			terminator->text = erealloc(terminator->text, terminator->length + part->length + 1);
			memcpy(&terminator->text[terminator->length], part->text, part->length);
			terminator->length += part->length;
			terminator->text[terminator->length] = '\0';
			free(part->text);
			free(part);
		}
	}

	free(quote->arguments);
}

static void
get_here_document_terminator(struct parser_context *ctx)
{
	struct argument *terminator, *next_part;
	struct parser_state *child;

	terminator = ctx->here_document_stack->first->argument->next_part;
	if (!terminator || (terminator->type != QUOTED && terminator->type != UNQUOTED && terminator->type != QUOTE_EXPRESSION)) {
		eprintf("missing right-hand side of %s operator at line %zu\n",
		        ctx->here_document_stack->first->redirection->type == HERE_DOCUMENT_INDENTED ? "<<-" : "<<",
		        ctx->here_document_stack->first->argument->line_number);
	} else if (terminator->type == QUOTE_EXPRESSION) {
		child = terminator->child;
		terminator->type = QUOTED;
		terminator->text = ecalloc(1, 1);
		terminator->length = 0;
		append_and_destroy_quote_to_here_document_terminator(ctx->here_document_stack->first, child);
		free(child);
	}

	while ((next_part = terminator->next_part)) {
		switch (next_part->type) {
		case QUOTED:
			terminator->type = QUOTED;
			/* fall through */
		case UNQUOTED:
			terminator->text = erealloc(terminator->text, terminator->length + next_part->length + 1);
			memcpy(&terminator->text[terminator->length], next_part->text, next_part->length);
			terminator->length += next_part->length;
			terminator->text[terminator->length] = '\0';
			free(next_part->text);
			break;

		case QUOTE_EXPRESSION:
			terminator->type = QUOTED;
			append_and_destroy_quote_to_here_document_terminator(ctx->here_document_stack->first, next_part->child);
			free(next_part->child);
			break;

		case BACKQUOTE_EXPRESSION:
		case ARITHMETIC_EXPRESSION:
		case VARIABLE_SUBSTITUTION:
		case SUBSHELL_SUBSTITUTION:
		case PROCESS_SUBSTITUTION_INPUT:
		case PROCESS_SUBSTITUTION_OUTPUT:
		case PROCESS_SUBSTITUTION_INPUT_OUTPUT:
			eprintf("use of run-time evaluated expression as right-hand side of %s operator (at line %zu) is illegal\n",
			        ctx->here_document_stack->first->redirection->type == HERE_DOCUMENT_INDENTED ? "<<-" : "<<",
			        ctx->here_document_stack->first->argument->line_number);
			return;

		case REDIRECTION:
		case FUNCTION_MARK:
		case SUBSHELL:
		case ARITHMETIC_SUBSHELL:
			/* interpreter shall recognise these as new "arguments" */
			return;

		default:
		case COMMAND: /* used by interpreter */
		case VARIABLE: /* ditto */
			abort();
		}

		if (ctx->parser_state->current_argument_end == next_part)
			ctx->parser_state->current_argument_end = terminator;
		terminator->next_part = next_part->next_part;
		free(next_part);
	}
}


int
check_extension(const char *token, size_t line_number)
{
	if (!posix_mode) {
		return 1;
	} else {
		weprintf("the '%s' token (at line %zu) is not portable, not parsing as it\n", token, line_number);
		return 0;
	}
}


size_t
parse_preparsed(struct parser_context *ctx, char *code, size_t code_len)
{
#define IS_SYMBOL(C) ((C) == '<' || (C) == '>' || (C) == '&' || (C) == '|' ||\
                      (C) == '(' || (C) == ')' || (C) == ';' || (C) == '-')

	size_t bytes_read = 0;
	size_t token_len;
	struct here_document *here_document;
	struct here_document_stack *here_doc_stack;

	for (; bytes_read < code_len; bytes_read += token_len, code = &code[token_len]) {
		switch (ctx->mode_stack->mode) {
		case NORMAL_MODE:
			if (*code == '#' && ctx->mode_stack->she_is_comment) {
				token_len = 1;
				push_mode(ctx, COMMENT_MODE);

			} else if (*code == '\n') {
				token_len = 1;
				ctx->mode_stack->she_is_comment = 1;
				push_whitespace(ctx, 0);
				push_semicolon(ctx, 1);
				ctx->tokeniser_line_number += 1;
				if (ctx->here_document_stack->first)
					push_mode(ctx, HERE_DOCUMENT_MODE_INITIALISATION);

			} else if (isspace(*code)) {
				ctx->mode_stack->she_is_comment = 1;
				push_whitespace(ctx, 0);
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1)
					if (!isspace(code[token_len]) || code[token_len] == '\n')
						break;

			} else if (*code == ')' && ctx->mode_stack->previous) {
				token_len = 1;
				ctx->mode_stack->she_is_comment = 1;
				pop_mode(ctx);
				push_leave(ctx);

			} else if (IS_SYMBOL(*code)) {
				ctx->mode_stack->she_is_comment = 1;
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1)
					if (!IS_SYMBOL(code[token_len]))
						goto symbol_end;
				if (!ctx->end_of_file_reached)
					goto need_more;
			symbol_end:
				token_len = push_symbol(ctx, code, token_len);

			} else if (*code == '\\') {
				ctx->mode_stack->she_is_comment = 0;
			backslash_mode:
				if (code_len - bytes_read < 2)
					goto need_more;
				token_len = 2;
				push_quoted(ctx, &code[1], 1);

			} else if (*code == '\'') {
				ctx->mode_stack->she_is_comment = 0;
			sqoute_mode:
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1)
					if (code[token_len] == '\'')
						goto squote_end;
				goto need_more;
			squote_end:
				token_len += 1;
				push_quoted(ctx, &code[1], token_len - 2);

			} else if (*code == '"') {
				ctx->mode_stack->she_is_comment = 0;
			dquote_mode:
				token_len = 1;
				push_mode(ctx, DQ_QUOTE_MODE);
				push_enter(ctx, QUOTE_EXPRESSION);

			} else if (*code == '`') {
				ctx->mode_stack->she_is_comment = 0;
			bquote_mode:
				token_len = 1;
				push_mode(ctx, BQ_QUOTE_MODE);
				push_enter(ctx, BACKQUOTE_EXPRESSION);

			} else if (*code == '$') {
				ctx->mode_stack->she_is_comment = 0;
			dollar_mode:
				if (code_len - bytes_read < 2) {
					if (ctx->end_of_file_reached) {
						token_len = 1;
						push_unquoted(ctx, code, 1);
					} else {
						goto need_more;
					}

				} else if (code[1] == '(') {
					if (code_len - bytes_read < 3) {
						goto need_more;

					} else if (code[2] == '(') {
						token_len = 3;
						push_mode(ctx, RRB_QUOTE_MODE);
						push_enter(ctx, ARITHMETIC_EXPRESSION);

					} else {
						token_len = 2;
						push_mode(ctx, NORMAL_MODE);
						push_enter(ctx, SUBSHELL_SUBSTITUTION);
					}

				} else if (code[1] == '[' && check_extension("$[", ctx->tokeniser_line_number)) {
					token_len = 2;
					push_mode(ctx, SB_QUOTE_MODE);
					push_enter(ctx, ARITHMETIC_EXPRESSION);

				} else if (code[1] == '{') {
					token_len = 2;
					push_mode(ctx, CB_QUOTE_MODE);
					push_enter(ctx, VARIABLE_SUBSTITUTION);

				} else if (code[1] == '\'' && check_extension("$'", ctx->tokeniser_line_number)) {
					for (token_len = 2; token_len < code_len - bytes_read; token_len += 1) {
						if (code[token_len] == '\\') {
							if (token_len + 1 == code_len - bytes_read) {
								token_len += 1;
							} else {
								goto need_more;
							}
						} else if (code[token_len] == '\'') {
							goto dollar_squote_end;
						}
					}
				dollar_squote_end:
					token_len += 1;
					push_escaped(ctx, &code[2], token_len - 3);

				} else {
					token_len = 1;
					push_unquoted(ctx, code, 1);
				}

			} else {
				ctx->mode_stack->she_is_comment = 0;
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1) {
					if (isspace(code[token_len]) || IS_SYMBOL(code[token_len]) ||
					    code[token_len] == '\''  || code[token_len] == '"'     ||
					    code[token_len] == '\\'  || code[token_len] == '$'     ||
					    code[token_len] == '`')
						break;
				}
				push_unquoted(ctx, code, token_len);
			}
			break;


		case COMMENT_MODE:
			if (*code == '\n') {
				token_len = 0; /* do not consume */
				pop_mode(ctx);
			} else {
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1)
					if (code[token_len] == '\n')
						break;
			}
			break;


		case HERE_DOCUMENT_MODE_INITIALISATION:
			here_doc_stack = ctx->here_document_stack;
			here_doc_stack->indented = 0;
			if (here_doc_stack->first->redirection->type == HERE_DOCUMENT_INDENTED)
				here_doc_stack->indented = 1;
			get_here_document_terminator(ctx);
			here_doc_stack->verbatim = 0;
			if (here_doc_stack->first->argument->next_part->type == QUOTED)
				here_doc_stack->verbatim = 1;
			here_doc_stack->first->terminator = here_doc_stack->first->argument->next_part->text;
			here_doc_stack->first->terminator_length = here_doc_stack->first->argument->next_part->length;
			here_doc_stack->first->argument->next_part->text = ecalloc(1, 1);
			here_doc_stack->first->argument->next_part->length = 0;
			here_doc_stack->first->argument->next_part->type = QUOTED;
			here_doc_stack->first->argument_end = here_doc_stack->first->argument->next_part;
			ctx->mode_stack->mode = HERE_DOCUMENT_MODE;
			/* fall through */

		case HERE_DOCUMENT_MODE:
			here_doc_stack = ctx->here_document_stack;
			if (*code == '\t' && here_doc_stack->indented) {
				token_len = 1;
			} else {
				token_len = here_doc_stack->line_offset;
				for (; token_len < code_len - bytes_read; token_len += 1) {
					if (code[token_len] == '\n') {
						goto here_document_line_end;
					} else if (!here_doc_stack->verbatim) {
						if (code[token_len] == '\\') {
							if (token_len + 1 == code_len - bytes_read) {
								goto need_more;
							} else if (code[token_len + 1] == '$' || code[token_len + 1] == '`') {
								here_doc_stack->line_offset = 0;
								push_quoted(ctx, code, token_len);
								push_quoted(ctx, &code[token_len + 1], 1);
								goto next;
							}
							token_len += 1;
						} else if (code[token_len] == '$') {
							here_doc_stack->line_offset = 0;
							push_quoted(ctx, code, token_len);
							bytes_read += token_len;
							code = &code[token_len];
							goto quote_mode_dollar_mode;
						} else if (code[token_len] == '`') {
							here_doc_stack->line_offset = 0;
							push_quoted(ctx, code, token_len);
							push_mode(ctx, BQ_QUOTE_MODE);
							push_enter(ctx, BACKQUOTE_EXPRESSION);
							goto next;
						}
					}
				}
				goto need_more;

			here_document_line_end:
				token_len += 1;
				ctx->tokeniser_line_number += 1;
				here_doc_stack->line_offset = 0;
				here_document = here_doc_stack->first;

				if (token_len - 1 == here_document->terminator_length &&
				    !strncmp(code, here_document->terminator, token_len - 1)) {
					here_document->redirection->type = HERE_STRING;
					here_doc_stack->first = here_document->next;
					free(here_document->terminator);
					free(here_document);
					if (here_doc_stack->first) {
						ctx->mode_stack->mode = HERE_DOCUMENT_MODE_INITIALISATION;
					} else {
						here_doc_stack->next = &here_doc_stack->first;
						pop_mode(ctx);
						if (here_doc_stack->interpret_when_empty) {
							here_doc_stack->interpret_when_empty = 0;
							interpret_and_eliminate(ctx);
						}
					}
				} else {
					push_quoted(ctx, code, token_len);
				}
			}
			break;


		case BQ_QUOTE_MODE:
			if (*code == '\\') {
				if (code_len - bytes_read < 2) {
					goto need_more;
				} else if (code[1] == '\\' || code[1] == '`' || code[1] == '$') {
					token_len = 2;
					push_unquoted(ctx, &code[1], 1);
					if (code[1] == '$') {
						weprintf("meaningless \\ found before $ inside backquote expression at line "
						         "%zu, perhaps you mean to use \\\\$ instead to get a literal $\n",
						         ctx->tokeniser_line_number);
					}
				} else {
					token_len = 2;
					push_unquoted(ctx, code, 2);
				}

			} else if (*code == '`') {
				token_len = 1;
				pop_mode(ctx);
				push_leave(ctx);

			} else if (*code == '\n') {
				token_len = 1;
				ctx->tokeniser_line_number += 1;
				push_unquoted(ctx, code, 1);

			} else {
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1)
					if (code[token_len] == '\n' || code[token_len] == '\\' || code[token_len] == '`')
						break;
				push_unquoted(ctx, code, token_len);
			}
			break;


		case DQ_QUOTE_MODE:
			if (*code == '"') {
				token_len = 1;
				pop_mode(ctx);
				push_leave(ctx);
			} else {
				goto common_quote_mode;
			}
			break;

		case RRB_QUOTE_MODE:
			if (*code == ')') {
				if (code_len - bytes_read < 2) {
					goto need_more;
				} else if (code[1] == ')') {
					token_len = 2;
					pop_mode(ctx);
					push_leave(ctx);
				} else {
					goto common_quote_mode;
				}
			} else {
				goto common_quote_mode;
			}
			break;

		case RB_QUOTE_MODE:
			if (*code == ')') {
				token_len = 1;
				pop_mode(ctx);
				push_leave(ctx);
			} else {
				goto common_quote_mode;
			}
			break;

		case SB_QUOTE_MODE:
			if (*code == ']') {
				token_len = 1;
				pop_mode(ctx);
				push_leave(ctx);
			} else {
				goto common_quote_mode;
			}
			break;

		common_quote_mode:
			if (*code == '(' && ctx->mode_stack->mode != DQ_QUOTE_MODE) {
				if (code_len - bytes_read < 2) {
					goto need_more;

				} else if (code[1] == '(') {
					token_len = 2;
					push_mode(ctx, RRB_QUOTE_MODE);
					push_enter(ctx, ARITHMETIC_EXPRESSION);

				} else {
					token_len = 1;
					push_mode(ctx, RB_QUOTE_MODE);
					push_enter(ctx, ARITHMETIC_EXPRESSION);
				}

			} else if (*code == '$') {
			quote_mode_dollar_mode:
				if (code_len - bytes_read < 2) {
					if (ctx->end_of_file_reached) {
						token_len = 1;
						push_unquoted(ctx, code, 1);
					} else {
						goto need_more;
					}

				} else if (code[1] == '(') {
					if (code_len - bytes_read < 3) {
						goto need_more;

					} else if (code[2] == '(') {
						token_len = 3;
						push_mode(ctx, RRB_QUOTE_MODE);
						push_enter(ctx, ARITHMETIC_EXPRESSION);

					} else {
						token_len = 2;
						push_mode(ctx, NORMAL_MODE);
						push_enter(ctx, SUBSHELL_SUBSTITUTION);
					}

				} else if (code[1] == '[' && check_extension("$[", ctx->tokeniser_line_number)) {
					token_len = 2;
					push_mode(ctx, SB_QUOTE_MODE);
					push_enter(ctx, ARITHMETIC_EXPRESSION);

				} else if (code[1] == '{') {
					token_len = 2;
					push_mode(ctx, CB_QUOTE_MODE);
					push_enter(ctx, VARIABLE_SUBSTITUTION);

				} else {
					token_len = 1;
					push_unquoted(ctx, code, 1);
				}

			} else if (*code == '\\') {
				if (code_len - bytes_read < 2) {
					if (ctx->end_of_file_reached) {
						token_len = 1;
						push_unquoted(ctx, code, 1);
					} else {
						goto need_more;
					}

				} else if (code[1] == '$' || code[1] == '`' || code[1] == '"' || code[1] == '\\') {
					token_len = 1;
					push_quoted(ctx, &code[1], 1);

				} else {
					token_len = 1;
					push_unquoted(ctx, code, 1);
				}

			} else if (*code == '`') {
				goto bquote_mode;

			} else if (*code == '\n') {
				token_len = 1;
				ctx->tokeniser_line_number += 1;
				push_unquoted(ctx, code, 1);

			} else {
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1) {
					if (code[token_len] == '"' || code[token_len] == ')'  ||
					    code[token_len] == ']' || code[token_len] == '('  ||
					    code[token_len] == '$' || code[token_len] == '\\' ||
					    code[token_len] == '`' || code[token_len] == '\n')
						break;
				}
				push_unquoted(ctx, code, token_len);
			}
			break;


		case CB_QUOTE_MODE:
			if (*code == '}') {
				token_len = 1;
				pop_mode(ctx);
				push_leave(ctx);

			} else if (*code == '\\') {
				goto backslash_mode;

			} else if (*code == '\'') {
				goto sqoute_mode;

			} else if (*code == '"') {
				goto dquote_mode;

			} else if (*code == '`') {
				goto bquote_mode;

			} else if (*code == '$') {
				goto dollar_mode;

			} else if (*code == '\n') {
				token_len = 1;
				ctx->tokeniser_line_number += 1;
				push_unquoted(ctx, code, 1);

			} else {
				for (token_len = 1; token_len < code_len - bytes_read; token_len += 1) {
					if (code[token_len] == '}'  || code[token_len] == '\\' ||
					    code[token_len] == '\'' || code[token_len] == '"'  ||
					    code[token_len] == '`'  || code[token_len] == '$'  ||
					    code[token_len] == '\n')
						break;
				}
				push_unquoted(ctx, code, token_len);
			}
			break;

		default:
			abort();
		}

	next:
		if (ctx->line_continuations) {
			ctx->tokeniser_line_number += ctx->line_continuations;
			ctx->line_continuations = 0;
		}
	}

	if (bytes_read == code_len && ctx->end_of_file_reached)
		push_end_of_file(ctx);

need_more:
	return bytes_read;

#undef IS_SYMBOL
}