/** * mds — A micro-display server * Copyright © 2014, 2015, 2016 Mattias Andrée (maandree@member.fsf.org) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "mds-respawn.h" #include #include #include #include #include #include #include #include #include #include #define MDS_RESPAWN_VARS_VERSION 0 /** * This variable should declared by the actual server implementation. * It must be configured before `main` is invoked. * * This tells the server-base how to behave */ server_characteristics_t server_characteristics = { .require_privileges = 0, .require_display = 0, .require_respawn_info = 1, .sanity_check_argc = 0, .fork_for_safety = 0, .danger_is_deadly = 0 }; /** * Do not respawn crashed servers that did not live this many seconds */ static int interval = RESPAWN_TIME_LIMIT_SECONDS; /** * The number of servers managed by this process */ static size_t servers = 0; /** * Command line arguments, for each server — concatenated, with NULL-termination */ static char** commands_args = NULL; /** * Mapping elements in `commands_args` that are the first * argument for each server to run */ static char*** commands = NULL; /** * States of managed servers */ static server_state_t* states = NULL; /** * Whether a revive request has been received but not processed */ static volatile sig_atomic_t reviving = 0; /** * The number of servers that are alive */ static size_t live_count = 0; /** * Parse command line arguments * * @return Non-zero on error */ int parse_cmdline(void) { /* Parse command line arguments. */ int i; size_t j, args = 0, stack = 0; for (i = 1; i < argc; i++) { char* arg = argv[i]; if (startswith(arg, "--alarm=")) /* Schedule an alarm signal for forced abort. */ alarm((unsigned)min(atou(arg + strlen("--alarm=")), 60)); /* At most 1 minute. */ else if (startswith(arg, "--interval=")) interval = min(atoi(arg + strlen("--interval=")), 60); /* At most 1 minute. */ else if (strequals(arg, "--re-exec")) /* Re-exec state-marshal. */ is_reexec = 1; else if (strequals(arg, "{")) servers += stack++ == 0 ? 1 : 0; else if (strequals(arg, "}")) { exit_if (stack-- == 0, eprint("Terminating non-started command, aborting.");); exit_if (stack == 0 && strequals(argv[i - 1], "{"), eprint("Zero argument command specified, aborting.");); } else if (stack == 0) eprintf("Unrecognised option: %s, did you forget `='?", arg); else args++; } if (is_reexec) { is_respawn = 1; eprint("re-exec performed."); } /* Validate command line arguments. */ exit_if (stack > 0, eprint("Non-terminated command specified, aborting.");); exit_if (servers == 0, eprint("No programs to spawn, aborting.");); /* Allocate arrays. */ fail_if (xmalloc(commands_args, args + servers, char*)); fail_if (xmalloc(commands, servers, char**)); fail_if (xmalloc(states, servers, server_state_t)); /* Fill command arrays. */ for (i = 1, args = j = 0; i < argc; i++) { char* arg = argv[i]; if (strequals(arg, "}")) commands_args[args++] = --stack == 0 ? NULL : arg; else if (stack > 0) commands_args[args++] = arg; else if (strequals(arg, "{") && (stack++ == 0)) commands[j++] = commands_args + args; } return 0; fail: xperror(*argv); return 1; } /** * Spawn a server * * @param index The index of the server */ static void spawn_server(size_t index) { struct timespec started; pid_t pid; /* When did the spawned server start? */ if (monotone(&started) < 0) { xperror(*argv); eprintf("cannot read clock when starting %s, burying.", commands[index][0]); states[index].state = DEAD_AND_BURIED; return; } states[index].started = started; /* Fork process to spawn the server. */ pid = fork(); if (pid == (pid_t)-1) { xperror(*argv); eprintf("cannot fork in order to start %s, burying.", commands[index][0]); states[index].state = DEAD_AND_BURIED; return; } /* In the parent process (respawner): store spawned server information. */ if (pid) { states[index].pid = pid; states[index].state = ALIVE; live_count++; return; } /* In the child process (server): remove the alarm and change execution image to the server.. */ alarm(0); execvp(commands[index][0], commands[index]); xperror(commands[index][0]); _exit(1); } /** * This function is called when a signal that * signals the program to respawn all * `DEAD_AND_BURIED` server is received * * @param signo The signal that has been received */ static void received_revive(int signo) { SIGHANDLER_START; (void) signo; reviving = 1; eprint("revive signal received."); SIGHANDLER_END; } /** * This function will be invoked before `initialise_server` (if not re-exec:ing) * or before `unmarshal_server` (if re-exec:ing) * * @return Non-zero on error */ int preinitialise_server(void) { /* Make the server revive all `DEAD_AND_BURIED` servers on SIGUSR2. */ fail_if (xsigaction(SIGUSR2, received_revive) < 0); return 0; fail: xperror(*argv); return 1; } /** * This function should initialise the server, * and it not invoked after a re-exec. * * @return Non-zero on error */ int initialise_server(void) { #if UNBORN != 0 size_t i; #endif memset(states, 0, servers * sizeof(server_state_t)); #if UNBORN != 0 for (i = 0; i < servers; i++) states[i].state = UNBORN; #endif return 0; } /** * This function will be invoked after `initialise_server` (if not re-exec:ing) * or after `unmarshal_server` (if re-exec:ing) * * @return Non-zero on error */ int postinitialise_server(void) { size_t i, j; /* Spawn servers that has not been spawned yet. */ for (i = 0; i < servers; i++) if (states[i].state == UNBORN) spawn_server(i); /* Forever mark newly spawned services (after this point in time) as respawned. */ for (i = j = 0; j < servers; i++) if (commands_args[i] == NULL) j++; else if (strequals(commands_args[i], "--initial-spawn")) fail_if (xstrdup(commands_args[i], "--respawn")); /* Respawn dead and dead and buried servers. */ for (i = 0; i < servers; i++) if ((states[i].state == DEAD) || (states[i].state == DEAD_AND_BURIED)) spawn_server(i); return 0; fail: xperror(*argv); return 1; } /** * Calculate the number of bytes that will be stored by `marshal_server` * * On failure the program should `abort()` or exit by other means. * However it should not be possible for this function to fail. * * @return The number of bytes that will be stored by `marshal_server` */ size_t marshal_server_size(void) { size_t rc = sizeof(int) + sizeof(sig_atomic_t); rc += sizeof(time_t) + sizeof(long); rc += servers * sizeof(server_state_t); return rc; } /** * Marshal server implementation specific data into a buffer * * @param state_buf The buffer for the marshalled data * @return Non-zero on error */ int marshal_server(char* state_buf) { size_t i; struct timespec antiepoch; antiepoch.tv_sec = 0; antiepoch.tv_nsec = 0; (void) monotone(&antiepoch); buf_set_next(state_buf, int, MDS_RESPAWN_VARS_VERSION); buf_set_next(state_buf, sig_atomic_t, reviving); buf_set_next(state_buf, time_t, antiepoch.tv_sec); buf_set_next(state_buf, long, antiepoch.tv_nsec); for (i = 0; i < servers; i++) { buf_set_next(state_buf, pid_t, states[i].pid); buf_set_next(state_buf, int, states[i].state); buf_set_next(state_buf, time_t, states[i].started.tv_sec); buf_set_next(state_buf, long, states[i].started.tv_nsec); } free(states); return 0; } /** * Unmarshal server implementation specific data and update the servers state accordingly * * On critical failure the program should `abort()` or exit by other means. * That is, do not let `reexec_failure_recover` run successfully, if it unrecoverable * error has occurred or one severe enough that it is better to simply respawn. * * @param state_buf The marshalled data that as not been read already * @return Non-zero on error */ int unmarshal_server(char* state_buf) { size_t i; struct timespec antiepoch; struct timespec epoch; epoch.tv_sec = 0; epoch.tv_nsec = 0; (void) monotone(&epoch); /* buf_get_next(state_buf, int, MDS_RESPAWN_VARS_VERSION); */ buf_next(state_buf, int, 1); buf_get_next(state_buf, sig_atomic_t, reviving); buf_get_next(state_buf, time_t, antiepoch.tv_sec); buf_get_next(state_buf, long, antiepoch.tv_nsec); epoch.tv_sec -= antiepoch.tv_sec; epoch.tv_nsec -= antiepoch.tv_nsec; for (i = 0; i < servers; i++) { buf_get_next(state_buf, pid_t, states[i].pid); buf_get_next(state_buf, int, states[i].state); buf_get_next(state_buf, time_t, states[i].started.tv_sec); buf_get_next(state_buf, long, states[i].started.tv_nsec); if (validate_state(states[i].state) == 0) { states[i].state = CREMATED; eprintf("invalid state unmarshallaed for `%s', cremating.", commands[i][0]); } else if (states[i].state == ALIVE) { live_count++; /* Monotonic time epoch adjusment, the epoch of the monotonic clock is unspecified, so we cannot know whether an exec with cause a time jump. */ states[i].started.tv_sec -= epoch.tv_sec; states[i].started.tv_nsec -= epoch.tv_nsec; if (states[i].started.tv_nsec < 0) { states[i].started.tv_sec -= 1; states[i].started.tv_nsec += 1000000000; } else if (states[i].started.tv_nsec > 0) { states[i].started.tv_sec += 1; states[i].started.tv_nsec -= 1000000000; } } } return 0; } /** * Attempt to recover from a re-exec failure that has been * detected after the server successfully updated it execution image * * @return Non-zero on error */ int __attribute__((cold, const)) reexec_failure_recover(void) { /* Re-exec cannot fail. */ return 0; } /** * Respawn a server that has exited if appropriate * * @param pid The process ID of the server that has exited * @param status The server's death status */ static void joined_with_server(pid_t pid, int status) { struct timespec ended; size_t i; /* Find index of reaped server. */ for (i = 0; i < servers; i++) if (states[i].pid == pid) break; if (i == servers) { eprintf("joined with unknown child process: %i", pid); return; } /* Do nothing if the server is cremated. */ if (states[i].state == CREMATED) { eprintf("cremated child process `%s' exited, ignoring.", commands[i][0]); return; } /* Mark server as dead if it was alive. */ if (states[i].state == ALIVE) live_count--; states[i].state = DEAD; /* Cremate server if it exited normally or was killed nicely. */ if (WIFEXITED(status) ? (WEXITSTATUS(status) == 0) : ((WTERMSIG(status) == SIGTERM) || (WTERMSIG(status) == SIGINT))) { eprintf("child process `%s' exited normally, cremating.", commands[i][0]); states[i].state = CREMATED; return; } /* Print exit status of the reaped server. */ if (WIFEXITED(status)) eprintf("`%s' exited with code %i.", commands[i][0], WEXITSTATUS(status)); else eprintf("`%s' died by signal %i.", commands[i][0], WTERMSIG(status)); /* When did the server exit. */ if (monotone(&ended) < 0) { xperror(*argv); eprintf("`%s' died abnormally, burying because we could not read the time.", commands[i][0]); states[i].state = DEAD_AND_BURIED; return; } /* Bury the server if it died abnormally too fast. */ if (ended.tv_sec - states[i].started.tv_sec < interval) { eprintf("`%s' died abnormally, burying because it died too fast.", commands[i][0]); states[i].state = DEAD_AND_BURIED; return; } /* Respawn server if it died abnormally in a responable time. */ eprintf("`%s' died abnormally, respawning.", commands[i][0]); spawn_server(i); } /** * Perform the server's mission * * @return Non-zero on error */ int master_loop(void) { int status, rc = 0; size_t i; while (!reexecing && !terminating && live_count) { pid_t pid = uninterruptable_waitpid(-1, &status, 0); if (reviving) for (reviving = 0, i = 0; i < servers; i++) if (states[i].state == DEAD_AND_BURIED) spawn_server(i); if (pid == (pid_t)-1) { xperror(*argv); rc = 1; break; } joined_with_server(pid, status); } free(commands_args); free(commands); if (reexecing == 0) free(states); return rc; } /** * This function is called when a signal that * signals that the system to dump state information * and statistics has been received * * @param signo The signal that has been received */ void received_info(int signo) { SIGHANDLER_START; server_state_t state; size_t i, n = servers; char** cmdline; struct timespec now; (void) signo; if (monotone(&now) < 0) iprint("(unable to get current time)"); else iprintf("current time: %ji.%09li", (intmax_t)(now.tv_sec), (long)(now.tv_nsec)); iprintf("do-not-resuscitate period: %i seconds", interval); iprintf("managed servers: %zu", n); iprintf("alive servers: %zu", live_count); iprintf("reviving: %s", reviving ? "yes" : "no"); for (i = 0; i < n; i++) { state = states[i]; cmdline = commands[i]; iprintf("managed server %zu: pid: %li", i, (long)(state.pid)); iprintf("managed server %zu: state: %s", i, state.state == UNBORN ? "not started yet" : state.state == ALIVE ? "up and running" : state.state == DEAD ? "about to be respawn" : state.state == DEAD_AND_BURIED ? "requires SIGUSR2 to respawn" : state.state == CREMATED ? "will never respawn" : "unrecognised state, something is wrong here!"); iprintf("managed server %zu: started: %ji.%09li", i, (intmax_t)(state.started.tv_sec), (long)(state.started.tv_nsec)); iprintf("managed server %zu: cmdline:", i); while (*cmdline) iprintf(" %z", *cmdline++); } SIGHANDLER_END; }