From de94df188f4ca3be13e95b7983c08b8e3f1960e1 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Sat, 7 Jun 2014 15:49:51 +0200 Subject: finish implementation of mds-respawn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- src/mds-respawn.c | 266 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 241 insertions(+), 25 deletions(-) diff --git a/src/mds-respawn.c b/src/mds-respawn.c index 9f9dc78..8d5fb0a 100644 --- a/src/mds-respawn.c +++ b/src/mds-respawn.c @@ -24,6 +24,15 @@ #include #include #include +#include +#include +#include +#include + + + +#define MDS_RESPAWN_VARS_VERSION 0 + /** @@ -45,7 +54,7 @@ server_characteristics_t server_characteristics = /** * Do not respawn crashed servers that did not live this many seconds */ -static int interval = 5; +static int interval = RESPAWN_TIME_LIMIT_SECONDS; /** * The number of servers managed by this process @@ -55,19 +64,29 @@ static size_t servers = 0; /** * Command line arguments, for each server — concatenated, with NULL termination */ -static const char** commands_args = NULL; +static char** commands_args = NULL; /** * Mapping elements in `commands_args` that are the first * argument for each server to run */ -static const char*** commands = NULL; +static char*** commands = NULL; /** * States of managed servers */ static server_state_t* states = NULL; +/** + * Whether a revive request has been received but not processed + */ +static volatile sig_atomic_t reviving = 0; + +/** + * The number of servers that are alive + */ +static size_t live_count = 0; + /** @@ -92,9 +111,9 @@ int parse_cmdline(void) servers += stack++ == 0 ? 1 : 0; else if (strequals(arg, "}")) { - exit_if (stack-- == 0, eprint("Terminating non-started command, aborting");); + exit_if (stack-- == 0, eprint("Terminating non-started command, aborting.");); exit_if (stack == 0 && strequals(argv[i - 1], "{"), - eprint("Zero argument command specified, aborting");); + eprint("Zero argument command specified, aborting.");); } else if (stack == 0) eprintf("Unrecognised option: %s, did you forget `='?", arg); @@ -107,8 +126,8 @@ int parse_cmdline(void) eprint("re-exec performed."); } - exit_if (stack > 0, eprint("Non-terminated command specified, aborting");); - exit_if (servers == 0, eprint("No programs to spawn, aborting");); + exit_if (stack > 0, eprint("Non-terminated command specified, aborting.");); + exit_if (servers == 0, eprint("No programs to spawn, aborting.");); fail_if (xmalloc(commands_args, args + servers, char*)); fail_if (xmalloc(commands, servers, char**)); @@ -131,6 +150,47 @@ int parse_cmdline(void) } +/** + * Spawn a server + * + * @param index The index of the server + */ +static void spawn_server(size_t index) +{ + struct timespec started; + pid_t pid; + + if (monotone(&started) < 0) + { + perror(*argv); + eprintf("cannot read clock when starting %s, burying.", commands[index][0]); + states[index].state = DEAD_AND_BURIED; + return; + } + states[index].started = started; + + pid = fork(); + if (pid == (pid_t)-1) + { + perror(*argv); + eprintf("cannot fork in order to start %s, burying.", commands[index][0]); + states[index].state = DEAD_AND_BURIED; + return; + } + + if (pid) + { + states[index].pid = pid; + states[index].state = ALIVE; + return; + } + + execvp(commands[index][0], commands[index]); + perror(commands[index][0]); + _exit(1); +} + + /** * This function is called when a signal that * signals the program to respawn all @@ -140,8 +200,8 @@ int parse_cmdline(void) */ static void received_revive(int signo) { - /* TODO */ (void) signo; + reviving = 1; eprint("revive signal received."); } @@ -172,12 +232,14 @@ int preinitialise_server(void) */ int initialise_server(void) { +#if UNBORN != 0 size_t i; +#endif + memset(states, 0, servers * sizeof(server_state_t)); +#if UNBORN != 0 for (i = 0; i < servers; i++) - { - states[i].pid = 0; - states[i].state = UNBORN; - } + states[i].state = UNBORN; +#endif return 0; } @@ -192,17 +254,26 @@ int postinitialise_server(void) { size_t i, j; - /* TODO Spawn `UNBORN` servers. */ + for (i = 0; i < servers; i++) + if (states[i].state == UNBORN) + spawn_server(i); for (i = j = 0; j < servers; i++) if (commands_args[i] == NULL) j++; else if (strequals(commands_args[i], "--initial-spawn")) - commands_args[i] = "--respawn"; + if ((commands_args[i] = strdup("--respawn")) == NULL) + goto pfail; - /* TODO Spawn `DEAD` and `DEAD_AND_BURIED` servers. */ + for (i = 0; i < servers; i++) + if ((states[i].state == DEAD) || + (states[i].state == DEAD_AND_BURIED)) + spawn_server(i); return 0; + pfail: + perror(*argv); + return 1; } @@ -216,8 +287,10 @@ int postinitialise_server(void) */ size_t marshal_server_size(void) { - /* TODO */ - return 0; + size_t rc = sizeof(int) + sizeof(sig_atomic_t); + rc += sizeof(time_t) + sizeof(long); + rc += servers * sizeof(server_state_t); + return rc; } @@ -229,8 +302,23 @@ size_t marshal_server_size(void) */ int marshal_server(char* state_buf) { - /* TODO */ - (void) state_buf; + size_t i; + struct timespec antiepoch; + antiepoch.tv_sec = 0; + antiepoch.tv_nsec = 0; + (void) monotone(&antiepoch); + buf_set_next(state_buf, int, MDS_RESPAWN_VARS_VERSION); + buf_set_next(state_buf, sig_atomic_t, reviving); + buf_set_next(state_buf, time_t, antiepoch.tv_sec); + buf_set_next(state_buf, long, antiepoch.tv_nsec); + for (i = 0; i < servers; i++) + { + buf_set_next(state_buf, pid_t, states[i].pid); + buf_set_next(state_buf, int, states[i].state); + buf_set_next(state_buf, time_t, states[i].started.tv_sec); + buf_set_next(state_buf, long, states[i].started.tv_nsec); + } + free(states); return 0; } @@ -247,8 +335,50 @@ int marshal_server(char* state_buf) */ int unmarshal_server(char* state_buf) { - /* TODO */ - (void) state_buf; + size_t i; + struct timespec antiepoch; + struct timespec epoch; + epoch.tv_sec = 0; + epoch.tv_nsec = 0; + (void) monotone(&epoch); + /* buf_get_next(state_buf, int, MDS_RESPAWN_VARS_VERSION); */ + buf_next(state_buf, int, 1); + buf_get_next(state_buf, sig_atomic_t, reviving); + buf_get_next(state_buf, time_t, antiepoch.tv_sec); + buf_get_next(state_buf, long, antiepoch.tv_nsec); + epoch.tv_sec -= antiepoch.tv_sec; + epoch.tv_nsec -= antiepoch.tv_nsec; + for (i = 0; i < servers; i++) + { + buf_get_next(state_buf, pid_t, states[i].pid); + buf_get_next(state_buf, int, states[i].state); + buf_get_next(state_buf, time_t, states[i].started.tv_sec); + buf_get_next(state_buf, long, states[i].started.tv_nsec); + if (validate_state(states[i].state) == 0) + { + states[i].state = CREMATED; + eprintf("invalid state unmarshallaed for `%s', cremating.", commands[i][0]); + } + else if (states[i].state == ALIVE) + { + live_count++; + /* Monotonic time epoch adjusment, the epoch of the monotonic + clock is unspecified, so we cannot know whether an exec + with cause a time jump. */ + states[i].started.tv_sec -= epoch.tv_sec; + states[i].started.tv_nsec -= epoch.tv_nsec; + if (states[i].started.tv_nsec < 0) + { + states[i].started.tv_sec -= 1; + states[i].started.tv_nsec += 1000000000; + } + else if (states[i].started.tv_nsec > 0) + { + states[i].started.tv_sec += 1; + states[i].started.tv_nsec -= 1000000000; + } + } + } return 0; } @@ -259,13 +389,71 @@ int unmarshal_server(char* state_buf) * * @return Non-zero on error */ -int reexec_failure_recover(void) +int __attribute__((cold, const)) reexec_failure_recover(void) { - /* TODO */ + /* Re-exec cannot fail. */ return 0; } +/** + * Respawn a server that has exited if appropriate + * + * @param pid The process ID of the server that has exited + * @param status The server's death status + */ +static void joined_with_server(pid_t pid, int status) +{ + struct timespec ended; + size_t i; + + for (i = 0; i < servers; i++) + if (states[i].pid == pid) + break; + if (i == servers) + { + eprintf("joined with unknown child process: %i", pid); + return; + } + + if (states[i].state == CREMATED) + { + eprintf("cremated child process %s exited, ignoring.", commands[i][0]); + return; + } + + if (states[i].state == ALIVE) + live_count--; + states[i].state = DEAD; + + if (WIFEXITED(status) ? (WEXITSTATUS(status) == 0) : + ((WTERMSIG(status) == SIGTERM) || (WTERMSIG(status) == SIGINT))) + { + eprintf("child process %s exited normally, cremating.", commands[i][0]); + states[i].state = CREMATED; + return; + } + + if (monotone(&ended) < 0) + { + perror(*argv); + eprintf("%s died abnormally, burying because we could not read the time.", commands[i][0]); + states[i].state = DEAD_AND_BURIED; + return; + } + + if (ended.tv_sec - states[i].started.tv_sec < RESPAWN_TIME_LIMIT_SECONDS) + { + eprintf("%s died abnormally, burying because it died too fast.", commands[i][0]); + states[i].state = DEAD_AND_BURIED; + return; + } + + eprintf("%s died abnormally, respawning.", commands[i][0]); + spawn_server(i); +} + + /** * Perform the server's mission * @@ -273,7 +461,35 @@ int reexec_failure_recover(void) */ int master_loop(void) { - /* TODO */ - return 0; + int status, rc = 0; + size_t i; + + while (!reexecing && !terminating && live_count) + { + pid_t pid = waitpid(-1, &status, 0); + + if (reviving) + for (reviving = 0, i = 0; i < servers; i++) + if (states[i].state == DEAD_AND_BURIED) + spawn_server(i); + + if (pid == (pid_t)-1) + { + if (errno == EINTR) + continue; + perror(*argv); + rc = 1; + break; + } + + joined_with_server(pid, status); + } + + free(commands_args); + free(commands); + if (reexecing == 0) + free(states); + + return rc; } -- cgit v1.2.3-70-g09d2