Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent dockerd from restarting after runtime error #198

Merged
merged 1 commit into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,28 +240,31 @@ Following are the possible values of `Status`:
```text
-1 NOT STARTED The application is not started.
0 RUNNING The application is started and dockerd is running.
1 TLS CERT MISSING Use TLS is selected but there but certificates are missing on the device.
1 DOCKERD STOPPED Dockerd was stopped successfully and will soon be restarted.
2 DOCKERD RUNTIME ERROR Dockerd has reported an error during runtime that needs to be resolved by the operator.
Change at least one parameter or restart the application in order to start dockerd again.
3 TLS CERT MISSING Use TLS is selected but there but certificates are missing on the device.
The application is running but dockerd is stopped.
Upload certificates and restart the application or de-select Use TLS.
2 NO SOCKET Neither TCP Socket or IPC Socket are selected.
4 NO SOCKET Neither TCP Socket or IPC Socket are selected.
The application is running but dockerd is stopped.
Select one or both sockets.
3 NO SD CARD Use SD Card is selected but no SD Card is mounted in the device.
5 NO SD CARD Use SD Card is selected but no SD Card is mounted in the device.
The application is running but dockerd is stopped.
Insert and mount a SD Card.
4 SD CARD WRONG FS Use SD Card is selected but the mounted SD Card has the wrong file system.
Insert and mount an SD Card.
6 SD CARD WRONG FS Use SD Card is selected but the mounted SD Card has the wrong file system.
The application is running but dockerd is stopped.
Format the SD Card with the correct file system.
5 SD CARD WRONG PERMISSION Use SD Card is selected but the application user does not have the correct file
7 SD CARD WRONG PERMISSION Use SD Card is selected but the application user does not have the correct file
permissions to use it.
The application is running but dockerd is stopped.
Make sure no directories with the wrong user permissions are left on the
SD Card. Then restart the application.
6 SD CARD MIGRATION FAILED Use SD Card is selected but migrating data from the old data root location to the
SD Card, then restart the application.
8 SD CARD MIGRATION FAILED Use SD Card is selected but migrating data from the old data root location to the
new one has failed.
The application is running but dockerd is stopped.
Manually back up and remove either the old data root folder, or the new
data root folder, from the SD card. Then restart the application.
Manually back up and remove either the old or the new data root folder from the SD card,
then restart the application.
```

## Building the Docker ACAP
Expand Down
106 changes: 73 additions & 33 deletions app/dockerdwrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,10 @@
#define PARAM_STATUS "Status"

typedef enum {
STATUS_NOT_STARTED = 0,
STATUS_NOT_STARTED = 0, // Index in the array, not the actual status code
STATUS_RUNNING,
STATUS_DOCKERD_STOPPED,
STATUS_DOCKERD_RUNTIME_ERROR,
STATUS_TLS_CERT_MISSING,
STATUS_NO_SOCKET,
STATUS_NO_SD_CARD,
Expand All @@ -53,12 +55,14 @@ typedef enum {

static const char* const status_code_strs[STATUS_CODE_COUNT] = {"-1 NOT STARTED",
"0 RUNNING",
"1 TLS CERT MISSING",
"2 NO SOCKET",
"3 NO SD CARD",
"4 SD CARD WRONG FS",
"5 SD CARD WRONG PERMISSION",
"6 SD CARD MIGRATION FAILED"};
"1 DOCKERD STOPPED",
"2 DOCKERD RUNTIME ERROR",
"3 TLS CERT MISSING",
"4 NO SOCKET",
"5 NO SD CARD",
"6 SD CARD WRONG FS",
"7 SD CARD WRONG PERMISSION",
"8 SD CARD MIGRATION FAILED"};

struct settings {
char* data_root;
Expand All @@ -68,10 +72,18 @@ struct settings {
};

struct app_state {
bool allow_dockerd_to_start;
char* sd_card_area;
AXParameter* param_handle;
};

// If process exited by a signal, code will be -1.
// If process exited with an exit code, signal will be 0.
struct exit_cause {
int code;
int signal;
};

/**
* @brief Callback called when the dockerd process exits.
*/
Expand Down Expand Up @@ -653,22 +665,43 @@ static void stop_dockerd(void) {
log_info("Stopped dockerd.");
}

static struct exit_cause child_process_exit_cause(int status, GError* error) {
struct exit_cause result;
result.code = -1;
result.signal = 0;

if (g_spawn_check_wait_status(status, &error) || error->domain == G_SPAWN_EXIT_ERROR)
result.code = error ? error->code : 0;
else if (error->domain == G_SPAWN_ERROR && error->code == G_SPAWN_ERROR_FAILED)
result.signal = status;

return result;
}

static void log_child_process_exit_cause(const char* name, GPid pid, int status) {
GError* error = NULL;
struct exit_cause exit_cause = child_process_exit_cause(status, error);

char msg[128];
const char* end = msg + sizeof(msg);
char* ptr = msg + g_snprintf(msg, end - msg, "Child process %s (%d)", name, pid);

if (g_spawn_check_wait_status(status, &error) || error->domain == G_SPAWN_EXIT_ERROR)
g_snprintf(ptr, end - ptr, " exited with exit code %d", error ? error->code : 0);
else if (error->domain == G_SPAWN_ERROR && error->code == G_SPAWN_ERROR_FAILED)
g_snprintf(ptr, end - ptr, " was killed by signal %d", status);
if (exit_cause.code >= 0)
g_snprintf(ptr, end - ptr, " exited with exit code %d", exit_cause.code);
else if (exit_cause.signal > 0)
g_snprintf(ptr, end - ptr, " was killed by signal %d", exit_cause.signal);
else
g_snprintf(ptr, end - ptr, " terminated in an unexpected way: %s", error->message);
g_clear_error(&error);
log_debug("%s", msg);
}

static bool child_process_exited_with_error(int status) {
GError* error = NULL;
struct exit_cause exit_cause = child_process_exit_cause(status, error);
g_clear_error(&error);
return exit_cause.code > 0;
}

/**
* @brief Callback called when the dockerd process exits.
*/
Expand All @@ -677,6 +710,11 @@ static void dockerd_process_exited_callback(GPid pid, gint status, gpointer app_

struct app_state* app_state = app_state_void_ptr;

bool runtime_error = child_process_exited_with_error(status);
app_state->allow_dockerd_to_start = !runtime_error;
status_code_t s = runtime_error ? STATUS_DOCKERD_RUNTIME_ERROR : STATUS_DOCKERD_STOPPED;
set_status_parameter(app_state->param_handle, s);

dockerd_process_pid = -1;
g_spawn_close_pid(pid);

Expand All @@ -700,26 +738,26 @@ static gboolean quit_main_loop(void*) {
* @param name Name of the updated parameter.
* @param value Value of the updated parameter.
*/
static void parameter_changed_callback(const gchar* name,
const gchar* value,
__attribute__((unused)) gpointer data) {
log_debug("Parameter %s changed to %s", name, value);
static void
parameter_changed_callback(const gchar* name, const gchar* value, gpointer app_state_void_ptr) {
const gchar* parname = name += strlen("root." APP_NAME ".");

for (size_t i = 0; i < sizeof(ax_parameters) / sizeof(ax_parameters[0]); ++i) {
if (strcmp(parname, ax_parameters[i]) == 0) {
log_info("%s changed to: %s", ax_parameters[i], value);
// Trigger a restart of dockerd from main(), but delay it 1 second.
// When there are multiple AXParameter callbacks in a queue, such as
// during the first parameter change after installation, any parameter
// usage, even outside a callback, will cause a 20 second deadlock per
// queued callback.
g_timeout_add_seconds(1, quit_main_loop, NULL);
}
}
log_info("%s changed to %s", parname, value);

struct app_state* app_state = app_state_void_ptr;

// If dockerd has failed before, this parameter change may have resolved the problem.
app_state->allow_dockerd_to_start = true;

// Trigger a restart of dockerd from main(), but delay it 1 second.
// When there are multiple AXParameter callbacks in a queue, such as
// during the first parameter change after installation, any parameter
// usage, even outside a callback, will cause a 20 second deadlock per
// queued callback.
g_timeout_add_seconds(1, quit_main_loop, NULL);
}

static AXParameter* setup_axparameter(void) {
static AXParameter* setup_axparameter(struct app_state* app_state) {
bool success = false;
GError* error = NULL;
AXParameter* ax_parameter = ax_parameter_new(APP_NAME, &error);
Expand All @@ -733,7 +771,7 @@ static AXParameter* setup_axparameter(void) {
gboolean geresult = ax_parameter_register_callback(ax_parameter,
parameter_path,
parameter_changed_callback,
NULL,
app_state,
&error);
free(parameter_path);

Expand Down Expand Up @@ -784,7 +822,9 @@ int main(int argc, char** argv) {
parse_command_line(argc, argv, &log_settings);
log_init(&log_settings);

app_state.param_handle = setup_axparameter();
app_state.allow_dockerd_to_start = true;

app_state.param_handle = setup_axparameter(&app_state);
if (!app_state.param_handle) {
log_error("Error in setup_axparameter");
return EX_SOFTWARE;
Expand All @@ -797,17 +837,15 @@ int main(int argc, char** argv) {
struct sd_disk_storage* sd_disk_storage = sd_disk_storage_init(sd_card_callback, &app_state);

while (application_exit_code == EX_KEEP_RUNNING) {
if (dockerd_process_pid == -1)
if (dockerd_process_pid == -1 && app_state.allow_dockerd_to_start)
read_settings_and_start_dockerd(&app_state);

main_loop_run();

log_settings.debug = is_app_log_level_debug(app_state.param_handle);

stop_dockerd();
set_status_parameter(app_state.param_handle, STATUS_NOT_STARTED);
}

main_loop_unref();

if (app_state.param_handle != NULL) {
Expand All @@ -821,5 +859,7 @@ int main(int argc, char** argv) {

sd_disk_storage_free(sd_disk_storage);
free(app_state.sd_card_area);

set_status_parameter(app_state.param_handle, STATUS_NOT_STARTED);
return application_exit_code;
}