From 5bdd93081f2d5f06594a24b1535e1e95a6d7c1aa Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 13 Mar 2023 20:08:23 +0100 Subject: [PATCH 1/2] cgroup: systemd initialize rt limits Signed-off-by: Giuseppe Scrivano --- src/libcrun/cgroup-systemd.c | 74 +++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/src/libcrun/cgroup-systemd.c b/src/libcrun/cgroup-systemd.c index 35ae099346..e1051b1f82 100644 --- a/src/libcrun/cgroup-systemd.c +++ b/src/libcrun/cgroup-systemd.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #ifdef HAVE_SYSTEMD @@ -68,17 +69,79 @@ get_systemd_scope_and_slice (const char *id, const char *cgroup_path, char **sco } } +/* set the rt-runtime for the current cgroup and its parent if the path is not a scope. */ +static int +setup_rt_runtime (runtime_spec_schema_config_linux_resources *resources, + const char *path, libcrun_error_t *err) +{ + cleanup_free char *cgroup_path = NULL; + cleanup_close int dirfd = -1; + bool need_set_parent = true; + char fmt_buf[64]; + size_t len; + int ret; + + if (resources == NULL || resources->cpu == NULL) + return 0; + + if (has_suffix (path, ".scope")) + need_set_parent = false; + + ret = append_paths (&cgroup_path, err, CGROUP_ROOT, "cpu", path, NULL); + if (UNLIKELY (ret < 0)) + return ret; + + dirfd = open (cgroup_path, O_DIRECTORY | O_CLOEXEC); + if (UNLIKELY (dirfd < 0)) + return crun_make_error (err, errno, "open `%s`", cgroup_path); + + if (resources->cpu->realtime_period) + { + len = sprintf (fmt_buf, "%" PRIu64, resources->cpu->realtime_period); + + if (need_set_parent) + { + ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "../cpu.rt_period_us", fmt_buf, len, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "cpu.rt_period_us", fmt_buf, len, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + if (resources->cpu->realtime_runtime) + { + len = sprintf (fmt_buf, "%" PRIu64, resources->cpu->realtime_runtime); + + if (need_set_parent) + { + ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "../cpu.rt_runtime_us", fmt_buf, len, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "cpu.rt_runtime_us", fmt_buf, len, err); + if (UNLIKELY (ret < 0)) + return ret; + } + return 0; +} + + static int systemd_finalize (struct libcrun_cgroup_args *args, char **path_out, int cgroup_mode, const char *suffix, libcrun_error_t *err) { + runtime_spec_schema_config_linux_resources *resources = args->resources; + cleanup_free char *cgroup_path = NULL; cleanup_free char *content = NULL; cleanup_free char *path = NULL; pid_t pid = args->pid; int ret; char *from, *to; char *saveptr = NULL; - cleanup_free char *cgroup_path = NULL; xasprintf (&cgroup_path, "/proc/%d/cgroup", pid); ret = read_all_file (cgroup_path, &content, NULL, err); @@ -148,6 +211,11 @@ systemd_finalize (struct libcrun_cgroup_args *args, char **path_out, } } } + + ret = setup_rt_runtime (resources, path, err); + if (UNLIKELY (ret < 0)) + return ret; + break; case CGROUP_MODE_UNIFIED: @@ -1071,6 +1139,10 @@ libcrun_update_resources_systemd (struct libcrun_cgroup_status *cgroup_status, goto exit; } + ret = setup_rt_runtime (resources, cgroup_status->path, err); + if (UNLIKELY (ret < 0)) + goto exit; + ret = 0; exit: From 6ba6a00250da6efb07d820d9835f92918e7eedaf Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 13 Mar 2023 16:15:00 +0100 Subject: [PATCH 2/2] container: add custom annotation to specify the scheduler This commit adds a new feature to the container runtime that allows users to set the scheduling policy of the container process. The new feature is implemented as a custom annotation. To set the scheduling policy and priority, users can add a `run.oci.scheduler` annotation with a value in the format POLICY[|OPTION][#PRIORITY]. If no scheduling policy or priority is specified, the container process will use the current scheduling policy and priority. Signed-off-by: Giuseppe Scrivano --- crun.1 | 19 ++++++- crun.1.md | 13 +++++ src/libcrun/cgroup-systemd.c | 1 - src/libcrun/container.c | 4 ++ src/libcrun/linux.c | 102 +++++++++++++++++++++++++++++++++-- src/libcrun/linux.h | 2 + 6 files changed, 135 insertions(+), 6 deletions(-) diff --git a/crun.1 b/crun.1 index 8b583f939a..3dfcdb9cb0 100644 --- a/crun.1 +++ b/crun.1 @@ -685,6 +685,21 @@ wasm module is relayed back via crun. .RE +.SH \fB\fCrun.oci.scheduler\fR +.PP +The \fB\fCrun.oci.scheduler\fR annotation allows you to set the scheduling +policy for the container process. The value of the annotation should +be in the format \fB\fCPOLICY[|OPTION][#PRIORITY]\fR, where \fB\fCPOLICY\fR is the +name of the scheduling policy, \fB\fCOPTION\fR can be \fB\fCSCHED_RESET_ON_FORK\fR +and \fB\fCPRIORITY\fR is an optional integer priority value. + +.PP +It is an experimental feature and will be removed once the feature is in the +OCI runtime specs. + +.PP +Please refer to \fB\fCsched_setscheduler(2)\fR for more information. + .SH tmpcopyup mount options .PP If the \fB\fCtmpcopyup\fR option is specified for a tmpfs, then the path that @@ -814,8 +829,8 @@ For example, the mapping: \fB\fCuids=@1-3-10\fR, given a configuration like .PP will be converted to the absolute value \fB\fCuids=1-4-10\fR, where 4 is -calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping) -+ 1 (\fB\fChostID - containerID\fR for the user namespace mapping where +calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping) and 1 +(\fB\fChostID - containerID\fR for the user namespace mapping where \fB\fCcontainerID = 1\fR is found). .PP diff --git a/crun.1.md b/crun.1.md index eba111fb47..677ac70c55 100644 --- a/crun.1.md +++ b/crun.1.md @@ -541,6 +541,19 @@ workload natively. Accepts a `.wasm` binary as input and if `.wat` is provided it will be automatically compiled into a wasm module. Stdout of wasm module is relayed back via crun. +## `run.oci.scheduler` + +The `run.oci.scheduler` annotation allows you to set the scheduling +policy for the container process. The value of the annotation should +be in the format `POLICY[|OPTION][#PRIORITY]`, where `POLICY` is the +name of the scheduling policy, `OPTION` can be `SCHED_RESET_ON_FORK` +and `PRIORITY` is an optional integer priority value. + +It is an experimental feature and will be removed once the feature is in the +OCI runtime specs. + +Please refer to `sched_setscheduler(2)` for more information. + ## tmpcopyup mount options If the `tmpcopyup` option is specified for a tmpfs, then the path that diff --git a/src/libcrun/cgroup-systemd.c b/src/libcrun/cgroup-systemd.c index e1051b1f82..3384635cc5 100644 --- a/src/libcrun/cgroup-systemd.c +++ b/src/libcrun/cgroup-systemd.c @@ -129,7 +129,6 @@ setup_rt_runtime (runtime_spec_schema_config_linux_resources *resources, return 0; } - static int systemd_finalize (struct libcrun_cgroup_args *args, char **path_out, int cgroup_mode, const char *suffix, libcrun_error_t *err) diff --git a/src/libcrun/container.c b/src/libcrun/container.c index 2c6bee7489..62f226e7fe 100644 --- a/src/libcrun/container.c +++ b/src/libcrun/container.c @@ -2376,6 +2376,10 @@ libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_ if (UNLIKELY (ret < 0)) goto fail; + ret = libcrun_set_scheduler (pid, container, err); + if (UNLIKELY (ret < 0)) + return ret; + /* The container is waiting that we write back. In this phase we can launch the prestart hooks. */ if (def->hooks && def->hooks->prestart_len) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 96dd2c5298..89d27f4f1d 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -4650,7 +4651,8 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin } static int -join_process_parent_helper (pid_t child_pid, int sync_socket_fd, +join_process_parent_helper (libcrun_container_t *container, + pid_t child_pid, int sync_socket_fd, libcrun_container_status_t *status, bool need_move_to_cgroup, const char *sub_cgroup, int *terminal_fd, libcrun_error_t *err) @@ -4700,6 +4702,11 @@ join_process_parent_helper (pid_t child_pid, int sync_socket_fd, if (UNLIKELY (ret < 0)) return ret; } + + /* Join the scheduler immediately after joining the cgroup. */ + ret = libcrun_set_scheduler (pid, container, err); + if (UNLIKELY (ret < 0)) + return ret; } /* The write unblocks the grandchild process so it can run once we setup @@ -4944,6 +4951,16 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join, pid = syscall_clone3 (&clone3_args); + if (pid > 0) + { + /* We need to set the scheduler as soon as possible after joining the cgroup, + because if it is a RT scheduler, other processes in the container could already + take the entire cpu time and stall the new process. */ + ret = libcrun_set_scheduler (pid, container, err); + if (UNLIKELY (ret < 0)) + return ret; + } + /* On errors, fall back to fork(). */ if (pid < 0) { @@ -4961,8 +4978,9 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join, { close_and_reset (&sync_socket_fd[1]); sync_fd = sync_socket_fd[0]; - return join_process_parent_helper (pid, sync_fd, status, need_move_to_cgroup, - sub_cgroup, terminal_fd, err); + return join_process_parent_helper (container, pid, sync_fd, status, + need_move_to_cgroup, sub_cgroup, + terminal_fd, err); } close_and_reset (&sync_socket_fd[0]); @@ -5270,3 +5288,81 @@ libcrun_kill_linux (libcrun_container_status_t *status, int signal, libcrun_erro return crun_make_error (err, errno, "kill container"); return 0; } + +int +libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err) +{ + cleanup_free char *copy = NULL; + struct sched_param param; + int ret, policy, option; + char *v_priority; + const char *v; + char *sptr; + struct + { + const char *name; + int value; + int option_value; + } policies[] = { + { "SCHED_OTHER", SCHED_OTHER, 0 }, + { "SCHED_BATCH", SCHED_BATCH, 0 }, + { "SCHED_IDLE", SCHED_IDLE, 0 }, + { "SCHED_FIFO", SCHED_FIFO, 0 }, + { "SCHED_RR", SCHED_RR, 0 }, + { "SCHED_RESET_ON_FORK", 0, SCHED_RESET_ON_FORK }, + { NULL, 0, 0 }, + }; + + v = find_annotation (container, "run.oci.scheduler"); + if (LIKELY (v == NULL)) + return 0; + + memset (¶m, 0, sizeof (param)); + + copy = xstrdup (v); + v_priority = strchr (copy, '#'); + if (v_priority) + *v_priority = '\0'; + + policy = 0; + option = 0; + for (v = strtok_r (copy, "|", &sptr); v; v = strtok_r (NULL, "|", &sptr)) + { + int i; + + for (i = 0; policies[i].name; i++) + if (strcmp (v, policies[i].name) == 0) + { + policy |= policies[i].value; + option |= policies[i].option_value; + break; + } + if (UNLIKELY (policies[i].name == NULL)) + return crun_make_error (err, 0, "invalid scheduler `%s`", v); + } + + if (v_priority) + { + long long priority; + char *ep = NULL; + + errno = 0; + priority = strtoll (v_priority + 1, &ep, 10); + if (UNLIKELY (ep != NULL && *ep != '\0')) + return crun_make_error (err, EINVAL, "parse scheduler annotation"); + if (UNLIKELY (errno)) + return crun_make_error (err, errno, "parse scheduler annotation"); + + if (priority >= INT_MAX || priority <= INT_MIN + || priority < sched_get_priority_min (policy) || priority > sched_get_priority_max (policy)) + return crun_make_error (err, 0, "scheduler priority value `%lli` out of range", priority); + + param.sched_priority = (int) priority; + } + + ret = sched_setscheduler (pid, option | policy, ¶m); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "sched_setscheduler"); + + return 0; +} diff --git a/src/libcrun/linux.h b/src/libcrun/linux.h index f8800c9228..b385e58aee 100644 --- a/src/libcrun/linux.h +++ b/src/libcrun/linux.h @@ -116,4 +116,6 @@ int libcrun_create_dev (libcrun_container_t *container, int devfd, int parse_idmapped_mount_option (runtime_spec_schema_config_schema *def, bool is_uids, char *option, char **out, size_t *len, libcrun_error_t *err); +int libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err); + #endif