From f5f17cde0de1026f92fe47f87a9f81e77e7bdae4 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 4 Aug 2022 11:57:31 -0700 Subject: [PATCH] tests/int/checkpoint: fix lazy migration flakiness When doing a lazy checkpoint/restore, we should not restore into the same cgroup, otherwise there is a race which result in occasional killing of the restored container (GH #2760, #2924). The fix is to use --manage-cgroup-mode=ignore, which allows to restore into a different cgroup. Note that since cgroupsPath is not set in config.json, the cgroup is derived from the container name, so calling set_cgroups_path is not needed. For the previous (unsuccessful) attempt to fix this, as well as detailed (and apparently correct) analysis, see commit 36fe3cc28c35d7157dc8b. Signed-off-by: Kir Kolyshkin --- tests/integration/checkpoint.bats | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index fe351e7476a..0a8e58a2c9c 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -224,7 +224,14 @@ function simple_cr() { # TCP port for lazy migration port=27277 - __runc checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_w} --work-path ./work-dir --image-path ./image-dir test_busybox & + __runc checkpoint \ + --lazy-pages \ + --page-server 0.0.0.0:${port} \ + --status-fd ${lazy_w} \ + --manage-cgroups-mode=ignore \ + --work-path ./work-dir \ + --image-path ./image-dir \ + test_busybox & cpt_pid=$! # wait for lazy page server to be ready @@ -246,14 +253,18 @@ function simple_cr() { lp_pid=$! # Restore lazily from checkpoint. - # The restored container needs a different name (as well as systemd - # unit name, in case systemd cgroup driver is used) as the checkpointed - # container is not yet destroyed. It is only destroyed at that point - # in time when the last page is lazily transferred to the destination. + # + # The restored container needs a different name and a different cgroup + # (and a different systemd unit name, in case systemd cgroup driver is + # used) as the checkpointed container is not yet destroyed. It is only + # destroyed at that point in time when the last page is lazily + # transferred to the destination. + # # Killing the CRIU on the checkpoint side will let the container # continue to run if the migration failed at some point. - [ -v RUNC_USE_SYSTEMD ] && set_cgroups_path - runc_restore_with_pipes ./image-dir test_busybox_restore --lazy-pages + runc_restore_with_pipes ./image-dir test_busybox_restore \ + --lazy-pages \ + --manage-cgroups-mode=ignore wait $cpt_pid