From f5f17cde0de1026f92fe47f87a9f81e77e7bdae4 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 4 Aug 2022 11:57:31 -0700
Subject: [PATCH] tests/int/checkpoint: fix lazy migration flakiness

When doing a lazy checkpoint/restore, we should not restore into the
same cgroup, otherwise there is a race which result in occasional
killing of the restored container (GH #2760, #2924).

The fix is to use --manage-cgroup-mode=ignore, which allows to restore
into a different cgroup.

Note that since cgroupsPath is not set in config.json, the cgroup is
derived from the container name, so calling set_cgroups_path is not
needed.

For the previous (unsuccessful) attempt to fix this, as well as detailed
(and apparently correct) analysis, see commit 36fe3cc28c35d7157dc8b.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 tests/integration/checkpoint.bats | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats
index fe351e7476a..0a8e58a2c9c 100644
--- a/tests/integration/checkpoint.bats
+++ b/tests/integration/checkpoint.bats
@@ -224,7 +224,14 @@ function simple_cr() {
 	# TCP port for lazy migration
 	port=27277
 
-	__runc checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_w} --work-path ./work-dir --image-path ./image-dir test_busybox &
+	__runc checkpoint \
+		--lazy-pages \
+		--page-server 0.0.0.0:${port} \
+		--status-fd ${lazy_w} \
+		--manage-cgroups-mode=ignore \
+		--work-path ./work-dir \
+		--image-path ./image-dir \
+		test_busybox &
 	cpt_pid=$!
 
 	# wait for lazy page server to be ready
@@ -246,14 +253,18 @@ function simple_cr() {
 	lp_pid=$!
 
 	# Restore lazily from checkpoint.
-	# The restored container needs a different name (as well as systemd
-	# unit name, in case systemd cgroup driver is used) as the checkpointed
-	# container is not yet destroyed. It is only destroyed at that point
-	# in time when the last page is lazily transferred to the destination.
+	#
+	# The restored container needs a different name and a different cgroup
+	# (and a different systemd unit name, in case systemd cgroup driver is
+	# used) as the checkpointed container is not yet destroyed. It is only
+	# destroyed at that point in time when the last page is lazily
+	# transferred to the destination.
+	#
 	# Killing the CRIU on the checkpoint side will let the container
 	# continue to run if the migration failed at some point.
-	[ -v RUNC_USE_SYSTEMD ] && set_cgroups_path
-	runc_restore_with_pipes ./image-dir test_busybox_restore --lazy-pages
+	runc_restore_with_pipes ./image-dir test_busybox_restore \
+		--lazy-pages \
+		--manage-cgroups-mode=ignore
 
 	wait $cpt_pid