From fb0422c5587132d269ced2522937dd2c0d2f3603 Mon Sep 17 00:00:00 2001
From: Delyan Kratunov <delyank@meta.com>
Date: Thu, 3 Nov 2022 12:50:40 -0700
Subject: [PATCH] tests: teach runtime runner about multiple before clauses

As noted in iovisor/bpftrace#2402, usdt flakiness was made better by
508538a but not fully fixed.

This commit is what I should have done all along: it allows the test
runner to parse and wait for multiple BEFORE clauses and thus ensures
the processes have started before the test runs.

There are two minor changes:

  1. The check for child processes is now `ps --ppid` based to
     eventually allow parallel process runs in the same environment.
     That requires to use `ps` from the `procps` package on Alpine as
     the default BusyBox one doesn't have the `--ppid` option.
  2. Because of the `ps` usage, the name check is now truncated to 15
     chars, which will fail if TASK_COMM_LEN is not 16. That looks like
     a constant in the kernel, so I think we're good.
---
 docker/Dockerfile.alpine                   |  1 +
 tests/runtime/engine/parser.py             |  8 +--
 tests/runtime/engine/runner.py             | 70 +++++++++++++++-------
 tests/runtime/usdt                         |  3 +-
 tests/testprogs/two_usdt_semaphore_tests.c | 17 ------
 5 files changed, 55 insertions(+), 44 deletions(-)
 delete mode 100644 tests/testprogs/two_usdt_semaphore_tests.c

diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine
index 3166bfccd6ab..85f32898e8ee 100644
--- a/docker/Dockerfile.alpine
+++ b/docker/Dockerfile.alpine
@@ -29,6 +29,7 @@ RUN apk add --update \
   llvm${LLVM_VERSION}-dev \
   llvm${LLVM_VERSION}-static \
   musl-obstack-dev \
+  procps \
   python3 \
   wget \
   xxd \
diff --git a/tests/runtime/engine/parser.py b/tests/runtime/engine/parser.py
index 6e6fb0b529ef..82b9240ac401 100644
--- a/tests/runtime/engine/parser.py
+++ b/tests/runtime/engine/parser.py
@@ -25,7 +25,7 @@ class InvalidFieldError(Exception):
         'prog',
         'expect',
         'timeout',
-        'before',
+        'befores',
         'after',
         'suite',
         'kernel_min',
@@ -101,7 +101,7 @@ def __read_test_struct(test, test_suite):
         prog = ''
         expect = ''
         timeout = ''
-        before = ''
+        befores = []
         after = ''
         kernel_min = ''
         kernel_max = ''
@@ -128,7 +128,7 @@ def __read_test_struct(test, test_suite):
             elif item_name == 'TIMEOUT':
                 timeout = int(line.strip(' '))
             elif item_name == 'BEFORE':
-                before = line
+                befores.append(line)
             elif item_name == 'AFTER':
                 after = line
             elif item_name == 'MIN_KERNEL':
@@ -194,7 +194,7 @@ def __read_test_struct(test, test_suite):
             prog,
             expect,
             timeout,
-            before,
+            befores,
             after,
             test_suite,
             kernel_min,
diff --git a/tests/runtime/engine/runner.py b/tests/runtime/engine/runner.py
index 63dcd1a76d9a..6181a64876c5 100644
--- a/tests/runtime/engine/runner.py
+++ b/tests/runtime/engine/runner.py
@@ -158,11 +158,13 @@ def run_test(test):
         signal.signal(signal.SIGALRM, Runner.__handler)
 
         p = None
-        before = None
+        befores = []
         bpftrace = None
         after = None
         try:
+            result = None
             timeout = False
+            output = ""
 
             print(ok("[ RUN      ] ") + "%s.%s" % (test.suite, test.name))
             if test.requirement:
@@ -195,22 +197,45 @@ def run_test(test):
                         print(warn("[   SKIP   ] ") + "%s.%s" % (test.suite, test.name))
                         return Runner.SKIP_FEATURE_REQUIREMENT_UNSATISFIED
 
-            if test.before:
-                before = subprocess.Popen(test.before, shell=True, preexec_fn=os.setsid)
-                waited=0
+            if test.befores:
+                for before in test.befores:
+                    before = subprocess.Popen(before.split(), preexec_fn=os.setsid)
+                    befores.append(before)
+
                 with open(os.devnull, 'w') as dn:
-                    # This might not work for complicated cases, such as if
-                    # a test program needs to accept arguments. It covers the
-                    # current simple calls with no arguments
-                    child_name = os.path.basename(test.before.split()[-1])
-                    while subprocess.call(["pidof", child_name], stdout=dn, stderr=dn) != 0:
+                    child_names = [os.path.basename(x.strip().split()[-1]) for x in test.befores]
+                    child_names = sorted((x[:15] for x in child_names))  # cut to comm length
+                    print(f"child_names: %{child_names}")
+
+                    # Print the names of all of our children and look
+                    # for the ones from BEFORE clauses
+                    waited=0
+                    while waited <= test.timeout:
+                        children = subprocess.run(["ps", "--ppid", str(os.getpid()), "--no-headers", "-o", "comm"],
+                                                  check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                        if children.returncode == 0 and children.stdout:
+                            lines = [line.decode('utf-8') for line in children.stdout.splitlines()]
+                            lines = sorted((line.strip() for line in lines if line != 'ps'))
+                            print(f"lines: %{lines}")
+                            if lines == child_names:
+                                break
+                        else:
+                            print(children.stderr)
+
                         time.sleep(0.1)
                         waited+=0.1
-                        if waited > test.timeout:
-                            raise TimeoutError('Timed out waiting for BEFORE %s ', test.before)
+
+                    if waited > test.timeout:
+                        raise TimeoutError(f'Timed out waiting for BEFORE(s) {test.befores}')
 
             bpf_call = Runner.prepare_bpf_call(test)
-            if test.before and '{{BEFORE_PID}}' in bpf_call:
+            if test.befores and '{{BEFORE_PID}}' in bpf_call:
+                if len(test.befores) > 1:
+                    raise ValueError(f"test has {len(test.befores)} BEFORE clauses but BEFORE_PID usage requires exactly one")
+
+                child_name = test.befores[0].strip().split()[-1]
+                child_name = os.path.basename(child_name)
+
                 childpid = subprocess.Popen(["pidof", child_name], stdout=subprocess.PIPE, universal_newlines=True).communicate()[0].split()[0]
                 bpf_call = re.sub("{{BEFORE_PID}}", str(childpid), bpf_call)
             env = {
@@ -234,8 +259,6 @@ def run_test(test):
 
             signal.alarm(ATTACH_TIMEOUT)
 
-            output = ""
-
             while p.poll() is None:
                 nextline = p.stdout.readline()
                 output += nextline
@@ -268,15 +291,18 @@ def run_test(test):
                     os.killpg(os.getpgid(p.pid), signal.SIGTERM)
                 output += p.communicate()[0]
                 result = re.search(test.expect, output)
-            if not result:
-                print(fail("[  TIMEOUT ] ") + "%s.%s" % (test.suite, test.name))
-                print('\tCommand: %s' % bpf_call)
-                print('\tTimeout: %s' % test.timeout)
-                print('\tCurrent output: %s' % output)
-                return Runner.TIMEOUT
+
+                if not result:
+                    print(fail("[  TIMEOUT ] ") + "%s.%s" % (test.suite, test.name))
+                    print('\tCommand: %s' % bpf_call)
+                    print('\tTimeout: %s' % test.timeout)
+                    print('\tCurrent output: %s' % output)
+                    return Runner.TIMEOUT
         finally:
-            if before and before.poll() is None:
-                os.killpg(os.getpgid(before.pid), signal.SIGKILL)
+            if befores:
+                for before in befores:
+                    if before.poll() is None:
+                        os.killpg(os.getpgid(before.pid), signal.SIGKILL)
 
             if bpftrace and bpftrace.poll() is None:
                 os.killpg(os.getpgid(p.pid), signal.SIGKILL)
diff --git a/tests/runtime/usdt b/tests/runtime/usdt
index 34c90edb3eb5..21231b823f6a 100644
--- a/tests/runtime/usdt
+++ b/tests/runtime/usdt
@@ -247,7 +247,8 @@ NAME "usdt probes - file based semaphore activation multi process"
 RUN {{BPFTRACE}} runtime/scripts/usdt_file_activation_multiprocess.bt --usdt-file-activation
 EXPECT found 2 processes
 TIMEOUT 5
-BEFORE ./testprogs/two_usdt_semaphore_tests
+BEFORE ./testprogs/usdt_semaphore_test
+BEFORE ./testprogs/usdt_semaphore_test
 REQUIRES ./testprogs/usdt_semaphore_test should_not_skip
 
 NAME "usdt probes - list probes by pid in separate mountns"
diff --git a/tests/testprogs/two_usdt_semaphore_tests.c b/tests/testprogs/two_usdt_semaphore_tests.c
deleted file mode 100644
index d271ea2cb6da..000000000000
--- a/tests/testprogs/two_usdt_semaphore_tests.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <unistd.h>
-
-int main(int argc __attribute__((unused)), char **argv)
-{
-  int pid = fork();
-
-  if (pid < 0)
-    return 1;
-  else if (pid == 0)
-    // child, test 1
-    execv("./testprogs/usdt_semaphore_test", argv);
-  else
-    // parent, test 2
-    execv("./testprogs/usdt_semaphore_test", argv);
-
-  return 0;
-}