From 9ca94a4b555e9fb59959a7a139f98a0d1b0270b8 Mon Sep 17 00:00:00 2001
From: Alexander Turenko <alexander.turenko@tarantool.org>
Date: Wed, 16 Dec 2020 01:33:20 +0300
Subject: [PATCH] Kill app server after fail of a non-default server

See a test case in #252. The idea is the following: the default server
(it is the app server) executes a script, which starts a non-default
server and hangs. The non-default instance fails after some time (before
--test-timeout), test-run's crash detector observed this and stops the
test (kills the greenlet). We kill all other non-default servers in the
case, but prior to this commit, we don't kill the app server itself.

Regarding the implementation. I updated AppServer.stop() to follow the
way how TarantoolServer.stop() works: verbose logging, use SIGKILL after
a timeout, wait for a process termination. And reused this code in the
finalization of an app test.

Introduced the AppTest.teardown() method. The idea is to have a method,
where all finalization after AppTest.execute() occurs. Maybe we'll
refactor process management in future and presence an explicit
finalization method will make things more clear.

I think, it is good point to close #65 and #157: most of cases, where
tarantool instances are not terminated or killed, are fixed. There is
kill_old_server(), which only sends SIGTERM, but we should not find
alive instances here after the 'Wait until residual servers will be
stopped' commit. There is #256, but it unlikely will hit us on real
tests. If there will be other cases of this kind, we should handle them
separately: file issue with a reproducer, investigate and fix them. I
see no reason to track 'kill hung server' as the general task anymore.

Fixes #65
Fixes #157
---
 lib/app_server.py | 76 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/lib/app_server.py b/lib/app_server.py
index bb248d63..00a82b8f 100644
--- a/lib/app_server.py
+++ b/lib/app_server.py
@@ -9,6 +9,7 @@
 
 from lib.colorer import color_stdout
 from lib.colorer import color_log
+from lib.colorer import qa_notice
 from lib.options import Options
 from lib.preprocessor import TestState
 from lib.server import Server
@@ -18,6 +19,7 @@
 from lib.tarantool_server import TarantoolStartError
 from lib.utils import find_port
 from lib.utils import format_process
+from lib.utils import signame
 from lib.utils import warn_unix_socket
 from test import TestRunGreenlet, TestExecutionError
 from threading import Timer
@@ -72,15 +74,25 @@ def execute(self, server):
             # A non-default server failed to start.
             raise TestExecutionError
         finally:
-            # Stop any servers created by the test, except the
-            # default one.
-            #
-            # See a comment in LuaTest.execute() for motivation of
-            # SIGKILL usage.
-            ts.stop_nondefault(signal=signal.SIGKILL)
+            self.teardown(server, ts)
         if retval.get('returncode', None) != 0:
             raise TestExecutionError
 
+    def teardown(self, server, ts):
+        # Stop any servers created by the test, except the
+        # default one.
+        #
+        # See a comment in LuaTest.execute() for motivation of
+        # SIGKILL usage.
+        ts.stop_nondefault(signal=signal.SIGKILL)
+
+        # When a supplementary (non-default) server fails, we
+        # should not leave the process that executes an app test.
+        # Let's kill it.
+        #
+        # Reuse AppServer.stop() code for convenience.
+        server.stop(signal=signal.SIGKILL)
+
 
 class AppServer(Server):
     """A dummy server implementation for application server tests"""
@@ -153,16 +165,60 @@ def deploy(self, vardir=None, silent=True, need_init=True):
         # cannot check length of path of *.control unix socket created by it.
         # So for 'app' tests type we don't check *.control unix sockets paths.
 
-    def stop(self, silent):
+    def stop(self, silent=True, signal=signal.SIGTERM):
+        # FIXME: Extract common parts of AppServer.stop() and
+        # TarantoolServer.stop() to an utility function.
+
+        color_log('DEBUG: [app server] Stopping the server...\n',
+                  schema='info')
+
         if not self.process:
+            color_log(' | Nothing to do: the process does not exist\n',
+                      schema='info')
             return
-        color_log('AppServer.stop(): stopping the %s\n' %
-                  format_process(self.process.pid), schema='test_var')
+
+        if self.process.returncode:
+            if self.process.returncode < 0:
+                signaled_by = -self.process.returncode
+                color_log(' | Nothing to do: the process was terminated by '
+                          'signal {} ({})\n'.format(signaled_by,
+                                                    signame(signaled_by)),
+                          schema='info')
+            else:
+                color_log(' | Nothing to do: the process was exited with code '
+                          '{}\n'.format(self.process.returncode),
+                          schema='info')
+            return
+
+        color_log(' | Sending signal {0} ({1}) to {2}\n'.format(
+                  signal, signame(signal),
+                  format_process(self.process.pid)))
         try:
-            self.process.terminate()
+            self.process.send_signal(signal)
         except OSError:
             pass
 
+        # Waiting for stopping the server. If the timeout
+        # reached, send SIGKILL.
+        timeout = 5
+
+        def kill():
+            qa_notice('The app server does not stop during {} '
+                      'seconds after the {} ({}) signal.\n'
+                      'Info: {}\n'
+                      'Sending SIGKILL...'.format(
+                          timeout, signal, signame(signal),
+                          format_process(self.process.pid)))
+            try:
+                self.process.kill()
+            except OSError:
+                pass
+
+        timer = Timer(timeout, kill)
+        timer.start()
+        self.process.wait()
+        timer.cancel()
+
     @classmethod
     def find_exe(cls, builddir):
         cls.builddir = builddir