From 9ca94a4b555e9fb59959a7a139f98a0d1b0270b8 Mon Sep 17 00:00:00 2001 From: Alexander Turenko Date: Wed, 16 Dec 2020 01:33:20 +0300 Subject: [PATCH] Kill app server after fail of a non-default server See a test case in #252. The idea is the following: the default server (it is the app server) executes a script, which starts a non-default server and hangs. The non-default instance fails after some time (before --test-timeout), test-run's crash detector observed this and stops the test (kills the greenlet). We kill all other non-default servers in the case, but prior to this commit, we don't kill the app server itself. Regarding the implementation. I updated AppServer.stop() to follow the way how TarantoolServer.stop() works: verbose logging, use SIGKILL after a timeout, wait for a process termination. And reused this code in the finalization of an app test. Introduced the AppTest.teardown() method. The idea is to have a method, where all finalization after AppTest.execute() occurs. Maybe we'll refactor process management in future and presence an explicit finalization method will make things more clear. I think, it is good point to close #65 and #157: most of cases, where tarantool instances are not terminated or killed, are fixed. There is kill_old_server(), which only sends SIGTERM, but we should not find alive instances here after the 'Wait until residual servers will be stopped' commit. There is #256, but it unlikely will hit us on real tests. If there will be other cases of this kind, we should handle them separately: file issue with a reproducer, investigate and fix them. I see no reason to track 'kill hung server' as the general task anymore. Fixes #65 Fixes #157 --- lib/app_server.py | 76 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/lib/app_server.py b/lib/app_server.py index bb248d63..00a82b8f 100644 --- a/lib/app_server.py +++ b/lib/app_server.py @@ -9,6 +9,7 @@ from lib.colorer import color_stdout from lib.colorer import color_log +from lib.colorer import qa_notice from lib.options import Options from lib.preprocessor import TestState from lib.server import Server @@ -18,6 +19,7 @@ from lib.tarantool_server import TarantoolStartError from lib.utils import find_port from lib.utils import format_process +from lib.utils import signame from lib.utils import warn_unix_socket from test import TestRunGreenlet, TestExecutionError from threading import Timer @@ -72,15 +74,25 @@ def execute(self, server): # A non-default server failed to start. raise TestExecutionError finally: - # Stop any servers created by the test, except the - # default one. - # - # See a comment in LuaTest.execute() for motivation of - # SIGKILL usage. - ts.stop_nondefault(signal=signal.SIGKILL) + self.teardown(server, ts) if retval.get('returncode', None) != 0: raise TestExecutionError + def teardown(self, server, ts): + # Stop any servers created by the test, except the + # default one. + # + # See a comment in LuaTest.execute() for motivation of + # SIGKILL usage. + ts.stop_nondefault(signal=signal.SIGKILL) + + # When a supplementary (non-default) server fails, we + # should not leave the process that executes an app test. + # Let's kill it. + # + # Reuse AppServer.stop() code for convenience. + server.stop(signal=signal.SIGKILL) + class AppServer(Server): """A dummy server implementation for application server tests""" @@ -153,16 +165,60 @@ def deploy(self, vardir=None, silent=True, need_init=True): # cannot check length of path of *.control unix socket created by it. # So for 'app' tests type we don't check *.control unix sockets paths. - def stop(self, silent): + def stop(self, silent=True, signal=signal.SIGTERM): + # FIXME: Extract common parts of AppServer.stop() and + # TarantoolServer.stop() to an utility function. + + color_log('DEBUG: [app server] Stopping the server...\n', + schema='info') + if not self.process: + color_log(' | Nothing to do: the process does not exist\n', + schema='info') return - color_log('AppServer.stop(): stopping the %s\n' % - format_process(self.process.pid), schema='test_var') + + if self.process.returncode: + if self.process.returncode < 0: + signaled_by = -self.process.returncode + color_log(' | Nothing to do: the process was terminated by ' + 'signal {} ({})\n'.format(signaled_by, + signame(signaled_by)), + schema='info') + else: + color_log(' | Nothing to do: the process was exited with code ' + '{}\n'.format(self.process.returncode), + schema='info') + return + + color_log(' | Sending signal {0} ({1}) to {2}\n'.format( + signal, signame(signal), + format_process(self.process.pid))) try: - self.process.terminate() + self.process.send_signal(signal) except OSError: pass + # Waiting for stopping the server. If the timeout + # reached, send SIGKILL. + timeout = 5 + + def kill(): + qa_notice('The app server does not stop during {} ' + 'seconds after the {} ({}) signal.\n' + 'Info: {}\n' + 'Sending SIGKILL...'.format( + timeout, signal, signame(signal), + format_process(self.process.pid))) + try: + self.process.kill() + except OSError: + pass + + timer = Timer(timeout, kill) + timer.start() + self.process.wait() + timer.cancel() + @classmethod def find_exe(cls, builddir): cls.builddir = builddir