-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
vdk-jobs-troubleshooting: Add thread-dump utility
This change adds the implementation of a thread-dump utility to the vdk-jobs-troubleshooting plugin. The utility uses an http server, through which an administrator is able to force a stacktrace dump of all threads used by the python process of the data job. The server is bound to a port on the localhost, so to get the stacktrace, one needs to be attached to the data job pod. Testing Done: Added unit tests for the utility registry, and tested the plugin itself locally by running a simple data job and examining the execution logs. Signed-off-by: Andon Andonov <[email protected]>
- Loading branch information
Showing
6 changed files
with
235 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
30 changes: 30 additions & 0 deletions
30
...bleshooting/src/vdk/plugin/jobs_troubleshoot/troubleshoot_utilities/healthcheck_server.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright 2021 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from http.server import HTTPServer | ||
from http.server import SimpleHTTPRequestHandler | ||
from threading import Thread | ||
from typing import Any | ||
|
||
|
||
class HealthCheckServer: | ||
def __init__(self, port: int, handler: Any = None): | ||
if handler: | ||
self._server = HTTPServer(("", port), handler) | ||
else: | ||
self._server = HTTPServer(("", port), SimpleHTTPRequestHandler) | ||
self._thread = Thread(target=self._server.serve_forever) | ||
|
||
def __enter__(self): | ||
self.start() | ||
return self | ||
|
||
def __exit__(self, typ, value, traceback): | ||
self.stop() | ||
|
||
def start(self): | ||
self._thread.start() | ||
|
||
def stop(self): | ||
self._server.shutdown() | ||
self._server.server_close() | ||
self._thread.join() |
63 changes: 63 additions & 0 deletions
63
...bs-troubleshooting/src/vdk/plugin/jobs_troubleshoot/troubleshoot_utilities/thread_dump.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Copyright 2021 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import json | ||
import logging | ||
import sys | ||
import threading | ||
import traceback | ||
from http.server import BaseHTTPRequestHandler | ||
|
||
from vdk.internal.core.config import Configuration | ||
from vdk.plugin.jobs_troubleshoot.api.troubleshoot_utility import ITroubleshootUtility | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_configuration import ( | ||
TROUBLESHOOT_PORT_TO_USE, | ||
) | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_utilities.healthcheck_server import ( | ||
HealthCheckServer, | ||
) | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class ThreadDumpHandler(BaseHTTPRequestHandler): | ||
def do_GET(self): | ||
if self.path == "/threads": | ||
self.send_response(200) | ||
self.wfile.write(b"making a thread dump") | ||
log.info("Dumping threads") | ||
self._log_thread_dump() | ||
else: | ||
self.send_error(404) | ||
|
||
@staticmethod | ||
def _log_thread_dump(): | ||
try: | ||
for t in threading.enumerate(): | ||
log.info( | ||
f"Thread:{t.getName()} alive:{t.is_alive()} daemon:{t.isDaemon()}" | ||
) | ||
log.info("--------------------------------------------------------------") | ||
log.info("Dumping threads stacks:") | ||
code = [] | ||
for threadId, stack in sys._current_frames().items(): | ||
code.append("\n# ThreadID: %s" % threadId) | ||
for filename, lineno, name, line in traceback.extract_stack(stack): | ||
code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) | ||
if line: | ||
code.append(" %s" % (line.strip())) | ||
log.info(f"Threads stacks:{json.dumps(code)}") | ||
log.info("--------------------------------------------------------------") | ||
except Exception as e: | ||
log.exception(f"test_reporter_thread exception:{e}", exc_info=True) | ||
|
||
|
||
class ThreadDumpUtility(ITroubleshootUtility): | ||
def __init__(self, job_configuration: Configuration): | ||
self.port_to_use = job_configuration.get_value(TROUBLESHOOT_PORT_TO_USE) | ||
self.server = HealthCheckServer(self.port_to_use, ThreadDumpHandler) | ||
|
||
def start(self): | ||
self.server.start() | ||
|
||
def stop(self): | ||
self.server.stop() |
65 changes: 65 additions & 0 deletions
65
...bleshooting/src/vdk/plugin/jobs_troubleshoot/troubleshoot_utilities/utilities_registry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright 2021 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import logging | ||
from typing import Any | ||
from typing import Dict | ||
from typing import List | ||
from typing import Optional | ||
|
||
from vdk.internal.core.config import Configuration | ||
from vdk.plugin.jobs_troubleshoot.api.troubleshoot_utility import ITroubleshootUtility | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_configuration import ( | ||
TROUBLESHOOT_UTILITIES_TO_USE, | ||
) | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_utilities.thread_dump import ( | ||
ThreadDumpUtility, | ||
) | ||
|
||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
def utilities_registry(job_config: Configuration) -> Dict[str, Any]: | ||
""" | ||
The troubleshooting utilities registry is where all utility objects are to | ||
be initialized. | ||
TODO: Come up with a more elegant approach to register utilities. | ||
:param job_config: The data job configuration. | ||
:return: A dictionary with all available troubleshooting utilities. | ||
""" | ||
registered_utilities: Dict[str, Any] = {} | ||
registered_utilities["thread-dump"] = ThreadDumpUtility( | ||
job_configuration=job_config | ||
) | ||
|
||
return registered_utilities | ||
|
||
|
||
def get_utilities_to_use( | ||
job_config: Configuration, | ||
) -> Optional[List[ITroubleshootUtility]]: | ||
""" | ||
Get a list of the initialized troubleshooting utilities that are specified | ||
by the VDK_TROUBLESHOOT_UTILITIES_TO_USE configuration variable. | ||
:param job_config: Data Job configuration | ||
:return: A list of utility objects that are to be used. | ||
""" | ||
utilities: Optional[List[ITroubleshootUtility]] = [] | ||
selected_utilities: str = job_config.get_value(TROUBLESHOOT_UTILITIES_TO_USE) | ||
registered_utilities: Dict = utilities_registry(job_config=job_config) | ||
|
||
for util in selected_utilities.split(","): | ||
registered_util = registered_utilities.get(util) | ||
if registered_util: | ||
utilities.append(registered_util) | ||
else: | ||
log.info( | ||
f""" | ||
Utility {util} is not in the list of available troubleshooting | ||
utilities. | ||
Available utilities: {registered_utilities.keys()} | ||
""" | ||
) | ||
|
||
return utilities |
6 changes: 0 additions & 6 deletions
6
projects/vdk-plugins/vdk-jobs-troubleshooting/tests/test_jobs_troubleshoot.py
This file was deleted.
Oops, something went wrong.
39 changes: 39 additions & 0 deletions
39
projects/vdk-plugins/vdk-jobs-troubleshooting/tests/test_utilities_registry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Copyright 2021 VMware, Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from typing import List | ||
|
||
from vdk.internal.core.config import ConfigurationBuilder | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_configuration import ( | ||
TROUBLESHOOT_PORT_TO_USE, | ||
) | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_configuration import ( | ||
TROUBLESHOOT_UTILITIES_TO_USE, | ||
) | ||
from vdk.plugin.jobs_troubleshoot.troubleshoot_utilities.utilities_registry import ( | ||
get_utilities_to_use, | ||
) | ||
|
||
|
||
def test_get_utilities_to_use(): | ||
config_builder = ConfigurationBuilder() | ||
config_builder.set_value(key=TROUBLESHOOT_UTILITIES_TO_USE, value="thread-dump") | ||
config_builder.set_value(key=TROUBLESHOOT_PORT_TO_USE, value=8783) | ||
configuration = config_builder.build() | ||
|
||
utility = get_utilities_to_use(configuration) | ||
|
||
assert isinstance(utility, List) | ||
assert len(utility) == 1 | ||
|
||
|
||
def test_get_utilities_to_use__nonexistent_utility(): | ||
config_builder = ConfigurationBuilder() | ||
config_builder.set_value( | ||
key=TROUBLESHOOT_UTILITIES_TO_USE, value="non-existent-utility" | ||
) | ||
config_builder.set_value(key=TROUBLESHOOT_PORT_TO_USE, value=8783) | ||
configuration = config_builder.build() | ||
|
||
utility = get_utilities_to_use(configuration) | ||
|
||
assert len(utility) == 0 |