diff --git a/.gitignore b/.gitignore index a3a658641..3c79ac98c 100755 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ src/WebUI/dotnet/WebPortal/hosting.json **/storage.ide src/RepairManager/.vs/* +src/RepairManager/mock-data/* /.vs/slnx.sqlite /.vs/ProjectSettings.json /.vs/VSWorkspaceState.json diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 88cb8f112..16c466c7b 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -1,6 +1,7 @@ from Rules.rules_abc import Rule from kubernetes import client, config from utils import k8s_util, email +from tabulate import tabulate import requests import json import os @@ -27,23 +28,42 @@ def get_node_address_info(node_info): address_map[internal_ip] = hostname - logging.debug('node address map: %s ' % address_map) + logging.debug(f'node address map: {address_map}') return address_map def get_ECC_error_data(ecc_url): + try: + response = requests.get(ecc_url) + if response: + data = json.loads(response.text) + + if data: + ecc_metrics = data['data']['result'] + logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + return ecc_metrics + else: + logging.warning(f'No response from {ecc_url} found.') - response = requests.get(ecc_url) - data = json.loads(response.text) - - if data: - ecc_metrics = data['data']['result'] - logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + except: + logging.exception(f'Error retrieving data from {ecc_url}') - return ecc_metrics +def get_job_info_from_nodes(nodes): + pods = k8s_util.list_pod_for_all_namespaces() + jobs = {} + for pod in pods.items: + if pod.metadata and pod.metadata.labels: + if 'jobId' in pod.metadata.labels and 'userName' in pod.metadata.labels: + if pod.spec.node_name in nodes: + jobs[pod.metadata.labels['jobId']] = { + 'userName': pod.metadata.labels['userName'], + 'nodeName': pod.spec.node_name, + 'vcName': pod.metadata.labels['vcName'] + } + return jobs class ECCRule(Rule): @@ -79,25 +99,25 @@ def check_status(self): return False def take_action(self): - status = {} - + status = [] for node_name in self.ecc_hostnames: - - if not k8s_util.is_node_unschedulable(self.node_info, node_name): - success = k8s_util.cordon_node(node_name) - - if success != 0: - logging.warning(f'Unscheduling of node {node_name} not successful') - status[node_name] = 'Failed to mark as unschedulable' - else: - status[node_name] = 'Successfully marked as unschedulable' - - else: - status[node_name] = 'Previously marked as unschedulable' - - body = 'Uncorrectable ECC Error found on the following nodes:\n' - for node_name in status: - body += f'{node_name} -> \t{status[node_name]}\n' - - subject = 'Repair Manager Alert [ECC ERROR]' - self.alert.handle_email_alert(subject, body) \ No newline at end of file + output = k8s_util.cordon_node(node_name, dry_run=True) + status.append([node_name, output]) + + subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]' + body = f'

Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:

' + body += tabulate(status, headers=['node name', 'action status'], tablefmt="html").replace('','
') + + body += f'

Impacted Jobs and Job Owners

' + job_owners = [] + job_info = [] + jobs = get_job_info_from_nodes(self.ecc_hostnames) + for jobId in jobs: + job_owners.append(jobs[jobId]['userName'] + '@microsoft.com') + job_info.append([jobId, jobs[jobId]['userName'], jobs[jobId]['nodeName'], jobs[jobId]['vcName']]) + body += tabulate(job_info, headers=['job id', 'job owner', 'node name', 'vc name' ], tablefmt="html").replace('
','
') + + if self.config['rules']['ecc_rule']['alert_job_owners']: + self.alert.handle_email_alert(subject, body, additional_recipients=job_owners) + else: + self.alert.handle_email_alert(subject, body) \ No newline at end of file diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml index 4a75bfba6..317cdd004 100644 --- a/src/RepairManager/config/email-config.yaml +++ b/src/RepairManager/config/email-config.yaml @@ -4,5 +4,4 @@ password: {{cnf['repair-manager']['alert']['password']}} sender: {{cnf['repair-manager']['alert']['sender']}} receiver: {{cnf['repair-manager']['alert']['receiver']}} -# time to wait before resending email alert for exceptions (seconds) -alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}} \ No newline at end of file +alert_wait_seconds: 86400 # time to wait before resending email alert for exceptions (seconds) \ No newline at end of file diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index 0912010a4..de7103df0 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -4,6 +4,10 @@ rules: module_name : Rules.ecc_rule class_name : ECCRule ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + alert_job_owners: False # alert all impacted job owners # time to sleep between rule execution (seconds) rule_wait_time: 10 + +cluster_name: {{cnf['repair-manager']['cluster_name']}} + diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index 12ca560e8..46feb0c99 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -55,13 +55,13 @@ def Run(): except Exception as e: logger.exception(f'Error executing {class_name} from module {module_name}\n') - subject = f'Repair Manager Alert [Exception executing {class_name}]' + subject = f'Repair Manager Alert [Rule Error] [{class_name}] [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.handle_email_alert(subject, body) except Exception as e: logger.exception('Repair manager has stopped due to an unhandled exception:') - subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly' + subject = f'Repair Manager Alert [Repair Manager Crashed] [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.send(subject, body) diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py index 59deece01..0cc1d6ed6 100644 --- a/src/RepairManager/utils/email.py +++ b/src/RepairManager/utils/email.py @@ -14,15 +14,25 @@ def load_config(self): with open('./config/email-config.yaml', 'r') as file: return yaml.safe_load(file) - def send(self, subject, body): - message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body) + def send(self, subject, body, additional_recipients=None): + recepients = self.config['receiver'] + if additional_recipients is not None: + recepients = recepients + additional_recipients + + message = ( + f"From: {self.config['sender']}\r\n" + f"To: {';'.join(recepients)}\r\n" + f"MIME-Version: 1.0\r\n" + f"Content-type: text/html\r\n" + f"Subject: {subject}\r\n\r\n{body}" + ) try: with smtplib.SMTP(self.config['smtp_url']) as server: server.starttls() server.login(self.config['login'], self.config['password']) - server.sendmail(self.config['sender'], self.config['receiver'], message) - logging.info('Email Sent') + server.sendmail(self.config['sender'], recepients, message) + logging.info(f"Email sent to {', '.join(recepients)}") except smtplib.SMTPAuthenticationError: logging.warning('The server didn\'t accept the user\\password combination.') except smtplib.SMTPServerDisconnected: @@ -30,9 +40,8 @@ def send(self, subject, body): except smtplib.SMTPException as e: logging.exception('SMTP error occurred: ' + str(e)) - def handle_email_alert(self, subject, body): - + def handle_email_alert(self, subject, body, additional_recipients=None): # to avoid email spam, send email based on configured alert wait time if body not in self.alert_cache: - self.send(subject, body) + self.send(subject, body, additional_recipients) self.alert_cache[body] = 1 \ No newline at end of file diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py index 4c83e1658..9c796d394 100644 --- a/src/RepairManager/utils/k8s_util.py +++ b/src/RepairManager/utils/k8s_util.py @@ -1,22 +1,37 @@ -import os -import logging -from kubernetes import client, config - -def cordon_node(node_name): - output = os.system('kubectl cordon %s' % node_name) - return output - - -def is_node_unschedulable(node_info, node_name): - for node in node_info.items: - for address in node.status.addresses: - if address.type == 'Hostname' and address.address == node_name: - return node.spec.unschedulable - - logging.warning(f"Could not find node with hostname {node_name}") - - -def list_node(): - config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') - api_instance = client.CoreV1Api() - return api_instance.list_node() \ No newline at end of file +import subprocess +import logging +from kubernetes import client, config + +def cordon_node(node_name, dry_run=True): + args = ['kubectl', 'cordon', node_name] + + if dry_run: + args.append('--dry-run') + + try: + output = subprocess.check_output(args, stderr=subprocess.STDOUT) + logging.info(output.decode()) + return output.decode() + except subprocess.CalledProcessError as e: + logging.exception(f'Exception attempting to cordon node {node_name}') + return e.output.decode() + + +def is_node_unschedulable(node_info, node_name): + for node in node_info.items: + for address in node.status.addresses: + if address.type == 'Hostname' and address.address == node_name: + return node.spec.unschedulable + + logging.warning(f"Could not find node with hostname {node_name}") + + +def list_node(): + config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') + api_instance = client.CoreV1Api() + return api_instance.list_node() + +def list_pod_for_all_namespaces(): + config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml',) + api_instance = client.CoreV1Api() + return api_instance.list_pod_for_all_namespaces() \ No newline at end of file diff --git a/src/docker-images/RepairManager/Dockerfile b/src/docker-images/RepairManager/Dockerfile index fcf6045fa..09d68d68a 100644 --- a/src/docker-images/RepairManager/Dockerfile +++ b/src/docker-images/RepairManager/Dockerfile @@ -9,6 +9,7 @@ RUN pip3 install wheel RUN pip3 install setuptools RUN pip3 install requests RUN pip3 install cachetools +RUN pip3 install tabulate COPY kubectl /usr/local/bin/kubectl RUN chmod +x /usr/local/bin/kubectl diff --git a/src/docker-images/RepairManager/prebuild.sh b/src/docker-images/RepairManager/prebuild.sh index b449bcf1d..3dd1aafd1 100644 --- a/src/docker-images/RepairManager/prebuild.sh +++ b/src/docker-images/RepairManager/prebuild.sh @@ -8,6 +8,7 @@ cp -r ../../../../RepairManager RepairManager # Render config file for email credentials cd ../../../ ./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml +./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml cd ./deploy/docker-images/RepairManager/ cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl