From ecf149ae0af8cb775273f080852e0c5216ef7234 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Thu, 21 Nov 2019 01:31:31 +0000 Subject: [PATCH 01/12] add repairmanager to params.py --- src/ClusterBootstrap/params.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 src/ClusterBootstrap/params.py diff --git a/src/ClusterBootstrap/params.py b/src/ClusterBootstrap/params.py old mode 100644 new mode 100755 From 82fadbc52b00c37e6ea31b126667ca32624e49dc Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Mon, 25 Nov 2019 20:19:46 +0000 Subject: [PATCH 02/12] Refactoring + Email Alerts --- src/RepairManager/Rules/ecc_rule.py | 83 +++++++++++++----------- src/RepairManager/k8s_util.py | 15 +++++ src/RepairManager/main.py | 98 +++++++++++++++++++---------- src/RepairManager/rule-config.yaml | 11 +++- src/RepairManager/util.py | 20 ++++-- 5 files changed, 154 insertions(+), 73 deletions(-) create mode 100644 src/RepairManager/k8s_util.py diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index da6b4fc14..62379df84 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -6,24 +6,27 @@ import time import yaml import util +import k8s_util import logging -def get_node_address_info(): +def list_node(): config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') api_instance = client.CoreV1Api() - service_account_list = api_instance.list_node() + return api_instance.list_node() + +def get_node_address_info(node_info): # map InternalIP to Hostname address_map = {} - if (service_account_list): + if node_info: - for account in service_account_list.items: + for node in node_info.items: internal_ip = None hostname = None - for address in account.status.addresses: + for address in node.status.addresses: if address.type == 'InternalIP': internal_ip = address.address @@ -55,43 +58,51 @@ class ECCRule(Rule): def __init__(self): self.ecc_hostnames = [] + self.config = self.load_config() + self.node_info = {} + + def load_config(self): + with open('rule-config.yaml', 'r') as rule_file: + return yaml.safe_load(rule_file) def check_status(self): - try: - with open('rule-config.yaml', 'r') as rule_config: - config = yaml.safe_load(rule_config) - - address_map = get_node_address_info() - - ecc_url = os.environ['PROMETHEUS_HOST'] + config['rules']['ecc_rule']['ecc_error_url'] - ecc_metrics = get_ECC_error_data(ecc_url) - - if (ecc_metrics): - for m in ecc_metrics: - offending_node_ip = m['metric']['instance'].split(':')[0] - ecc_hostnames.append(address_map[offending_node_ip]) + # save node_info to reduce the number of API calls + self.node_info = list_node() + address_map = get_node_address_info(self.node_info) - logging.info('Uncorrectable ECC metrics found: ' + ecc_hostnames) - return True - - else: - logging.debug('No uncorrectable ECC metrics found.') - return False + ecc_url = os.environ['PROMETHEUS_HOST'] + self.config['rules']['ecc_rule']['ecc_error_url'] + ecc_metrics = get_ECC_error_data(ecc_url) + if ecc_metrics: + for m in ecc_metrics: + offending_node_ip = m['metric']['instance'].split(':')[0] + self.ecc_hostnames.append(address_map[offending_node_ip]) + logging.info(f'Uncorrectable ECC metrics found: {self.ecc_hostnames}') + return True + + else: + logging.debug('No uncorrectable ECC metrics found.') + return False - except Exception as e: - logging.exception('Error checking status for ECCRule') - #TODO: send email alert, raise exception? - def take_action(self): - try: - for node in ecc_hostnames: - success = util.cordon_node(node) + body = 'ECC Error found on the following nodes:\n' + all_nodes_already_unscheduled = True + + for node_name in self.ecc_hostnames: + + if not k8s_util.is_node_unschedulable(self.node_info, node_name): + all_nodes_already_unscheduled = False + success = k8s_util.cordon_node(node_name) + + if success != 0: + logging.warning(f'Unscheduling of node {node_name} not successful') + body += f'{node_name}: Failed to mark as unschedulable\n' + else: + body += f'{node_name}: Successfully marked as unschedulable\n' - if (success != 0): - logging.warning('Unscheduling of node ' + node + ' not successful') + if not all_nodes_already_unscheduled: + alert_info = self.config['email_alerts'] + subject = 'Repair Manager Alert [ECC ERROR]' + util.smtp_send_email(**alert_info, subject=subject, body=body) - except Exception as e: - logging.exception('Error taking action for ECCRule') - #TODO: send email alert, rasie exception? diff --git a/src/RepairManager/k8s_util.py b/src/RepairManager/k8s_util.py new file mode 100644 index 000000000..f4d6495af --- /dev/null +++ b/src/RepairManager/k8s_util.py @@ -0,0 +1,15 @@ +import os +import logging +from kubernetes import client, config + +def cordon_node(node_name): + output = os.system('kubectl cordon %s' % node_name) + return output + +def is_node_unschedulable(node_info, node_name): + for node in node_info.items: + for address in node.status.addresses: + if address.type == 'Hostname' and address.address == node_name: + return node.spec.unschedulable + + logging.warning(f"Could not find node with hostname {node_name}") \ No newline at end of file diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index e8e758c58..7435f52ae 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -4,53 +4,87 @@ import logging import logging.config import importlib +import datetime +import util +import traceback import Rules - with open('logging.yaml', 'r') as log_file: log_config = yaml.safe_load(log_file) logging.config.dictConfig(log_config) logger = logging.getLogger(__name__) -logger.debug('Repair manager controller has started') -try: - while True: - try: - #reload module - importlib.reload(Rules) +def handle_email_alert(config, monitor_alerts, class_name, e): + email_params = config['email_alerts'] + traceback_str = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) + subject = f'Repair Manager Alert [Exception executing {class_name}]' + + # to avoid email clutter, send email based on configured alert wait time + if class_name in monitor_alerts: + time_now = datetime.datetime.now() + time_start = monitor_alerts[class_name] + time_delta = datetime.timedelta(hours=config['alert_wait_time']) + + if time_now - time_start > time_delta: + util.smtp_send_email(**email_params, subject=subject, body=traceback_str) + monitor_alerts[class_name] = datetime.datetime.now() + + else: + monitor_alerts[class_name] = datetime.datetime.now() + util.smtp_send_email(**email_params, subject=subject, body=traceback_str) + + return monitor_alerts + +def refresh_rules(): + try: + importlib.reload(Rules) + + with open('rule-config.yaml', 'r') as config_file: + config = yaml.safe_load(config_file) + + return config + + except Exception as e: + logger.exception('Error loading modules/rule config') - # refresh config - with open('rule-config.yaml', 'r') as rule_file: - rule_config = yaml.safe_load(rule_file) +def Run(): + try: + monitor_alerts = {} + while True: + config = refresh_rules() - rules = rule_config['rules'] - - except Exception as e: - logger.exception('Error loading modules/rule config') + # execute all rules listed in config + rules = config['rules'] + for r_key in rules.keys(): + try: + # retrieve module and class for given rule + module_name = rules[r_key]['module_name'] + class_name = rules[r_key]['class_name'] + r_module = sys.modules[module_name] + r_class = getattr(r_module, class_name) + rule = r_class() - # execute all rules listed in config - for r_key in rules.keys(): - try: - module_name = rules[r_key]['module_name'] - class_name = rules[r_key]['class_name'] + logger.debug(f'Executing {class_name} from module {module_name}') - r_module = sys.modules[module_name] - r_class = getattr(r_module, class_name) - rule = r_class() + if rule.check_status(): + rule.take_action() - logger.debug('Executing ' + class_name + ' from module ' + module_name) - - if rule.check_status(): - rule.take_action() + time.sleep(config['rule_wait_time']) - time.sleep(rule_config['wait_time']) + except Exception as e: + logger.exception(f'Error executing {class_name} from module {module_name}\n') + monitor_alerts = handle_email_alert(config, monitor_alerts, class_name, e) - except Exception as e: - logger.exception('Error executing ' + class_name + ' from module ' + module_name) - #TODO: send email alert? + except Exception as e: + logger.exception('Repair manager has stopped due to an unhandled exception:') + email_params = config['email_alerts'] + traceback_str = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) + subject = '[Repair Manager Alert] Repair manager has stopped' + body = f'Repair manager has stopped unexpectedly due to an unhandled exception:\n{traceback_str}' + util.smtp_send_email(**email_params, subject=subject, body=body) -except KeyboardInterrupt: - pass +if __name__ == '__main__': + Run() \ No newline at end of file diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml index 32a21045d..b704a6e86 100644 --- a/src/RepairManager/rule-config.yaml +++ b/src/RepairManager/rule-config.yaml @@ -6,5 +6,14 @@ rules: ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' # time to sleep between rule execution -wait_time: 10 +rule_wait_time: 10 +email_alerts: + smtp_url: REPLACE SMTP URL + login: REPLACE EMAIL LOGING + password: REPLACE EMAIL PASSWORD + sender: REPLACE EMAIL SENDER + receiver: REPLACE EMAIL RECEIVER + +# time to wait before resending email alert for exceptions (hours) +alert_wait_time: 2 diff --git a/src/RepairManager/util.py b/src/RepairManager/util.py index 8be734c31..23f883203 100644 --- a/src/RepairManager/util.py +++ b/src/RepairManager/util.py @@ -1,6 +1,18 @@ -import os +import smtplib +import logging -def cordon_node(node): - output = os.system('kubectl cordon %s --dry-run' % node) - return output +def smtp_send_email(smtp_url, login, password, sender, receiver, subject, body): + message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (sender, receiver, subject, body) + try: + with smtplib.SMTP(smtp_url) as server: + server.starttls() + server.login(login, password) + server.sendmail(sender, receiver, message) + logging.info('Email Sent') + except smtplib.SMTPAuthenticationError: + logging.warning('The server didn\'t accept the user\\password combination.') + except smtplib.SMTPServerDisconnected: + logging.warning('Server unexpectedly disconnected') + except smtplib.SMTPException as e: + logging.exception('SMTP error occurred: ' + str(e)) \ No newline at end of file From a9853169ddf9a61cdc21bcd41d26c889d2cea7ce Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Tue, 3 Dec 2019 21:06:48 +0000 Subject: [PATCH 03/12] repair manager: more refactoring --- src/RepairManager/Rules/ecc_rule.py | 31 ++++++------ src/RepairManager/config/email-config.yaml | 8 ++++ src/RepairManager/{ => config}/logging.yaml | 0 src/RepairManager/config/rule-config.yaml | 9 ++++ src/RepairManager/main.py | 41 ++++------------ src/RepairManager/mock-data/ecc.json | 38 +++++++++++++++ src/RepairManager/rule-config.yaml | 19 -------- src/RepairManager/util.py | 18 ------- src/RepairManager/utils/email.py | 52 +++++++++++++++++++++ src/RepairManager/{ => utils}/k8s_util.py | 9 +++- src/docker-images/RepairManager/prebuild.sh | 6 +++ 11 files changed, 145 insertions(+), 86 deletions(-) create mode 100644 src/RepairManager/config/email-config.yaml rename src/RepairManager/{ => config}/logging.yaml (100%) create mode 100644 src/RepairManager/config/rule-config.yaml create mode 100644 src/RepairManager/mock-data/ecc.json delete mode 100644 src/RepairManager/rule-config.yaml delete mode 100644 src/RepairManager/util.py create mode 100644 src/RepairManager/utils/email.py rename src/RepairManager/{ => utils}/k8s_util.py (72%) mode change 100755 => 100644 src/docker-images/RepairManager/prebuild.sh diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 62379df84..73a7a69d2 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -1,20 +1,14 @@ from Rules.rules_abc import Rule from kubernetes import client, config +from utils import k8s_util, email import requests import json import os import time import yaml -import util -import k8s_util import logging -def list_node(): - config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') - api_instance = client.CoreV1Api() - - return api_instance.list_node() - +alert = email.EmailHandler() def get_node_address_info(node_info): # map InternalIP to Hostname @@ -45,6 +39,10 @@ def get_ECC_error_data(ecc_url): response = requests.get(ecc_url) data = json.loads(response.text) + + # use mock data + #file_object = open('./mock-data/ecc.json', 'r') + #data = json.load(file_object) if data: ecc_metrics = data['data']['result'] @@ -57,17 +55,18 @@ def get_ECC_error_data(ecc_url): class ECCRule(Rule): def __init__(self): + self.config = self.load_rule_config() self.ecc_hostnames = [] - self.config = self.load_config() self.node_info = {} - def load_config(self): - with open('rule-config.yaml', 'r') as rule_file: - return yaml.safe_load(rule_file) + def load_rule_config(self): + with open('./config/rule-config.yaml', 'r') as file: + return yaml.safe_load(file) def check_status(self): # save node_info to reduce the number of API calls - self.node_info = list_node() + self.node_info = k8s_util.list_node() + address_map = get_node_address_info(self.node_info) ecc_url = os.environ['PROMETHEUS_HOST'] + self.config['rules']['ecc_rule']['ecc_error_url'] @@ -101,8 +100,6 @@ def take_action(self): else: body += f'{node_name}: Successfully marked as unschedulable\n' - if not all_nodes_already_unscheduled: - alert_info = self.config['email_alerts'] - subject = 'Repair Manager Alert [ECC ERROR]' - util.smtp_send_email(**alert_info, subject=subject, body=body) + subject = 'Repair Manager Alert [ECC ERROR]' + alert.handle_email_alert(subject, body) diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml new file mode 100644 index 000000000..b2a0aec00 --- /dev/null +++ b/src/RepairManager/config/email-config.yaml @@ -0,0 +1,8 @@ +smtp_url: {{cnf['repair-manager']['alert']['smtp_url']}} +login: {{cnf['repair-manager']['alert']['login']}} +password: {{cnf['repair-manager']['alert']['password']}} +sender: {{cnf['repair-manager']['alert']['sender']}} +receiver: {{cnf['repair-manager']['alert']['receiver']}} + +# time to wait before resending email alert for exceptions (hours) +alert_wait_time: {{cnf['repair-manager']['alert']['alert_wait_time']}} \ No newline at end of file diff --git a/src/RepairManager/logging.yaml b/src/RepairManager/config/logging.yaml similarity index 100% rename from src/RepairManager/logging.yaml rename to src/RepairManager/config/logging.yaml diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml new file mode 100644 index 000000000..e923307f0 --- /dev/null +++ b/src/RepairManager/config/rule-config.yaml @@ -0,0 +1,9 @@ +--- +rules: + ecc_rule: + module_name : Rules.ecc_rule + class_name : ECCRule + ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + +# time to sleep between rule execution +rule_wait_time: 2 diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index 7435f52ae..3e06fcc90 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -4,45 +4,25 @@ import logging import logging.config import importlib -import datetime -import util import traceback +from utils import email import Rules -with open('logging.yaml', 'r') as log_file: +with open('./config/logging.yaml', 'r') as log_file: log_config = yaml.safe_load(log_file) logging.config.dictConfig(log_config) logger = logging.getLogger(__name__) +alert = email.EmailHandler() -def handle_email_alert(config, monitor_alerts, class_name, e): - email_params = config['email_alerts'] - traceback_str = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) - subject = f'Repair Manager Alert [Exception executing {class_name}]' - - # to avoid email clutter, send email based on configured alert wait time - if class_name in monitor_alerts: - time_now = datetime.datetime.now() - time_start = monitor_alerts[class_name] - time_delta = datetime.timedelta(hours=config['alert_wait_time']) - - if time_now - time_start > time_delta: - util.smtp_send_email(**email_params, subject=subject, body=traceback_str) - monitor_alerts[class_name] = datetime.datetime.now() - - else: - monitor_alerts[class_name] = datetime.datetime.now() - util.smtp_send_email(**email_params, subject=subject, body=traceback_str) - - return monitor_alerts def refresh_rules(): try: importlib.reload(Rules) - with open('rule-config.yaml', 'r') as config_file: + with open('./config/rule-config.yaml', 'r') as config_file: config = yaml.safe_load(config_file) return config @@ -52,7 +32,6 @@ def refresh_rules(): def Run(): try: - monitor_alerts = {} while True: config = refresh_rules() @@ -76,15 +55,15 @@ def Run(): except Exception as e: logger.exception(f'Error executing {class_name} from module {module_name}\n') - monitor_alerts = handle_email_alert(config, monitor_alerts, class_name, e) + subject = f'Repair Manager Alert [Exception executing {class_name}]' + body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) + alert.handle_email_alert(subject, body) except Exception as e: logger.exception('Repair manager has stopped due to an unhandled exception:') - email_params = config['email_alerts'] - traceback_str = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) - subject = '[Repair Manager Alert] Repair manager has stopped' - body = f'Repair manager has stopped unexpectedly due to an unhandled exception:\n{traceback_str}' - util.smtp_send_email(**email_params, subject=subject, body=body) + subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly' + body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) + alert.send(subject, body) if __name__ == '__main__': Run() \ No newline at end of file diff --git a/src/RepairManager/mock-data/ecc.json b/src/RepairManager/mock-data/ecc.json new file mode 100644 index 000000000..9e8f42fbc --- /dev/null +++ b/src/RepairManager/mock-data/ecc.json @@ -0,0 +1,38 @@ +{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + { + "metric": { + "minor_number": "0", + "exporter_name": "job-exporter", + "scraped_from": "job-exporter-abcde", + "instance": "192.168.0.5:9102", + "job": "serivce_exporter", + "__name__": "nvidiasmi_ecc_error_count", + "type": "volatile_double" + }, + "value": [ + 1572393294.659, + "2" + ] + }, + { + "metric": { + "minor_number": "1", + "exporter_name": "job-exporter", + "scraped_from": "job-exporter-fghij", + "instance": "192.168.0.4:9102", + "job": "serivce_exporter", + "__name__": "nvidiasmi_ecc_error_count", + "type": "volatile_double" + }, + "value": [ + 1572393294.659, + "1" + ] + } + ] + } +} diff --git a/src/RepairManager/rule-config.yaml b/src/RepairManager/rule-config.yaml deleted file mode 100644 index b704a6e86..000000000 --- a/src/RepairManager/rule-config.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -rules: - ecc_rule: - module_name : Rules.ecc_rule - class_name : ECCRule - ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' - -# time to sleep between rule execution -rule_wait_time: 10 - -email_alerts: - smtp_url: REPLACE SMTP URL - login: REPLACE EMAIL LOGING - password: REPLACE EMAIL PASSWORD - sender: REPLACE EMAIL SENDER - receiver: REPLACE EMAIL RECEIVER - -# time to wait before resending email alert for exceptions (hours) -alert_wait_time: 2 diff --git a/src/RepairManager/util.py b/src/RepairManager/util.py deleted file mode 100644 index 23f883203..000000000 --- a/src/RepairManager/util.py +++ /dev/null @@ -1,18 +0,0 @@ -import smtplib -import logging - -def smtp_send_email(smtp_url, login, password, sender, receiver, subject, body): - message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (sender, receiver, subject, body) - - try: - with smtplib.SMTP(smtp_url) as server: - server.starttls() - server.login(login, password) - server.sendmail(sender, receiver, message) - logging.info('Email Sent') - except smtplib.SMTPAuthenticationError: - logging.warning('The server didn\'t accept the user\\password combination.') - except smtplib.SMTPServerDisconnected: - logging.warning('Server unexpectedly disconnected') - except smtplib.SMTPException as e: - logging.exception('SMTP error occurred: ' + str(e)) \ No newline at end of file diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py new file mode 100644 index 000000000..f6459e159 --- /dev/null +++ b/src/RepairManager/utils/email.py @@ -0,0 +1,52 @@ +import smtplib +import logging +import yaml +import datetime + +class EmailHandler(): + + def __init__(self): + self.config=self.load_config() + self.monitor_alerts = {} + + def load_config(self): + with open('./config/email-config.yaml', 'r') as file: + return yaml.safe_load(file) + + def send(self, subject, body): + message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body) + + try: + with smtplib.SMTP(self.config['smtp_url']) as server: + server.starttls() + server.login(self.config['login'], self.config['password']) + server.sendmail(self.config['sender'], self.config['receiver'], message) + logging.info('Email Sent') + except smtplib.SMTPAuthenticationError: + logging.warning('The server didn\'t accept the user\\password combination.') + except smtplib.SMTPServerDisconnected: + logging.warning('Server unexpectedly disconnected') + except smtplib.SMTPException as e: + logging.exception('SMTP error occurred: ' + str(e)) + + def handle_email_alert(self, subject, body): + time_now = datetime.datetime.now() + + # to avoid email clutter, send email based on configured alert wait time + for alert in self.monitor_alerts: + time_start = self.monitor_alerts[alert] + time_delta = datetime.timedelta(hours=self.config['alert_wait_time']) + wait_time_reached = time_now - time_start > time_delta + + if wait_time_reached: + + if alert == body: + self.send(subject, body) + self.monitor_alerts[alert] = datetime.datetime.now() + + else: + self.monitor_alerts.pop(alert) + + if body not in self.monitor_alerts: + self.send(subject, body) + self.monitor_alerts[body] = datetime.datetime.now() \ No newline at end of file diff --git a/src/RepairManager/k8s_util.py b/src/RepairManager/utils/k8s_util.py similarity index 72% rename from src/RepairManager/k8s_util.py rename to src/RepairManager/utils/k8s_util.py index f4d6495af..4c83e1658 100644 --- a/src/RepairManager/k8s_util.py +++ b/src/RepairManager/utils/k8s_util.py @@ -6,10 +6,17 @@ def cordon_node(node_name): output = os.system('kubectl cordon %s' % node_name) return output + def is_node_unschedulable(node_info, node_name): for node in node_info.items: for address in node.status.addresses: if address.type == 'Hostname' and address.address == node_name: return node.spec.unschedulable - logging.warning(f"Could not find node with hostname {node_name}") \ No newline at end of file + logging.warning(f"Could not find node with hostname {node_name}") + + +def list_node(): + config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') + api_instance = client.CoreV1Api() + return api_instance.list_node() \ No newline at end of file diff --git a/src/docker-images/RepairManager/prebuild.sh b/src/docker-images/RepairManager/prebuild.sh old mode 100755 new mode 100644 index cf80d3017..b449bcf1d --- a/src/docker-images/RepairManager/prebuild.sh +++ b/src/docker-images/RepairManager/prebuild.sh @@ -4,4 +4,10 @@ rm -rf RepairManager cp -r ../../../../RepairManager RepairManager + +# Render config file for email credentials +cd ../../../ +./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml +cd ./deploy/docker-images/RepairManager/ + cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl From 7f150f87cc10a26ced8769acc436d6cbb8839645 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Wed, 4 Dec 2019 07:40:27 +0000 Subject: [PATCH 04/12] PR feedback --- src/RepairManager/Rules/ecc_rule.py | 16 ++++----------- src/RepairManager/config/email-config.yaml | 2 +- src/RepairManager/config/rule-config.yaml | 10 ++++----- src/RepairManager/main.py | 2 +- src/RepairManager/utils/email.py | 24 +++++----------------- src/docker-images/RepairManager/Dockerfile | 1 + 6 files changed, 17 insertions(+), 38 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 73a7a69d2..7555b2a10 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -8,8 +8,6 @@ import yaml import logging -alert = email.EmailHandler() - def get_node_address_info(node_info): # map InternalIP to Hostname address_map = {} @@ -37,13 +35,9 @@ def get_node_address_info(node_info): def get_ECC_error_data(ecc_url): - response = requests.get(ecc_url) + response = requests.get(ecc_url) data = json.loads(response.text) - # use mock data - #file_object = open('./mock-data/ecc.json', 'r') - #data = json.load(file_object) - if data: ecc_metrics = data['data']['result'] logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) @@ -54,10 +48,11 @@ def get_ECC_error_data(ecc_url): class ECCRule(Rule): - def __init__(self): + def __init__(self, alert): self.config = self.load_rule_config() self.ecc_hostnames = [] self.node_info = {} + self.alert = alert def load_rule_config(self): with open('./config/rule-config.yaml', 'r') as file: @@ -79,19 +74,16 @@ def check_status(self): logging.info(f'Uncorrectable ECC metrics found: {self.ecc_hostnames}') return True - else: logging.debug('No uncorrectable ECC metrics found.') return False def take_action(self): body = 'ECC Error found on the following nodes:\n' - all_nodes_already_unscheduled = True for node_name in self.ecc_hostnames: if not k8s_util.is_node_unschedulable(self.node_info, node_name): - all_nodes_already_unscheduled = False success = k8s_util.cordon_node(node_name) if success != 0: @@ -101,5 +93,5 @@ def take_action(self): body += f'{node_name}: Successfully marked as unschedulable\n' subject = 'Repair Manager Alert [ECC ERROR]' - alert.handle_email_alert(subject, body) + self.alert.handle_email_alert(subject, body) diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml index b2a0aec00..78e750ba5 100644 --- a/src/RepairManager/config/email-config.yaml +++ b/src/RepairManager/config/email-config.yaml @@ -5,4 +5,4 @@ sender: {{cnf['repair-manager']['alert']['sender']}} receiver: {{cnf['repair-manager']['alert']['receiver']}} # time to wait before resending email alert for exceptions (hours) -alert_wait_time: {{cnf['repair-manager']['alert']['alert_wait_time']}} \ No newline at end of file +alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}} \ No newline at end of file diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index e923307f0..67c86064f 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -1,9 +1,9 @@ --- rules: - ecc_rule: - module_name : Rules.ecc_rule - class_name : ECCRule - ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' - + ecc_rule: + module_name : Rules.ecc_rule + class_name : ECCRule + ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + # time to sleep between rule execution rule_wait_time: 2 diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index 3e06fcc90..12ca560e8 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -44,7 +44,7 @@ def Run(): class_name = rules[r_key]['class_name'] r_module = sys.modules[module_name] r_class = getattr(r_module, class_name) - rule = r_class() + rule = r_class(alert) logger.debug(f'Executing {class_name} from module {module_name}') diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py index f6459e159..59deece01 100644 --- a/src/RepairManager/utils/email.py +++ b/src/RepairManager/utils/email.py @@ -2,12 +2,13 @@ import logging import yaml import datetime +from cachetools import TTLCache class EmailHandler(): def __init__(self): self.config=self.load_config() - self.monitor_alerts = {} + self.alert_cache=TTLCache(maxsize=1000, ttl=self.config['alert_wait_seconds']) def load_config(self): with open('./config/email-config.yaml', 'r') as file: @@ -30,23 +31,8 @@ def send(self, subject, body): logging.exception('SMTP error occurred: ' + str(e)) def handle_email_alert(self, subject, body): - time_now = datetime.datetime.now() - # to avoid email clutter, send email based on configured alert wait time - for alert in self.monitor_alerts: - time_start = self.monitor_alerts[alert] - time_delta = datetime.timedelta(hours=self.config['alert_wait_time']) - wait_time_reached = time_now - time_start > time_delta - - if wait_time_reached: - - if alert == body: - self.send(subject, body) - self.monitor_alerts[alert] = datetime.datetime.now() - - else: - self.monitor_alerts.pop(alert) - - if body not in self.monitor_alerts: + # to avoid email spam, send email based on configured alert wait time + if body not in self.alert_cache: self.send(subject, body) - self.monitor_alerts[body] = datetime.datetime.now() \ No newline at end of file + self.alert_cache[body] = 1 \ No newline at end of file diff --git a/src/docker-images/RepairManager/Dockerfile b/src/docker-images/RepairManager/Dockerfile index f2d40e9ac..fcf6045fa 100644 --- a/src/docker-images/RepairManager/Dockerfile +++ b/src/docker-images/RepairManager/Dockerfile @@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN pip3 install wheel RUN pip3 install setuptools RUN pip3 install requests +RUN pip3 install cachetools COPY kubectl /usr/local/bin/kubectl RUN chmod +x /usr/local/bin/kubectl From b8fd08f866a557753ae85cba726e9fe1fb7b49b0 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Thu, 5 Dec 2019 01:02:32 +0000 Subject: [PATCH 05/12] update time between rules --- src/RepairManager/config/rule-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index 67c86064f..86887940f 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -6,4 +6,4 @@ rules: ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' # time to sleep between rule execution -rule_wait_time: 2 +rule_wait_time: 10 From 2bc2f0c40388f2be413836f03c537f2ed917801f Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Thu, 5 Dec 2019 06:25:52 +0000 Subject: [PATCH 06/12] email config fixes --- src/RepairManager/Rules/ecc_rule.py | 16 +++++++++++----- src/RepairManager/config/email-config.yaml | 2 +- src/RepairManager/config/rule-config.yaml | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 7555b2a10..88cb8f112 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -79,7 +79,7 @@ def check_status(self): return False def take_action(self): - body = 'ECC Error found on the following nodes:\n' + status = {} for node_name in self.ecc_hostnames: @@ -88,10 +88,16 @@ def take_action(self): if success != 0: logging.warning(f'Unscheduling of node {node_name} not successful') - body += f'{node_name}: Failed to mark as unschedulable\n' + status[node_name] = 'Failed to mark as unschedulable' else: - body += f'{node_name}: Successfully marked as unschedulable\n' + status[node_name] = 'Successfully marked as unschedulable' - subject = 'Repair Manager Alert [ECC ERROR]' - self.alert.handle_email_alert(subject, body) + else: + status[node_name] = 'Previously marked as unschedulable' + + body = 'Uncorrectable ECC Error found on the following nodes:\n' + for node_name in status: + body += f'{node_name} -> \t{status[node_name]}\n' + subject = 'Repair Manager Alert [ECC ERROR]' + self.alert.handle_email_alert(subject, body) \ No newline at end of file diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml index 78e750ba5..4a75bfba6 100644 --- a/src/RepairManager/config/email-config.yaml +++ b/src/RepairManager/config/email-config.yaml @@ -4,5 +4,5 @@ password: {{cnf['repair-manager']['alert']['password']}} sender: {{cnf['repair-manager']['alert']['sender']}} receiver: {{cnf['repair-manager']['alert']['receiver']}} -# time to wait before resending email alert for exceptions (hours) +# time to wait before resending email alert for exceptions (seconds) alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}} \ No newline at end of file diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index 67c86064f..0912010a4 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -5,5 +5,5 @@ rules: class_name : ECCRule ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' -# time to sleep between rule execution -rule_wait_time: 2 +# time to sleep between rule execution (seconds) +rule_wait_time: 10 From c0d413ec076cf55ddeedfaa06029566311f0ba23 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Mon, 9 Dec 2019 23:55:14 +0000 Subject: [PATCH 07/12] more descriptive email alert --- src/RepairManager/Rules/ecc_rule.py | 194 ++++++++++---------- src/RepairManager/config/rule-config.yaml | 20 +- src/RepairManager/utils/k8s_util.py | 53 +++--- src/docker-images/RepairManager/prebuild.sh | 1 + 4 files changed, 136 insertions(+), 132 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 88cb8f112..6277dd757 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -1,103 +1,93 @@ -from Rules.rules_abc import Rule -from kubernetes import client, config -from utils import k8s_util, email -import requests -import json -import os -import time -import yaml -import logging - -def get_node_address_info(node_info): - # map InternalIP to Hostname - address_map = {} - - if node_info: - - for node in node_info.items: - internal_ip = None - hostname = None - - for address in node.status.addresses: - if address.type == 'InternalIP': - internal_ip = address.address - - if address.type == 'Hostname': - hostname = address.address - - address_map[internal_ip] = hostname - - logging.debug('node address map: %s ' % address_map) - - return address_map - - - -def get_ECC_error_data(ecc_url): - - response = requests.get(ecc_url) - data = json.loads(response.text) - - if data: - ecc_metrics = data['data']['result'] - logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) - - return ecc_metrics - - - -class ECCRule(Rule): - - def __init__(self, alert): - self.config = self.load_rule_config() - self.ecc_hostnames = [] - self.node_info = {} - self.alert = alert - - def load_rule_config(self): - with open('./config/rule-config.yaml', 'r') as file: - return yaml.safe_load(file) - - def check_status(self): - # save node_info to reduce the number of API calls - self.node_info = k8s_util.list_node() - - address_map = get_node_address_info(self.node_info) - - ecc_url = os.environ['PROMETHEUS_HOST'] + self.config['rules']['ecc_rule']['ecc_error_url'] - ecc_metrics = get_ECC_error_data(ecc_url) - - if ecc_metrics: - for m in ecc_metrics: - offending_node_ip = m['metric']['instance'].split(':')[0] - self.ecc_hostnames.append(address_map[offending_node_ip]) - - logging.info(f'Uncorrectable ECC metrics found: {self.ecc_hostnames}') - return True - else: - logging.debug('No uncorrectable ECC metrics found.') - return False - - def take_action(self): - status = {} - - for node_name in self.ecc_hostnames: - - if not k8s_util.is_node_unschedulable(self.node_info, node_name): - success = k8s_util.cordon_node(node_name) - - if success != 0: - logging.warning(f'Unscheduling of node {node_name} not successful') - status[node_name] = 'Failed to mark as unschedulable' - else: - status[node_name] = 'Successfully marked as unschedulable' - - else: - status[node_name] = 'Previously marked as unschedulable' - - body = 'Uncorrectable ECC Error found on the following nodes:\n' - for node_name in status: - body += f'{node_name} -> \t{status[node_name]}\n' - - subject = 'Repair Manager Alert [ECC ERROR]' +from Rules.rules_abc import Rule +from kubernetes import client, config +from utils import k8s_util, email +import requests +import json +import os +import time +import yaml +import logging + +def get_node_address_info(node_info): + # map InternalIP to Hostname + address_map = {} + + if node_info: + + for node in node_info.items: + internal_ip = None + hostname = None + + for address in node.status.addresses: + if address.type == 'InternalIP': + internal_ip = address.address + + if address.type == 'Hostname': + hostname = address.address + + address_map[internal_ip] = hostname + + logging.debug('node address map: %s ' % address_map) + + return address_map + + + +def get_ECC_error_data(ecc_url): + + response = requests.get(ecc_url) + data = json.loads(response.text) + + if data: + ecc_metrics = data['data']['result'] + logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + + return ecc_metrics + + + +class ECCRule(Rule): + + def __init__(self, alert): + self.config = self.load_rule_config() + self.ecc_hostnames = [] + self.node_info = {} + self.alert = alert + + def load_rule_config(self): + with open('./config/rule-config.yaml', 'r') as file: + return yaml.safe_load(file) + + def check_status(self): + # save node_info to reduce the number of API calls + self.node_info = k8s_util.list_node() + + address_map = get_node_address_info(self.node_info) + + ecc_url = os.environ['PROMETHEUS_HOST'] + self.config['rules']['ecc_rule']['ecc_error_url'] + ecc_metrics = get_ECC_error_data(ecc_url) + + if ecc_metrics: + for m in ecc_metrics: + offending_node_ip = m['metric']['instance'].split(':')[0] + self.ecc_hostnames.append(address_map[offending_node_ip]) + + logging.info(f'Uncorrectable ECC metrics found: {self.ecc_hostnames}') + return True + else: + logging.debug('No uncorrectable ECC metrics found.') + return False + + def take_action(self): + status = {} + + for node_name in self.ecc_hostnames: + output = k8s_util.cordon_node(node_name, dry_run=True) + status[node_name] = output + + body = f'Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:\n' + for node_name in status: + body += f'{node_name}:\t{status[node_name]}\n\n' + + subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]' self.alert.handle_email_alert(subject, body) \ No newline at end of file diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index 0912010a4..d94c82e2f 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -1,9 +1,11 @@ ---- -rules: - ecc_rule: - module_name : Rules.ecc_rule - class_name : ECCRule - ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' - -# time to sleep between rule execution (seconds) -rule_wait_time: 10 +--- +rules: + ecc_rule: + module_name : Rules.ecc_rule + class_name : ECCRule + ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + +# time to sleep between rule execution (seconds) +rule_wait_time: 10 + +cluster_name: {{cnf['repair-manager']['cluster_name']}} diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py index 4c83e1658..acda6ff06 100644 --- a/src/RepairManager/utils/k8s_util.py +++ b/src/RepairManager/utils/k8s_util.py @@ -1,22 +1,33 @@ -import os -import logging -from kubernetes import client, config - -def cordon_node(node_name): - output = os.system('kubectl cordon %s' % node_name) - return output - - -def is_node_unschedulable(node_info, node_name): - for node in node_info.items: - for address in node.status.addresses: - if address.type == 'Hostname' and address.address == node_name: - return node.spec.unschedulable - - logging.warning(f"Could not find node with hostname {node_name}") - - -def list_node(): - config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') - api_instance = client.CoreV1Api() +import subprocess +import logging +from kubernetes import client, config + +def cordon_node(node_name, dry_run=True): + args = ['kubectl', 'cordon', node_name] + + if dry_run: + args.append('--dry-run') + + try: + output = subprocess.check_output(args, stderr=subprocess.STDOUT) + output_decoded = output.decode() + logging.info(output_decoded) + return output_decoded + except Exception as e: + logging.exception(f'Exception attempting to cordon node {node_name}') + return str(e) + + +def is_node_unschedulable(node_info, node_name): + for node in node_info.items: + for address in node.status.addresses: + if address.type == 'Hostname' and address.address == node_name: + return node.spec.unschedulable + + logging.warning(f"Could not find node with hostname {node_name}") + + +def list_node(): + config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') + api_instance = client.CoreV1Api() return api_instance.list_node() \ No newline at end of file diff --git a/src/docker-images/RepairManager/prebuild.sh b/src/docker-images/RepairManager/prebuild.sh index b449bcf1d..3dd1aafd1 100644 --- a/src/docker-images/RepairManager/prebuild.sh +++ b/src/docker-images/RepairManager/prebuild.sh @@ -8,6 +8,7 @@ cp -r ../../../../RepairManager RepairManager # Render config file for email credentials cd ../../../ ./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml +./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml cd ./deploy/docker-images/RepairManager/ cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl From 46150c5dd115615a4259685d79133256a3ea4c85 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Tue, 10 Dec 2019 01:05:37 +0000 Subject: [PATCH 08/12] nit --- src/RepairManager/Rules/ecc_rule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 57eeb1aaf..762c40685 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -85,9 +85,9 @@ def take_action(self): output = k8s_util.cordon_node(node_name, dry_run=True) status[node_name] = output - body = f'Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:\n' + body = f'Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:\n\n' for node_name in status: - body += f'{node_name}:\t{status[node_name]}\n\n' + body += f'{node_name}:\t{status[node_name]}\n' subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]' self.alert.handle_email_alert(subject, body) \ No newline at end of file From 802ce1f8c3269eee56536d2934e3ea457c277672 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Mon, 9 Dec 2019 19:03:04 -0800 Subject: [PATCH 09/12] "fixing output error message" --- src/RepairManager/main.py | 4 ++-- src/RepairManager/utils/k8s_util.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index 12ca560e8..ba724bb74 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -55,13 +55,13 @@ def Run(): except Exception as e: logger.exception(f'Error executing {class_name} from module {module_name}\n') - subject = f'Repair Manager Alert [Exception executing {class_name}]' + subject = f'Repair Manager Rule Error [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.handle_email_alert(subject, body) except Exception as e: logger.exception('Repair manager has stopped due to an unhandled exception:') - subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly' + subject = f'[Repair Manager Exception] [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.send(subject, body) diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py index acda6ff06..a50726980 100644 --- a/src/RepairManager/utils/k8s_util.py +++ b/src/RepairManager/utils/k8s_util.py @@ -10,12 +10,11 @@ def cordon_node(node_name, dry_run=True): try: output = subprocess.check_output(args, stderr=subprocess.STDOUT) - output_decoded = output.decode() - logging.info(output_decoded) - return output_decoded - except Exception as e: + logging.info(output.decode()) + return output.decode() + except subprocess.CalledProcessError as e: logging.exception(f'Exception attempting to cordon node {node_name}') - return str(e) + return e.output.decode() def is_node_unschedulable(node_info, node_name): From c17a367e9ce88cd2ee74e62b37ae97d439de746b Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Wed, 11 Dec 2019 23:27:23 +0000 Subject: [PATCH 10/12] try/catch for prometheus request --- src/RepairManager/Rules/ecc_rule.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 762c40685..2427c9f64 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -35,14 +35,20 @@ def get_node_address_info(node_info): def get_ECC_error_data(ecc_url): - response = requests.get(ecc_url) - data = json.loads(response.text) - - if data: - ecc_metrics = data['data']['result'] - logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + try: + response = requests.get(ecc_url) + if response: + data = json.loads(response.text) + + if data: + ecc_metrics = data['data']['result'] + logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics)) + return ecc_metrics + else: + logging.warning(f'No response from {ecc_url} found.') - return ecc_metrics + except Exception: + logging.exception(f'Error retrieving data from {ecc_url}') From fdc84d2100cef945c1ff157498045d959f10ff5c Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Fri, 13 Dec 2019 01:16:24 +0000 Subject: [PATCH 11/12] email multiple recipients, configurable --- src/RepairManager/utils/email.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py index 59deece01..e90404967 100644 --- a/src/RepairManager/utils/email.py +++ b/src/RepairManager/utils/email.py @@ -15,14 +15,14 @@ def load_config(self): return yaml.safe_load(file) def send(self, subject, body): - message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body) + message = f"From: {self.config['sender']}\r\nTo: {';'.join(self.config['receiver'])}\r\nSubject: {subject}\r\n\r\n{body}" try: with smtplib.SMTP(self.config['smtp_url']) as server: server.starttls() server.login(self.config['login'], self.config['password']) server.sendmail(self.config['sender'], self.config['receiver'], message) - logging.info('Email Sent') + logging.info(f"Email sent to {', '.join(self.config['receiver'])}") except smtplib.SMTPAuthenticationError: logging.warning('The server didn\'t accept the user\\password combination.') except smtplib.SMTPServerDisconnected: From 80f3155dff6055a252e6b5161890740f33cc3903 Mon Sep 17 00:00:00 2001 From: debbie-alaine Date: Thu, 19 Dec 2019 03:05:52 +0000 Subject: [PATCH 12/12] add functionaity to email job owners --- .gitignore | 1 + src/RepairManager/Rules/ecc_rule.py | 46 ++++++++++++++++------ src/RepairManager/config/email-config.yaml | 3 +- src/RepairManager/config/rule-config.yaml | 1 + src/RepairManager/main.py | 4 +- src/RepairManager/utils/email.py | 23 +++++++---- src/RepairManager/utils/k8s_util.py | 7 +++- src/docker-images/RepairManager/Dockerfile | 1 + 8 files changed, 63 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index a3a658641..3c79ac98c 100755 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ src/WebUI/dotnet/WebPortal/hosting.json **/storage.ide src/RepairManager/.vs/* +src/RepairManager/mock-data/* /.vs/slnx.sqlite /.vs/ProjectSettings.json /.vs/VSWorkspaceState.json diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py index 2427c9f64..16c466c7b 100644 --- a/src/RepairManager/Rules/ecc_rule.py +++ b/src/RepairManager/Rules/ecc_rule.py @@ -1,6 +1,7 @@ from Rules.rules_abc import Rule from kubernetes import client, config from utils import k8s_util, email +from tabulate import tabulate import requests import json import os @@ -27,14 +28,13 @@ def get_node_address_info(node_info): address_map[internal_ip] = hostname - logging.debug('node address map: %s ' % address_map) + logging.debug(f'node address map: {address_map}') return address_map def get_ECC_error_data(ecc_url): - try: response = requests.get(ecc_url) if response: @@ -47,9 +47,23 @@ def get_ECC_error_data(ecc_url): else: logging.warning(f'No response from {ecc_url} found.') - except Exception: + except: logging.exception(f'Error retrieving data from {ecc_url}') +def get_job_info_from_nodes(nodes): + pods = k8s_util.list_pod_for_all_namespaces() + + jobs = {} + for pod in pods.items: + if pod.metadata and pod.metadata.labels: + if 'jobId' in pod.metadata.labels and 'userName' in pod.metadata.labels: + if pod.spec.node_name in nodes: + jobs[pod.metadata.labels['jobId']] = { + 'userName': pod.metadata.labels['userName'], + 'nodeName': pod.spec.node_name, + 'vcName': pod.metadata.labels['vcName'] + } + return jobs class ECCRule(Rule): @@ -85,15 +99,25 @@ def check_status(self): return False def take_action(self): - status = {} - + status = [] for node_name in self.ecc_hostnames: output = k8s_util.cordon_node(node_name, dry_run=True) - status[node_name] = output - - body = f'Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:\n\n' - for node_name in status: - body += f'{node_name}:\t{status[node_name]}\n' + status.append([node_name, output]) subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]' - self.alert.handle_email_alert(subject, body) \ No newline at end of file + body = f'

Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:

' + body += tabulate(status, headers=['node name', 'action status'], tablefmt="html").replace('','
') + + body += f'

Impacted Jobs and Job Owners

' + job_owners = [] + job_info = [] + jobs = get_job_info_from_nodes(self.ecc_hostnames) + for jobId in jobs: + job_owners.append(jobs[jobId]['userName'] + '@microsoft.com') + job_info.append([jobId, jobs[jobId]['userName'], jobs[jobId]['nodeName'], jobs[jobId]['vcName']]) + body += tabulate(job_info, headers=['job id', 'job owner', 'node name', 'vc name' ], tablefmt="html").replace('
','
') + + if self.config['rules']['ecc_rule']['alert_job_owners']: + self.alert.handle_email_alert(subject, body, additional_recipients=job_owners) + else: + self.alert.handle_email_alert(subject, body) \ No newline at end of file diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml index 4a75bfba6..317cdd004 100644 --- a/src/RepairManager/config/email-config.yaml +++ b/src/RepairManager/config/email-config.yaml @@ -4,5 +4,4 @@ password: {{cnf['repair-manager']['alert']['password']}} sender: {{cnf['repair-manager']['alert']['sender']}} receiver: {{cnf['repair-manager']['alert']['receiver']}} -# time to wait before resending email alert for exceptions (seconds) -alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}} \ No newline at end of file +alert_wait_seconds: 86400 # time to wait before resending email alert for exceptions (seconds) \ No newline at end of file diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml index 91c31c27c..de7103df0 100644 --- a/src/RepairManager/config/rule-config.yaml +++ b/src/RepairManager/config/rule-config.yaml @@ -4,6 +4,7 @@ rules: module_name : Rules.ecc_rule class_name : ECCRule ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0' + alert_job_owners: False # alert all impacted job owners # time to sleep between rule execution (seconds) rule_wait_time: 10 diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py index ba724bb74..46feb0c99 100644 --- a/src/RepairManager/main.py +++ b/src/RepairManager/main.py @@ -55,13 +55,13 @@ def Run(): except Exception as e: logger.exception(f'Error executing {class_name} from module {module_name}\n') - subject = f'Repair Manager Rule Error [{config["cluster_name"]}]' + subject = f'Repair Manager Alert [Rule Error] [{class_name}] [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.handle_email_alert(subject, body) except Exception as e: logger.exception('Repair manager has stopped due to an unhandled exception:') - subject = f'[Repair Manager Exception] [{config["cluster_name"]}]' + subject = f'Repair Manager Alert [Repair Manager Crashed] [{config["cluster_name"]}]' body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) alert.send(subject, body) diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py index e90404967..0cc1d6ed6 100644 --- a/src/RepairManager/utils/email.py +++ b/src/RepairManager/utils/email.py @@ -14,15 +14,25 @@ def load_config(self): with open('./config/email-config.yaml', 'r') as file: return yaml.safe_load(file) - def send(self, subject, body): - message = f"From: {self.config['sender']}\r\nTo: {';'.join(self.config['receiver'])}\r\nSubject: {subject}\r\n\r\n{body}" + def send(self, subject, body, additional_recipients=None): + recepients = self.config['receiver'] + if additional_recipients is not None: + recepients = recepients + additional_recipients + + message = ( + f"From: {self.config['sender']}\r\n" + f"To: {';'.join(recepients)}\r\n" + f"MIME-Version: 1.0\r\n" + f"Content-type: text/html\r\n" + f"Subject: {subject}\r\n\r\n{body}" + ) try: with smtplib.SMTP(self.config['smtp_url']) as server: server.starttls() server.login(self.config['login'], self.config['password']) - server.sendmail(self.config['sender'], self.config['receiver'], message) - logging.info(f"Email sent to {', '.join(self.config['receiver'])}") + server.sendmail(self.config['sender'], recepients, message) + logging.info(f"Email sent to {', '.join(recepients)}") except smtplib.SMTPAuthenticationError: logging.warning('The server didn\'t accept the user\\password combination.') except smtplib.SMTPServerDisconnected: @@ -30,9 +40,8 @@ def send(self, subject, body): except smtplib.SMTPException as e: logging.exception('SMTP error occurred: ' + str(e)) - def handle_email_alert(self, subject, body): - + def handle_email_alert(self, subject, body, additional_recipients=None): # to avoid email spam, send email based on configured alert wait time if body not in self.alert_cache: - self.send(subject, body) + self.send(subject, body, additional_recipients) self.alert_cache[body] = 1 \ No newline at end of file diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py index a50726980..9c796d394 100644 --- a/src/RepairManager/utils/k8s_util.py +++ b/src/RepairManager/utils/k8s_util.py @@ -29,4 +29,9 @@ def is_node_unschedulable(node_info, node_name): def list_node(): config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml') api_instance = client.CoreV1Api() - return api_instance.list_node() \ No newline at end of file + return api_instance.list_node() + +def list_pod_for_all_namespaces(): + config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml',) + api_instance = client.CoreV1Api() + return api_instance.list_pod_for_all_namespaces() \ No newline at end of file diff --git a/src/docker-images/RepairManager/Dockerfile b/src/docker-images/RepairManager/Dockerfile index fcf6045fa..09d68d68a 100644 --- a/src/docker-images/RepairManager/Dockerfile +++ b/src/docker-images/RepairManager/Dockerfile @@ -9,6 +9,7 @@ RUN pip3 install wheel RUN pip3 install setuptools RUN pip3 install requests RUN pip3 install cachetools +RUN pip3 install tabulate COPY kubectl /usr/local/bin/kubectl RUN chmod +x /usr/local/bin/kubectl