microsoft · hongzhili · Dec 20, 2019 · Nov 21, 2019 · Nov 25, 2019 · Nov 27, 2019
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ src/WebUI/dotnet/WebPortal/hosting.json
 **/storage.ide
 
 src/RepairManager/.vs/*
+src/RepairManager/mock-data/*
 /.vs/slnx.sqlite
 /.vs/ProjectSettings.json
 /.vs/VSWorkspaceState.json

diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py
@@ -1,6 +1,7 @@
 from Rules.rules_abc import Rule
 from kubernetes import client, config
 from utils import k8s_util, email
+from tabulate import tabulate
 import requests
 import json
 import os
@@ -27,23 +28,42 @@ def get_node_address_info(node_info):
 
                 address_map[internal_ip] = hostname
 
-    logging.debug('node address map: %s ' %  address_map)
+    logging.debug(f'node address map: {address_map}')
 
     return address_map
 
 
 
 def get_ECC_error_data(ecc_url):
+    try:
+        response = requests.get(ecc_url)
+        if response:
+            data = json.loads(response.text)
+
+            if data:
+                ecc_metrics = data['data']['result']
+                logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
+                return ecc_metrics
+        else:
+            logging.warning(f'No response from {ecc_url} found.')
 
-    response = requests.get(ecc_url)
-    data = json.loads(response.text)
-
-    if data:
-        ecc_metrics = data['data']['result']
-        logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
+    except:
+        logging.exception(f'Error retrieving data from {ecc_url}')
 
-        return ecc_metrics
+def get_job_info_from_nodes(nodes):
+    pods = k8s_util.list_pod_for_all_namespaces()
 
+    jobs = {}
+    for pod in pods.items:
+        if pod.metadata and pod.metadata.labels:
+            if 'jobId' in pod.metadata.labels and 'userName' in pod.metadata.labels:
+                if pod.spec.node_name in nodes:
+                    jobs[pod.metadata.labels['jobId']] = {
+                    'userName': pod.metadata.labels['userName'],
+                    'nodeName': pod.spec.node_name,
+                    'vcName': pod.metadata.labels['vcName']
+                    }
+    return jobs
 
 
 class ECCRule(Rule):
@@ -79,25 +99,25 @@ def check_status(self):
             return False
 
     def take_action(self):
-        status = {}
-
+        status = []
         for node_name in self.ecc_hostnames:
-
-            if not k8s_util.is_node_unschedulable(self.node_info, node_name):
-                success = k8s_util.cordon_node(node_name)
-
-                if success != 0:
-                    logging.warning(f'Unscheduling of node {node_name} not successful')
-                    status[node_name] = 'Failed to mark as unschedulable'
-                else:
-                    status[node_name] = 'Successfully marked as unschedulable'
-
-            else:
-                status[node_name] = 'Previously marked as unschedulable'
-
-        body = 'Uncorrectable ECC Error found on the following nodes:\n'
-        for node_name in status:
-            body += f'{node_name} -> \t{status[node_name]}\n'
-
-        subject = 'Repair Manager Alert [ECC ERROR]'
-        self.alert.handle_email_alert(subject, body)
+            output = k8s_util.cordon_node(node_name, dry_run=True)
+            status.append([node_name, output])
+
+        subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]'
+        body = f'<h3>Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:</h1>'
+        body += tabulate(status, headers=['node name', 'action status'], tablefmt="html").replace('<table>','<table border="1">')
+
+        body += f'<h3>Impacted Jobs and Job Owners</h3>'
+        job_owners = []
+        job_info = []
+        jobs = get_job_info_from_nodes(self.ecc_hostnames)
+        for jobId in jobs:
+            job_owners.append(jobs[jobId]['userName'] + '@microsoft.com')
+            job_info.append([jobId, jobs[jobId]['userName'], jobs[jobId]['nodeName'], jobs[jobId]['vcName']])
+        body += tabulate(job_info, headers=['job id', 'job owner', 'node name', 'vc name' ], tablefmt="html").replace('<table>','<table border="1">')
+
+        if self.config['rules']['ecc_rule']['alert_job_owners']:
+            self.alert.handle_email_alert(subject, body, additional_recipients=job_owners)
+        else:
+            self.alert.handle_email_alert(subject, body)
diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml
@@ -4,5 +4,4 @@ password: {{cnf['repair-manager']['alert']['password']}}
 sender: {{cnf['repair-manager']['alert']['sender']}}
 receiver: {{cnf['repair-manager']['alert']['receiver']}}
 
-# time to wait before resending email alert for exceptions (seconds)
-alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}}
+alert_wait_seconds: 86400 # time to wait before resending email alert for exceptions (seconds)
diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml
@@ -4,6 +4,10 @@ rules:
     module_name : Rules.ecc_rule
     class_name : ECCRule
     ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0'
+    alert_job_owners: False # alert all impacted job owners
 
 # time to sleep between rule execution (seconds)
 rule_wait_time: 10
+
+cluster_name: {{cnf['repair-manager']['cluster_name']}}
+
diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py
@@ -55,13 +55,13 @@ def Run():
 
                 except Exception as e:
                     logger.exception(f'Error executing {class_name} from module {module_name}\n')
-                    subject = f'Repair Manager Alert [Exception executing {class_name}]'
+                    subject = f'Repair Manager Alert [Rule Error] [{class_name}] [{config["cluster_name"]}]'
                     body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
                     alert.handle_email_alert(subject, body)
 
     except Exception as e:
          logger.exception('Repair manager has stopped due to an unhandled exception:')
-         subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly'
+         subject = f'Repair Manager Alert [Repair Manager Crashed] [{config["cluster_name"]}]'
          body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
          alert.send(subject, body)
 

diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py
@@ -14,25 +14,34 @@ def load_config(self):
         with open('./config/email-config.yaml', 'r') as file:
             return yaml.safe_load(file)
 
-    def send(self, subject, body):
-        message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body)
+    def send(self, subject, body, additional_recipients=None):
+        recepients = self.config['receiver']
+        if additional_recipients is not None:
+            recepients = recepients + additional_recipients
+
+        message = (
+            f"From: {self.config['sender']}\r\n"
+            f"To: {';'.join(recepients)}\r\n"
+            f"MIME-Version: 1.0\r\n"
+            f"Content-type: text/html\r\n"
+            f"Subject: {subject}\r\n\r\n{body}"
+        )
 
         try:
             with smtplib.SMTP(self.config['smtp_url']) as server:
                 server.starttls()
                 server.login(self.config['login'], self.config['password'])
-                server.sendmail(self.config['sender'], self.config['receiver'], message)
-                logging.info('Email Sent')
+                server.sendmail(self.config['sender'], recepients, message)
+                logging.info(f"Email sent to {', '.join(recepients)}")
         except smtplib.SMTPAuthenticationError:
             logging.warning('The server didn\'t accept the user\\password combination.')
         except smtplib.SMTPServerDisconnected:
             logging.warning('Server unexpectedly disconnected')
         except smtplib.SMTPException as e:
             logging.exception('SMTP error occurred: ' + str(e))
 
-    def handle_email_alert(self, subject, body):
-
+    def handle_email_alert(self, subject, body, additional_recipients=None):
         # to avoid email spam, send email based on configured alert wait time
         if body not in self.alert_cache:
-            self.send(subject, body)
+            self.send(subject, body, additional_recipients)
             self.alert_cache[body] = 1  
diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py
@@ -1,22 +1,37 @@
-import os
-import logging
-from kubernetes import client, config
-
-def cordon_node(node_name):
-    output = os.system('kubectl cordon %s' % node_name)
-    return output
-
-
-def is_node_unschedulable(node_info, node_name):
-    for node in node_info.items:
-        for address in node.status.addresses:
-            if address.type == 'Hostname' and address.address == node_name:
-                return node.spec.unschedulable
-
-    logging.warning(f"Could not find node with hostname {node_name}")
-
-
-def list_node():
-    config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
-    api_instance = client.CoreV1Api()
-    return api_instance.list_node()
+import subprocess
+import logging
+from kubernetes import client, config
+
+def cordon_node(node_name, dry_run=True):
+    args = ['kubectl', 'cordon', node_name]
+
+    if dry_run:
+        args.append('--dry-run')
+
+    try:
+        output = subprocess.check_output(args, stderr=subprocess.STDOUT)
+        logging.info(output.decode())
+        return output.decode()
+    except subprocess.CalledProcessError as e:
+        logging.exception(f'Exception attempting to cordon node {node_name}')
+        return e.output.decode()
+
+
+def is_node_unschedulable(node_info, node_name):
+    for node in node_info.items:
+        for address in node.status.addresses:
+            if address.type == 'Hostname' and address.address == node_name:
+                return node.spec.unschedulable
+
+    logging.warning(f"Could not find node with hostname {node_name}")
+
+
+def list_node():
+    config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
+    api_instance = client.CoreV1Api()
+    return api_instance.list_node()
+
+def list_pod_for_all_namespaces():
+    config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml',)
+    api_instance = client.CoreV1Api()
+    return api_instance.list_pod_for_all_namespaces()
diff --git a/src/docker-images/RepairManager/Dockerfile b/src/docker-images/RepairManager/Dockerfile
@@ -9,6 +9,7 @@ RUN pip3 install wheel
 RUN pip3 install setuptools
 RUN pip3 install requests
 RUN pip3 install cachetools
+RUN pip3 install tabulate
 
 COPY kubectl /usr/local/bin/kubectl
 RUN chmod +x /usr/local/bin/kubectl

diff --git a/src/docker-images/RepairManager/prebuild.sh b/src/docker-images/RepairManager/prebuild.sh
@@ -8,6 +8,7 @@ cp -r ../../../../RepairManager RepairManager
 # Render config file for email credentials
 cd ../../../
 ./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml
+./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml
 cd ./deploy/docker-images/RepairManager/
 
 cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl