diff --git a/.gitignore b/.gitignore
index a3a658641..3c79ac98c 100755
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ src/WebUI/dotnet/WebPortal/hosting.json
**/storage.ide
src/RepairManager/.vs/*
+src/RepairManager/mock-data/*
/.vs/slnx.sqlite
/.vs/ProjectSettings.json
/.vs/VSWorkspaceState.json
diff --git a/src/RepairManager/Rules/ecc_rule.py b/src/RepairManager/Rules/ecc_rule.py
index 88cb8f112..16c466c7b 100644
--- a/src/RepairManager/Rules/ecc_rule.py
+++ b/src/RepairManager/Rules/ecc_rule.py
@@ -1,6 +1,7 @@
from Rules.rules_abc import Rule
from kubernetes import client, config
from utils import k8s_util, email
+from tabulate import tabulate
import requests
import json
import os
@@ -27,23 +28,42 @@ def get_node_address_info(node_info):
address_map[internal_ip] = hostname
- logging.debug('node address map: %s ' % address_map)
+ logging.debug(f'node address map: {address_map}')
return address_map
def get_ECC_error_data(ecc_url):
+ try:
+ response = requests.get(ecc_url)
+ if response:
+ data = json.loads(response.text)
+
+ if data:
+ ecc_metrics = data['data']['result']
+ logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
+ return ecc_metrics
+ else:
+ logging.warning(f'No response from {ecc_url} found.')
- response = requests.get(ecc_url)
- data = json.loads(response.text)
-
- if data:
- ecc_metrics = data['data']['result']
- logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
+ except:
+ logging.exception(f'Error retrieving data from {ecc_url}')
- return ecc_metrics
+def get_job_info_from_nodes(nodes):
+ pods = k8s_util.list_pod_for_all_namespaces()
+ jobs = {}
+ for pod in pods.items:
+ if pod.metadata and pod.metadata.labels:
+ if 'jobId' in pod.metadata.labels and 'userName' in pod.metadata.labels:
+ if pod.spec.node_name in nodes:
+ jobs[pod.metadata.labels['jobId']] = {
+ 'userName': pod.metadata.labels['userName'],
+ 'nodeName': pod.spec.node_name,
+ 'vcName': pod.metadata.labels['vcName']
+ }
+ return jobs
class ECCRule(Rule):
@@ -79,25 +99,25 @@ def check_status(self):
return False
def take_action(self):
- status = {}
-
+ status = []
for node_name in self.ecc_hostnames:
-
- if not k8s_util.is_node_unschedulable(self.node_info, node_name):
- success = k8s_util.cordon_node(node_name)
-
- if success != 0:
- logging.warning(f'Unscheduling of node {node_name} not successful')
- status[node_name] = 'Failed to mark as unschedulable'
- else:
- status[node_name] = 'Successfully marked as unschedulable'
-
- else:
- status[node_name] = 'Previously marked as unschedulable'
-
- body = 'Uncorrectable ECC Error found on the following nodes:\n'
- for node_name in status:
- body += f'{node_name} -> \t{status[node_name]}\n'
-
- subject = 'Repair Manager Alert [ECC ERROR]'
- self.alert.handle_email_alert(subject, body)
\ No newline at end of file
+ output = k8s_util.cordon_node(node_name, dry_run=True)
+ status.append([node_name, output])
+
+ subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]'
+ body = f'
Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:'
+ body += tabulate(status, headers=['node name', 'action status'], tablefmt="html").replace('
','')
+
+ body += f'Impacted Jobs and Job Owners
'
+ job_owners = []
+ job_info = []
+ jobs = get_job_info_from_nodes(self.ecc_hostnames)
+ for jobId in jobs:
+ job_owners.append(jobs[jobId]['userName'] + '@microsoft.com')
+ job_info.append([jobId, jobs[jobId]['userName'], jobs[jobId]['nodeName'], jobs[jobId]['vcName']])
+ body += tabulate(job_info, headers=['job id', 'job owner', 'node name', 'vc name' ], tablefmt="html").replace('','')
+
+ if self.config['rules']['ecc_rule']['alert_job_owners']:
+ self.alert.handle_email_alert(subject, body, additional_recipients=job_owners)
+ else:
+ self.alert.handle_email_alert(subject, body)
\ No newline at end of file
diff --git a/src/RepairManager/config/email-config.yaml b/src/RepairManager/config/email-config.yaml
index 4a75bfba6..317cdd004 100644
--- a/src/RepairManager/config/email-config.yaml
+++ b/src/RepairManager/config/email-config.yaml
@@ -4,5 +4,4 @@ password: {{cnf['repair-manager']['alert']['password']}}
sender: {{cnf['repair-manager']['alert']['sender']}}
receiver: {{cnf['repair-manager']['alert']['receiver']}}
-# time to wait before resending email alert for exceptions (seconds)
-alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}}
\ No newline at end of file
+alert_wait_seconds: 86400 # time to wait before resending email alert for exceptions (seconds)
\ No newline at end of file
diff --git a/src/RepairManager/config/rule-config.yaml b/src/RepairManager/config/rule-config.yaml
index 0912010a4..de7103df0 100644
--- a/src/RepairManager/config/rule-config.yaml
+++ b/src/RepairManager/config/rule-config.yaml
@@ -4,6 +4,10 @@ rules:
module_name : Rules.ecc_rule
class_name : ECCRule
ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0'
+ alert_job_owners: False # alert all impacted job owners
# time to sleep between rule execution (seconds)
rule_wait_time: 10
+
+cluster_name: {{cnf['repair-manager']['cluster_name']}}
+
diff --git a/src/RepairManager/main.py b/src/RepairManager/main.py
index 12ca560e8..46feb0c99 100644
--- a/src/RepairManager/main.py
+++ b/src/RepairManager/main.py
@@ -55,13 +55,13 @@ def Run():
except Exception as e:
logger.exception(f'Error executing {class_name} from module {module_name}\n')
- subject = f'Repair Manager Alert [Exception executing {class_name}]'
+ subject = f'Repair Manager Alert [Rule Error] [{class_name}] [{config["cluster_name"]}]'
body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
alert.handle_email_alert(subject, body)
except Exception as e:
logger.exception('Repair manager has stopped due to an unhandled exception:')
- subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly'
+ subject = f'Repair Manager Alert [Repair Manager Crashed] [{config["cluster_name"]}]'
body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
alert.send(subject, body)
diff --git a/src/RepairManager/utils/email.py b/src/RepairManager/utils/email.py
index 59deece01..0cc1d6ed6 100644
--- a/src/RepairManager/utils/email.py
+++ b/src/RepairManager/utils/email.py
@@ -14,15 +14,25 @@ def load_config(self):
with open('./config/email-config.yaml', 'r') as file:
return yaml.safe_load(file)
- def send(self, subject, body):
- message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body)
+ def send(self, subject, body, additional_recipients=None):
+ recepients = self.config['receiver']
+ if additional_recipients is not None:
+ recepients = recepients + additional_recipients
+
+ message = (
+ f"From: {self.config['sender']}\r\n"
+ f"To: {';'.join(recepients)}\r\n"
+ f"MIME-Version: 1.0\r\n"
+ f"Content-type: text/html\r\n"
+ f"Subject: {subject}\r\n\r\n{body}"
+ )
try:
with smtplib.SMTP(self.config['smtp_url']) as server:
server.starttls()
server.login(self.config['login'], self.config['password'])
- server.sendmail(self.config['sender'], self.config['receiver'], message)
- logging.info('Email Sent')
+ server.sendmail(self.config['sender'], recepients, message)
+ logging.info(f"Email sent to {', '.join(recepients)}")
except smtplib.SMTPAuthenticationError:
logging.warning('The server didn\'t accept the user\\password combination.')
except smtplib.SMTPServerDisconnected:
@@ -30,9 +40,8 @@ def send(self, subject, body):
except smtplib.SMTPException as e:
logging.exception('SMTP error occurred: ' + str(e))
- def handle_email_alert(self, subject, body):
-
+ def handle_email_alert(self, subject, body, additional_recipients=None):
# to avoid email spam, send email based on configured alert wait time
if body not in self.alert_cache:
- self.send(subject, body)
+ self.send(subject, body, additional_recipients)
self.alert_cache[body] = 1
\ No newline at end of file
diff --git a/src/RepairManager/utils/k8s_util.py b/src/RepairManager/utils/k8s_util.py
index 4c83e1658..9c796d394 100644
--- a/src/RepairManager/utils/k8s_util.py
+++ b/src/RepairManager/utils/k8s_util.py
@@ -1,22 +1,37 @@
-import os
-import logging
-from kubernetes import client, config
-
-def cordon_node(node_name):
- output = os.system('kubectl cordon %s' % node_name)
- return output
-
-
-def is_node_unschedulable(node_info, node_name):
- for node in node_info.items:
- for address in node.status.addresses:
- if address.type == 'Hostname' and address.address == node_name:
- return node.spec.unschedulable
-
- logging.warning(f"Could not find node with hostname {node_name}")
-
-
-def list_node():
- config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
- api_instance = client.CoreV1Api()
- return api_instance.list_node()
\ No newline at end of file
+import subprocess
+import logging
+from kubernetes import client, config
+
+def cordon_node(node_name, dry_run=True):
+ args = ['kubectl', 'cordon', node_name]
+
+ if dry_run:
+ args.append('--dry-run')
+
+ try:
+ output = subprocess.check_output(args, stderr=subprocess.STDOUT)
+ logging.info(output.decode())
+ return output.decode()
+ except subprocess.CalledProcessError as e:
+ logging.exception(f'Exception attempting to cordon node {node_name}')
+ return e.output.decode()
+
+
+def is_node_unschedulable(node_info, node_name):
+ for node in node_info.items:
+ for address in node.status.addresses:
+ if address.type == 'Hostname' and address.address == node_name:
+ return node.spec.unschedulable
+
+ logging.warning(f"Could not find node with hostname {node_name}")
+
+
+def list_node():
+ config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
+ api_instance = client.CoreV1Api()
+ return api_instance.list_node()
+
+def list_pod_for_all_namespaces():
+ config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml',)
+ api_instance = client.CoreV1Api()
+ return api_instance.list_pod_for_all_namespaces()
\ No newline at end of file
diff --git a/src/docker-images/RepairManager/Dockerfile b/src/docker-images/RepairManager/Dockerfile
index fcf6045fa..09d68d68a 100644
--- a/src/docker-images/RepairManager/Dockerfile
+++ b/src/docker-images/RepairManager/Dockerfile
@@ -9,6 +9,7 @@ RUN pip3 install wheel
RUN pip3 install setuptools
RUN pip3 install requests
RUN pip3 install cachetools
+RUN pip3 install tabulate
COPY kubectl /usr/local/bin/kubectl
RUN chmod +x /usr/local/bin/kubectl
diff --git a/src/docker-images/RepairManager/prebuild.sh b/src/docker-images/RepairManager/prebuild.sh
index b449bcf1d..3dd1aafd1 100644
--- a/src/docker-images/RepairManager/prebuild.sh
+++ b/src/docker-images/RepairManager/prebuild.sh
@@ -8,6 +8,7 @@ cp -r ../../../../RepairManager RepairManager
# Render config file for email credentials
cd ../../../
./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml
+./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml
cd ./deploy/docker-images/RepairManager/
cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl