Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repair manager - add more details to alert emails #709

Merged
merged 15 commits into from
Dec 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ src/WebUI/dotnet/WebPortal/hosting.json
**/storage.ide

src/RepairManager/.vs/*
src/RepairManager/mock-data/*
/.vs/slnx.sqlite
/.vs/ProjectSettings.json
/.vs/VSWorkspaceState.json
Expand Down
78 changes: 49 additions & 29 deletions src/RepairManager/Rules/ecc_rule.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from Rules.rules_abc import Rule
from kubernetes import client, config
from utils import k8s_util, email
from tabulate import tabulate
import requests
import json
import os
Expand All @@ -27,23 +28,42 @@ def get_node_address_info(node_info):

address_map[internal_ip] = hostname

logging.debug('node address map: %s ' % address_map)
logging.debug(f'node address map: {address_map}')

return address_map



def get_ECC_error_data(ecc_url):
try:
response = requests.get(ecc_url)
if response:
data = json.loads(response.text)

if data:
ecc_metrics = data['data']['result']
logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
return ecc_metrics
else:
logging.warning(f'No response from {ecc_url} found.')

response = requests.get(ecc_url)
data = json.loads(response.text)

if data:
ecc_metrics = data['data']['result']
logging.info('ECC error metrics from prometheus: ' + json.dumps(ecc_metrics))
except:
logging.exception(f'Error retrieving data from {ecc_url}')

return ecc_metrics
def get_job_info_from_nodes(nodes):
pods = k8s_util.list_pod_for_all_namespaces()

jobs = {}
for pod in pods.items:
if pod.metadata and pod.metadata.labels:
if 'jobId' in pod.metadata.labels and 'userName' in pod.metadata.labels:
if pod.spec.node_name in nodes:
jobs[pod.metadata.labels['jobId']] = {
'userName': pod.metadata.labels['userName'],
'nodeName': pod.spec.node_name,
'vcName': pod.metadata.labels['vcName']
}
return jobs


class ECCRule(Rule):
Expand Down Expand Up @@ -79,25 +99,25 @@ def check_status(self):
return False

def take_action(self):
status = {}

status = []
for node_name in self.ecc_hostnames:

if not k8s_util.is_node_unschedulable(self.node_info, node_name):
success = k8s_util.cordon_node(node_name)

if success != 0:
logging.warning(f'Unscheduling of node {node_name} not successful')
status[node_name] = 'Failed to mark as unschedulable'
else:
status[node_name] = 'Successfully marked as unschedulable'

else:
status[node_name] = 'Previously marked as unschedulable'

body = 'Uncorrectable ECC Error found on the following nodes:\n'
for node_name in status:
body += f'{node_name} -> \t{status[node_name]}\n'

subject = 'Repair Manager Alert [ECC ERROR]'
self.alert.handle_email_alert(subject, body)
output = k8s_util.cordon_node(node_name, dry_run=True)
status.append([node_name, output])

subject = f'Repair Manager Alert [ECC ERROR] [{self.config["cluster_name"]}]'
body = f'<h3>Uncorrectable ECC Error found in {self.config["cluster_name"]} cluster on the following nodes:</h1>'
body += tabulate(status, headers=['node name', 'action status'], tablefmt="html").replace('<table>','<table border="1">')

body += f'<h3>Impacted Jobs and Job Owners</h3>'
job_owners = []
job_info = []
jobs = get_job_info_from_nodes(self.ecc_hostnames)
for jobId in jobs:
job_owners.append(jobs[jobId]['userName'] + '@microsoft.com')
job_info.append([jobId, jobs[jobId]['userName'], jobs[jobId]['nodeName'], jobs[jobId]['vcName']])
body += tabulate(job_info, headers=['job id', 'job owner', 'node name', 'vc name' ], tablefmt="html").replace('<table>','<table border="1">')

if self.config['rules']['ecc_rule']['alert_job_owners']:
self.alert.handle_email_alert(subject, body, additional_recipients=job_owners)
else:
self.alert.handle_email_alert(subject, body)
3 changes: 1 addition & 2 deletions src/RepairManager/config/email-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@ password: {{cnf['repair-manager']['alert']['password']}}
sender: {{cnf['repair-manager']['alert']['sender']}}
receiver: {{cnf['repair-manager']['alert']['receiver']}}

# time to wait before resending email alert for exceptions (seconds)
alert_wait_seconds: {{cnf['repair-manager']['alert']['alert_wait_seconds']}}
alert_wait_seconds: 86400 # time to wait before resending email alert for exceptions (seconds)
4 changes: 4 additions & 0 deletions src/RepairManager/config/rule-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ rules:
module_name : Rules.ecc_rule
class_name : ECCRule
ecc_error_url: '/api/v1/query?query=nvidiasmi_ecc_error_count%7Btype%3D%22volatile_double%22%7D%3E0'
alert_job_owners: False # alert all impacted job owners

# time to sleep between rule execution (seconds)
rule_wait_time: 10

cluster_name: {{cnf['repair-manager']['cluster_name']}}

4 changes: 2 additions & 2 deletions src/RepairManager/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def Run():

except Exception as e:
logger.exception(f'Error executing {class_name} from module {module_name}\n')
subject = f'Repair Manager Alert [Exception executing {class_name}]'
subject = f'Repair Manager Alert [Rule Error] [{class_name}] [{config["cluster_name"]}]'
body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
alert.handle_email_alert(subject, body)

except Exception as e:
logger.exception('Repair manager has stopped due to an unhandled exception:')
subject = '[Repair Manager Alert] Repair manager has stopped unexpectedly'
subject = f'Repair Manager Alert [Repair Manager Crashed] [{config["cluster_name"]}]'
body = ''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
alert.send(subject, body)

Expand Down
23 changes: 16 additions & 7 deletions src/RepairManager/utils/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,34 @@ def load_config(self):
with open('./config/email-config.yaml', 'r') as file:
return yaml.safe_load(file)

def send(self, subject, body):
message = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (self.config['sender'], self.config['receiver'], subject, body)
def send(self, subject, body, additional_recipients=None):
recepients = self.config['receiver']
if additional_recipients is not None:
recepients = recepients + additional_recipients

message = (
f"From: {self.config['sender']}\r\n"
f"To: {';'.join(recepients)}\r\n"
f"MIME-Version: 1.0\r\n"
f"Content-type: text/html\r\n"
f"Subject: {subject}\r\n\r\n{body}"
)

try:
with smtplib.SMTP(self.config['smtp_url']) as server:
server.starttls()
server.login(self.config['login'], self.config['password'])
server.sendmail(self.config['sender'], self.config['receiver'], message)
logging.info('Email Sent')
server.sendmail(self.config['sender'], recepients, message)
logging.info(f"Email sent to {', '.join(recepients)}")
except smtplib.SMTPAuthenticationError:
logging.warning('The server didn\'t accept the user\\password combination.')
except smtplib.SMTPServerDisconnected:
logging.warning('Server unexpectedly disconnected')
except smtplib.SMTPException as e:
logging.exception('SMTP error occurred: ' + str(e))

def handle_email_alert(self, subject, body):

def handle_email_alert(self, subject, body, additional_recipients=None):
# to avoid email spam, send email based on configured alert wait time
if body not in self.alert_cache:
self.send(subject, body)
self.send(subject, body, additional_recipients)
self.alert_cache[body] = 1
59 changes: 37 additions & 22 deletions src/RepairManager/utils/k8s_util.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
import os
import logging
from kubernetes import client, config

def cordon_node(node_name):
output = os.system('kubectl cordon %s' % node_name)
return output


def is_node_unschedulable(node_info, node_name):
for node in node_info.items:
for address in node.status.addresses:
if address.type == 'Hostname' and address.address == node_name:
return node.spec.unschedulable

logging.warning(f"Could not find node with hostname {node_name}")


def list_node():
config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
api_instance = client.CoreV1Api()
return api_instance.list_node()
import subprocess
import logging
from kubernetes import client, config

def cordon_node(node_name, dry_run=True):
args = ['kubectl', 'cordon', node_name]

if dry_run:
args.append('--dry-run')

try:
output = subprocess.check_output(args, stderr=subprocess.STDOUT)
logging.info(output.decode())
return output.decode()
except subprocess.CalledProcessError as e:
logging.exception(f'Exception attempting to cordon node {node_name}')
return e.output.decode()


def is_node_unschedulable(node_info, node_name):
for node in node_info.items:
for address in node.status.addresses:
if address.type == 'Hostname' and address.address == node_name:
return node.spec.unschedulable

logging.warning(f"Could not find node with hostname {node_name}")


def list_node():
config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml')
api_instance = client.CoreV1Api()
return api_instance.list_node()

def list_pod_for_all_namespaces():
config.load_kube_config(config_file='/etc/kubernetes/restapi-kubeconfig.yaml',)
api_instance = client.CoreV1Api()
return api_instance.list_pod_for_all_namespaces()
1 change: 1 addition & 0 deletions src/docker-images/RepairManager/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ RUN pip3 install wheel
RUN pip3 install setuptools
RUN pip3 install requests
RUN pip3 install cachetools
RUN pip3 install tabulate

COPY kubectl /usr/local/bin/kubectl
RUN chmod +x /usr/local/bin/kubectl
Expand Down
1 change: 1 addition & 0 deletions src/docker-images/RepairManager/prebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ cp -r ../../../../RepairManager RepairManager
# Render config file for email credentials
cd ../../../
./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/email-config.yaml
./deploy.py rendertemplate ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml ./deploy/docker-images/RepairManager/RepairManager/config/rule-config.yaml
cd ./deploy/docker-images/RepairManager/

cp -r ../../../../ClusterBootstrap/deploy/bin/kubectl kubectl