Skip to content

Commit

Permalink
feat(incident): Take screenshot of relevant instances
Browse files Browse the repository at this point in the history
  • Loading branch information
balamurali27 committed Jan 24, 2025
1 parent 3136948 commit 665ff08
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 30 deletions.
58 changes: 37 additions & 21 deletions press/press/doctype/incident/incident.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
from base64 import b64encode
from datetime import timedelta
from functools import cached_property
from time import sleep
from typing import TYPE_CHECKING

import frappe
from frappe.types.DF import Phone
from frappe.utils import cint
from frappe.utils.background_jobs import enqueue_doc
from frappe.website.website_generator import WebsiteGenerator
from playwright.sync_api import sync_playwright
from playwright.sync_api import Page, sync_playwright
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
from tenacity.retry import retry_if_not_result
from twilio.base.exceptions import TwilioRestException
Expand All @@ -33,6 +32,7 @@
from press.press.doctype.incident_settings_user.incident_settings_user import (
IncidentSettingsUser,
)
from press.press.doctype.monitor_server.monitor_server import MonitorServer
from press.press.doctype.press_settings.press_settings import PressSettings

INCIDENT_ALERT = "Sites Down" # TODO: make it a field or child table somewhere #
Expand Down Expand Up @@ -258,33 +258,49 @@ def identify_problem(self):

# TODO: categorize proxy issues #

def take_grafana_screenshot(self):
press_settings: PressSettings = frappe.get_cached_doc("Press Settings")
if not (monitor_url := press_settings.monitor_server):
@property
def other_resource(self):
if self.resource_type == "Database Server":
return str(self.server)
if self.resource_type == "Server":
return str(frappe.db.get_value("Server", self.resource, "database_server"))
return None

Check warning on line 267 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L263-L267

Added lines #L263 - L267 were not covered by tests

def add_node_exporter_screenshot(self, page: Page, instance: str | None):
if not instance:
return

Check warning on line 271 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L270-L271

Added lines #L270 - L271 were not covered by tests
monitor_server = frappe.get_cached_doc("Monitor Server", monitor_url)

grafana_username = str(monitor_server.grafana_username)
grafana_password = str(monitor_server.get_password("grafana_password"))
page.goto(

Check warning on line 273 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L273

Added line #L273 was not covered by tests
f"https://{self.monitor_server.name}{self.monitor_server.node_exporter_dashboard_path}&refresh=5m&var-DS_PROMETHEUS=Prometheus&var-job=node&var-node={instance}&from=now-1h&to=now"
)
page.wait_for_load_state("networkidle")

Check warning on line 276 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L276

Added line #L276 was not covered by tests

def get_basic_auth_header(username, password):
token = b64encode(f"{username}:{password}".encode()).decode("ascii")
return f"Basic {token}"
image = b64encode(page.screenshot()).decode("ascii")
self.add_description(f'<img src="data:image/png;base64,{image}" alt="grafana-image">')

Check warning on line 279 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L278-L279

Added lines #L278 - L279 were not covered by tests

@cached_property
def monitor_server(self) -> MonitorServer:
press_settings: PressSettings = frappe.get_cached_doc("Press Settings")
if not (monitor_url := press_settings.monitor_server):
frappe.throw("Monitor Server not set in Press Settings")
return frappe.get_cached_doc("Monitor Server", monitor_url)

Check warning on line 286 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L283-L286

Added lines #L283 - L286 were not covered by tests

def get_grafana_auth_header(self):
username = str(self.monitor_server.grafana_username)
password = str(self.monitor_server.get_password("grafana_password"))
token = b64encode(f"{username}:{password}".encode()).decode("ascii")
return f"Basic {token}"

Check warning on line 292 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L289-L292

Added lines #L289 - L292 were not covered by tests

def take_grafana_screenshot(self):
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.set_extra_http_headers(
{"Authorization": get_basic_auth_header(grafana_username, grafana_password)}
)
page.goto(f"https://{monitor_url}/grafana/d/abc/node-exporter?orgId=1&refresh=5m")
sleep(5)
image = b64encode(page.screenshot()).decode("ascii")
self.add_description(f'<img src="data:image/png;base64,{image}" alt="grafana-image">')
self.save()
page.set_extra_http_headers({"Authorization": self.get_grafana_auth_header()})

Check warning on line 298 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L295-L298

Added lines #L295 - L298 were not covered by tests

# TODO: grafana path as field somewhere
# TODO: input server name
self.add_node_exporter_screenshot(page, self.resource)
self.add_node_exporter_screenshot(page, self.other_resource)

Check warning on line 301 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L300-L301

Added lines #L300 - L301 were not covered by tests

self.save()

Check warning on line 303 in press/press/doctype/incident/incident.py

View check run for this annotation

Codecov / codecov/patch

press/press/doctype/incident/incident.py#L303

Added line #L303 was not covered by tests

@frappe.whitelist()
def ignore_for_server(self):
Expand Down
8 changes: 1 addition & 7 deletions press/press/doctype/incident_settings/incident_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"field_order": [
"users",
"self_hosted_users",
"grafana_dashboard_url",
"section_break_rnxb",
"enable_incident_detection",
"phone_call_alerts",
Expand Down Expand Up @@ -101,17 +100,12 @@
"fieldname": "email_alerts",
"fieldtype": "Check",
"label": "Email Alerts"
},
{
"fieldname": "grafana_dashboard_url",
"fieldtype": "Data",
"label": "Grafana Dashboard URL"
}
],
"index_web_pages_for_search": 1,
"issingle": 1,
"links": [],
"modified": "2025-01-23 20:09:29.434754",
"modified": "2025-01-24 00:34:20.356424",
"modified_by": "Administrator",
"module": "Press",
"name": "Incident Settings",
Expand Down
1 change: 0 additions & 1 deletion press/press/doctype/incident_settings/incident_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class IncidentSettings(Document):
confirmation_threshold_night: DF.Duration | None
email_alerts: DF.Check
enable_incident_detection: DF.Check
grafana_dashboard_url: DF.Data | None
phone_call_alerts: DF.Check
self_hosted_users: DF.Table[IncidentSettingsSelfHostedUser]
users: DF.Table[IncidentSettingsUser]
Expand Down
16 changes: 15 additions & 1 deletion press/press/doctype/monitor_server/monitor_server.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
"agent_password",
"grafana_section",
"grafana_username",
"default_server",
"column_break_ilpd",
"grafana_password",
"node_exporter_dashboard_path",
"ssh_section",
"frappe_user_password",
"frappe_public_key",
Expand Down Expand Up @@ -215,6 +217,18 @@
{
"fieldname": "column_break_ilpd",
"fieldtype": "Column Break"
},
{
"description": "This is the server that is selected by default in node exporter dashboard due to sorting. Used by incident detection to collect data.",
"fieldname": "default_server",
"fieldtype": "Data",
"label": "Default Server"
},
{
"description": "Begin with / but don't end with /",
"fieldname": "node_exporter_dashboard_path",
"fieldtype": "Data",
"label": "Node Exporter Dashboard Path"
}
],
"links": [
Expand All @@ -223,7 +237,7 @@
"link_fieldname": "server"
}
],
"modified": "2025-01-23 20:05:36.835000",
"modified": "2025-01-24 00:35:26.093683",
"modified_by": "Administrator",
"module": "Press",
"name": "Monitor Server",
Expand Down
2 changes: 2 additions & 0 deletions press/press/doctype/monitor_server/monitor_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class MonitorServer(BaseServer):

agent_password: DF.Password | None
cluster: DF.Link | None
default_server: DF.Data | None
domain: DF.Link | None
frappe_public_key: DF.Code | None
frappe_user_password: DF.Password | None
Expand All @@ -31,6 +32,7 @@ class MonitorServer(BaseServer):
ip: DF.Data | None
is_server_setup: DF.Check
monitoring_password: DF.Password | None
node_exporter_dashboard_path: DF.Data | None
private_ip: DF.Data
private_mac_address: DF.Data | None
private_vlan_id: DF.Data | None
Expand Down

0 comments on commit 665ff08

Please sign in to comment.