From 4327cf121b4bd051e9db31a74b3898970ae1a19b Mon Sep 17 00:00:00 2001 From: Stadicus Date: Thu, 5 Dec 2019 13:45:45 +0100 Subject: [PATCH 1/2] Prometheus: log public network availability https://github.com/shiftdevices/bitbox-base-internal/issues/370 Prometheus should log the availability of public internet without leaking privacy information. This pull requests queries an external host with a lot of general traffic, Cloudflare, over Tor: ``` curl --socks5-hostname localhost:9050 1.1.1.1 ``` If Tor is not active, it's impossible to ping an external host without revealing the own ip address, but the solution can just "mingle in the crowd", e.g. by `ping`ing Cloudflare, which should not be suspicious. ``` ping -c 1 1.1.1.1 ``` These queries are run regularly from `prometheus-base.py`, so that the result is collected by Prometheus and stored in its time-series database. This helps with analyzing/debugging incidents, as the public network availability can also be queried after the fact. This commit: * provides the new Prometheus metric 'base_internet_connectivity' that is 0 when OK, or an error code when NOT OK --- armbian/base/scripts/prometheus-base.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/armbian/base/scripts/prometheus-base.py b/armbian/base/scripts/prometheus-base.py index ba33dc35..446518be 100755 --- a/armbian/base/scripts/prometheus-base.py +++ b/armbian/base/scripts/prometheus-base.py @@ -27,6 +27,7 @@ BASE_SYSTEMD_LIGHTNINGD = Gauge("base_systemd_lightningd", "Systemd unit status for c-lightning") BASE_SYSTEMD_PROMETHEUS = Gauge("base_systemd_prometheus", "Systemd unit status for Prometheus") BASE_SYSTEMD_GRAFANA = Gauge("base_systemd_grafana", "Systemd unit status for Grafana") +BASE_INTERNET_CONNECTIVITY = Gauge("base_internet_connectivity", "Connectivity to public internet") r = redis.Redis( host='127.0.0.1', @@ -80,6 +81,27 @@ def getSystemdStatus(unit): print(unit, e.returncode, e.output) return e.returncode +def getInternetConnectivity(): + torEnabled = int(r.get('tor:base:enabled').decode("utf-8")) + + try: + if torEnabled == 1: + print("Tor ok") + subprocess.check_output(["curl", "--socks5-hostname", "localhost:9050", "1.1.1.1"], shell=False, timeout=5, stderr=subprocess.STDOUT) + else: + print("Tor not ok") + subprocess.check_output(["ping", "-c", "1", "1.1.1.1"], shell=False, timeout=5, stderr=subprocess.STDOUT) + + return 0 + + except subprocess.TimeoutExpired as e: + print("getInternetConnectivity(): subprocess.TimeoutExpired; torEnabled", torEnabled) + return 1 + + except subprocess.CalledProcessError as e: + print("getInternetConnectivity(): subprocess.CalledProcessError (", e.returncode, "); torEnabled", torEnabled, e.output) + return e.returncode + def main(): # Start up the server to expose the metrics. start_http_server(8400) @@ -91,6 +113,7 @@ def main(): BASE_SYSTEMD_LIGHTNINGD.set(int(getSystemdStatus("lightningd"))) BASE_SYSTEMD_PROMETHEUS.set(int(getSystemdStatus("prometheus"))) BASE_SYSTEMD_GRAFANA.set(int(getSystemdStatus("grafana-server"))) + BASE_INTERNET_CONNECTIVITY.set(int(getInternetConnectivity())) try: BASE_CPU_TEMP.set(readFile("/sys/class/thermal/thermal_zone0/temp")) From f64a68b121df419d17af090a5fba440940a39bfd Mon Sep 17 00:00:00 2001 From: Stadicus Date: Sat, 7 Dec 2019 15:44:07 +0100 Subject: [PATCH 2/2] prometheus: use 1 as 'success' value, not 0 related to: https://github.com/digitalbitbox/bitbox-base/pull/301#discussion_r354780098 Because: * The Base Prometheus scraper used the `systemctl` error code to indicate if a service is active: `0` == OK; otherwise NOT OK * This is not intuitive, especially when extending the monitoring to other status, like the network connectivity in this pull request. * It's better to use `1` as OK and `0` as NOT OK, which also makes visualization in Grafana easier. This commit: * changes logged values for network connectivity, `1` meaning OK * changes logged values for systemd units, `1` meaning OK * adjusts Grafana dashboard to use new values * extends Grafana dashboard with diskspace usage over time (long overdue) --- .../dashboards/grafana_bitbox_base.json | 448 ++++++++++++------ armbian/base/scripts/prometheus-base.py | 13 +- 2 files changed, 317 insertions(+), 144 deletions(-) diff --git a/armbian/base/rootfs/etc/grafana/dashboards/grafana_bitbox_base.json b/armbian/base/rootfs/etc/grafana/dashboards/grafana_bitbox_base.json index 9dfb3d94..f6fe1d5e 100644 --- a/armbian/base/rootfs/etc/grafana/dashboards/grafana_bitbox_base.json +++ b/armbian/base/rootfs/etc/grafana/dashboards/grafana_bitbox_base.json @@ -17,7 +17,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "iteration": 1568818067203, + "iteration": 1575728381645, "links": [], "panels": [ { @@ -38,9 +38,9 @@ "colorBackground": true, "colorValue": false, "colors": [ - "#76997f", + "#e63963", "#cc914e", - "#e63963" + "#76997f" ], "format": "none", "gauge": { @@ -99,7 +99,7 @@ "refId": "A" } ], - "thresholds": "1,2", + "thresholds": "0.5,1", "timeFrom": null, "timeShift": null, "title": "Bitcoin", @@ -109,12 +109,12 @@ { "op": "=", "text": "✓", - "value": "0" + "value": "1" }, { "op": "=", "text": "down", - "value": "3" + "value": "0" } ], "valueName": "current" @@ -124,9 +124,9 @@ "colorBackground": true, "colorValue": false, "colors": [ - "#74997e", + "#e63963", "#cc914e", - "#e63963" + "#74997e" ], "format": "none", "gauge": { @@ -185,7 +185,7 @@ "refId": "A" } ], - "thresholds": "1,2", + "thresholds": "0.5,1", "timeFrom": null, "timeShift": null, "title": "Electrum", @@ -195,12 +195,12 @@ { "op": "=", "text": "✓", - "value": "0" + "value": "1" }, { "op": "=", "text": "down", - "value": "3" + "value": "0" } ], "valueName": "current" @@ -210,9 +210,9 @@ "colorBackground": true, "colorValue": false, "colors": [ - "#74997e", + "#e63963", "#cc914e", - "#e63963" + "#74997e" ], "format": "none", "gauge": { @@ -271,7 +271,7 @@ "refId": "A" } ], - "thresholds": "1,2", + "thresholds": "0.5,1", "timeFrom": null, "timeShift": null, "title": "Lightning", @@ -281,12 +281,12 @@ { "op": "=", "text": "✓", - "value": "0" + "value": "1" }, { "op": "=", "text": "down", - "value": "3" + "value": "0" } ], "valueName": "current" @@ -1356,6 +1356,7 @@ "bars": false, "dashLength": 10, "dashes": false, + "description": "", "fill": 2, "gridPos": { "h": 8, @@ -1364,6 +1365,7 @@ "y": 14 }, "id": 52, + "interval": "", "legend": { "avg": false, "current": false, @@ -1387,27 +1389,20 @@ "steppedLine": false, "targets": [ { - "expr": "node_filesystem_size_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype=\"tmpfs\"} - node_filesystem_avail_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype=\"tmpfs\"}", + "expr": "node_filesystem_size_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype!=\"tmpfs\"}-node_filesystem_avail_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype!=\"tmpfs\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 4, "legendFormat": "{{mountpoint}}", "refId": "A" - }, - { - "expr": "node_filesystem_free_bytes", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Size RAM-based TMPFS", + "title": "Diskspace used", "tooltip": { "shared": true, "sort": 0, @@ -1449,14 +1444,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "fill": 2, + "description": "", + "fill": 0, "gridPos": { "h": 8, - "w": 11, + "w": 10, "x": 13, "y": 14 }, - "id": 97, + "id": 101, + "interval": "", "legend": { "avg": false, "current": false, @@ -1474,41 +1471,26 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*Read.*/", - "color": "#73BF69" - }, - { - "alias": "/.*Write.*/", - "color": "#FF9830", - "transform": "negative-Y" - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "irate(node_disk_reads_completed_total{instance=~\"$node:$port\",job=~\"$job\",device=~\"mmcblk1|nvme.*|sda.*\"}[5m])", + "expr": "1 - (node_filesystem_avail_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype!=\"tmpfs\"}) / node_filesystem_size_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype!=\"tmpfs\"}", "format": "time_series", + "hide": false, + "interval": "", "intervalFactor": 4, - "legendFormat": "{{device}} - Reads completed", + "legendFormat": "{{mountpoint}}", "refId": "A" - }, - { - "expr": "irate(node_disk_writes_completed_total{instance=~\"$node:$port\",job=~\"$job\",device=~\"mmcblk1|nvme.*|sda.*\"}[5m]) ", - "format": "time_series", - "intervalFactor": 4, - "legendFormat": "{{device}} - Writes completed", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Disk IOps", + "title": "Diskspace used %", "tooltip": { "shared": true, "sort": 0, @@ -1524,11 +1506,11 @@ }, "yaxes": [ { - "format": "iops", + "format": "percentunit", "label": "IO write (-) / read (+)", "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -1537,7 +1519,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ], "yaxis": { @@ -1713,56 +1695,24 @@ } }, { - "aliasColors": { - "Recv_bytes_eth2": "#7EB26D", - "Recv_bytes_lo": "#0A50A1", - "Recv_drop_eth2": "#6ED0E0", - "Recv_drop_lo": "#E0F9D7", - "Recv_errs_eth2": "#BF1B00", - "Recv_errs_lo": "#CCA300", - "Trans_bytes_eth2": "#7EB26D", - "Trans_bytes_lo": "#0A50A1", - "Trans_drop_eth2": "#6ED0E0", - "Trans_drop_lo": "#E0F9D7", - "Trans_errs_eth2": "#BF1B00", - "Trans_errs_lo": "#CCA300", - "recv_bytes_lo": "#0A50A1", - "recv_drop_eth0": "#99440A", - "recv_drop_lo": "#967302", - "recv_errs_eth0": "#BF1B00", - "recv_errs_lo": "#890F02", - "trans_bytes_eth0": "#7EB26D", - "trans_bytes_lo": "#0A50A1", - "trans_drop_eth0": "#99440A", - "trans_drop_lo": "#967302", - "trans_errs_eth0": "#BF1B00", - "trans_errs_lo": "#890F02" - }, + "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", - "description": "", - "fill": 4, + "fill": 2, "gridPos": { "h": 8, - "w": 11, + "w": 10, "x": 13, "y": 22 }, - "id": 32, + "id": 100, "legend": { - "alignAsTable": false, "avg": false, "current": false, - "hideEmpty": false, - "hideZero": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": false }, @@ -1771,47 +1721,36 @@ "links": [], "nullPointMode": "null", "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*send.*/", - "color": "#cc914e", - "transform": "negative-Y" - }, - { - "alias": "/.*recv.*/", - "color": "#74997e" - } - ], + "seriesOverrides": [], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{instance=~\"$node:$port\",job=~\"$job\",device!=\"lo\"}[5m])", + "expr": "node_filesystem_size_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype=\"tmpfs\"} - node_filesystem_avail_bytes{instance=~\"$node:$port\",job=~\"$job\",fstype=\"tmpfs\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "recv {{device}}", - "refId": "A", - "step": 240 + "hide": false, + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{mountpoint}}", + "refId": "A" }, { - "expr": "rate(node_network_transmit_bytes_total{instance=~\"$node:$port\",job=~\"$job\",device!=\"lo\"}[5m])", + "expr": "node_filesystem_free_bytes", "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "send {{device}} ", - "refId": "B", - "step": 240 + "hide": true, + "intervalFactor": 1, + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Network Traffic", + "title": "Size RAM-based TMPFS", "tooltip": { "shared": true, "sort": 0, @@ -1828,19 +1767,19 @@ "yaxes": [ { "format": "bytes", - "label": "send (-) / receive (+)", + "label": "IO write (-) / read (+)", "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "pps", - "label": "", + "format": "short", + "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { @@ -1984,6 +1923,243 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 2, + "gridPos": { + "h": 8, + "w": 11, + "x": 13, + "y": 30 + }, + "id": 97, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "color": "#73BF69" + }, + { + "alias": "/.*Write.*/", + "color": "#FF9830", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=~\"$node:$port\",job=~\"$job\",device=~\"mmcblk1|nvme.*|sda.*\"}[5m])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A" + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=~\"$node:$port\",job=~\"$job\",device=~\"mmcblk1|nvme.*|sda.*\"}[5m]) ", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Writes completed", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "IO write (-) / read (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "description": "", + "fill": 4, + "gridPos": { + "h": 8, + "w": 11, + "x": 3, + "y": 38 + }, + "id": 32, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*send.*/", + "color": "#cc914e", + "transform": "negative-Y" + }, + { + "alias": "/.*recv.*/", + "color": "#74997e" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{instance=~\"$node:$port\",job=~\"$job\",device!=\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "recv {{device}}", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_bytes_total{instance=~\"$node:$port\",job=~\"$job\",device!=\"lo\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "send {{device}} ", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "send (-) / receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "pps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "System details", @@ -4217,8 +4393,8 @@ { "allValue": null, "current": { - "text": "ORANGESCAN", - "value": "ORANGESCAN" + "text": "SLIMYCALENDAR", + "value": "SLIMYCALENDAR" }, "datasource": "Prometheus", "definition": "label_values(lightning_node_info, alias)", @@ -4242,8 +4418,8 @@ { "allValue": null, "current": { - "text": "03cf3fe6592961579e5408a80290b22682b8d8915e7f58705d151b72134f9d507b", - "value": "03cf3fe6592961579e5408a80290b22682b8d8915e7f58705d151b72134f9d507b" + "text": "03517e54ba17e663be6bfc3476411afb7ab8e502f70c55c3003fdf6b8787ec348f", + "value": "03517e54ba17e663be6bfc3476411afb7ab8e502f70c55c3003fdf6b8787ec348f" }, "datasource": "Prometheus", "definition": "label_values(lightning_node_info, id)", @@ -4292,8 +4468,8 @@ { "allValue": null, "current": { - "text": "v0.7.2.1", - "value": "v0.7.2.1" + "text": "v0.7.3", + "value": "v0.7.3" }, "datasource": "Prometheus", "definition": "label_values(lightning_node_info, version)", @@ -4317,8 +4493,8 @@ { "allValue": null, "current": { - "text": "cleopatra", - "value": "cleopatra" + "text": "bitbox-base", + "value": "bitbox-base" }, "datasource": "Prometheus", "definition": "label_values(base_system_info, base_hostname)", @@ -4342,8 +4518,8 @@ { "allValue": null, "current": { - "text": "2019-09-08", - "value": "2019-09-08" + "text": "2019-12-05", + "value": "2019-12-05" }, "datasource": "Prometheus", "definition": "label_values(base_system_info, build_date)", @@ -4367,8 +4543,8 @@ { "allValue": null, "current": { - "text": "13:02", - "value": "13:02" + "text": "09:19", + "value": "09:19" }, "datasource": "Prometheus", "definition": "label_values(base_system_info, build_time)", @@ -4392,8 +4568,8 @@ { "allValue": null, "current": { - "text": "de28372", - "value": "de28372" + "text": "93b0917", + "value": "93b0917" }, "datasource": "Prometheus", "definition": "label_values(base_system_info, build_commit)", @@ -4417,8 +4593,8 @@ { "allValue": null, "current": { - "text": "0.0.3", - "value": "0.0.3" + "text": "0.1.0", + "value": "0.1.0" }, "datasource": "Prometheus", "definition": "label_values(base_system_info, base_version)", @@ -4469,5 +4645,5 @@ "timezone": "", "title": "BitBoxBase", "uid": "BitBoxBase", - "version": 25 -} \ No newline at end of file + "version": 26 +} diff --git a/armbian/base/scripts/prometheus-base.py b/armbian/base/scripts/prometheus-base.py index 446518be..4f0c3ce3 100755 --- a/armbian/base/scripts/prometheus-base.py +++ b/armbian/base/scripts/prometheus-base.py @@ -76,31 +76,28 @@ def getSystemInfo(): def getSystemdStatus(unit): try: subprocess.check_output(["systemctl", "is-active", unit]) - return 0 + return 1 except subprocess.CalledProcessError as e: print(unit, e.returncode, e.output) - return e.returncode + return 0 def getInternetConnectivity(): torEnabled = int(r.get('tor:base:enabled').decode("utf-8")) - try: if torEnabled == 1: - print("Tor ok") subprocess.check_output(["curl", "--socks5-hostname", "localhost:9050", "1.1.1.1"], shell=False, timeout=5, stderr=subprocess.STDOUT) else: - print("Tor not ok") subprocess.check_output(["ping", "-c", "1", "1.1.1.1"], shell=False, timeout=5, stderr=subprocess.STDOUT) - return 0 + return 1 except subprocess.TimeoutExpired as e: print("getInternetConnectivity(): subprocess.TimeoutExpired; torEnabled", torEnabled) - return 1 + return 0 except subprocess.CalledProcessError as e: print("getInternetConnectivity(): subprocess.CalledProcessError (", e.returncode, "); torEnabled", torEnabled, e.output) - return e.returncode + return 0 def main(): # Start up the server to expose the metrics.