From e6eb3bd5c969bdf6e79989b0c5960c965d2b0ad6 Mon Sep 17 00:00:00 2001 From: Nico Flaig Date: Thu, 16 Feb 2023 17:56:17 +0100 Subject: [PATCH] Support client monitoring by a remote service (e.g. beaconcha.in) (#5037) * Initial client monitoring implementation * Add monitoring to beacon node * Add monitoring to validator * Ensure that monitoring endpoint is a valid URL * Improve validation of monitoring endpoint * Improve error handling and timeout of remote server request * Wait for pending request before sending next one * Update request error handling * Update monitoring endpoint parsing * Export monitoring package * Fix process cpu seconds total metric Use "process_cpu_user_seconds_total" instead of "process_cpu_seconds_total" as it properly reports the CPU usage of the process and more importantly also triggers the collect method of the metric which ensures that always the latest value is shown. * Add option to collect system stats * Define system stats * Improve logs when monitoring service is started * Use the term "remote service" instead of "remote server" * Move Client type to service * Add monitoring args to beacon node test * Update description of monitoring cli args * Update monitoring service * Add metrics for collecting and sending data * Add monitoring panels to VM + host dashboard * Update send data metric buckets * Print out machine when starting monitoring service * Refactoring * Add unit tests * Add metric values for sync_eth1_connected and sync_eth1_fallback_configured * Use setTimeout instead of sleep for initial delay * Use milliseconds instead of seconds for time values * Add description to client stats properties * Remove sinon spies after tests are finished * Document client monitoring usage * Add enum to check status of monitoring service * Reduce info log when monitoring service is started Only print out information which is configurable by non-hidden CLI options and properly documented. --- dashboards/lodestar_vm_host.json | 198 +++++++++++- docs/usage/client-monitoring.md | 49 +++ mkdocs.yml | 1 + packages/beacon-node/package.json | 3 + packages/beacon-node/src/index.ts | 3 + .../beacon-node/src/monitoring/clientStats.ts | 297 ++++++++++++++++++ packages/beacon-node/src/monitoring/index.ts | 2 + .../beacon-node/src/monitoring/options.ts | 20 ++ .../beacon-node/src/monitoring/properties.ts | 154 +++++++++ .../beacon-node/src/monitoring/service.ts | 229 ++++++++++++++ packages/beacon-node/src/monitoring/types.ts | 32 ++ packages/beacon-node/src/node/nodejs.ts | 20 ++ packages/beacon-node/src/node/options.ts | 3 + .../test/unit/monitoring/clientStats.test.ts | 38 +++ .../test/unit/monitoring/properties.test.ts | 274 ++++++++++++++++ .../test/unit/monitoring/remoteService.ts | 86 +++++ .../test/unit/monitoring/schemas.ts | 63 ++++ .../test/unit/monitoring/service.test.ts | 266 ++++++++++++++++ packages/cli/src/cmds/dev/handler.ts | 1 + packages/cli/src/cmds/validator/handler.ts | 27 +- packages/cli/src/cmds/validator/options.ts | 54 ++++ .../src/options/beaconNodeOptions/index.ts | 4 + .../options/beaconNodeOptions/monitoring.ts | 61 ++++ .../unit/options/beaconNodeOptions.test.ts | 13 + 24 files changed, 1894 insertions(+), 4 deletions(-) create mode 100644 docs/usage/client-monitoring.md create mode 100644 packages/beacon-node/src/monitoring/clientStats.ts create mode 100644 packages/beacon-node/src/monitoring/index.ts create mode 100644 packages/beacon-node/src/monitoring/options.ts create mode 100644 packages/beacon-node/src/monitoring/properties.ts create mode 100644 packages/beacon-node/src/monitoring/service.ts create mode 100644 packages/beacon-node/src/monitoring/types.ts create mode 100644 packages/beacon-node/test/unit/monitoring/clientStats.test.ts create mode 100644 packages/beacon-node/test/unit/monitoring/properties.test.ts create mode 100644 packages/beacon-node/test/unit/monitoring/remoteService.ts create mode 100644 packages/beacon-node/test/unit/monitoring/schemas.ts create mode 100644 packages/beacon-node/test/unit/monitoring/service.test.ts create mode 100644 packages/cli/src/options/beaconNodeOptions/monitoring.ts diff --git a/dashboards/lodestar_vm_host.json b/dashboards/lodestar_vm_host.json index bebcc36473ff..bd6047ba1205 100644 --- a/dashboards/lodestar_vm_host.json +++ b/dashboards/lodestar_vm_host.json @@ -22,7 +22,7 @@ "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 35, - "iteration": 1672991883959, + "iteration": 1674481074076, "links": [ { "asDropdown": true, @@ -2624,6 +2624,200 @@ "x": 0, "y": 44 }, + "id": 527, + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 529, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "7.4.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": false, + "expr": "rate(lodestar_monitoring_collect_data_seconds_sum[$rate_interval])\r\n/\r\nrate(lodestar_monitoring_collect_data_seconds_count[$rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Collect data duration", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 531, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "7.4.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": false, + "expr": "rate(lodestar_monitoring_send_data_seconds_sum[$rate_interval])\r\n/\r\nrate(lodestar_monitoring_send_data_seconds_count[$rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{job}} - {{status}}", + "refId": "A" + } + ], + "title": "Send data duration", + "type": "timeseries" + } + ], + "title": "Monitoring", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 + }, "id": 86, "panels": [ { @@ -3158,7 +3352,7 @@ "h": 1, "w": 24, "x": 0, - "y": 45 + "y": 46 }, "id": 164, "panels": [ diff --git a/docs/usage/client-monitoring.md b/docs/usage/client-monitoring.md new file mode 100644 index 000000000000..f641ee1120e7 --- /dev/null +++ b/docs/usage/client-monitoring.md @@ -0,0 +1,49 @@ +# Client monitoring + +Lodestar has the ability to send client stats to a remote service for collection. +At the moment, the main service offering remote monitoring is [beaconcha.in](https://beaconcha.in/). + +Instructions for setting up client monitoring with *beaconcha.in* can be found in their docs about +[Mobile App <> Node Monitoring](https://kb.beaconcha.in/beaconcha.in-explorer/mobile-app-less-than-greater-than-beacon-node) +and in your [account settings](https://beaconcha.in/user/settings#app). + +## Configuration + +Lodestar provides CLI options to configure monitoring on both the beacon node and validator client. + +### Remote endpoint URL + +Client monitoring can be enabled by setting the `--monitoring.endpoint` flag to a remote service endpoint URL. +As monitoring relies on metrics data, it is required that metrics are also enabled by supplying the `--metrics` flag. + +```bash +lodestar beacon --monitoring.endpoint "https://beaconcha.in/api/v1/client/metrics?apikey={apikey}&machine={machineName}" --metrics +``` + +In case of *beaconcha.in*, the API key can be found in your [account settings](https://beaconcha.in/user/settings#api). +Setting the machine is optional but it is especially useful if you are monitoring multiple nodes. + + +!!! note + When sending data to a remote service you should be conscious about security: + + - Only use a service that you trust as this will send information which may identify you + and associate your validators, IP address and other personal information. + - Always use a HTTPS connection (i.e. a URL starting with `https://`) to prevent the traffic + from being intercepted in transit and leaking information. + + +More details about the data sent to the remote service can be found in the [specification](https://docs.google.com/document/d/1qPWAVRjPCENlyAjUBwGkHMvz9qLdd_6u9DPZcNxDBpc). + +It is also possible to print out the data sent to the remote service by enabling debug logs which can be done by supplying the `--logLevel debug` flag. + +### Monitoring interval + +It is possible to adjust the interval between sending client stats to the remote service by +setting the `--monitoring.interval` flag. It takes an integer value in milliseconds, the default is `60000` which means data is sent once a minute. + +```bash +lodestar beacon --monitoring.interval 60000 +``` + +Increasing the monitoring interval can be useful if you are running into rate limit errors when posting large amounts of data for multiple nodes. diff --git a/mkdocs.yml b/mkdocs.yml index 6775a5deb192..759a8dfd7151 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,6 +76,7 @@ nav: - Validator management: usage/validator-management.md - Prometheus & Grafana Setup: usage/prometheus-grafana.md - MEV Builder Integration: usage/mev-integration.md + - Client monitoring: usage/client-monitoring.md - Reference: - Command line: reference/cli.md - Libraries: libraries/index.md diff --git a/packages/beacon-node/package.json b/packages/beacon-node/package.json index 110f6285d369..7a773f5252d0 100644 --- a/packages/beacon-node/package.json +++ b/packages/beacon-node/package.json @@ -35,6 +35,9 @@ "./metrics": { "import": "./lib/metrics/index.js" }, + "./monitoring": { + "import": "./lib/monitoring/index.js" + }, "./network": { "import": "./lib/network/index.js" }, diff --git a/packages/beacon-node/src/index.ts b/packages/beacon-node/src/index.ts index 00dcf62cad3b..c8e9d8c339b7 100644 --- a/packages/beacon-node/src/index.ts +++ b/packages/beacon-node/src/index.ts @@ -7,6 +7,9 @@ export * from "./node/index.js"; // Export metrics utilities to de-duplicate validator metrics export {RegistryMetricCreator, collectNodeJSMetrics, HttpMetricsServer} from "./metrics/index.js"; +// Export monitoring service to make it usable by validator +export {MonitoringService} from "./monitoring/index.js"; + // Export generic RestApi server for CLI export {RestApiServer, RestApiServerOpts, RestApiServerModules, RestApiServerMetrics} from "./api/rest/base.js"; diff --git a/packages/beacon-node/src/monitoring/clientStats.ts b/packages/beacon-node/src/monitoring/clientStats.ts new file mode 100644 index 000000000000..06aa1f31c212 --- /dev/null +++ b/packages/beacon-node/src/monitoring/clientStats.ts @@ -0,0 +1,297 @@ +import {DynamicProperty, MetricProperty, StaticProperty} from "./properties.js"; +import {Client} from "./service.js"; +import {ClientStats, JsonType, ProcessType} from "./types.js"; + +// Definition of client stats based on specification +// See https://docs.google.com/document/d/1qPWAVRjPCENlyAjUBwGkHMvz9qLdd_6u9DPZcNxDBpc + +const CLIENT_STATS_SPEC_VERSION = 1; + +const CLIENT_NAME = "lodestar"; + +export function createClientStats(client: Client, collectSystemStats?: boolean): ClientStats[] { + const clientStats = []; + + if (client === "beacon") { + clientStats.push(createBeaconNodeStats()); + } else if (client === "validator") { + clientStats.push(createValidatorStats()); + } + + if (collectSystemStats) { + clientStats.push(createSystemStats()); + } + + return clientStats; +} + +function createCommonStats(process: ProcessType): ClientStats { + return { + version: new StaticProperty({ + jsonKey: "version", + value: CLIENT_STATS_SPEC_VERSION, + description: "Client Stats data specification version", + }), + timestamp: new DynamicProperty({ + jsonKey: "timestamp", + provider: Date.now, + description: "Unix timestamp in milliseconds", + }), + process: new StaticProperty({ + jsonKey: "process", + value: process, + description: "Process type, can be one of: validator, beaconnode, system", + }), + }; +} + +function createProcessStats(process: ProcessType): ClientStats { + return { + ...createCommonStats(process), + cpuProcessSecondsTotal: new MetricProperty({ + jsonKey: "cpu_process_seconds_total", + metricName: "process_cpu_user_seconds_total", + jsonType: JsonType.Number, + defaultValue: 0, + description: "CPU seconds consumed by the process", + }), + memoryProcessBytes: new MetricProperty({ + jsonKey: "memory_process_bytes", + metricName: "process_resident_memory_bytes", + jsonType: JsonType.Number, + defaultValue: 0, + description: "Amount of memory in bytes allocated to the process", + }), + clientName: new StaticProperty({ + jsonKey: "client_name", + value: CLIENT_NAME, + description: "Name of client, e.g. lodestar, prysm, lighthouse, teku, nimbus", + }), + clientVersion: new MetricProperty({ + jsonKey: "client_version", + metricName: "lodestar_version", + fromLabel: "version", + formatter: (value) => { + // remove "v" prefix from value + return (value as string).substring(1); + }, + jsonType: JsonType.String, + cacheResult: true, + defaultValue: "", + description: "Version of client, e.g. 1.3.0/2d0938e", + }), + clientBuild: new StaticProperty({ + jsonKey: "client_build", + value: 0, + description: "Incrementing integer representation of build for easier comparison", + }), + syncEth2FallbackConfigured: new StaticProperty({ + jsonKey: "sync_eth2_fallback_configured", + value: false, + description: "Whether the client has a fallback eth2 endpoint configured", + }), + syncEth2FallbackConnected: new StaticProperty({ + jsonKey: "sync_eth2_fallback_connected", + value: false, + description: "Whether the client is currently connected to a fallback eth2 endpoint", + }), + }; +} + +function createBeaconNodeStats(): ClientStats { + return { + ...createProcessStats(ProcessType.BeaconNode), + diskBeaconChainBytesTotal: new StaticProperty({ + jsonKey: "disk_beaconchain_bytes_total", + value: 0, + description: "Amount of bytes consumed on disk by the beacon node's database", + }), + networkLibp2pBytesTotalReceive: new MetricProperty({ + jsonKey: "network_libp2p_bytes_total_receive", + metricName: "libp2p_data_transfer_bytes_total", + withLabel: {name: "protocol", value: "global received"}, + jsonType: JsonType.Number, + defaultValue: 0, + description: "Number of bytes received via libp2p traffic", + }), + networkLibp2pBytesTotalTransmit: new MetricProperty({ + jsonKey: "network_libp2p_bytes_total_transmit", + metricName: "libp2p_data_transfer_bytes_total", + withLabel: {name: "protocol", value: "global sent"}, + jsonType: JsonType.Number, + defaultValue: 0, + description: "Number of bytes transmitted via libp2p traffic", + }), + networkPeersConnected: new MetricProperty({ + jsonKey: "network_peers_connected", + metricName: "libp2p_peers", + jsonType: JsonType.Number, + defaultValue: 0, + description: "Number of connected peers", + }), + syncEth1Connected: new MetricProperty({ + jsonKey: "sync_eth1_connected", + metricName: "lodestar_execution_engine_http_client_config_urls_count", + jsonType: JsonType.Boolean, + defaultValue: false, + description: "Whether the beacon node is connected to a eth1 node", + }), + syncEth2Synced: new MetricProperty({ + jsonKey: "sync_eth2_synced", + metricName: "lodestar_sync_status", + rangeValue: 3, + jsonType: JsonType.Boolean, + defaultValue: true, + description: "Whether the beacon node is in sync with the beacon chain network", + }), + syncBeaconHeadSlot: new MetricProperty({ + jsonKey: "sync_beacon_head_slot", + metricName: "beacon_head_slot", + jsonType: JsonType.Number, + defaultValue: 0, + description: "Slot of the head block of the beacon chain", + }), + syncEth1FallbackConfigured: new MetricProperty({ + jsonKey: "sync_eth1_fallback_configured", + metricName: "lodestar_execution_engine_http_client_config_urls_count", + threshold: 2, + jsonType: JsonType.Boolean, + defaultValue: false, + description: "Whether the beacon node has a fallback eth1 endpoint configured", + }), + syncEth1FallbackConnected: new StaticProperty({ + jsonKey: "sync_eth1_fallback_connected", + value: false, + description: "Whether the beacon node is currently connected to a fallback eth1 endpoint", + }), + slasherActive: new StaticProperty({ + jsonKey: "slasher_active", + value: false, + description: "Whether slasher functionality is enabled", + }), + }; +} + +function createValidatorStats(): ClientStats { + return { + ...createProcessStats(ProcessType.Validator), + validatorTotal: new MetricProperty({ + jsonKey: "validator_total", + metricName: "vc_indices_count", + jsonType: JsonType.Number, + defaultValue: 0, + description: "Number of validator keys in use", + }), + validatorActive: new MetricProperty({ + jsonKey: "validator_active", + metricName: "vc_indices_count", + jsonType: JsonType.Number, + defaultValue: 0, + description: "Number of validator keys that are currently active", + }), + }; +} + +function createSystemStats(): ClientStats { + return { + ...createCommonStats(ProcessType.System), + cpuCores: new DynamicProperty({ + jsonKey: "cpu_cores", + provider: () => 0, + cacheResult: true, + description: "Number of CPU cores available", + }), + cpuThreads: new DynamicProperty({ + jsonKey: "cpu_threads", + provider: () => 0, + cacheResult: true, + description: "Number of CPU threads available", + }), + cpuNodeSystemSecondsTotal: new DynamicProperty({ + jsonKey: "cpu_node_system_seconds_total", + provider: () => 0, + description: "CPU seconds consumed by all processes", + }), + cpuNodeUserSecondsTotal: new DynamicProperty({ + jsonKey: "cpu_node_user_seconds_total", + provider: () => 0, + description: "CPU seconds consumed by user processes", + }), + cpuNodeIOWaitSecondsTotal: new DynamicProperty({ + jsonKey: "cpu_node_iowait_seconds_total", + provider: () => 0, + description: "CPU seconds spent in I/O wait state", + }), + cpuNodeIdleSecondsTotal: new DynamicProperty({ + jsonKey: "cpu_node_idle_seconds_total", + provider: () => 0, + description: "CPU seconds spent in idle state", + }), + memoryNodeBytesTotal: new DynamicProperty({ + jsonKey: "memory_node_bytes_total", + provider: () => 0, + cacheResult: true, + description: "Total amount of memory in bytes available", + }), + memoryNodeBytesFree: new DynamicProperty({ + jsonKey: "memory_node_bytes_free", + provider: () => 0, + description: "Amount of free memory in bytes", + }), + memoryNodeBytesCached: new DynamicProperty({ + jsonKey: "memory_node_bytes_cached", + provider: () => 0, + description: "Amount of memory in bytes used by cache", + }), + memoryNodeBytesBuffers: new DynamicProperty({ + jsonKey: "memory_node_bytes_buffers", + provider: () => 0, + description: "Amount of memory in bytes used by buffers", + }), + diskNodeBytesTotal: new DynamicProperty({ + jsonKey: "disk_node_bytes_total", + provider: () => 0, + description: "Total amount of available disk space in bytes", + }), + diskNodeBytesFree: new DynamicProperty({ + jsonKey: "disk_node_bytes_free", + provider: () => 0, + description: "Amount of free disk space in bytes", + }), + diskNodeIOSeconds: new DynamicProperty({ + jsonKey: "disk_node_io_seconds", + provider: () => 0, + description: "Total time spent in seconds on disk I/O operations", + }), + diskNodeReadsTotal: new DynamicProperty({ + jsonKey: "disk_node_reads_total", + provider: () => 0, + description: "Total amount of bytes read from disk", + }), + diskNodeWritesTotal: new DynamicProperty({ + jsonKey: "disk_node_writes_total", + provider: () => 0, + description: "Total amount of bytes written to disk", + }), + networkNodeBytesTotalReceive: new DynamicProperty({ + jsonKey: "network_node_bytes_total_receive", + provider: () => 0, + description: "Total amount of bytes received over the network", + }), + networkNodeBytesTotalTransmit: new DynamicProperty({ + jsonKey: "network_node_bytes_total_transmit", + provider: () => 0, + description: "Total amount of bytes transmitted over the network", + }), + miscNodeBootTsSeconds: new DynamicProperty({ + jsonKey: "misc_node_boot_ts_seconds", + provider: () => 0, + description: "Unix timestamp in seconds of boot time", + }), + miscOs: new DynamicProperty({ + jsonKey: "misc_os", + provider: () => "unk", + description: "Operating system, can be one of: lin, win, mac, unk for unknown", + }), + }; +} diff --git a/packages/beacon-node/src/monitoring/index.ts b/packages/beacon-node/src/monitoring/index.ts new file mode 100644 index 000000000000..124cf97212b9 --- /dev/null +++ b/packages/beacon-node/src/monitoring/index.ts @@ -0,0 +1,2 @@ +export * from "./options.js"; +export {MonitoringService} from "./service.js"; diff --git a/packages/beacon-node/src/monitoring/options.ts b/packages/beacon-node/src/monitoring/options.ts new file mode 100644 index 000000000000..a9ab2cd8172e --- /dev/null +++ b/packages/beacon-node/src/monitoring/options.ts @@ -0,0 +1,20 @@ +export type MonitoringOptions = { + /** Remote endpoint URL where client stats are sent */ + endpoint: string; + /** Interval in milliseconds between sending client stats */ + interval?: number; + /** Initial delay in milliseconds before client stats are sent */ + initialDelay?: number; + /** Timeout in milliseconds for sending client stats */ + requestTimeout?: number; + /** Enable collecting system stats */ + collectSystemStats?: boolean; +}; + +export const defaultMonitoringOptions: Required = { + endpoint: "", + interval: 60_000, + initialDelay: 30_000, + requestTimeout: 10_000, + collectSystemStats: true, +}; diff --git a/packages/beacon-node/src/monitoring/properties.ts b/packages/beacon-node/src/monitoring/properties.ts new file mode 100644 index 000000000000..9b5de063f8b1 --- /dev/null +++ b/packages/beacon-node/src/monitoring/properties.ts @@ -0,0 +1,154 @@ +import {Registry} from "prom-client"; +import {JsonRecord, JsonType, MetricObject, MetricValue, MetricWithGetter, RecordValue} from "./types.js"; + +interface PropertyDefinition { + /** Key of value to be sent to remote service */ + jsonKey: string; + /** Description of the property */ + description?: string; +} + +interface StaticPropertyDefinition extends PropertyDefinition { + /** Static value */ + value: T; +} + +interface DynamicPropertyDefinition extends PropertyDefinition { + /** Value provider function */ + provider: () => T | Promise; + /** Only call provider function once and then use cached value */ + cacheResult?: boolean; +} + +interface MetricPropertyDefinition extends PropertyDefinition { + /** Type of value to be sent to remote service */ + jsonType: JsonType; + /** Name of the metric */ + metricName: string; + /** Get value from metric with label */ + withLabel?: {name: string; value: string}; + /** Get value from label instead of metric value */ + fromLabel?: string; + /** Range value to evaluate to true */ + rangeValue?: number; + /** Evaluate to true if value is greater than or equal to threshold */ + threshold?: number; + /** Function to format retrieved metric value */ + formatter?: (value: MetricValue) => MetricValue; + /** Only fetch metric once and then use cached value */ + cacheResult?: boolean; + /** Default value if metric does not exist */ + defaultValue: T; +} + +export interface ClientStatsProperty { + readonly definition: PropertyDefinition; + + getRecord(register: Registry): JsonRecord | Promise>; +} + +export class StaticProperty implements ClientStatsProperty { + constructor(readonly definition: StaticPropertyDefinition) {} + + getRecord(): JsonRecord { + return {key: this.definition.jsonKey, value: this.definition.value}; + } +} + +export class DynamicProperty implements ClientStatsProperty { + private cachedValue?: T; + + constructor(readonly definition: DynamicPropertyDefinition) {} + + async getRecord(): Promise> { + if (this.cachedValue != null) { + return {key: this.definition.jsonKey, value: this.cachedValue}; + } + + const value = await this.definition.provider(); + + if (this.definition.cacheResult) { + this.cachedValue = value; + } + + return {key: this.definition.jsonKey, value}; + } +} + +export class MetricProperty implements ClientStatsProperty { + private cachedValue?: T; + + constructor(readonly definition: MetricPropertyDefinition) {} + + async getRecord(register: Registry): Promise> { + if (this.cachedValue != null) { + return {key: this.definition.jsonKey, value: this.cachedValue}; + } + + const metric = register.getSingleMetric(this.definition.metricName); + + if (metric) { + const metricObject = await (metric as MetricWithGetter).get(); + + const metricValue = this.extractMetricValue(metricObject); + + if (metricValue != null) { + const formattedValue = this.formatMetricValue(metricValue); + + const typedValue = this.convertMetricValue(formattedValue) as T; + + if (this.definition.cacheResult) { + this.cachedValue = typedValue; + } + + return {key: this.definition.jsonKey, value: typedValue}; + } + } + + return {key: this.definition.jsonKey, value: this.definition.defaultValue}; + } + + private extractMetricValue(metricObject: MetricObject): MetricValue | undefined { + const {withLabel, fromLabel} = this.definition; + + if (withLabel) { + // get value from metric with specific label, e.g. protocol="global received" + return metricObject.values.find((v) => v.labels[withLabel.name] === withLabel.value)?.value; + } + + if (fromLabel) { + // get value from label, e.g. lodestar_version{version="v1.3.0/2d0938e"} => v1.3.0/2d0938e + return metricObject.values[0].labels[fromLabel]; + } + + // metric value e.g. beacon_head_slot 5603174 => 5603174 + return metricObject.values[0].value; + } + + private formatMetricValue(value: MetricValue): MetricValue { + if (!this.definition.formatter) { + return value; + } + return this.definition.formatter(value); + } + + private convertMetricValue(value: MetricValue): RecordValue { + if (typeof value === "number") { + switch (this.definition.jsonType) { + case JsonType.String: + return value.toString(); + case JsonType.Number: + return Math.round(value); + case JsonType.Boolean: + if (this.definition.rangeValue != null) { + return value === this.definition.rangeValue; + } else if (this.definition.threshold != null) { + return value >= this.definition.threshold; + } else { + return value > 0; + } + } + } + return value; + } +} diff --git a/packages/beacon-node/src/monitoring/service.ts b/packages/beacon-node/src/monitoring/service.ts new file mode 100644 index 000000000000..e51a2a341900 --- /dev/null +++ b/packages/beacon-node/src/monitoring/service.ts @@ -0,0 +1,229 @@ +import fetch from "cross-fetch"; +import {Registry} from "prom-client"; +import {ErrorAborted, ILogger, TimeoutError} from "@lodestar/utils"; +import {RegistryMetricCreator} from "../metrics/index.js"; +import {HistogramExtra} from "../metrics/utils/histogram.js"; +import {defaultMonitoringOptions, MonitoringOptions} from "./options.js"; +import {createClientStats} from "./clientStats.js"; +import {ClientStats} from "./types.js"; + +type MonitoringData = Record; + +export type RemoteServiceError = { + status: string; + data: null; +}; + +enum FetchAbortReason { + Stop = "stop", + Timeout = "timeout", +} + +enum Status { + Started = "started", + Stopped = "stopped", +} + +export type Client = "beacon" | "validator"; + +/** + * Service for sending clients stats to a remote service (e.g. beaconcha.in) + */ +export class MonitoringService { + private readonly clientStats: ClientStats[]; + private readonly remoteServiceUrl: URL; + private readonly remoteServiceHost: string; + private readonly options: Required; + private readonly register: Registry; + private readonly logger: ILogger; + + private readonly collectDataMetric: HistogramExtra; + private readonly sendDataMetric: HistogramExtra<"status">; + + private status = Status.Stopped; + private initialDelayTimeout?: NodeJS.Timeout; + private monitoringInterval?: NodeJS.Timer; + private fetchAbortController?: AbortController; + private pendingRequest?: Promise; + + constructor( + client: Client, + options: MonitoringOptions, + {register, logger}: {register: RegistryMetricCreator; logger: ILogger} + ) { + this.options = {...defaultMonitoringOptions, ...options}; + this.logger = logger; + this.register = register; + this.remoteServiceUrl = this.parseMonitoringEndpoint(this.options.endpoint); + this.remoteServiceHost = this.remoteServiceUrl.host; + this.clientStats = createClientStats(client, this.options.collectSystemStats); + + this.collectDataMetric = register.histogram({ + name: "lodestar_monitoring_collect_data_seconds", + help: "Time spent to collect monitoring data in seconds", + buckets: [0.001, 0.01, 0.1, 1], + }); + + this.sendDataMetric = register.histogram({ + name: "lodestar_monitoring_send_data_seconds", + help: "Time spent to send monitoring data to remote service in seconds", + labelNames: ["status"], + buckets: [0.3, 0.5, 1, Math.floor(this.options.requestTimeout / 1000)], + }); + } + + /** + * Start sending client stats based on configured interval + */ + start(): void { + if (this.status === Status.Started) return; + this.status = Status.Started; + + const {interval, initialDelay} = this.options; + + this.initialDelayTimeout = setTimeout(async () => { + await this.send(); + + this.monitoringInterval = setInterval(async () => { + await this.send(); + }, interval); + }, initialDelay); + + this.logger.info("Started monitoring service", { + remote: this.remoteServiceHost, + machine: this.remoteServiceUrl.searchParams.get("machine"), + interval, + }); + } + + /** + * Stop sending client stats + */ + stop(): void { + if (this.status === Status.Stopped) return; + this.status = Status.Stopped; + + if (this.initialDelayTimeout) { + clearTimeout(this.initialDelayTimeout); + } + if (this.monitoringInterval) { + clearInterval(this.monitoringInterval); + } + if (this.pendingRequest) { + this.fetchAbortController?.abort(FetchAbortReason.Stop); + } + } + + /** + * Collect and send client stats + */ + async send(): Promise { + if (!this.pendingRequest) { + this.pendingRequest = (async () => { + try { + const data = await this.collectData(); + + const res = await this.sendData(data); + + if (!res.ok) { + const error = (await res.json()) as RemoteServiceError; + throw new Error(error.status); + } + + this.logger.debug(`Sent client stats to ${this.remoteServiceHost}`, {data: JSON.stringify(data)}); + } catch (e) { + this.logger.error(`Failed to send client stats to ${this.remoteServiceHost}`, {}, e as Error); + } finally { + this.pendingRequest = undefined; + } + })(); + } + + await this.pendingRequest; + } + + private async collectData(): Promise { + const timer = this.collectDataMetric.startTimer(); + const data: MonitoringData[] = []; + const recordPromises = []; + + for (const [i, s] of this.clientStats.entries()) { + data[i] = {}; + + recordPromises.push( + ...Object.values(s).map(async (property) => { + const record = await property.getRecord(this.register); + data[i][record.key] = record.value; + }) + ); + } + + await Promise.all(recordPromises).finally(timer); + + return data; + } + + private async sendData(data: MonitoringData[]): Promise { + const timer = this.sendDataMetric.startTimer(); + this.fetchAbortController = new AbortController(); + + const timeout = setTimeout( + () => this.fetchAbortController?.abort(FetchAbortReason.Timeout), + this.options.requestTimeout + ); + + let res: Response | undefined; + + try { + res = await fetch(this.remoteServiceUrl, { + method: "POST", + headers: {"Content-Type": "application/json"}, + body: JSON.stringify(data), + signal: this.fetchAbortController.signal, + }); + + return res; + } catch (e) { + const {signal} = this.fetchAbortController; + + if (!signal.aborted) { + // error was thrown by fetch + throw e; + } + + // error was thrown by abort signal + if (signal.reason === FetchAbortReason.Stop) { + throw new ErrorAborted(`request to ${this.remoteServiceHost}`); + } else if (signal.reason === FetchAbortReason.Timeout) { + throw new TimeoutError(`reached for request to ${this.remoteServiceHost}`); + } else { + throw e; + } + } finally { + timer({status: res?.ok ? "success" : "error"}); + clearTimeout(timeout); + } + } + + private parseMonitoringEndpoint(endpoint: string): URL { + if (!endpoint) { + throw new Error("Monitoring endpoint must be provided"); + } + + try { + const url = new URL(endpoint); + + if (url.protocol === "http:") { + this.logger.warn( + "Insecure monitoring endpoint, please make sure to always use a HTTPS connection in production" + ); + } else if (url.protocol !== "https:") { + throw new Error(); + } + + return url; + } catch { + throw new Error("Monitoring endpoint must be a valid URL"); + } + } +} diff --git a/packages/beacon-node/src/monitoring/types.ts b/packages/beacon-node/src/monitoring/types.ts new file mode 100644 index 000000000000..8980c02b2422 --- /dev/null +++ b/packages/beacon-node/src/monitoring/types.ts @@ -0,0 +1,32 @@ +import {Metric} from "prom-client"; +import {ClientStatsProperty} from "./properties.js"; + +// get methods are missing in prom-client type definitions +// see https://github.com/siimon/prom-client/pull/531 +export type MetricWithGetter = Metric & { + get(): Promise; +}; + +export type MetricObject = { + values: Array<{value: number; labels: Record}>; +}; + +export type MetricValue = string | number; + +export type RecordValue = string | number | boolean; + +export type JsonRecord = {key: string; value: T}; + +export type ClientStats = Record>; + +export enum ProcessType { + BeaconNode = "beaconnode", + Validator = "validator", + System = "system", +} + +export enum JsonType { + String, + Number, + Boolean, +} diff --git a/packages/beacon-node/src/node/nodejs.ts b/packages/beacon-node/src/node/nodejs.ts index a0dccc3a2d63..4f3d3bdf3c21 100644 --- a/packages/beacon-node/src/node/nodejs.ts +++ b/packages/beacon-node/src/node/nodejs.ts @@ -15,6 +15,7 @@ import {BeaconSync, IBeaconSync} from "../sync/index.js"; import {BackfillSync} from "../sync/backfill/index.js"; import {BeaconChain, IBeaconChain, initBeaconMetrics} from "../chain/index.js"; import {createMetrics, IMetrics, HttpMetricsServer} from "../metrics/index.js"; +import {MonitoringService} from "../monitoring/index.js"; import {getApi, BeaconRestApiServer} from "../api/index.js"; import {initializeExecutionEngine, initializeExecutionBuilder} from "../execution/index.js"; import {initializeEth1ForBlockProduction} from "../eth1/index.js"; @@ -35,6 +36,7 @@ export interface IBeaconNodeModules { sync: IBeaconSync; backfillSync: BackfillSync | null; metricsServer?: HttpMetricsServer; + monitoring: MonitoringService | null; restApi?: BeaconRestApiServer; controller?: AbortController; } @@ -64,6 +66,7 @@ enum LoggerModule { chain = "chain", eth1 = "eth1", metrics = "metrics", + monitoring = "monitoring", network = "network", /** validator monitor */ vmon = "vmon", @@ -81,6 +84,7 @@ export class BeaconNode { db: IBeaconDb; metrics: IMetrics | null; metricsServer?: HttpMetricsServer; + monitoring: MonitoringService | null; network: INetwork; chain: IBeaconChain; api: {[K in keyof Api]: ServerApi}; @@ -97,6 +101,7 @@ export class BeaconNode { db, metrics, metricsServer, + monitoring, network, chain, api, @@ -109,6 +114,7 @@ export class BeaconNode { this.config = config; this.metrics = metrics; this.metricsServer = metricsServer; + this.monitoring = monitoring; this.db = db; this.chain = chain; this.api = api; @@ -171,6 +177,18 @@ export class BeaconNode { db.setMetrics(metrics.db); } + let monitoring = null; + if (opts.monitoring.endpoint) { + if (metrics == null) { + throw new Error("Metrics must be enabled to use monitoring"); + } + monitoring = new MonitoringService("beacon", opts.monitoring, { + register: metrics.register, + logger: logger.child({module: LoggerModule.monitoring}), + }); + monitoring.start(); + } + const chain = new BeaconChain(opts.chain, { config, db, @@ -274,6 +292,7 @@ export class BeaconNode { db, metrics, metricsServer, + monitoring, network, chain, api, @@ -294,6 +313,7 @@ export class BeaconNode { this.backfillSync?.close(); await this.network.close(); if (this.metricsServer) await this.metricsServer.stop(); + if (this.monitoring) this.monitoring.stop(); if (this.restApi) await this.restApi.close(); await this.chain.persistToDisk(); diff --git a/packages/beacon-node/src/node/options.ts b/packages/beacon-node/src/node/options.ts index 489a76e1d4c9..8e86edf27572 100644 --- a/packages/beacon-node/src/node/options.ts +++ b/packages/beacon-node/src/node/options.ts @@ -3,6 +3,7 @@ import {defaultChainOptions, IChainOptions} from "../chain/options.js"; import {defaultDbOptions, IDatabaseOptions} from "../db/options.js"; import {defaultEth1Options, Eth1Options} from "../eth1/options.js"; import {defaultMetricsOptions, MetricsOptions} from "../metrics/options.js"; +import {defaultMonitoringOptions, MonitoringOptions} from "../monitoring/options.js"; import {defaultNetworkOptions, INetworkOptions} from "../network/options.js"; import {defaultSyncOptions, SyncOptions} from "../sync/options.js"; import { @@ -22,6 +23,7 @@ export interface IBeaconNodeOptions { executionEngine: ExecutionEngineOpts; executionBuilder: ExecutionBuilderOpts; metrics: MetricsOptions; + monitoring: MonitoringOptions; network: INetworkOptions; sync: SyncOptions; } @@ -34,6 +36,7 @@ export const defaultOptions: IBeaconNodeOptions = { executionEngine: defaultExecutionEngineOpts, executionBuilder: defaultExecutionBuilderOpts, metrics: defaultMetricsOptions, + monitoring: defaultMonitoringOptions, network: defaultNetworkOptions, sync: defaultSyncOptions, }; diff --git a/packages/beacon-node/test/unit/monitoring/clientStats.test.ts b/packages/beacon-node/test/unit/monitoring/clientStats.test.ts new file mode 100644 index 000000000000..9391a5b956b1 --- /dev/null +++ b/packages/beacon-node/test/unit/monitoring/clientStats.test.ts @@ -0,0 +1,38 @@ +import {expect} from "chai"; +import {ClientStats} from "../../../src/monitoring/types.js"; +import {createClientStats} from "../../../src/monitoring/clientStats.js"; +import {beaconNodeStatsSchema, ClientStatsSchema, systemStatsSchema, validatorStatsSchema} from "./schemas.js"; + +describe("monitoring / clientStats", () => { + describe("BeaconNodeStats", () => { + it("should contain all required keys", () => { + const beaconNodeStats = createClientStats("beacon")[0]; + + expect(getJsonKeys(beaconNodeStats)).to.have.all.members(getSchemaKeys(beaconNodeStatsSchema)); + }); + }); + + describe("ValidatorStats", () => { + it("should contain all required keys", () => { + const validatorNodeStats = createClientStats("validator")[0]; + + expect(getJsonKeys(validatorNodeStats)).to.have.all.members(getSchemaKeys(validatorStatsSchema)); + }); + }); + + describe("SystemStats", () => { + it("should contain all required keys", () => { + const systemStats = createClientStats("beacon", true)[1]; + + expect(getJsonKeys(systemStats)).to.have.all.members(getSchemaKeys(systemStatsSchema)); + }); + }); +}); + +function getJsonKeys(stats: ClientStats): string[] { + return Object.values(stats).map((property) => property.definition.jsonKey); +} + +function getSchemaKeys(schema: ClientStatsSchema): string[] { + return schema.map((s) => s.key); +} diff --git a/packages/beacon-node/test/unit/monitoring/properties.test.ts b/packages/beacon-node/test/unit/monitoring/properties.test.ts new file mode 100644 index 000000000000..bb4c9bb6511d --- /dev/null +++ b/packages/beacon-node/test/unit/monitoring/properties.test.ts @@ -0,0 +1,274 @@ +import {expect} from "chai"; +import {IMetrics} from "../../../src/metrics/index.js"; +import {DynamicProperty, MetricProperty, StaticProperty} from "../../../src/monitoring/properties.js"; +import {JsonType} from "../../../src/monitoring/types.js"; +import {createMetricsTest} from "../metrics/utils.js"; + +describe("monitoring / properties", () => { + const jsonKey = "test_key"; + const value = 1; + + describe("StaticProperty", () => { + it("should return a json record with the configured key and value", () => { + const staticProperty = new StaticProperty({jsonKey, value}); + + const jsonRecord = staticProperty.getRecord(); + + expect(jsonRecord.key).to.equal(jsonKey); + expect(jsonRecord.value).to.equal(value); + }); + }); + + describe("DynamicProperty", () => { + it("should return a json record with the configured key and return value of provider", async () => { + const dynamicProperty = new DynamicProperty({jsonKey, provider: () => value}); + + const jsonRecord = await dynamicProperty.getRecord(); + + expect(jsonRecord.key).to.equal(jsonKey); + expect(jsonRecord.value).to.equal(value); + }); + + it("should return the same value on consecutive calls if cacheResult is set to true", async () => { + const initialValue = 1; + let updatedValue = initialValue; + + const provider = (): number => { + const value = updatedValue; + updatedValue++; + return value; + }; + + const dynamicProperty = new DynamicProperty({jsonKey, provider, cacheResult: true}); + + // ensure consecutive calls still return initial provider value + expect((await dynamicProperty.getRecord()).value).to.equal(initialValue); + expect((await dynamicProperty.getRecord()).value).to.equal(initialValue); + expect((await dynamicProperty.getRecord()).value).to.equal(initialValue); + }); + }); + + describe("MetricProperty", () => { + let metrics: IMetrics; + + before(() => { + metrics = createMetricsTest(); + }); + + it("should return a json record with the configured key and metric value", async () => { + const peerCount = 50; + metrics.peers.set(peerCount); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName: "libp2p_peers", + jsonType: JsonType.Number, + defaultValue: 0, + }); + + const jsonRecord = await metricProperty.getRecord(metrics.register); + + expect(jsonRecord.key).to.equal(jsonKey); + expect(jsonRecord.value).to.equal(peerCount); + }); + + it("should return the default value if metric with name does not exist", async () => { + const defaultValue = 10; + + const metricProperty = new MetricProperty({ + jsonKey, + metricName: "does_not_exist", + jsonType: JsonType.Number, + defaultValue, + }); + + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(defaultValue); + }); + + it("should get the value from label instead of metric value if fromLabel is defined", async () => { + const metricName = "static_metric"; + const labelName = "test_label"; + const labelValue = "test_value"; + + metrics.register.static({name: metricName, help: "fromLabel test", value: {[labelName]: labelValue}}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + fromLabel: labelName, + jsonType: JsonType.String, + defaultValue: "", + }); + + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(labelValue); + }); + + it("should get the value from metric with label if withLabel is defined", async () => { + const metricName = "metric_with_labels"; + const labelName = "test_label_name"; + const labelValue = "test_label_value"; + const metricValue = 10; + + const metric = metrics.register.gauge({name: metricName, help: "withLabel test", labelNames: [labelName]}); + metric.set({[labelName]: "different_value"}, metricValue + 1); + metric.set({[labelName]: labelValue}, metricValue); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + withLabel: {name: labelName, value: labelValue}, + jsonType: JsonType.Number, + defaultValue: 0, + }); + + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(metricValue); + }); + + it("should return the same value on consecutive calls if cacheResult is set to true", async () => { + const metricName = "metric_test_caching"; + const initialValue = 10; + + const metric = metrics.register.gauge({name: metricName, help: "cacheResult test"}); + metric.set(initialValue); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + jsonType: JsonType.Number, + defaultValue: 0, + cacheResult: true, + }); + + // initial call which will cache the result + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(initialValue); + + // set different value + metric.set(initialValue + 1); + + // ensure consecutive calls still return initial value + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(initialValue); + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(initialValue); + }); + + it("should convert the metric value to a string if jsonType is JsonType.String", async () => { + const metricName = "metric_test_string"; + + const metric = metrics.register.gauge({name: metricName, help: "JsonType.String test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + jsonType: JsonType.String, + defaultValue: "", + }); + + metric.set(10); + expect((await metricProperty.getRecord(metrics.register)).value).to.equal("10"); + }); + + it("should round the metric value to the nearest integer if jsonType is JsonType.Number", async () => { + const metricName = "metric_test_number"; + + const metric = metrics.register.gauge({name: metricName, help: "JsonType.Number test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + jsonType: JsonType.Number, + defaultValue: 0, + }); + + metric.set(1.49); + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(1); + }); + + it("should convert the metric value to a boolean if jsonType is JsonType.Boolean", async () => { + const metricName = "metric_test_boolean"; + + const metric = metrics.register.gauge({name: metricName, help: "JsonType.Boolean test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + jsonType: JsonType.Boolean, + defaultValue: false, + }); + + metric.set(0); + // metric value of 0 should be converted to false + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(false); + + metric.set(1); + // metric value > 0 should be converted to true + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(true); + }); + + it("should convert the metric value to true if the specified rangeValue is matched", async () => { + const metricName = "metric_test_range_value"; + const rangeValue = 3; + + const metric = metrics.register.gauge({name: metricName, help: "rangeValue test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + rangeValue, + jsonType: JsonType.Boolean, + defaultValue: false, + }); + + metric.set(rangeValue + 1); + // value does not match range value and should be converted to false + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(false); + + metric.set(rangeValue); + // value matches range value and should be converted to true + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(true); + }); + + it("should convert the metric value to true if value is greater than or equal to threshold", async () => { + const metricName = "metric_test_threshold"; + const threshold = 2; + + const metric = metrics.register.gauge({name: metricName, help: "threshold test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + threshold, + jsonType: JsonType.Boolean, + defaultValue: false, + }); + + metric.set(threshold - 1); + // value is below threshold and should be converted to false + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(false); + + metric.set(threshold); + // value is equal to threshold and should be converted to true + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(true); + + metric.set(threshold + 1); + // value is greater than threshold and should be converted to true + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(true); + }); + + it("should apply the defined formatter to the metric value", async () => { + const metricName = "metric_test_formatting"; + const metricValue = 10; + + const metric = metrics.register.gauge({name: metricName, help: "formatter test"}); + + const metricProperty = new MetricProperty({ + jsonKey, + metricName, + jsonType: JsonType.String, + formatter: (value) => `prefix_${value}`, + defaultValue: "", + }); + + metric.set(metricValue); + expect((await metricProperty.getRecord(metrics.register)).value).to.equal(`prefix_${metricValue}`); + }); + }); +}); diff --git a/packages/beacon-node/test/unit/monitoring/remoteService.ts b/packages/beacon-node/test/unit/monitoring/remoteService.ts new file mode 100644 index 000000000000..392c4046631f --- /dev/null +++ b/packages/beacon-node/test/unit/monitoring/remoteService.ts @@ -0,0 +1,86 @@ +import {expect} from "chai"; +import fastify from "fastify"; +import {RemoteServiceError} from "../../../src/monitoring/service.js"; +import {ProcessType} from "../../../src/monitoring/types.js"; +import {beaconNodeStatsSchema, ClientStatsSchema, systemStatsSchema, validatorStatsSchema} from "./schemas.js"; + +/* eslint-disable no-console */ + +type ReceivedData = Record; + +export const remoteServiceRoutes = { + success: "/success", + error: "/error", + pending: "/pending", +}; + +export const remoteServiceError: RemoteServiceError = {status: "error", data: null}; + +/** + * Starts mocked remote service to receive and validate client stats + */ +export async function startRemoteService(): Promise<{baseUrl: URL}> { + const server = fastify(); + + server.post(remoteServiceRoutes.success, {}, async function (request, reply) { + if (Array.isArray(request.body)) { + request.body.forEach(validateRequestData); + } else { + validateRequestData(request.body as ReceivedData); + } + + return reply.status(200).send(); + }); + + server.post(remoteServiceRoutes.error, {}, async function (_request, reply) { + return reply.status(400).send(remoteServiceError); + }); + + server.post(remoteServiceRoutes.pending, {}, function () { + // keep request pending until timeout is reached or aborted + }); + + server.addHook("onError", (_request, _reply, error, done) => { + console.log(`Error: ${error.message}`); + done(); + }); + + // ask the operating system to assign a free (ephemeral) port + // and use IPv4 localhost "127.0.0.1" to avoid known IPv6 issues + const baseUrl = await server.listen({host: "127.0.0.1", port: 0}); + + after(() => { + // there is no need to wait for server to be closed + server.close().catch(console.log); + }); + + return {baseUrl: new URL(baseUrl)}; +} + +function validateRequestData(data: ReceivedData): void { + switch (data.process) { + case ProcessType.BeaconNode: + validateClientStats(data, beaconNodeStatsSchema); + break; + case ProcessType.Validator: + validateClientStats(data, validatorStatsSchema); + break; + case ProcessType.System: + validateClientStats(data, systemStatsSchema); + break; + default: + throw new Error(`Invalid process type "${data.process}"`); + } +} + +function validateClientStats(data: ReceivedData, schema: ClientStatsSchema): void { + schema.forEach((s) => { + try { + expect(data[s.key]).to.be.a(s.type); + } catch { + throw new Error( + `Validation of property "${s.key}" failed. Expected type "${s.type}" but received "${typeof data[s.key]}".` + ); + } + }); +} diff --git a/packages/beacon-node/test/unit/monitoring/schemas.ts b/packages/beacon-node/test/unit/monitoring/schemas.ts new file mode 100644 index 000000000000..ec7efcd14d5e --- /dev/null +++ b/packages/beacon-node/test/unit/monitoring/schemas.ts @@ -0,0 +1,63 @@ +// See https://docs.google.com/document/d/1qPWAVRjPCENlyAjUBwGkHMvz9qLdd_6u9DPZcNxDBpc + +export type ClientStatsSchema = {key: string; type: "string" | "number" | "boolean"}[]; + +export const commonStatsSchema: ClientStatsSchema = [ + {key: "version", type: "number"}, + {key: "timestamp", type: "number"}, + {key: "process", type: "string"}, +]; + +export const processStatsSchema: ClientStatsSchema = [ + ...commonStatsSchema, + {key: "cpu_process_seconds_total", type: "number"}, + {key: "memory_process_bytes", type: "number"}, + {key: "client_name", type: "string"}, + {key: "client_version", type: "string"}, + {key: "client_build", type: "number"}, + {key: "sync_eth2_fallback_configured", type: "boolean"}, + {key: "sync_eth2_fallback_connected", type: "boolean"}, +]; + +export const beaconNodeStatsSchema: ClientStatsSchema = [ + ...processStatsSchema, + {key: "disk_beaconchain_bytes_total", type: "number"}, + {key: "network_libp2p_bytes_total_receive", type: "number"}, + {key: "network_libp2p_bytes_total_transmit", type: "number"}, + {key: "network_peers_connected", type: "number"}, + {key: "sync_eth1_connected", type: "boolean"}, + {key: "sync_eth2_synced", type: "boolean"}, + {key: "sync_beacon_head_slot", type: "number"}, + {key: "sync_eth1_fallback_configured", type: "boolean"}, + {key: "sync_eth1_fallback_connected", type: "boolean"}, + {key: "slasher_active", type: "boolean"}, +]; + +export const validatorStatsSchema: ClientStatsSchema = [ + ...processStatsSchema, + {key: "validator_total", type: "number"}, + {key: "validator_active", type: "number"}, +]; + +export const systemStatsSchema: ClientStatsSchema = [ + ...commonStatsSchema, + {key: "cpu_cores", type: "number"}, + {key: "cpu_threads", type: "number"}, + {key: "cpu_node_system_seconds_total", type: "number"}, + {key: "cpu_node_user_seconds_total", type: "number"}, + {key: "cpu_node_iowait_seconds_total", type: "number"}, + {key: "cpu_node_idle_seconds_total", type: "number"}, + {key: "memory_node_bytes_total", type: "number"}, + {key: "memory_node_bytes_free", type: "number"}, + {key: "memory_node_bytes_cached", type: "number"}, + {key: "memory_node_bytes_buffers", type: "number"}, + {key: "disk_node_bytes_total", type: "number"}, + {key: "disk_node_bytes_free", type: "number"}, + {key: "disk_node_io_seconds", type: "number"}, + {key: "disk_node_reads_total", type: "number"}, + {key: "disk_node_writes_total", type: "number"}, + {key: "network_node_bytes_total_receive", type: "number"}, + {key: "network_node_bytes_total_transmit", type: "number"}, + {key: "misc_node_boot_ts_seconds", type: "number"}, + {key: "misc_os", type: "string"}, +]; diff --git a/packages/beacon-node/test/unit/monitoring/service.test.ts b/packages/beacon-node/test/unit/monitoring/service.test.ts new file mode 100644 index 000000000000..34da774a1585 --- /dev/null +++ b/packages/beacon-node/test/unit/monitoring/service.test.ts @@ -0,0 +1,266 @@ +import {expect} from "chai"; +import sinon from "sinon"; +import {ErrorAborted, ILogger, TimeoutError} from "@lodestar/utils"; +import {RegistryMetricCreator} from "../../../src/index.js"; +import {HistogramExtra} from "../../../src/metrics/utils/histogram.js"; +import {MonitoringService} from "../../../src/monitoring/service.js"; +import {createStubbedLogger} from "../../utils/mocks/logger.js"; +import {MonitoringOptions} from "../../../src/monitoring/options.js"; +import {sleep} from "../../utils/sleep.js"; +import {startRemoteService, remoteServiceRoutes, remoteServiceError} from "./remoteService.js"; + +describe("monitoring / service", () => { + const sandbox = sinon.createSandbox(); + const endpoint = "https://test.example.com/api/v1/client/metrics"; + + let register: RegistryMetricCreator; + let logger: ILogger; + + beforeEach(() => { + // recreate to avoid "metric has already been registered" errors + register = new RegistryMetricCreator(); + logger = createStubbedLogger(); + }); + + after(() => { + sandbox.restore(); + }); + + describe("MonitoringService - constructor", () => { + it("should return an instance of the monitoring service", () => { + const service = new MonitoringService("beacon", {endpoint}, {register, logger}); + + expect(service.start).to.be.a("function"); + expect(service.stop).to.be.a("function"); + expect(service.send).to.be.a("function"); + }); + + it("should register metrics for collecting and sending data", () => { + new MonitoringService("beacon", {endpoint}, {register, logger}); + + expect(register.getSingleMetric("lodestar_monitoring_collect_data_seconds")).to.be.instanceOf(HistogramExtra); + expect(register.getSingleMetric("lodestar_monitoring_send_data_seconds")).to.be.instanceOf(HistogramExtra); + }); + + it("should log a warning message if insecure monitoring endpoint is provided ", () => { + const insecureEndpoint = "http://test.example.com/api/v1/client/metrics"; + + new MonitoringService("beacon", {endpoint: insecureEndpoint}, {register, logger}); + + expect(logger.warn).to.have.been.calledWith( + "Insecure monitoring endpoint, please make sure to always use a HTTPS connection in production" + ); + }); + + it("should throw an error if monitoring endpoint is not provided", () => { + expect(() => new MonitoringService("beacon", {endpoint: ""}, {register, logger})).to.throw( + "Monitoring endpoint must be provided" + ); + }); + + it("should throw an error if monitoring endpoint is not a valid URL", () => { + expect(() => new MonitoringService("beacon", {endpoint: "invalid"}, {register, logger})).to.throw( + "Monitoring endpoint must be a valid URL" + ); + }); + }); + + describe("MonitoringService - start", () => { + it("should set the status to started", async () => { + const service = await startedMonitoringService(); + + expect(service["status"]).to.equal("started"); + }); + + it("should set interval to continuously send client stats", async () => { + const setInterval = sandbox.spy(global, "setInterval"); + + const service = await startedMonitoringService(); + + expect(setInterval).to.have.been.calledOnce; + expect(service["monitoringInterval"]).to.be.an("object"); + }); + + it("should send client stats after initial delay", async () => { + const service = await startedMonitoringService(); + + expect(service.send).to.have.been.calledOnce; + }); + + it("should send client stats after interval", async () => { + const interval = 10; + + const service = await startedMonitoringService({interval}); + + // wait for interval to be executed + await sleep(interval); + + expect(service.send).to.have.been.calledTwice; + }); + + it("should log an info message that service was started", async () => { + await startedMonitoringService(); + + expect(logger.info).to.have.been.calledWith("Started monitoring service"); + }); + + it("should not send client stats if service is already started", async () => { + const service = await startedMonitoringService(); + + // invoke start a second time + service.start(); + await waitForStart(); + + expect(service.send).to.have.been.calledOnce; + }); + }); + + describe("MonitoringService - stop", () => { + it("should set the status to stopped", async () => { + const service = await startedMonitoringService(); + + service.stop(); + + expect(service["status"]).to.equal("stopped"); + }); + + it("should clear the monitoring interval", async () => { + const clearInterval = sandbox.spy(global, "clearInterval"); + + const service = await startedMonitoringService(); + + service.stop(); + + expect(clearInterval).to.have.been.calledOnceWith(service["monitoringInterval"]); + }); + + it("should clear the initial delay timeout", async () => { + const clearTimeout = sandbox.spy(global, "clearTimeout"); + + const service = await startedMonitoringService({initialDelay: 1000}); + + service.stop(); + + expect(clearTimeout).to.have.been.calledOnceWith(service["initialDelayTimeout"]); + }); + + it("should abort pending requests", async () => { + const service = await startedMonitoringService(); + service["pendingRequest"] = Promise.resolve(); + + service.stop(); + + expect(service["fetchAbortController"]?.abort).to.have.been.calledOnce; + }); + }); + + describe("MonitoringService - send", () => { + let remoteServiceUrl: URL; + let baseUrl: string; + + before(async () => { + ({baseUrl: remoteServiceUrl} = await startRemoteService()); + // get base URL from origin to remove trailing slash + baseUrl = remoteServiceUrl.origin; + }); + + (["beacon", "validator"] as const).forEach((client) => { + it(`should collect and send ${client} stats to remote service`, async () => { + const endpoint = `${baseUrl}${remoteServiceRoutes.success}`; + const service = new MonitoringService(client, {endpoint, collectSystemStats: true}, {register, logger}); + + await service.send(); + + // Validation of sent data happens inside the mocked remote service + // which returns a 500 error if data does not match expected schema. + // Fail test if error was logged due to a 500 response. + expect(logger.error).to.not.have.been.calledOnce; + }); + }); + + it("should properly handle remote service errors", async () => { + const endpoint = `${baseUrl}${remoteServiceRoutes.error}`; + const service = new MonitoringService("beacon", {endpoint, collectSystemStats: false}, {register, logger}); + + await service.send(); + + assertError({message: remoteServiceError.status}); + }); + + it("should properly handle errors if remote service is unreachable", async () => { + const differentPort = Number(remoteServiceUrl.port) - 1; + const endpoint = `http://127.0.0.1:${differentPort}`; + const service = new MonitoringService("beacon", {endpoint}, {register, logger}); + + await service.send(); + + assertError({name: "FetchError"}); + }); + + it("should abort pending requests if timeout is reached", async () => { + const endpoint = `${baseUrl}${remoteServiceRoutes.pending}`; + const service = new MonitoringService( + "beacon", + {endpoint, requestTimeout: 10, collectSystemStats: false}, + {register, logger} + ); + + await service.send(); + + assertError({message: new TimeoutError(`reached for request to ${remoteServiceUrl.host}`).message}); + }); + + it("should abort pending requests if monitoring service is stopped", (done) => { + const endpoint = `${baseUrl}${remoteServiceRoutes.pending}`; + const service = new MonitoringService("beacon", {endpoint, collectSystemStats: false}, {register, logger}); + service.start(); + + service.send().finally(() => { + try { + assertError({message: new ErrorAborted(`request to ${remoteServiceUrl.host}`).message}); + done(); + } catch (e) { + done(e); + } + }); + + // wait for request to be sent before stopping + setTimeout(() => service.stop(), 10); + }); + + function assertError(error: {name?: string; message?: string}): void { + expect(logger.error).to.have.been.calledOnce; + // errors are not thrown and need to be asserted based on the error log + expect(logger.error).to.have.been.calledWithMatch("Failed to send client stats", {}, error); + } + }); + + function stubbedMonitoringService(options: Partial = {}): MonitoringService { + const service = new MonitoringService( + "beacon", + {endpoint, initialDelay: 0, ...options}, + {register: new RegistryMetricCreator(), logger} + ); + service.send = sandbox.stub(); + service["fetchAbortController"] = sandbox.createStubInstance(AbortController); + + return service; + } + + async function startedMonitoringService(options: Partial = {}): Promise { + const service = stubbedMonitoringService(options); + service.start(); + + // ensure start is finished + await waitForStart(); + + after(service.stop); + + return service; + } + + async function waitForStart(): Promise { + // value of 0 seems to do the job + await sleep(0); + } +}); diff --git a/packages/cli/src/cmds/dev/handler.ts b/packages/cli/src/cmds/dev/handler.ts index 4b7d70c31f63..a48248f33865 100644 --- a/packages/cli/src/cmds/dev/handler.ts +++ b/packages/cli/src/cmds/dev/handler.ts @@ -80,6 +80,7 @@ export async function devHandler(args: IDevArgs & IGlobalArgs): Promise { // Note: recycle entire validator handler: // - keystore handling // - metrics + // - monitoring // - keymanager server await validatorHandler(args); } diff --git a/packages/cli/src/cmds/validator/handler.ts b/packages/cli/src/cmds/validator/handler.ts index 4ac87952c7fc..f73e4a4c982e 100644 --- a/packages/cli/src/cmds/validator/handler.ts +++ b/packages/cli/src/cmds/validator/handler.ts @@ -9,14 +9,14 @@ import { BuilderSelection, } from "@lodestar/validator"; import {getMetrics, MetricsRegister} from "@lodestar/validator"; -import {RegistryMetricCreator, collectNodeJSMetrics, HttpMetricsServer} from "@lodestar/beacon-node"; +import {RegistryMetricCreator, collectNodeJSMetrics, HttpMetricsServer, MonitoringService} from "@lodestar/beacon-node"; import {getBeaconConfigFromArgs} from "../../config/index.js"; import {IGlobalArgs} from "../../options/index.js"; import {YargsError, getDefaultGraffiti, mkdir, getCliLogger, cleanOldLogFiles} from "../../util/index.js"; import {onGracefulShutdown, parseFeeRecipient, parseProposerConfig} from "../../util/index.js"; import {getVersionData} from "../../util/version.js"; import {getAccountPaths, getValidatorPaths} from "./paths.js"; -import {IValidatorCliArgs, validatorMetricsDefaultOptions} from "./options.js"; +import {IValidatorCliArgs, validatorMetricsDefaultOptions, validatorMonitoringDefaultOptions} from "./options.js"; import {getSignersFromArgs} from "./signers/index.js"; import {logSigners} from "./signers/logSigners.js"; import {KeymanagerApi} from "./keymanager/impl.js"; @@ -127,6 +127,29 @@ export async function validatorHandler(args: IValidatorCliArgs & IGlobalArgs): P await metricsServer.start(); } + if (args["monitoring.endpoint"]) { + if (register == null) { + throw new Error("Metrics must be enabled to use monitoring"); + } + + const {interval, initialDelay, requestTimeout, collectSystemStats} = validatorMonitoringDefaultOptions; + + const monitoring = new MonitoringService( + "validator", + { + endpoint: args["monitoring.endpoint"], + interval: args["monitoring.interval"] ?? interval, + initialDelay: args["monitoring.initialDelay"] ?? initialDelay, + requestTimeout: args["monitoring.requestTimeout"] ?? requestTimeout, + collectSystemStats: args["monitoring.collectSystemStats"] ?? collectSystemStats, + }, + {register, logger} + ); + + onGracefulShutdownCbs.push(() => monitoring.stop()); + monitoring.start(); + } + // This promise resolves once genesis is available. // It will wait for genesis, so this promise can be potentially very long diff --git a/packages/cli/src/cmds/validator/options.ts b/packages/cli/src/cmds/validator/options.ts index e743c2cb545f..a99957d643f1 100644 --- a/packages/cli/src/cmds/validator/options.ts +++ b/packages/cli/src/cmds/validator/options.ts @@ -17,6 +17,13 @@ export const validatorMetricsDefaultOptions = { address: "127.0.0.1", }; +export const validatorMonitoringDefaultOptions = { + interval: 60_000, + initialDelay: 30_000, + requestTimeout: 10_000, + collectSystemStats: false, +}; + // Defined as variable to not set yargs.default to an array export const DEFAULT_BEACON_NODE_URL = ""; @@ -52,6 +59,12 @@ export type IValidatorCliArgs = AccountValidatorArgs & metrics?: boolean; "metrics.port"?: number; "metrics.address"?: string; + + "monitoring.endpoint"?: string; + "monitoring.interval"?: number; + "monitoring.initialDelay"?: number; + "monitoring.requestTimeout"?: number; + "monitoring.collectSystemStats"?: boolean; }; export type KeymanagerArgs = { @@ -287,6 +300,47 @@ export const validatorOptions: ICliCommandOptions = { group: "metrics", }, + // Monitoring + + "monitoring.endpoint": { + type: "string", + description: + "Enables monitoring service for sending clients stats to the specified endpoint of a remote service (e.g. beaconcha.in). It is required that metrics are also enabled by supplying the --metrics flag.", + group: "monitoring", + }, + + "monitoring.interval": { + type: "number", + description: "Interval in milliseconds between sending client stats to the remote service", + defaultDescription: String(validatorMonitoringDefaultOptions.interval), + group: "monitoring", + }, + + "monitoring.initialDelay": { + type: "number", + description: "Initial delay in milliseconds before client stats are sent to the remote service", + defaultDescription: String(validatorMonitoringDefaultOptions.initialDelay), + group: "monitoring", + hidden: true, + }, + + "monitoring.requestTimeout": { + type: "number", + description: "Timeout in milliseconds for sending client stats to the remote service", + defaultDescription: String(validatorMonitoringDefaultOptions.requestTimeout), + group: "monitoring", + hidden: true, + }, + + "monitoring.collectSystemStats": { + type: "boolean", + description: + "Enable collecting system stats. This should only be enabled if validator client and beacon node are running on different hosts.", + defaultDescription: String(validatorMonitoringDefaultOptions.collectSystemStats), + group: "monitoring", + hidden: true, + }, + // For testing only interopIndexes: { diff --git a/packages/cli/src/options/beaconNodeOptions/index.ts b/packages/cli/src/options/beaconNodeOptions/index.ts index 32122f80f142..d92910830314 100644 --- a/packages/cli/src/options/beaconNodeOptions/index.ts +++ b/packages/cli/src/options/beaconNodeOptions/index.ts @@ -7,6 +7,7 @@ import * as chain from "./chain.js"; import * as eth1 from "./eth1.js"; import * as execution from "./execution.js"; import * as metrics from "./metrics.js"; +import * as monitoring from "./monitoring.js"; import * as network from "./network.js"; import * as sync from "./sync.js"; @@ -16,6 +17,7 @@ export type IBeaconNodeArgs = api.IApiArgs & execution.ExecutionEngineArgs & builder.ExecutionBuilderArgs & metrics.IMetricsArgs & + monitoring.IMonitoringArgs & network.INetworkArgs & sync.ISyncArgs; @@ -29,6 +31,7 @@ export function parseBeaconNodeArgs(args: IBeaconNodeArgs): RecursivePartial = { + "monitoring.endpoint": { + type: "string", + description: + "Enables monitoring service for sending clients stats to the specified endpoint of a remote service (e.g. beaconcha.in). It is required that metrics are also enabled by supplying the --metrics flag.", + group: "monitoring", + }, + + "monitoring.interval": { + type: "number", + description: "Interval in milliseconds between sending client stats to the remote service", + defaultDescription: String(defaultOptions.monitoring.interval), + group: "monitoring", + }, + + "monitoring.initialDelay": { + type: "number", + description: "Initial delay in milliseconds before client stats are sent to the remote service", + defaultDescription: String(defaultOptions.monitoring.initialDelay), + group: "monitoring", + hidden: true, + }, + + "monitoring.requestTimeout": { + type: "number", + description: "Timeout in milliseconds for sending client stats to the remote service", + defaultDescription: String(defaultOptions.monitoring.requestTimeout), + group: "monitoring", + hidden: true, + }, + + "monitoring.collectSystemStats": { + type: "boolean", + description: + "Enable collecting system stats. By default, the beacon node will collect system stats but this can also be handled by the validator client.", + defaultDescription: String(defaultOptions.monitoring.collectSystemStats), + group: "monitoring", + hidden: true, + }, +}; diff --git a/packages/cli/test/unit/options/beaconNodeOptions.test.ts b/packages/cli/test/unit/options/beaconNodeOptions.test.ts index 3d3194015889..9a8ea63661a8 100644 --- a/packages/cli/test/unit/options/beaconNodeOptions.test.ts +++ b/packages/cli/test/unit/options/beaconNodeOptions.test.ts @@ -52,6 +52,12 @@ describe("options / beaconNodeOptions", () => { "metrics.port": 8765, "metrics.address": "0.0.0.0", + "monitoring.endpoint": "https://beaconcha.in/api/v1/client/metrics?apikey=secretKey&machine=machine1", + "monitoring.interval": 60000, + "monitoring.initialDelay": 30000, + "monitoring.requestTimeout": 10000, + "monitoring.collectSystemStats": true, + discv5: true, listenAddress: "127.0.0.1", port: 9001, @@ -132,6 +138,13 @@ describe("options / beaconNodeOptions", () => { port: 8765, address: "0.0.0.0", }, + monitoring: { + endpoint: "https://beaconcha.in/api/v1/client/metrics?apikey=secretKey&machine=machine1", + interval: 60000, + initialDelay: 30000, + requestTimeout: 10000, + collectSystemStats: true, + }, network: { discv5: { enabled: true,