From d04e85f4ce6b82b989a07087cf20fdd4c984573b Mon Sep 17 00:00:00 2001 From: Carlos Bermudez Porto <43155355+cbermudez97@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:15:06 -0400 Subject: [PATCH] feat: add generic prometheus endpoints (#209) - feat: add execution and additional metrics jobs to prometheus service - feat: add beacon metrics gazer to prometheus - fix: ignore not defined metrics info --- main.star | 57 +++++---- .../beacon_metrics_gazer_launcher.star | 18 ++- src/prometheus/prometheus_launcher.star | 117 ++++++++++++++++-- .../prometheus-config/prometheus.yml.tmpl | 27 ++-- 4 files changed, 168 insertions(+), 51 deletions(-) diff --git a/main.star b/main.star index 47ee3795e..143dc8e67 100644 --- a/main.star +++ b/main.star @@ -83,6 +83,7 @@ def run(plan, args={}): prometheus_config_template = read_file( static_files.PROMETHEUS_CONFIG_TEMPLATE_FILEPATH ) + prometheus_additional_metrics_jobs = [] plan.print("Read the prometheus, grafana templates") @@ -229,6 +230,7 @@ def run(plan, args={}): if not args_with_right_defaults.launch_additional_services: return + launch_prometheus_grafana = False for additional_service in args_with_right_defaults.additional_services: if additional_service == "tx_spammer": plan.print("Launching transaction spammer") @@ -283,12 +285,18 @@ def run(plan, args={}): beacon_metrics_gazer_config_template = read_file( static_files.BEACON_METRICS_GAZER_CONFIG_TEMPLATE_FILEPATH ) - beacon_metrics_gazer.launch_beacon_metrics_gazer( - plan, - beacon_metrics_gazer_config_template, - all_cl_client_contexts, - args_with_right_defaults.participants, - network_params, + beacon_metrics_gazer_prometheus_metrics_job = ( + beacon_metrics_gazer.launch_beacon_metrics_gazer( + plan, + beacon_metrics_gazer_config_template, + all_cl_client_contexts, + args_with_right_defaults.participants, + network_params, + ) + ) + launch_prometheus_grafana = True + prometheus_additional_metrics_jobs.append( + beacon_metrics_gazer_prometheus_metrics_job ) plan.print("Succesfully launched beacon metrics gazer") elif additional_service == "light_beaconchain_explorer": @@ -301,25 +309,28 @@ def run(plan, args={}): ) plan.print("Succesfully light-beaconchain-explorer") elif additional_service == "prometheus_grafana": - plan.print("Launching prometheus...") - prometheus_private_url = prometheus.launch_prometheus( - plan, - prometheus_config_template, - all_cl_client_contexts, - all_el_client_contexts, - ) - plan.print("Successfully launched Prometheus") - - plan.print("Launching grafana...") - grafana.launch_grafana( - plan, - grafana_datasource_config_template, - grafana_dashboards_config_template, - prometheus_private_url, - ) - plan.print("Succesfully launched grafana") + # Allow prometheus to be launched last so is able to collect metrics from other services + launch_prometheus_grafana = True else: fail("Invalid additional service %s" % (additional_service)) + if launch_prometheus_grafana: + plan.print("Launching prometheus...") + prometheus_private_url = prometheus.launch_prometheus( + plan, + prometheus_config_template, + all_el_client_contexts, + all_cl_client_contexts, + prometheus_additional_metrics_jobs, + ) + + plan.print("Launching grafana...") + grafana.launch_grafana( + plan, + grafana_datasource_config_template, + grafana_dashboards_config_template, + prometheus_private_url, + ) + plan.print("Succesfully launched grafana") if args_with_right_defaults.wait_for_finalization: plan.print("Waiting for the first finalized epoch") diff --git a/src/beacon_metrics_gazer/beacon_metrics_gazer_launcher.star b/src/beacon_metrics_gazer/beacon_metrics_gazer_launcher.star index 316aebed3..3345eb833 100644 --- a/src/beacon_metrics_gazer/beacon_metrics_gazer_launcher.star +++ b/src/beacon_metrics_gazer/beacon_metrics_gazer_launcher.star @@ -1,6 +1,9 @@ shared_utils = import_module( "github.com/kurtosis-tech/ethereum-package/src/shared_utils/shared_utils.star" ) +prometheus = import_module( + "github.com/kurtosis-tech/ethereum-package/src/prometheus/prometheus_launcher.star" +) SERVICE_NAME = "beacon-metrics-gazer" @@ -9,6 +12,8 @@ IMAGE_NAME = "ethpandaops/beacon-metrics-gazer:master" HTTP_PORT_ID = "http" HTTP_PORT_NUMBER = 8080 +METRICS_PATH = "/metrics" + BEACON_METRICS_GAZER_CONFIG_FILENAME = "validator-ranges.yaml" BEACON_METRICS_GAZER_CONFIG_MOUNT_DIRPATH_ON_SERVICE = "/config" @@ -59,7 +64,18 @@ def launch_beacon_metrics_gazer( cl_client_contexts[0].http_port_num, ) - plan.add_service(SERVICE_NAME, config) + beacon_metrics_gazer_service = plan.add_service(SERVICE_NAME, config) + + return prometheus.new_metrics_job( + job_name=SERVICE_NAME, + endpoint="{0}:{1}".format( + beacon_metrics_gazer_service.ip_address, HTTP_PORT_NUMBER + ), + metrics_path=METRICS_PATH, + labels={ + "service": SERVICE_NAME, + }, + ) def get_config(config_files_artifact_name, ip_addr, http_port_num): diff --git a/src/prometheus/prometheus_launcher.star b/src/prometheus/prometheus_launcher.star index a4f73f822..66e417acc 100644 --- a/src/prometheus/prometheus_launcher.star +++ b/src/prometheus/prometheus_launcher.star @@ -4,6 +4,14 @@ shared_utils = import_module( SERVICE_NAME = "prometheus" +EXECUTION_CLIENT_TYPE = "execution" +BEACON_CLIENT_TYPE = "beacon" +VALIDATOR_CLIENT_TYPE = "validator" + +METRICS_INFO_NAME_KEY = "name" +METRICS_INFO_URL_KEY = "url" +METRICS_INFO_PATH_KEY = "path" + # TODO(old) I'm not sure if we should use latest version or ping an specific version instead IMAGE_NAME = "prom/prometheus:latest" @@ -22,17 +30,18 @@ USED_PORTS = { } -def launch_prometheus(plan, config_template, cl_client_contexts, el_client_contexts): - all_nodes_metrics_info = [] - for client in cl_client_contexts: - all_nodes_metrics_info.extend(client.cl_nodes_metrics_info) - - for client in el_client_contexts: - # etheruemjs doesn't populate metrics just yet - if client.el_metrics_info != [None]: - all_nodes_metrics_info.extend(client.el_metrics_info) - - template_data = new_config_template_data(all_nodes_metrics_info) +def launch_prometheus( + plan, + config_template, + el_client_contexts, + cl_client_contexts, + additional_metrics_jobs, +): + template_data = new_config_template_data( + el_client_contexts, + cl_client_contexts, + additional_metrics_jobs, + ) template_and_data = shared_utils.new_template_and_data( config_template, template_data ) @@ -75,5 +84,87 @@ def get_config(config_files_artifact_name): ) -def new_config_template_data(cl_nodes_metrics_info): - return {"CLNodesMetricsInfo": cl_nodes_metrics_info} +def new_config_template_data( + el_client_contexts, + cl_client_contexts, + additional_metrics_jobs, +): + metrics_jobs = [] + # Adding execution clients metrics jobs + for context in el_client_contexts: + if len(context.el_metrics_info) >= 1 and context.el_metrics_info[0] != None: + execution_metrics_info = context.el_metrics_info[0] + metrics_jobs.append( + new_metrics_job( + job_name=execution_metrics_info[METRICS_INFO_NAME_KEY], + endpoint=execution_metrics_info[METRICS_INFO_URL_KEY], + metrics_path=execution_metrics_info[METRICS_INFO_PATH_KEY], + labels={ + "service": context.service_name, + "client_type": EXECUTION_CLIENT_TYPE, + "client_name": context.client_name, + }, + ) + ) + # Adding consensus clients metrics jobs + for context in cl_client_contexts: + if ( + len(context.cl_nodes_metrics_info) >= 1 + and context.cl_nodes_metrics_info[0] != None + ): + # Adding beacon node metrics + beacon_metrics_info = context.cl_nodes_metrics_info[0] + metrics_jobs.append( + new_metrics_job( + job_name=beacon_metrics_info[METRICS_INFO_NAME_KEY], + endpoint=beacon_metrics_info[METRICS_INFO_URL_KEY], + metrics_path=beacon_metrics_info[METRICS_INFO_PATH_KEY], + labels={ + "service": context.beacon_service_name, + "client_type": BEACON_CLIENT_TYPE, + "client_name": context.client_name, + }, + ) + ) + if ( + len(context.cl_nodes_metrics_info) >= 2 + and context.cl_nodes_metrics_info[1] != None + ): + # Adding validator node metrics + validator_metrics_info = context.cl_nodes_metrics_info[1] + metrics_jobs.append( + new_metrics_job( + job_name=validator_metrics_info[METRICS_INFO_NAME_KEY], + endpoint=validator_metrics_info[METRICS_INFO_URL_KEY], + metrics_path=validator_metrics_info[METRICS_INFO_PATH_KEY], + labels={ + "service": context.validator_service_name, + "client_type": VALIDATOR_CLIENT_TYPE, + "client_name": context.client_name, + }, + ) + ) + # Adding additional metrics jobs + for job in additional_metrics_jobs: + if job == None: + continue + metrics_jobs.append(job) + return { + "MetricsJobs": metrics_jobs, + } + + +def new_metrics_job( + job_name, + endpoint, + metrics_path, + labels, + scrape_interval="15s", +): + return { + "Name": job_name, + "Endpoint": endpoint, + "MetricsPath": metrics_path, + "Labels": labels, + "ScrapeInterval": scrape_interval, + } diff --git a/static_files/prometheus-config/prometheus.yml.tmpl b/static_files/prometheus-config/prometheus.yml.tmpl index ff4058fc8..6e65bb500 100644 --- a/static_files/prometheus-config/prometheus.yml.tmpl +++ b/static_files/prometheus-config/prometheus.yml.tmpl @@ -1,16 +1,15 @@ global: - scrape_interval: 15s # By default, scrape targets every 15 seconds. - -# A scrape configuration containing exactly one endpoint to scrape: -# Here it's Prometheus itself. + scrape_interval: 15s scrape_configs: - {{ range $clNode := .CLNodesMetricsInfo }} - - job_name: '{{ $clNode.name }}' - metrics_path: {{ $clNode.path }} - static_configs: - - targets: ['{{ $clNode.url }}'] - {{ end }} - - job_name: 'beacon-metrics-gazer' - metrics_path: '/metrics' - static_configs: - - targets: ['beacon-metrics-gazer:8080'] + {{- range $job := .MetricsJobs }} + - job_name: "{{ $job.Name }}" + metrics_path: "{{ $job.MetricsPath }}" + {{- if $job.ScrapeInterval }} + scrape_interval: {{ $job.ScrapeInterval }} + {{- end }} + static_configs: + - targets: ['{{ $job.Endpoint }}'] + labels:{{ range $labelName, $labelValue := $job.Labels }} + {{ $labelName }}: "{{ $labelValue }}" + {{- end }} + {{- end }}