From 519a8b615411d1779dc4d7d34ffa5c2084e34a0f Mon Sep 17 00:00:00 2001 From: Santiago Palladino Date: Fri, 17 Jan 2025 14:46:35 -0300 Subject: [PATCH] chore: Add OTEL_EXCLUDE_METRICS Adds an env var to allow excluding certain metrics from being exported to reduce noisiness and cost. Metrics are defined by prefix and comma-separated. --- .../aztec-network/templates/boot-node.yaml | 2 + spartan/aztec-network/templates/faucet.yaml | 2 + .../aztec-network/templates/prover-agent.yaml | 2 + .../templates/prover-broker.yaml | 2 + .../aztec-network/templates/prover-node.yaml | 2 + spartan/aztec-network/templates/pxe.yaml | 2 + .../templates/transaction-bot.yaml | 2 + .../aztec-network/templates/validator.yaml | 2 + spartan/aztec-network/values.yaml | 9 +++++ yarn-project/foundation/src/config/env_var.ts | 1 + yarn-project/telemetry-client/src/config.ts | 13 +++++++ yarn-project/telemetry-client/src/otel.ts | 8 +++- .../src/otel_filter_metric_exporter.ts | 38 +++++++++++++++++++ 13 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 yarn-project/telemetry-client/src/otel_filter_metric_exporter.ts diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml index 3072c9d4de1..2b96d621400 100644 --- a/spartan/aztec-network/templates/boot-node.yaml +++ b/spartan/aztec-network/templates/boot-node.yaml @@ -243,6 +243,8 @@ spec: value: "{{ .Values.storage.worldStateMapSize }}" - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.bootNode.otelExcludeMetrics }}" ports: - containerPort: {{ .Values.bootNode.service.nodePort }} - containerPort: {{ .Values.bootNode.service.p2pTcpPort }} diff --git a/spartan/aztec-network/templates/faucet.yaml b/spartan/aztec-network/templates/faucet.yaml index 4a3b492113f..54565bd03ac 100644 --- a/spartan/aztec-network/templates/faucet.yaml +++ b/spartan/aztec-network/templates/faucet.yaml @@ -86,6 +86,8 @@ spec: value: faucet - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.faucet.otelExcludeMetrics }}" ports: - name: http containerPort: {{ .Values.faucet.service.nodePort }} diff --git a/spartan/aztec-network/templates/prover-agent.yaml b/spartan/aztec-network/templates/prover-agent.yaml index 744b2bf2fcf..e68994c00be 100644 --- a/spartan/aztec-network/templates/prover-agent.yaml +++ b/spartan/aztec-network/templates/prover-agent.yaml @@ -105,6 +105,8 @@ spec: value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }} - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.proverAgent.otelExcludeMetrics }}" resources: {{- toYaml .Values.proverAgent.resources | nindent 12 }} {{- end }} diff --git a/spartan/aztec-network/templates/prover-broker.yaml b/spartan/aztec-network/templates/prover-broker.yaml index 84eef388993..4d5d82de8cb 100644 --- a/spartan/aztec-network/templates/prover-broker.yaml +++ b/spartan/aztec-network/templates/prover-broker.yaml @@ -108,6 +108,8 @@ spec: value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }} - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.proverBroker.otelExcludeMetrics }}" resources: {{- toYaml .Values.proverBroker.resources | nindent 12 }} volumes: diff --git a/spartan/aztec-network/templates/prover-node.yaml b/spartan/aztec-network/templates/prover-node.yaml index 6e21c5a9f0c..3df3d60afaa 100644 --- a/spartan/aztec-network/templates/prover-node.yaml +++ b/spartan/aztec-network/templates/prover-node.yaml @@ -190,6 +190,8 @@ spec: value: "{{ .Values.storage.worldStateMapSize }}" - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.proverNode.otelExcludeMetrics }}" ports: - containerPort: {{ .Values.proverNode.service.nodePort }} - containerPort: {{ .Values.proverNode.service.p2pTcpPort }} diff --git a/spartan/aztec-network/templates/pxe.yaml b/spartan/aztec-network/templates/pxe.yaml index 738c6c5a73b..6922b71e55f 100644 --- a/spartan/aztec-network/templates/pxe.yaml +++ b/spartan/aztec-network/templates/pxe.yaml @@ -105,6 +105,8 @@ spec: value: "{{ .Values.aztec.realProofs }}" - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.pxe.otelExcludeMetrics }}" ports: - name: http containerPort: {{ .Values.pxe.service.nodePort }} diff --git a/spartan/aztec-network/templates/transaction-bot.yaml b/spartan/aztec-network/templates/transaction-bot.yaml index 07f682cd857..20494f823cc 100644 --- a/spartan/aztec-network/templates/transaction-bot.yaml +++ b/spartan/aztec-network/templates/transaction-bot.yaml @@ -119,6 +119,8 @@ spec: value: "{{ .Values.bot.stopIfUnhealthy }}" - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.bot.otelExcludeMetrics }}" ports: - name: http containerPort: {{ .Values.bot.service.nodePort }} diff --git a/spartan/aztec-network/templates/validator.yaml b/spartan/aztec-network/templates/validator.yaml index 38768f5fe55..39411ad2bf1 100644 --- a/spartan/aztec-network/templates/validator.yaml +++ b/spartan/aztec-network/templates/validator.yaml @@ -211,6 +211,8 @@ spec: value: "{{ .Values.storage.worldStateMapSize }}" - name: USE_GCLOUD_OBSERVABILITY value: "{{ .Values.telemetry.useGcloudObservability }}" + - name: OTEL_EXCLUDE_METRICS + value: "{{ .Values.validator.otelExcludeMetrics }}" ports: - containerPort: {{ .Values.validator.service.nodePort }} - containerPort: {{ .Values.validator.service.p2pTcpPort }} diff --git a/spartan/aztec-network/values.yaml b/spartan/aztec-network/values.yaml index 17557fd0559..962081904b3 100644 --- a/spartan/aztec-network/values.yaml +++ b/spartan/aztec-network/values.yaml @@ -85,6 +85,7 @@ bootNode: stakingAssetAddress: "" storageSize: "1Gi" dataDir: "/data" + otelExcludeMetrics: "" validator: # If true, the validator will use its peers to serve as the boot node. @@ -130,6 +131,7 @@ validator: dataDir: "/data" l1FixedPriorityFeePerGas: "" l1GasLimitBufferPercentage: "" + otelExcludeMetrics: "" proverNode: proverPublisherPrivateKey: "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" @@ -166,6 +168,7 @@ proverNode: intervalMs: 1000 maxParallelRequests: 100 failedProofStore: "gs://aztec-develop/spartan/failed-proofs" + otelExcludeMetrics: "" pxe: logLevel: "debug; info: aztec:simulator, json-rpc" @@ -182,6 +185,7 @@ pxe: requests: memory: "4Gi" cpu: "1" + otelExcludeMetrics: "" bot: enabled: true @@ -211,6 +215,7 @@ bot: requests: memory: "4Gi" cpu: "1" + otelExcludeMetrics: "" ethereum: externalHost: "" @@ -236,6 +241,7 @@ ethereum: cpu: "1" storageSize: "80Gi" deployL1ContractsPrivateKey: + otelExcludeMetrics: "" proverAgent: service: @@ -254,6 +260,7 @@ proverAgent: memory: "4Gi" cpu: "1" pollInterval: 200 + otelExcludeMetrics: "" proverBroker: service: @@ -271,6 +278,7 @@ proverBroker: memory: "4Gi" cpu: "1" maxOldSpaceSize: "3584" + otelExcludeMetrics: "" jobs: deployL1Verifier: @@ -288,3 +296,4 @@ faucet: requests: memory: "2Gi" cpu: "200m" + otelExcludeMetrics: "" diff --git a/yarn-project/foundation/src/config/env_var.ts b/yarn-project/foundation/src/config/env_var.ts index ed21716f5f0..2fef4f1dcac 100644 --- a/yarn-project/foundation/src/config/env_var.ts +++ b/yarn-project/foundation/src/config/env_var.ts @@ -70,6 +70,7 @@ export type EnvVar = | 'OTEL_EXPORTER_OTLP_LOGS_ENDPOINT' | 'OTEL_SERVICE_NAME' | 'OTEL_COLLECT_INTERVAL_MS' + | 'OTEL_EXCLUDE_METRICS' | 'OTEL_EXPORT_TIMEOUT_MS' | 'OUTBOX_CONTRACT_ADDRESS' | 'P2P_BLOCK_CHECK_INTERVAL_MS' diff --git a/yarn-project/telemetry-client/src/config.ts b/yarn-project/telemetry-client/src/config.ts index 523c47cdff1..f01be5ec461 100644 --- a/yarn-project/telemetry-client/src/config.ts +++ b/yarn-project/telemetry-client/src/config.ts @@ -12,6 +12,7 @@ export interface TelemetryClientConfig { k8sPodUid?: string; k8sPodName?: string; k8sNamespaceName?: string; + otelExcludeMetrics?: string[]; } export const telemetryClientConfigMappings: ConfigMappingsType = { @@ -57,6 +58,18 @@ export const telemetryClientConfigMappings: ConfigMappingsType parseInt(val), }, + otelExcludeMetrics: { + env: 'OTEL_EXCLUDE_METRICS', + description: 'A list of metric prefixes to exclude from export', + parseEnv: (val: string) => + val + ? val + .split(',') + .map(s => s.trim()) + .filter(s => s.length > 0) + : [], + defaultValue: [], + }, k8sPodUid: { env: 'K8S_POD_UID', description: 'The UID of the Kubernetes pod (injected automatically by k8s)', diff --git a/yarn-project/telemetry-client/src/otel.ts b/yarn-project/telemetry-client/src/otel.ts index 66a54390e56..9140827b4be 100644 --- a/yarn-project/telemetry-client/src/otel.ts +++ b/yarn-project/telemetry-client/src/otel.ts @@ -32,6 +32,7 @@ import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic import { type TelemetryClientConfig } from './config.js'; import { EventLoopMonitor } from './event_loop_monitor.js'; import { linearBuckets } from './histogram_utils.js'; +import { OtelFilterMetricExporter } from './otel_filter_metric_exporter.js'; import { registerOtelLoggerProvider } from './otel_logger_provider.js'; import { getOtelResource } from './otel_resource.js'; import { type Gauge, type TelemetryClient } from './telemetry.js'; @@ -247,7 +248,7 @@ export class OpenTelemetryClient implements TelemetryClient { tracerProvider.register(); const meterProvider = OpenTelemetryClient.createMeterProvider(resource, { - exporter: new GoogleCloudMetricExporter(), + exporter: new OtelFilterMetricExporter(new GoogleCloudMetricExporter(), config.otelExcludeMetrics ?? []), exportTimeoutMillis: config.otelExportTimeoutMs, exportIntervalMillis: config.otelCollectIntervalMs, }); @@ -269,7 +270,10 @@ export class OpenTelemetryClient implements TelemetryClient { const meterProvider = OpenTelemetryClient.createMeterProvider(resource, { exporter: config.metricsCollectorUrl - ? new OTLPMetricExporter({ url: config.metricsCollectorUrl.href }) + ? new OtelFilterMetricExporter( + new OTLPMetricExporter({ url: config.metricsCollectorUrl.href }), + config.otelExcludeMetrics ?? [], + ) : undefined, exportTimeoutMillis: config.otelExportTimeoutMs, exportIntervalMillis: config.otelCollectIntervalMs, diff --git a/yarn-project/telemetry-client/src/otel_filter_metric_exporter.ts b/yarn-project/telemetry-client/src/otel_filter_metric_exporter.ts new file mode 100644 index 00000000000..34fef5fe289 --- /dev/null +++ b/yarn-project/telemetry-client/src/otel_filter_metric_exporter.ts @@ -0,0 +1,38 @@ +import { type ExportResult } from '@opentelemetry/core'; +import { type MetricData, type PushMetricExporter, type ResourceMetrics } from '@opentelemetry/sdk-metrics'; + +export class OtelFilterMetricExporter implements PushMetricExporter { + constructor(private readonly exporter: PushMetricExporter, private readonly excludeMetricPrefixes: string[]) { + if (exporter.selectAggregation) { + (this as PushMetricExporter).selectAggregation = exporter.selectAggregation.bind(exporter); + } + if (exporter.selectAggregationTemporality) { + (this as PushMetricExporter).selectAggregationTemporality = exporter.selectAggregationTemporality.bind(exporter); + } + } + + public export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void { + const filteredMetrics: ResourceMetrics = { + resource: metrics.resource, + scopeMetrics: metrics.scopeMetrics + .map(({ scope, metrics }) => ({ scope, metrics: this.filterMetrics(metrics) })) + .filter(({ metrics }) => metrics.length > 0), + }; + + this.exporter.export(filteredMetrics, resultCallback); + } + + private filterMetrics(metrics: MetricData[]): MetricData[] { + return metrics.filter( + metric => !this.excludeMetricPrefixes.some(prefix => metric.descriptor.name.startsWith(prefix)), + ); + } + + public forceFlush(): Promise { + return this.exporter.forceFlush(); + } + + public shutdown(): Promise { + return this.exporter.shutdown(); + } +}