diff --git a/api/v1beta1/flowcollector_types.go b/api/v1beta1/flowcollector_types.go index 18c48d90f..040b1dfec 100644 --- a/api/v1beta1/flowcollector_types.go +++ b/api/v1beta1/flowcollector_types.go @@ -352,13 +352,13 @@ type FLPMetrics struct { // +optional IgnoreTags []string `json:"ignoreTags"` - // `includeList` is a list of metric names to specify which metrics to generate. - // The names correspond to the name in Prometheus, without the prefix. For example, + // `includeList` is a list of metric names to specify which ones to generate. + // The names correspond to the names in Prometheus without the prefix. For example, // `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. - // Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - // `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, - // `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, - // `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`. + // Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. + // Metrics enabled by default are: + // `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). + // More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md // +optional IncludeList *[]string `json:"includeList,omitempty"` diff --git a/api/v1beta1/flowcollector_webhook_test.go b/api/v1beta1/flowcollector_webhook_test.go index 1617dae0a..2876b6a31 100644 --- a/api/v1beta1/flowcollector_webhook_test.go +++ b/api/v1beta1/flowcollector_webhook_test.go @@ -118,14 +118,14 @@ func TestBeta1ConversionRoundtrip_Metrics(t *testing.T) { assert.Equal([]v1beta2.FLPAlert{v1beta2.AlertLokiError}, converted.Spec.Processor.Metrics.DisableAlerts) assert.NotNil(converted.Spec.Processor.Metrics.IncludeList) - assert.Equal([]string{"namespace_egress_packets_total", "namespace_flows_total"}, *converted.Spec.Processor.Metrics.IncludeList) + assert.Equal([]string{"namespace_egress_packets_total", "namespace_flows_total", "namespace_rtt_seconds", "namespace_drop_packets_total"}, *converted.Spec.Processor.Metrics.IncludeList) // Other way var back FlowCollector err = back.ConvertFrom(&converted) assert.NoError(err) // Here, includeList is preserved; it takes precedence over ignoreTags - assert.Equal([]string{"namespace_egress_packets_total", "namespace_flows_total"}, *back.Spec.Processor.Metrics.IncludeList) + assert.Equal([]string{"namespace_egress_packets_total", "namespace_flows_total", "namespace_rtt_seconds", "namespace_drop_packets_total"}, *back.Spec.Processor.Metrics.IncludeList) assert.Equal(initial.Spec.Processor.Metrics.DisableAlerts, back.Spec.Processor.Metrics.DisableAlerts) assert.Equal(initial.Spec.Processor.Metrics.Server, back.Spec.Processor.Metrics.Server) } diff --git a/api/v1beta2/flowcollector_types.go b/api/v1beta2/flowcollector_types.go index d5e6fc1ad..7839ccdfe 100644 --- a/api/v1beta2/flowcollector_types.go +++ b/api/v1beta2/flowcollector_types.go @@ -344,13 +344,13 @@ type FLPMetrics struct { // +optional Server MetricsServerConfig `json:"server,omitempty"` - // `includeList` is a list of metric names to specify which metrics to generate. - // The names correspond to the name in Prometheus, without the prefix. For example, + // `includeList` is a list of metric names to specify which ones to generate. + // The names correspond to the names in Prometheus without the prefix. For example, // `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. - // Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - // `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, - // `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, - // `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`. + // Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. + // Metrics enabled by default are: + // `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). + // More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md // +optional IncludeList *[]string `json:"includeList,omitempty"` diff --git a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml index 278f34968..a0a7678e5 100644 --- a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml +++ b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml @@ -4827,17 +4827,16 @@ spec: type: array includeList: description: '`includeList` is a list of metric names to specify - which metrics to generate. The names correspond to the name - in Prometheus, without the prefix. For example, `namespace_egress_packets_total` + which ones to generate. The names correspond to the names + in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` - in Prometheus. Available names are: `namespace_egress_bytes_total`, - `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - `namespace_ingress_packets_total`, `namespace_flows_total`, - `node_egress_bytes_total`, `node_egress_packets_total`, - `node_ingress_bytes_total`, `node_ingress_packets_total`, - `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, - `workload_ingress_bytes_total`, `workload_ingress_packets_total`, - `workload_flows_total`.' + in Prometheus. Note that the more metrics you add, the bigger + is the impact on Prometheus workload resources. Metrics + enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, + `workload_ingress_bytes_total`, `namespace_drop_packets_total` + (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` + (when `FlowRTT` feature is enabled). More information, with + full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array @@ -7704,17 +7703,16 @@ spec: type: array includeList: description: '`includeList` is a list of metric names to specify - which metrics to generate. The names correspond to the name - in Prometheus, without the prefix. For example, `namespace_egress_packets_total` + which ones to generate. The names correspond to the names + in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` - in Prometheus. Available names are: `namespace_egress_bytes_total`, - `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - `namespace_ingress_packets_total`, `namespace_flows_total`, - `node_egress_bytes_total`, `node_egress_packets_total`, - `node_ingress_bytes_total`, `node_ingress_packets_total`, - `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, - `workload_ingress_bytes_total`, `workload_ingress_packets_total`, - `workload_flows_total`.' + in Prometheus. Note that the more metrics you add, the bigger + is the impact on Prometheus workload resources. Metrics + enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, + `workload_ingress_bytes_total`, `namespace_drop_packets_total` + (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` + (when `FlowRTT` feature is enabled). More information, with + full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array diff --git a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml index 72f629672..699acb2e9 100644 --- a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml +++ b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml @@ -4813,17 +4813,16 @@ spec: type: array includeList: description: '`includeList` is a list of metric names to specify - which metrics to generate. The names correspond to the name - in Prometheus, without the prefix. For example, `namespace_egress_packets_total` + which ones to generate. The names correspond to the names + in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` - in Prometheus. Available names are: `namespace_egress_bytes_total`, - `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - `namespace_ingress_packets_total`, `namespace_flows_total`, - `node_egress_bytes_total`, `node_egress_packets_total`, - `node_ingress_bytes_total`, `node_ingress_packets_total`, - `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, - `workload_ingress_bytes_total`, `workload_ingress_packets_total`, - `workload_flows_total`.' + in Prometheus. Note that the more metrics you add, the bigger + is the impact on Prometheus workload resources. Metrics + enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, + `workload_ingress_bytes_total`, `namespace_drop_packets_total` + (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` + (when `FlowRTT` feature is enabled). More information, with + full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array @@ -7690,17 +7689,16 @@ spec: type: array includeList: description: '`includeList` is a list of metric names to specify - which metrics to generate. The names correspond to the name - in Prometheus, without the prefix. For example, `namespace_egress_packets_total` + which ones to generate. The names correspond to the names + in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` - in Prometheus. Available names are: `namespace_egress_bytes_total`, - `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, - `namespace_ingress_packets_total`, `namespace_flows_total`, - `node_egress_bytes_total`, `node_egress_packets_total`, - `node_ingress_bytes_total`, `node_ingress_packets_total`, - `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, - `workload_ingress_bytes_total`, `workload_ingress_packets_total`, - `workload_flows_total`.' + in Prometheus. Note that the more metrics you add, the bigger + is the impact on Prometheus workload resources. Metrics + enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, + `workload_ingress_bytes_total`, `namespace_drop_packets_total` + (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` + (when `FlowRTT` feature is enabled). More information, with + full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array diff --git a/controllers/flowlogspipeline/flp_test.go b/controllers/flowlogspipeline/flp_test.go index 04981295e..8d61e552d 100644 --- a/controllers/flowlogspipeline/flp_test.go +++ b/controllers/flowlogspipeline/flp_test.go @@ -915,10 +915,13 @@ func TestMergeMetricsConfiguration_Default(t *testing.T) { jsonStages, _ := json.Marshal(stages) assert.Equal(`[{"name":"ipfix"},{"name":"extract_conntrack","follows":"ipfix"},{"name":"enrich","follows":"extract_conntrack"},{"name":"loki","follows":"enrich"},{"name":"stdout","follows":"enrich"},{"name":"prometheus","follows":"enrich"}]`, string(jsonStages)) names := getSortedMetricsNames(parameters[5].Encode.Prom.Metrics) - assert.Len(names, 3) - assert.Equal("namespace_flows_total", names[0]) - assert.Equal("node_ingress_bytes_total", names[1]) - assert.Equal("workload_ingress_bytes_total", names[2]) + assert.Equal([]string{ + "namespace_drop_packets_total", + "namespace_flows_total", + "namespace_rtt_seconds", + "node_ingress_bytes_total", + "workload_ingress_bytes_total", + }, names) assert.Equal("netobserv_", parameters[5].Encode.Prom.Prefix) } diff --git a/docs/FlowCollector.md b/docs/FlowCollector.md index b5f52f3a4..97affae55 100644 --- a/docs/FlowCollector.md +++ b/docs/FlowCollector.md @@ -8556,7 +8556,7 @@ target specifies the target value for the given metric includeList []string - `includeList` is a list of metric names to specify which metrics to generate. The names correspond to the name in Prometheus, without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`.
+ `includeList` is a list of metric names to specify which ones to generate. The names correspond to the names in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. Metrics enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md
false @@ -13717,7 +13717,7 @@ target specifies the target value for the given metric includeList []string - `includeList` is a list of metric names to specify which metrics to generate. The names correspond to the name in Prometheus, without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`.
+ `includeList` is a list of metric names to specify which ones to generate. The names correspond to the names in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. Metrics enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md
false diff --git a/docs/Metrics.md b/docs/Metrics.md index 599af7d74..461fa67d5 100644 --- a/docs/Metrics.md +++ b/docs/Metrics.md @@ -1,35 +1,39 @@ # Metrics in the NetObserv Operator -Configuration of metrics to be collected are stored in the metrics_definitions folder. -These are defined in yaml files according to the format handled by the flp confgenerator. -The flp confgenerator was modified to produce output that can be easily consumed by the NetObserv Operator. -The flp confgenerator was further modified so that it may be called as a module, and provides its output as a data structure returned from a function rather than a yaml file. -All metrics that may be produced are included in the metrics_definitions library, and they are associated with tags. -A parameter is added to the Operator CRD to specify tags of metrics to not produce. - -On each iteration of the Operator, the Operator checks whether the CRD has been modified. -If the CRD has changed, the Operator reconciles the state of the cluster to the specification in the CRD. - -The implementation of the Operator specifies the flp Network Transform enrichment (in particular, kubernetes features). -The actual metrics to produce are taken from the metrics_definitions, based on the enrichment defined in the Operator. -The Operator allocates the extract_aggregate and encode_prom Stage structures for the flp pipeline, -and extract_aggregate and encode_prom entries are filled in using the results from the confgenerator. -The configuration is placed into a configMap. -Flp is then deployed using this combined configuration. -The configuration is not changed during runtime. -In order to change the configuration (e.g. exclude a different set of metrics), flp must be redeployed. - -Note that there are 2 data paths in flp. Data that is ingested is enriched and is then passed directly to Loki. -In addition, after the enrichment, we derive metrics (from the metrics_definitions), aggregate them, and report to prometheus. -The metrics_definitions does not impact the data that is sent to Loki. - -In the metrics_definitions yaml files, there are tags associated with each metric. -A user may specify to skip metrics that have a particular tag. -This is specified by a field in the CRD. -These tags are then specified to the confgenerator module to produce metrics that are not associated with the specified tag. - -## Parameters added to CRD to support metrics -Note: These parameters may be changed between interations, in which case the Operator redeploys flp. -- ignoreMetrics (list of tags to specify which metrics to ignore) - - +The NetObserv operator uses [flowlogs-pipeline](https://github.com/netobserv/flowlogs-pipeline/) to generate metrics out of flow logs. + +They can be configured in the `FlowCollector` custom resource, via `spec.processor.metrics.includeList`. It is a list of metric names that tells which ones to generate. + +The names correspond to the names in Prometheus without their prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. + +Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. Some metrics in particular have a bigger cardinality, such as all metrics starting with `workload_`, which may result in stressing Prometheus if too many of them are enabled. It is recommended to monitor the impact on Prometheus when adding more metrics. + +Available names are: (names followed by `*` are enabled by default) +- `namespace_egress_bytes_total` +- `namespace_egress_packets_total` +- `namespace_ingress_bytes_total` +- `namespace_ingress_packets_total` +- `namespace_flows_total` `*` +- `node_egress_bytes_total` +- `node_egress_packets_total` +- `node_ingress_bytes_total` `*` +- `node_ingress_packets_total` +- `node_flows_total` +- `workload_egress_bytes_total` +- `workload_egress_packets_total` +- `workload_ingress_bytes_total` `*` +- `workload_ingress_packets_total` +- `workload_flows_total` + +When the `PacketDrop` feature is enabled in `spec.agent.ebpf.features` (with privileged mode), additional metrics are available: +- `namespace_drop_bytes_total` +- `namespace_drop_packets_total` `*` +- `node_drop_bytes_total` +- `node_drop_packets_total` +- `workload_drop_bytes_total` +- `workload_drop_packets_total` + +When the `FlowRTT` feature is enabled in `spec.agent.ebpf.features`, additional metrics are available: +- `namespace_rtt_seconds` `*` +- `node_rtt_seconds` +- `workload_rtt_seconds` diff --git a/hack/cloned.flows.netobserv.io_flowcollectors.yaml b/hack/cloned.flows.netobserv.io_flowcollectors.yaml index 6d06de434..d0bf5a406 100644 --- a/hack/cloned.flows.netobserv.io_flowcollectors.yaml +++ b/hack/cloned.flows.netobserv.io_flowcollectors.yaml @@ -3339,7 +3339,7 @@ spec: type: string type: array includeList: - description: '`includeList` is a list of metric names to specify which metrics to generate. The names correspond to the name in Prometheus, without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`.' + description: '`includeList` is a list of metric names to specify which ones to generate. The names correspond to the names in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. Metrics enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array @@ -5320,7 +5320,7 @@ spec: type: string type: array includeList: - description: '`includeList` is a list of metric names to specify which metrics to generate. The names correspond to the name in Prometheus, without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Available names are: `namespace_egress_bytes_total`, `namespace_egress_packets_total`, `namespace_ingress_bytes_total`, `namespace_ingress_packets_total`, `namespace_flows_total`, `node_egress_bytes_total`, `node_egress_packets_total`, `node_ingress_bytes_total`, `node_ingress_packets_total`, `node_flows_total`, `workload_egress_bytes_total`, `workload_egress_packets_total`, `workload_ingress_bytes_total`, `workload_ingress_packets_total`, `workload_flows_total`.' + description: '`includeList` is a list of metric names to specify which ones to generate. The names correspond to the names in Prometheus without the prefix. For example, `namespace_egress_packets_total` will show up as `netobserv_namespace_egress_packets_total` in Prometheus. Note that the more metrics you add, the bigger is the impact on Prometheus workload resources. Metrics enabled by default are: `namespace_flows_total`, `node_ingress_bytes_total`, `workload_ingress_bytes_total`, `namespace_drop_packets_total` (when `PacketDrop` feature is enabled), `namespace_rtt_seconds` (when `FlowRTT` feature is enabled). More information, with full list of available metrics: https://github.com/netobserv/network-observability-operator/blob/main/docs/Metrics.md' items: type: string type: array diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index 6f52c7559..76e72ceed 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -16,19 +16,21 @@ type rowInfo struct { // Queries const ( - layerApps = "Applications" - layerInfra = "Infrastructure" - appsFilters1 = `SrcK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - appsFilters2 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - infraFilters1 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - infraFilters2 = `SrcK8S_Namespace!~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - metricTagNamespaces = "namespaces" - metricTagNodes = "nodes" - metricTagWorkloads = "workloads" - metricTagIngress = "ingress" - metricTagEgress = "egress" - metricTagBytes = "bytes" - metricTagPackets = "packets" + layerApps = "Applications" + layerInfra = "Infrastructure" + appsFilters1 = `SrcK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` + appsFilters2 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` + infraFilters1 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` + infraFilters2 = `SrcK8S_Namespace!~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` + metricTagNamespaces = "namespaces" + metricTagNodes = "nodes" + metricTagWorkloads = "workloads" + metricTagIngress = "ingress" + metricTagEgress = "egress" + metricTagBytes = "bytes" + metricTagPackets = "packets" + metricTagDropBytes = "drop_bytes" + metricTagDropPackets = "drop_packets" ) var ( @@ -85,6 +87,7 @@ var ( func init() { for _, group := range []string{metricTagNodes, metricTagNamespaces, metricTagWorkloads} { groupTrimmed := strings.TrimSuffix(group, "s") + // byte/pkt rates for _, vt := range []string{metricTagBytes, metricTagPackets} { for _, dir := range []string{metricTagEgress, metricTagIngress} { rowsInfo = append(rowsInfo, rowInfo{ @@ -95,6 +98,15 @@ func init() { }) } } + // drops + for _, vt := range []string{metricTagDropBytes, metricTagDropPackets} { + rowsInfo = append(rowsInfo, rowInfo{ + metric: fmt.Sprintf("netobserv_%s_%s_total", groupTrimmed, vt), + group: group, + valueType: vt, + }) + } + // TODO: RTT dashboard (after dashboard refactoring for exposed metrics; need to handle histogram queries) } } @@ -217,6 +229,10 @@ func flowMetricsRow(netobsNs string, rowInfo rowInfo) string { vt = "byte" case metricTagPackets: vt = "packet" + case metricTagDropBytes: + vt = "drop bytes" + case metricTagDropPackets: + vt = "drop packets" } title := fmt.Sprintf("Top %s rates %s per source and destination %s", vt, verb, rowInfo.group) var panels string diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index 01430c265..87bf99a09 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -18,7 +18,7 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 12) + assert.Len(d.Rows, 18) // First row row := 0 @@ -28,8 +28,8 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert.Len(d.Rows[row].Panels[0].Targets, 1) assert.Contains(d.Rows[row].Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_egress_bytes_total[1m])) by (SrcK8S_HostName, DstK8S_HostName))") - // 6th row - row = 5 + // 8th row + row = 7 assert.Equal("Top byte rates received per source and destination namespaces", d.Rows[row].Title) assert.Len(d.Rows[row].Panels, 2) assert.Equal("Applications", d.Rows[row].Panels[0].Title) @@ -42,8 +42,8 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { `label_replace(label_replace(topk(10,sum(rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[1m]) or rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[1m])) by (SrcK8S_Namespace, DstK8S_Namespace))`, ) - // 12th row - row = 11 + // 16th row + row = 15 assert.Equal("Top packet rates received per source and destination workloads", d.Rows[row].Title) assert.Len(d.Rows[row].Panels, 2) assert.Equal("Applications", d.Rows[row].Panels[0].Title) @@ -88,7 +88,7 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 3) + assert.Len(d.Rows, 4) // First row row := 0 @@ -113,8 +113,8 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[1m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[1m])) by (SrcK8S_Namespace, DstK8S_Namespace))`, ) - // 3rd row - row = 2 + // 4th row + row = 3 assert.Equal("Top byte rates received per source and destination workloads", d.Rows[row].Title) assert.Len(d.Rows[row].Panels, 2) assert.Equal("Applications", d.Rows[row].Panels[0].Title) diff --git a/pkg/metrics/predefined_metrics.go b/pkg/metrics/predefined_metrics.go index c5f49e316..92a9f51c4 100644 --- a/pkg/metrics/predefined_metrics.go +++ b/pkg/metrics/predefined_metrics.go @@ -34,7 +34,13 @@ var ( } predefinedMetrics []taggedMetricDefinition // Note that we set default in-code rather than in CRD, in order to keep track of value being unset or set intentionnally in FlowCollector - DefaultIncludeList = []string{"node_ingress_bytes_total", "workload_ingress_bytes_total", "namespace_flows_total"} + DefaultIncludeList = []string{ + "node_ingress_bytes_total", + "workload_ingress_bytes_total", + "namespace_flows_total", + "namespace_drop_packets_total", + "namespace_rtt_seconds", + } // Pre-deprecation default IgnoreTags list (1.4) - used before switching to whitelist approach, // to make sure there is no unintended new metrics being collected // Don't add anything here: this is not meant to evolve @@ -61,7 +67,7 @@ func init() { ValueKey: valueField, Filters: []flpapi.PromMetricsFilter{ {Key: "Duplicate", Value: "false"}, - {Key: "FlowDirection", Value: mapDirection[dir], Type: "regex"}, + {Key: "FlowDirection", Value: mapDirection[dir], Type: flpapi.PromFilterRegex}, }, Labels: labels, }, @@ -78,6 +84,47 @@ func init() { }, tags: []string{group, group + "-flows", "flows"}, }) + // RTT metrics + predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ + PromMetricsItem: flpapi.PromMetricsItem{ + Name: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), + Type: "histogram", + ValueKey: "TimeFlowRttNs", + Filters: []flpapi.PromMetricsFilter{ + {Key: "TimeFlowRttNs", Type: flpapi.PromFilterPresence}, + }, + Labels: labels, + ValueScale: 1_000_000_000, // ns => s + }, + tags: []string{group, "rtt"}, + }) + // Drops metrics + predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ + PromMetricsItem: flpapi.PromMetricsItem{ + Name: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), + Type: "counter", + ValueKey: "PktDropPackets", + Filters: []flpapi.PromMetricsFilter{ + {Key: "Duplicate", Value: "false"}, + {Key: "PktDropPackets", Type: flpapi.PromFilterPresence}, + }, + Labels: labels, + }, + tags: []string{group, tagPackets, "drops"}, + }) + predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ + PromMetricsItem: flpapi.PromMetricsItem{ + Name: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), + Type: "counter", + ValueKey: "PktDropBytes", + Filters: []flpapi.PromMetricsFilter{ + {Key: "Duplicate", Value: "false"}, + {Key: "PktDropBytes", Type: flpapi.PromFilterPresence}, + }, + Labels: labels, + }, + tags: []string{group, tagBytes, "drop"}, + }) } } diff --git a/pkg/metrics/predefined_metrics_test.go b/pkg/metrics/predefined_metrics_test.go index e2a2108ef..eec99b716 100644 --- a/pkg/metrics/predefined_metrics_test.go +++ b/pkg/metrics/predefined_metrics_test.go @@ -11,7 +11,17 @@ func TestIncludeExclude(t *testing.T) { // IgnoreTags set, Include list unset => resolving ignore tags res := GetEnabledNames([]string{"egress", "packets", "flows"}, nil) - assert.Equal([]string{"node_ingress_bytes_total", "namespace_ingress_bytes_total", "workload_ingress_bytes_total"}, res) + assert.Equal([]string{ + "node_ingress_bytes_total", + "node_rtt_seconds", + "node_drop_bytes_total", + "namespace_ingress_bytes_total", + "namespace_rtt_seconds", + "namespace_drop_bytes_total", + "workload_ingress_bytes_total", + "workload_rtt_seconds", + "workload_drop_bytes_total", + }, res) // IgnoreTags set, Include list set => keep include list res = GetEnabledNames([]string{"egress", "packets"}, &[]string{"namespace_flows_total"})