From aa36522952276c029c9b6fef632ad3833d7f3416 Mon Sep 17 00:00:00 2001 From: Tsvetomir Palashki Date: Wed, 2 Feb 2022 15:33:05 +0200 Subject: [PATCH] control-service: add counter to track data job watching task executions Currently, we are lacking monitoring of our data job watching task - this is the task that monitors the K8s namespace for data job changes and updates the execution and termination statuses of the data jobs along with the metrics exposed by the control service. We have experienced cases when this task stops running. Considering the importance of this task it is essential that we get an early alert when this happens. This commit introduces a new metric (counter) that exposes the number of executions of this task. This counter can then be used in dashboards to alert when the task stops executing for a period of time. Testing done: new unit tests; manually starting the service to observe the new, gradually increasing metrics. Signed-off-by: Tsvetomir Palashki --- .../pipelines-control-service/README.md | 2 ++ .../service/monitoring/DataJobMetrics.java | 18 ++++++++++++++++++ .../service/monitoring/DataJobMonitor.java | 1 + .../service/monitoring/DataJobMetricsTest.java | 17 +++++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/projects/control-service/projects/helm_charts/pipelines-control-service/README.md b/projects/control-service/projects/helm_charts/pipelines-control-service/README.md index d9d57e2ba4..fbbe44645a 100644 --- a/projects/control-service/projects/helm_charts/pipelines-control-service/README.md +++ b/projects/control-service/projects/helm_charts/pipelines-control-service/README.md @@ -122,6 +122,8 @@ Custom metrics are: is allowed to be delayed from its schedule before an alert is triggered) * tags: * data_job - the data job name +* taurus.datajob.watch.task.invocations.counter (A counter that exposes the number of executions + of the data job monitoring task) ### Alerting diff --git a/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMetrics.java b/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMetrics.java index fb3f547cc6..2bda36d394 100644 --- a/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMetrics.java +++ b/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMetrics.java @@ -6,6 +6,7 @@ package com.vmware.taurus.service.monitoring; import com.vmware.taurus.service.model.DataJob; +import io.micrometer.core.instrument.Counter; import io.micrometer.core.instrument.Gauge; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.Tags; @@ -33,6 +34,7 @@ public class DataJobMetrics { public static final String TAURUS_DATAJOB_INFO_METRIC_NAME = "taurus.datajob.info"; public static final String TAURUS_DATAJOB_NOTIFICATION_DELAY_METRIC_NAME = "taurus.datajob.notification.delay"; public static final String TAURUS_DATAJOB_TERMINATION_STATUS_METRIC_NAME = "taurus.datajob.termination.status"; + public static final String TAURUS_DATAJOB_WATCH_TASK_INVOCATIONS_COUNTER_NAME = "taurus.datajob.watch.task.invocations.counter"; public static final String TAG_DATA_JOB = "data_job"; public static final String TAG_EXECUTION_ID = "execution_id"; public static final String TAG_TEAM = "team"; @@ -44,6 +46,7 @@ public class DataJobMetrics { public static final int DEFAULT_NOTIFICATION_DELAY_PERIOD_MINUTES = 240; private final MeterRegistry meterRegistry; + private final Counter watchTaskInvocationsCounter; private final Map infoGauges = new ConcurrentHashMap<>(); private final Map delayGauges = new ConcurrentHashMap<>(); private final Map statusGauges = new ConcurrentHashMap<>(); @@ -53,6 +56,21 @@ public class DataJobMetrics { @Autowired public DataJobMetrics(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; + + watchTaskInvocationsCounter = Counter.builder(TAURUS_DATAJOB_WATCH_TASK_INVOCATIONS_COUNTER_NAME) + .description("Counts the number of times the data jobs watching task is called.") + .register(this.meterRegistry); + } + + /** + * Increments the counter used to track the number of times the {@link DataJobMonitor#watchJobs} method was invoked. + */ + public void incrementWatchTaskInvocations() { + try { + watchTaskInvocationsCounter.increment(); + } catch (Exception e) { + log.warn("Error while trying to increment counter.", e); + } } /** diff --git a/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMonitor.java b/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMonitor.java index 0703b7947a..f7531ba29c 100644 --- a/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMonitor.java +++ b/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/monitoring/DataJobMonitor.java @@ -91,6 +91,7 @@ public DataJobMonitor( initialDelayString = "${datajobs.status.watch.initial.delay:10000}") @SchedulerLock(name = "watchJobs_schedulerLock") public void watchJobs() { + dataJobMetrics.incrementWatchTaskInvocations(); try { dataJobsKubernetesService.watchJobs( labelsToWatch, diff --git a/projects/control-service/projects/pipelines_control_service/src/test/java/com/vmware/taurus/service/monitoring/DataJobMetricsTest.java b/projects/control-service/projects/pipelines_control_service/src/test/java/com/vmware/taurus/service/monitoring/DataJobMetricsTest.java index ca87f48fcb..c6881cb9c4 100644 --- a/projects/control-service/projects/pipelines_control_service/src/test/java/com/vmware/taurus/service/monitoring/DataJobMetricsTest.java +++ b/projects/control-service/projects/pipelines_control_service/src/test/java/com/vmware/taurus/service/monitoring/DataJobMetricsTest.java @@ -230,4 +230,21 @@ void testClearGauges_shouldClearAllGauges() { gauges = meterRegistry.find(DataJobMetrics.TAURUS_DATAJOB_TERMINATION_STATUS_METRIC_NAME).gauges(); Assertions.assertEquals(0, gauges.size()); } + + @Test + @Order(13) + void testIncrementWatchTaskInvocations() { + dataJobMetrics.incrementWatchTaskInvocations(); + + var counter = meterRegistry.counter(DataJobMetrics.TAURUS_DATAJOB_WATCH_TASK_INVOCATIONS_COUNTER_NAME); + Assertions.assertEquals(1.0, counter.count(), 0.001); + + dataJobMetrics.incrementWatchTaskInvocations(); + dataJobMetrics.incrementWatchTaskInvocations(); + dataJobMetrics.incrementWatchTaskInvocations(); + dataJobMetrics.incrementWatchTaskInvocations(); + + counter = meterRegistry.counter(DataJobMetrics.TAURUS_DATAJOB_WATCH_TASK_INVOCATIONS_COUNTER_NAME); + Assertions.assertEquals(5.0, counter.count(), 0.001); + } }