-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
control-service: killed job was shown as successful (#2116)
# Why We recently got the following feedback from our internal client: A data job was listed as successful even though it hit the 12 hour limit and was killed; the logs do not show that either - the last entry in the log just shows the last object that was sent for ingestion, but there is no summary of the data job. The problem is caused by the following fix - #1586. When the job hit the 12-hour limit the K8S Pod is terminated and we construct partial JobExecutionStatus which enters in the following if statement and returns Optional.empty() rather than the constructed object. https://github.com/vmware/versatile-data-kit/blob/4763ba877f43b270fbd4770bc1533216f7c5d618/projects/control-service/projects/pipelines_control_service/src/main/java/com/vmware/taurus/service/KubernetesService.java#L1656 As a result, this job execution becomes stuck in the Running status until it is detected by emergency logic, which marks such executions as successful due to the lack of associated Pods to them. # What Added validation for an already completed job in a more appropriate place. # Testing Done Added integration test Signed-off-by: Miroslav Ivanov [email protected] --------- Signed-off-by: Miroslav Ivanov [email protected] Co-authored-by: github-actions <>
- Loading branch information
1 parent
6a4c358
commit 67e739c
Showing
15 changed files
with
301 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
70 changes: 70 additions & 0 deletions
70
...ntegration-test/java/com/vmware/taurus/datajobs/it/TestDataJobBackoffLimitExceededIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* | ||
* Copyright 2021-2023 VMware, Inc. | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package com.vmware.taurus.datajobs.it; | ||
|
||
import com.vmware.taurus.ControlplaneApplication; | ||
import com.vmware.taurus.controlplane.model.data.DataJobExecution; | ||
import com.vmware.taurus.datajobs.it.common.BaseIT; | ||
import com.vmware.taurus.datajobs.it.common.DataJobDeploymentExtension; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.commons.lang3.tuple.ImmutablePair; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.api.extension.RegisterExtension; | ||
import org.springframework.boot.test.context.SpringBootTest; | ||
import org.springframework.test.context.TestPropertySource; | ||
|
||
import static com.vmware.taurus.datajobs.it.common.JobExecutionUtil.*; | ||
|
||
@Slf4j | ||
@TestPropertySource( | ||
properties = { | ||
// This is a standard cron job template except activeDeadlineSeconds is set to 1 | ||
"datajobs.control.k8s.data.job.template.file=data_job_templates/backoff_limit_exceeded_cron_job.yaml" | ||
}) | ||
@SpringBootTest( | ||
webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT, | ||
classes = ControlplaneApplication.class) | ||
public class TestDataJobBackoffLimitExceededIT extends BaseIT { | ||
|
||
@RegisterExtension | ||
static DataJobDeploymentExtension dataJobDeploymentExtension = new DataJobDeploymentExtension(); | ||
|
||
@Test | ||
public void testDataJob_causesBackoffLimitExceeded_shouldCompleteWithUserError( | ||
String jobName, String teamName, String username, String deploymentId) throws Exception { | ||
// manually start job execution | ||
ImmutablePair<String, String> executeDataJobResult = | ||
executeDataJob(jobName, teamName, username, deploymentId, mockMvc); | ||
String opId = executeDataJobResult.getLeft(); | ||
String executionId = executeDataJobResult.getRight(); | ||
|
||
// Check the data job execution status | ||
testDataJobExecutionRead( | ||
executionId, | ||
DataJobExecution.StatusEnum.USER_ERROR, | ||
opId, | ||
jobName, | ||
teamName, | ||
username, | ||
mockMvc); | ||
testDataJobExecutionList( | ||
executionId, | ||
DataJobExecution.StatusEnum.USER_ERROR, | ||
opId, | ||
jobName, | ||
teamName, | ||
username, | ||
mockMvc); | ||
testDataJobDeploymentExecutionList( | ||
executionId, | ||
DataJobExecution.StatusEnum.USER_ERROR, | ||
opId, | ||
jobName, | ||
teamName, | ||
username, | ||
mockMvc); | ||
} | ||
} |
Oops, something went wrong.