Skip to content

Commit

Permalink
control-service: fix oom tests (#2028)
Browse files Browse the repository at this point in the history
# Why
In this PR I fix a very unstable oom test. 
The test is very unstable for a number of reasons. 

1. sometime the job actually succeeds, it seems to be able to compelte
with in the memory limits after a number of retries.
We can put the memory requirement any lower to ensure an error because
6mi is the lower limit supported by k8s.
2. on of the assertions we were making is that the logs are there.
However often if the jobs is killed at the very start there are no logs.


# What 
I fix it by using a cronjob template which doesn't let the job restart.
This means if it fails once it will report as failed.


# How has this been tested?
integration tests.

---------

Signed-off-by: murphp15 <[email protected]>
Co-authored-by: github-actions <>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
murphp15 and github-actions[bot] authored May 11, 2023
1 parent 66c6b34 commit 61f1192
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@
import com.vmware.taurus.datajobs.it.common.JobExecutionUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.RegisterExtension;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.TestPropertySource;

import static com.vmware.taurus.datajobs.it.common.JobExecutionUtil.*;

@Slf4j
@TestPropertySource(
properties = {
"datajobs.deployment.initContainer.resources.requests.memory=6Mi",
"datajobs.deployment.initContainer.resources.limits.memory=6Mi",
// This is a standard cron job template except restartPolicy is set to never so that when a
// job runs out of memory it is
// not retied but instead reports more quickly that it is a platform error
"datajobs.control.k8s.data.job.template.file=fast_failing_cron_job.yaml"
})
@SpringBootTest(
webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT,
Expand All @@ -30,7 +37,7 @@ public class DataJobInitContainerOOMIT extends BaseIT {
@RegisterExtension
static DataJobDeploymentExtension dataJobDeploymentExtension = new DataJobDeploymentExtension();

// @Test
@Test
public void testDataJob_causesOOM_shouldCompleteWithUserError(
String jobName, String teamName, String username, String deploymentId) throws Exception {
// manually start job execution
Expand All @@ -40,7 +47,23 @@ public void testDataJob_causesOOM_shouldCompleteWithUserError(
String executionId = executeDataJobResult.getRight();

// Check the data job execution status
JobExecutionUtil.checkDataJobExecutionStatus(
testDataJobExecutionRead(
executionId,
DataJobExecution.StatusEnum.PLATFORM_ERROR,
opId,
jobName,
teamName,
username,
mockMvc);
testDataJobExecutionList(
executionId,
DataJobExecution.StatusEnum.PLATFORM_ERROR,
opId,
jobName,
teamName,
username,
mockMvc);
testDataJobDeploymentExecutionList(
executionId,
DataJobExecution.StatusEnum.PLATFORM_ERROR,
opId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ public static ImmutablePair<String, String> executeDataJob(
return ImmutablePair.of(opId, executionId);
}

private static void testDataJobExecutionRead(
public static void testDataJobExecutionRead(
String executionId,
com.vmware.taurus.controlplane.model.data.DataJobExecution.StatusEnum executionStatus,
String opId,
Expand Down Expand Up @@ -198,7 +198,7 @@ private static void testDataJobExecutionRead(
executionId, executionStatus, opId, dataJobExecution[0], jobName, username);
}

private static void testDataJobExecutionList(
public static void testDataJobExecutionList(
String executionId,
com.vmware.taurus.controlplane.model.data.DataJobExecution.StatusEnum executionStatus,
String opId,
Expand Down Expand Up @@ -232,7 +232,7 @@ private static void testDataJobExecutionList(
executionId, executionStatus, opId, dataJobExecutions.get(0), jobName, username);
}

private static void testDataJobDeploymentExecutionList(
public static void testDataJobDeploymentExecutionList(
String executionId,
com.vmware.taurus.controlplane.model.data.DataJobExecution.StatusEnum executionStatus,
String opId,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2021-2023 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0

apiVersion: batch/v1beta1
kind: CronJob
metadata:
annotations: # merged with additional annotations from TPCS
name: cronjob-template-name # overridden by TPCS
spec:
concurrencyPolicy: Forbid
failedJobsHistoryLimit: 2
schedule: "*/10 * * * *" # overridden by TPCS
startingDeadlineSeconds: 1800
successfulJobsHistoryLimit: 1
suspend: false # overridden by TPCS
jobTemplate:
metadata:
annotations: # merged with additional annotations from TPCS
labels: # merged with additional labels from TPCS
spec:
activeDeadlineSeconds: 43200
backoffLimit: 3
template:
metadata:
labels: # merged with additional labels from TPCS
spec:
containers: # overridden by TPCS
- command:
- /bin/sh
- -c
- date; echo '************** Cronjob Template ******************'
name: cronjob-template-container-name
image: busybox
imagePullPolicy: IfNotPresent
restartPolicy: Never
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
automountServiceAccountToken: false
ttlSecondsAfterFinished: 600
Original file line number Diff line number Diff line change
Expand Up @@ -332,16 +332,10 @@ private V1beta1CronJob loadConfigurableV1beta1CronjobTemplate() {
return null;
}

// Check whether the configurable datajob template file exists.
File datajobTemplateFile = new File(datajobTemplateFileLocation);
if (!datajobTemplateFile.isFile()) {
log.warn("Datajob template location '{}' is not a file.", datajobTemplateFileLocation);
return null;
}

try {
// Load the configurable datajob template file.
return loadV1beta1CronjobTemplate(datajobTemplateFile);
return loadV1beta1CronjobTemplate(
new ClassPathResource(datajobTemplateFileLocation).getFile());
} catch (Exception e) {
log.error("Error while loading the datajob template file.", e);
return null;
Expand All @@ -355,16 +349,9 @@ private V1CronJob loadConfigurableV1CronjobTemplate() {
return null;
}

// Check whether the configurable datajob template file exists.
File datajobTemplateFile = new File(datajobTemplateFileLocation);
if (!datajobTemplateFile.isFile()) {
log.warn("Datajob template location '{}' is not a file.", datajobTemplateFileLocation);
return null;
}

try {
// Load the configurable datajob template file.
return loadV1CronjobTemplate(datajobTemplateFile);
return loadV1CronjobTemplate(new ClassPathResource(datajobTemplateFileLocation).getFile());
} catch (Exception e) {
log.error("Error while loading the datajob template file.", e);
return null;
Expand Down

0 comments on commit 61f1192

Please sign in to comment.