-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
control-service: logs endpoint doesn't hang (#2370)
# Why Closes: #2041 if a job is running and we make a request for its logs the control service hangs and after a very long time returns an error. # What The error was happening because we were reading the logs for kubernetes using an inputStream function. Kuberenetes was streaming the logs to the control plane and doesn't close the input stream till the jobs is completed. Prevoiusly we were using ```java new PodLogs(client).streamNamespacedPodLog(...) ``` Within this function Kubernetes calls ```java new CoreV1Api(client).readNamespacedPodLogCall(... , follow=true) ``` Instead of calling `PodLogs.streamNamespacedPodLog` instead I call `CoreV1Api.readNamespacedPodLogCall` directly and I set follow=false. Now the function returns immediately. # How has this been tested Integration test. I have created a job that runs for 20 minutes. I query for its logs and make sure that it returns within 5 seconds. Signed-off-by: murphp15 <[email protected]> --------- Co-authored-by: github-actions <>
- Loading branch information
Showing
3 changed files
with
112 additions
and
5 deletions.
There are no files selected for viewing
87 changes: 87 additions & 0 deletions
87
...ol_service/src/integration-test/java/com/vmware/taurus/datajobs/it/DataJobViewLogsIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
* Copyright 2021-2023 VMware, Inc. | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package com.vmware.taurus.datajobs.it; | ||
|
||
import com.vmware.taurus.ControlplaneApplication; | ||
import com.vmware.taurus.controlplane.model.data.DataJobExecution; | ||
import com.vmware.taurus.datajobs.it.common.BaseIT; | ||
import com.vmware.taurus.datajobs.it.common.DataJobDeploymentExtension; | ||
import com.vmware.taurus.datajobs.it.common.JobExecutionUtil; | ||
import com.vmware.taurus.service.execution.JobExecutionService; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.commons.lang3.tuple.ImmutablePair; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.api.extension.RegisterExtension; | ||
import org.springframework.beans.factory.annotation.Autowired; | ||
import org.springframework.boot.test.context.SpringBootTest; | ||
import org.testcontainers.shaded.org.awaitility.Awaitility; | ||
|
||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
import static com.vmware.taurus.datajobs.it.common.JobExecutionUtil.testDataJobExecutionRead; | ||
|
||
@Slf4j | ||
@SpringBootTest( | ||
webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT, | ||
classes = ControlplaneApplication.class) | ||
public class DataJobViewLogsIT extends BaseIT { | ||
|
||
@Autowired private JobExecutionService executionService; | ||
|
||
// simple_job_read_logs.zip contains a job that would run for more than 30 minutes if allowed | ||
@RegisterExtension | ||
static DataJobDeploymentExtension dataJobDeploymentExtension = | ||
new DataJobDeploymentExtension("simple_job_read_logs.zip"); | ||
|
||
@Test | ||
public void testJobLogsViewing( | ||
String jobName, String username, String deploymentId, String teamName) throws Exception { | ||
// manually start job execution | ||
ImmutablePair<String, String> executeDataJobResult = | ||
JobExecutionUtil.executeDataJob(jobName, teamName, username, deploymentId, mockMvc); | ||
String executionId = executeDataJobResult.getRight(); | ||
String opId = executeDataJobResult.getLeft(); | ||
testDataJobExecutionRead( | ||
executionId, | ||
DataJobExecution.StatusEnum.RUNNING, | ||
opId, | ||
jobName, | ||
teamName, | ||
username, | ||
mockMvc); | ||
Awaitility.await() | ||
.atMost(6, TimeUnit.MINUTES) | ||
.ignoreExceptionsMatching( | ||
(e) -> | ||
// It is only by looking 3 exceptions deep that we can tell the reason the logs read | ||
// failed. | ||
e.getCause() | ||
.getCause() | ||
// If 400 is in the error message then the pod is not up yet and we should keep | ||
// trying to poll for the logs. | ||
.getMessage() | ||
.contains("400")) | ||
.until( | ||
() -> { | ||
getLogsFast(jobName, teamName, executionId); | ||
return true; | ||
}); | ||
} | ||
|
||
/** We are testing that the server returns logs quickly. */ | ||
private void getLogsFast(String jobName, String teamName, String executionId) | ||
throws InterruptedException, ExecutionException, TimeoutException { | ||
Executors.newSingleThreadExecutor() | ||
.submit( | ||
() -> { | ||
executionService.getJobExecutionLogs(teamName, jobName, executionId, 0); | ||
}) | ||
.get(5, TimeUnit.SECONDS); | ||
} | ||
} |
Binary file added
BIN
+2.69 KB
...pelines_control_service/src/integration-test/resources/data_jobs/simple_job_read_logs.zip
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters