From b9ce1a6fcbd11373a5cc82807af15c1cca0dd48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 16:38:13 +0100 Subject: [PATCH] fix: retry specific internal errors (#3565) * chore: make internal auth backend errors retryable Spanner occasionally returns INTERNAL errors regarding the auth backend server. These errors should be regarded as retryable. * fix: retry specific internal errors Some specific internal errors should be retrid. Instead of adding INTERNAL as a standard retryable error code, we use an interceptor to catch and translate those specific errors. See also b/375684610 * chore: address review comments * fix: wait for session pool to initialize * fix: register errors before creating the client --- .../spanner/IsRetryableInternalError.java | 38 ++++---- .../spi/v1/SpannerErrorInterceptor.java | 7 ++ .../spanner/IsRetryableInternalErrorTest.java | 11 +++ .../spanner/RetryableInternalErrorTest.java | 95 +++++++++++++++++++ 4 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java index d250c0ad6c4..e69e1ec9d78 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java @@ -18,33 +18,35 @@ import com.google.api.gax.rpc.InternalException; import com.google.common.base.Predicate; +import com.google.common.collect.ImmutableList; import io.grpc.Status; +import io.grpc.Status.Code; import io.grpc.StatusRuntimeException; public class IsRetryableInternalError implements Predicate { + public static final IsRetryableInternalError INSTANCE = new IsRetryableInternalError(); - private static final String HTTP2_ERROR_MESSAGE = "HTTP/2 error code: INTERNAL_ERROR"; - private static final String CONNECTION_CLOSED_ERROR_MESSAGE = - "Connection closed with unknown cause"; - private static final String EOS_ERROR_MESSAGE = - "Received unexpected EOS on DATA frame from server"; + private static final ImmutableList RETRYABLE_ERROR_MESSAGES = + ImmutableList.of( + "HTTP/2 error code: INTERNAL_ERROR", + "Connection closed with unknown cause", + "Received unexpected EOS on DATA frame from server", + "stream terminated by RST_STREAM", + "Authentication backend internal server error. Please retry."); - private static final String RST_STREAM_ERROR_MESSAGE = "stream terminated by RST_STREAM"; + public boolean isRetryableInternalError(Status status) { + return status.getCode() == Code.INTERNAL + && status.getDescription() != null + && isRetryableErrorMessage(status.getDescription()); + } @Override public boolean apply(Throwable cause) { - if (isInternalError(cause)) { - if (cause.getMessage().contains(HTTP2_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(CONNECTION_CLOSED_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(EOS_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(RST_STREAM_ERROR_MESSAGE)) { - return true; - } - } - return false; + return isInternalError(cause) && isRetryableErrorMessage(cause.getMessage()); + } + + private boolean isRetryableErrorMessage(String errorMessage) { + return RETRYABLE_ERROR_MESSAGES.stream().anyMatch(errorMessage::contains); } private boolean isInternalError(Throwable cause) { diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java index 65db088ffac..549ea18a97f 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java @@ -16,6 +16,7 @@ package com.google.cloud.spanner.spi.v1; +import com.google.cloud.spanner.IsRetryableInternalError; import com.google.rpc.BadRequest; import com.google.rpc.Help; import com.google.rpc.LocalizedMessage; @@ -32,6 +33,7 @@ import io.grpc.Metadata; import io.grpc.MethodDescriptor; import io.grpc.Status; +import io.grpc.Status.Code; import io.grpc.protobuf.ProtoUtils; import java.util.logging.Level; import java.util.logging.Logger; @@ -69,6 +71,11 @@ public void start(Listener responseListener, Metadata headers) { @Override public void onClose(Status status, Metadata trailers) { try { + // Translate INTERNAL errors that should be retried to a retryable error code. + if (IsRetryableInternalError.INSTANCE.isRetryableInternalError(status)) { + status = + Status.fromCode(Code.UNAVAILABLE).withDescription(status.getDescription()); + } if (trailers.containsKey(LOCALIZED_MESSAGE_KEY)) { status = Status.fromCodeValue(status.getCode().value()) diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java index 63039fcd237..514b1e96b7f 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java @@ -127,6 +127,17 @@ public void rstStreamInternalExceptionIsRetryable() { assertTrue(predicate.apply(e)); } + @Test + public void testAuthenticationBackendInternalServerErrorIsRetryable() { + final StatusRuntimeException exception = + new StatusRuntimeException( + Status.fromCode(Code.INTERNAL) + .withDescription( + "INTERNAL: Authentication backend internal server error. Please retry.")); + + assertTrue(predicate.apply(exception)); + } + @Test public void genericInternalExceptionIsNotRetryable() { final InternalException e = diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java new file mode 100644 index 00000000000..3106bd16526 --- /dev/null +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java @@ -0,0 +1,95 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.spanner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import com.google.cloud.NoCredentials; +import com.google.cloud.spanner.MockSpannerServiceImpl.SimulatedExecutionTime; +import com.google.cloud.spanner.connection.AbstractMockServerTest; +import com.google.spanner.v1.BatchCreateSessionsRequest; +import com.google.spanner.v1.ExecuteSqlRequest; +import io.grpc.ManagedChannelBuilder; +import io.grpc.Status; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.threeten.bp.Duration; + +@RunWith(JUnit4.class) +public class RetryableInternalErrorTest extends AbstractMockServerTest { + @Test + public void testTranslateInternalException() { + mockSpanner.setBatchCreateSessionsExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + mockSpanner.setExecuteStreamingSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + + try (Spanner spanner = + SpannerOptions.newBuilder() + .setProjectId("my-project") + .setHost(String.format("http://localhost:%d", getPort())) + .setChannelConfigurator(ManagedChannelBuilder::usePlaintext) + .setCredentials(NoCredentials.getInstance()) + .setSessionPoolOption( + SessionPoolOptions.newBuilder() + .setMinSessions(1) + .setMaxSessions(1) + .setWaitForMinSessions(Duration.ofSeconds(5)) + .build()) + .build() + .getService()) { + + DatabaseClient client = spanner.getDatabaseClient(DatabaseId.of("p", "i", "d")); + // Execute a query. This will block until a BatchCreateSessions call has finished and then + // invoke ExecuteStreamingSql. Both of these RPCs should be retried. + try (ResultSet resultSet = client.singleUse().executeQuery(SELECT1_STATEMENT)) { + assertTrue(resultSet.next()); + assertFalse(resultSet.next()); + } + // Verify that both the BatchCreateSessions call and the ExecuteStreamingSql call were + // retried. + assertEquals(2, mockSpanner.countRequestsOfType(BatchCreateSessionsRequest.class)); + assertEquals(2, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + // Clear the requests before the next test. + mockSpanner.clearRequests(); + + // Execute a DML statement. This uses the ExecuteSql RPC. + assertEquals(0, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + mockSpanner.setExecuteSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + assertEquals( + Long.valueOf(1L), + client + .readWriteTransaction() + .run(transaction -> transaction.executeUpdate(INSERT_STATEMENT))); + // Verify that also this request was retried. + assertEquals(2, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + } + } +}