-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Retry broadcast OOM with BHJ disabled within the same spark session
Presto on Spark uses temp storage for storing and distributing broadcast tables. Spark driver performs the necessary threshold checks on broadcast table and if the size is over the threshold, the query fails with broadcast oom. The only way to fix this failure is to disable broadcast join in the query. As we are able to detect broadcast OOM on driver confidently, we can just disable broadcast join, replan and resubmit the query for execution. This can happen within the same spark session itself and thus would not need any users intervention for fixing such failures.
- Loading branch information
Showing
16 changed files
with
635 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
92 changes: 92 additions & 0 deletions
92
presto-spark-base/src/main/java/com/facebook/presto/spark/util/PrestoSparkFailureUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
/* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.facebook.presto.spark.util; | ||
|
||
import com.facebook.presto.Session; | ||
import com.facebook.presto.execution.ExecutionFailureInfo; | ||
import com.facebook.presto.spark.classloader_interface.PrestoSparkFailure; | ||
import com.facebook.presto.spark.classloader_interface.RetryExecutionStrategy; | ||
import com.facebook.presto.spi.ErrorCode; | ||
import com.google.common.collect.ImmutableList; | ||
|
||
import javax.annotation.Nullable; | ||
|
||
import java.util.List; | ||
import java.util.Optional; | ||
|
||
import static com.facebook.presto.execution.ExecutionFailureInfo.toStackTraceElement; | ||
import static com.facebook.presto.spark.PrestoSparkSessionProperties.isRetryOnOutOfMemoryBroadcastJoinEnabled; | ||
import static com.facebook.presto.spark.classloader_interface.RetryExecutionStrategy.DISABLE_BROADCAST_JOIN; | ||
import static com.facebook.presto.spi.StandardErrorCode.EXCEEDED_LOCAL_BROADCAST_JOIN_MEMORY_LIMIT; | ||
import static com.google.common.base.Preconditions.checkState; | ||
import static java.util.Objects.requireNonNull; | ||
|
||
public class PrestoSparkFailureUtils | ||
{ | ||
private PrestoSparkFailureUtils() {} | ||
|
||
public static PrestoSparkFailure toPrestoSparkFailure(Session session, ExecutionFailureInfo executionFailureInfo) | ||
{ | ||
requireNonNull(executionFailureInfo, "executionFailureInfo is null"); | ||
PrestoSparkFailure prestoSparkFailure = toPrestoSparkFailure(executionFailureInfo); | ||
checkState(prestoSparkFailure != null); | ||
|
||
Optional<RetryExecutionStrategy> retryExecutionStrategy = getRetryExecutionStrategy(session, executionFailureInfo.getErrorCode(), executionFailureInfo.getMessage()); | ||
return new PrestoSparkFailure( | ||
prestoSparkFailure.getMessage(), | ||
prestoSparkFailure.getCause(), | ||
prestoSparkFailure.getType(), | ||
prestoSparkFailure.getErrorCode(), | ||
retryExecutionStrategy); | ||
} | ||
|
||
@Nullable | ||
private static PrestoSparkFailure toPrestoSparkFailure(ExecutionFailureInfo executionFailureInfo) | ||
{ | ||
if (executionFailureInfo == null) { | ||
return null; | ||
} | ||
|
||
PrestoSparkFailure prestoSparkFailure = new PrestoSparkFailure( | ||
executionFailureInfo.getMessage(), | ||
toPrestoSparkFailure(executionFailureInfo.getCause()), | ||
executionFailureInfo.getType(), | ||
executionFailureInfo.getErrorCode() == null ? "" : executionFailureInfo.getErrorCode().getName(), | ||
Optional.empty()); | ||
|
||
for (ExecutionFailureInfo suppressed : executionFailureInfo.getSuppressed()) { | ||
prestoSparkFailure.addSuppressed(requireNonNull(toPrestoSparkFailure(suppressed), "suppressed failure is null")); | ||
} | ||
ImmutableList.Builder<StackTraceElement> stackTraceBuilder = ImmutableList.builder(); | ||
for (String stack : executionFailureInfo.getStack()) { | ||
stackTraceBuilder.add(toStackTraceElement(stack)); | ||
} | ||
List<StackTraceElement> stackTrace = stackTraceBuilder.build(); | ||
prestoSparkFailure.setStackTrace(stackTrace.toArray(new StackTraceElement[stackTrace.size()])); | ||
return prestoSparkFailure; | ||
} | ||
|
||
private static Optional<RetryExecutionStrategy> getRetryExecutionStrategy(Session session, ErrorCode errorCode, String message) | ||
{ | ||
if (errorCode == null || message == null) { | ||
return Optional.empty(); | ||
} | ||
|
||
if (isRetryOnOutOfMemoryBroadcastJoinEnabled(session) && errorCode == EXCEEDED_LOCAL_BROADCAST_JOIN_MEMORY_LIMIT.toErrorCode()) { | ||
return Optional.of(DISABLE_BROADCAST_JOIN); | ||
} | ||
|
||
return Optional.empty(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.