NVIDIA · razajafri · Oct 27, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 18, 2023
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GlobalLimitShims.scala
@@ -33,7 +33,6 @@
 {"spark": "332cdh"}
 {"spark": "332db"}
 {"spark": "333"}
-{"spark": "341db"}
 spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 

diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/FilteredPartitions.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/FilteredPartitions.scala
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "324"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.connector.read._
+
+trait FilteredPartitions {
+  @transient protected val filteredPartitions: Seq[InputPartition]
+}
diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -26,29 +26,20 @@ spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.google.common.base.Objects
-import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, GpuScan}
+import com.nvidia.spark.rapids.GpuScan
 
 import org.apache.spark.SparkException
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
-import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
 import org.apache.spark.sql.execution.datasources.v2._
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
     @transient scan: GpuScan,
     runtimeFilters: Seq[Expression] = Seq.empty)
-    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
-  @transient lazy val batch: Batch = scan.toBatch
-
-  // All expressions are filter expressions used on the CPU.
-  override def gpuExpressions: Seq[Expression] = Nil
+    extends GpuBatchScanExecBase(scan, runtimeFilters) {
 
   // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
   override def equals(other: Any): Boolean = other match {
@@ -62,7 +53,7 @@ case class GpuBatchScanExec(
 
   @transient override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions()
 
-  @transient private lazy val filteredPartitions: Seq[InputPartition] = {
+  @transient override protected lazy val filteredPartitions: Seq[InputPartition] = {
     val dataSourceFilters = runtimeFilters.flatMap {
       case DynamicPruningExpression(e) => DataSourceStrategyUtils.translateRuntimeFilter(e)
       case _ => None
@@ -77,7 +68,6 @@ case class GpuBatchScanExec(
 
       // call toBatch again to get filtered partitions
       val newPartitions = scan.toBatch.planInputPartitions()
-
       originalPartitioning match {
         case p: DataSourcePartitioning if p.numPartitions != newPartitions.size =>
           throw new SparkException(
@@ -94,38 +84,11 @@ case class GpuBatchScanExec(
     }
   }
 
-  override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
-
-  override lazy val inputRDD: RDD[InternalRow] = {
-    scan.metrics = allMetrics
-    if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) {
-      // return an empty RDD with 1 partition if dynamic filtering removed the only split
-      sparkContext.parallelize(Array.empty[InternalRow], 1)
-    } else {
-      new GpuDataSourceRDD(sparkContext, filteredPartitions, readerFactory)
-    }
-  }
-
   override def doCanonicalize(): GpuBatchScanExec = {
     this.copy(
       output = output.map(QueryPlan.normalizeExpressions(_, output)),
       runtimeFilters = QueryPlan.normalizePredicates(
         runtimeFilters.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)),
         output))
   }
-
-  override def simpleString(maxFields: Int): String = {
-    val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields)
-    val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}"
-    val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
-    redact(result)
-  }
-
-  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
-    val numOutputRows = longMetric("numOutputRows")
-    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
-      numOutputRows += b.numRows()
-      b
-    }
-  }
-}
+}
diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExecBase.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExecBase.scala
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "321db"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "324"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "332cdh"}
+{"spark": "332db"}
+{"spark": "333"}
+{"spark": "340"}
+{"spark": "341"}
+{"spark": "341db"}
+{"spark": "350"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, GpuScan}
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
+import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.connector.read._
+import org.apache.spark.sql.execution.datasources.v2._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+abstract class GpuBatchScanExecBase(
+    @transient scan: GpuScan,
+    runtimeFilters: Seq[Expression] = Seq.empty)
+  extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics with FilteredPartitions {
+
+  // All expressions are filter expressions used on the CPU.
+  override def gpuExpressions: Seq[Expression] = Nil
+
+  override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
+
+  @transient lazy val batch: Batch = scan.toBatch
+
+  override lazy val inputRDD: RDD[InternalRow] = {
+    scan.metrics = allMetrics
+    if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) {
+      // return an empty RDD with 1 partition if dynamic filtering removed the only split
+      sparkContext.parallelize(Array.empty[InternalRow], 1)
+    } else {
+      new GpuDataSourceRDD(sparkContext, filteredPartitions, readerFactory)
+    }
+  }
+
+  override def simpleString(maxFields: Int): String = {
+    val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields)
+    val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}"
+    val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
+    redact(result)
+  }
+
+  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
+      numOutputRows += b.numRows()
+      b
+    }
+  }
+}
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/FilteredPartitions.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/FilteredPartitions.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "332cdh"}
+{"spark": "332db"}
+{"spark": "333"}
+{"spark": "340"}
+{"spark": "341"}
+{"spark": "341db"}
+{"spark": "350"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.connector.read._
+
+trait FilteredPartitions {
+  @transient protected val filteredPartitions: Seq[Seq[InputPartition]]
+}
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala
@@ -25,30 +25,23 @@ spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.google.common.base.Objects
-import com.nvidia.spark.rapids.{GpuBatchScanExecMetrics, GpuScan}
+import com.nvidia.spark.rapids.GpuScan
 
 import org.apache.spark.SparkException
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, SinglePartition}
-import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowSet}
+import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
+import org.apache.spark.sql.catalyst.util.InternalRowSet
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils
 import org.apache.spark.sql.execution.datasources.v2._
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
 case class GpuBatchScanExec(
     output: Seq[AttributeReference],
     @transient scan: GpuScan,
     runtimeFilters: Seq[Expression] = Seq.empty,
     keyGroupedPartitioning: Option[Seq[Expression]] = None)
-    extends DataSourceV2ScanExecBase with GpuBatchScanExecMetrics {
-  @transient lazy val batch: Batch = scan.toBatch
-
-  // All expressions are filter expressions used on the CPU.
-  override def gpuExpressions: Seq[Expression] = Nil
+  extends GpuBatchScanExecBase(scan, runtimeFilters) {
 
   // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
   override def equals(other: Any): Boolean = other match {
@@ -62,7 +55,7 @@ case class GpuBatchScanExec(
 
   @transient override lazy val inputPartitions: Seq[InputPartition] = batch.planInputPartitions()
 
-  @transient private lazy val filteredPartitions: Seq[Seq[InputPartition]] = {
+  @transient override protected lazy val filteredPartitions: Seq[Seq[InputPartition]] = {
     val dataSourceFilters = runtimeFilters.flatMap {
       case DynamicPruningExpression(e) => DataSourceStrategyUtils.translateRuntimeFilter(e)
       case _ => None
@@ -114,38 +107,11 @@ case class GpuBatchScanExec(
     }
   }
 
-  override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
-
-  override lazy val inputRDD: RDD[InternalRow] = {
-    scan.metrics = allMetrics
-    if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) {
-      // return an empty RDD with 1 partition if dynamic filtering removed the only split
-      sparkContext.parallelize(Array.empty[InternalRow], 1)
-    } else {
-      new GpuDataSourceRDD(sparkContext, filteredPartitions, readerFactory)
-    }
-  }
-
   override def doCanonicalize(): GpuBatchScanExec = {
     this.copy(
       output = output.map(QueryPlan.normalizeExpressions(_, output)),
       runtimeFilters = QueryPlan.normalizePredicates(
         runtimeFilters.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)),
         output))
   }
-
-  override def simpleString(maxFields: Int): String = {
-    val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields)
-    val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}"
-    val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString"
-    redact(result)
-  }
-
-  override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
-    val numOutputRows = longMetric("numOutputRows")
-    inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { b =>
-      numOutputRows += b.numRows()
-      b
-    }
-  }
 }
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/Spark330PlusShims.scala
@@ -28,15 +28,13 @@ spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
 import com.nvidia.spark.rapids._
-import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.datasources.{DataSourceUtils, FilePartition, FileScanRDD, PartitionedFile}
-import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
+import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, PartitionedFile}
 import org.apache.spark.sql.rapids.shims.{GpuDivideYMInterval, GpuMultiplyYMInterval}
 import org.apache.spark.sql.types.StructType
 
@@ -53,22 +51,6 @@ trait Spark330PlusShims extends Spark321PlusShims with Spark320PlusNonDBShims {
     new FileScanRDD(sparkSession, readFunction, filePartitions, readDataSchema, metadataColumns)
   }
 
-  override def getParquetFilters(
-      schema: MessageType,
-      pushDownDate: Boolean,
-      pushDownTimestamp: Boolean,
-      pushDownDecimal: Boolean,
-      pushDownStartWith: Boolean,
-      pushDownInFilterThreshold: Int,
-      caseSensitive: Boolean,
-      lookupFileMeta: String => String,
-      dateTimeRebaseModeFromConf: String): ParquetFilters = {
-    val datetimeRebaseMode = DataSourceUtils
-      .datetimeRebaseSpec(lookupFileMeta, dateTimeRebaseModeFromConf)
-    new ParquetFilters(schema, pushDownDate, pushDownTimestamp, pushDownDecimal, pushDownStartWith,
-      pushDownInFilterThreshold, caseSensitive, datetimeRebaseMode)
-  }
-
   // GPU support ANSI interval types from 330
   override def getExprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = {
     val map: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = Seq(

diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala
@@ -17,7 +17,6 @@
 /*** spark-rapids-shim-json-lines
 {"spark": "330db"}
 {"spark": "332db"}
-{"spark": "341db"}
 spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims