fix: potential fix for false positive arrow leak memory

advancedxy · May 6, 2024 · 48517ef · 48517ef
1 parent 4e4d528
commit 48517ef
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 11 deletions.
diff --git a/common/src/main/scala/org/apache/comet/vector/StreamReader.scala b/common/src/main/scala/org/apache/comet/vector/StreamReader.scala
@@ -50,13 +50,21 @@ case class StreamReader(channel: ReadableByteChannel, source: String) extends Au
   }
 
   override def close(): Unit = {
+    close(false)
+  }
+
+  def close(forceCloseAllocator: Boolean): Unit = {
     if (root != null) {
       arrowReader.close()
       root.close()
-      allocator.close()
 
       arrowReader = null
       root = null
+    }
+
+    // don't close the allocator unless it's empty or forced to.
+    if (allocator != null && (forceCloseAllocator || allocator.getAllocatedMemory == 0)) {
+      allocator.close()
       allocator = null
     }
   }

diff --git a/common/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/ArrowReaderIterator.scala b/common/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/ArrowReaderIterator.scala
@@ -67,12 +67,12 @@ class ArrowReaderIterator(channel: ReadableByteChannel, source: String)
     reader.nextBatch()
   }
 
-  def close(): Unit =
+  def close(forceCloseAllocator: Boolean): Unit =
     synchronized {
       if (currentBatch != null) {
         currentBatch.close()
         currentBatch = null
       }
-      reader.close()
+      reader.close(forceCloseAllocator)
     }
 }
diff --git a/...ain/scala/org/apache/spark/sql/comet/execution/shuffle/CometBlockStoreShuffleReader.scala b/...ain/scala/org/apache/spark/sql/comet/execution/shuffle/CometBlockStoreShuffleReader.scala
@@ -32,6 +32,7 @@ import org.apache.spark.serializer.SerializerManager
 import org.apache.spark.shuffle.BaseShuffleHandle
 import org.apache.spark.shuffle.ShuffleReader
 import org.apache.spark.shuffle.ShuffleReadMetricsReporter
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.storage.BlockId
 import org.apache.spark.storage.BlockManager
 import org.apache.spark.storage.BlockManagerId
@@ -98,19 +99,25 @@ class CometBlockStoreShuffleReader[K, C](
         // read iterators, it may blow up the call stack and cause OOM.
         context.addTaskCompletionListener[Unit] { _ =>
           if (currentReadIterator != null) {
-            currentReadIterator.close()
+            currentReadIterator.close(true)
           }
         }
 
-        IpcInputStreamIterator(inputStream, decompressingNeeded = true, context)
+        // accumulated readers/allocator to be closed after the input stream is consumed
+        val accumulatedReaders = new scala.collection.mutable.ArrayBuffer[ArrowReaderIterator]()
+        val iter = IpcInputStreamIterator(inputStream, decompressingNeeded = true, context)
           .flatMap { channel =>
             if (currentReadIterator != null) {
               // Closes previous read iterator.
-              currentReadIterator.close()
+              currentReadIterator.close(false)
+              accumulatedReaders.append(currentReadIterator)
             }
             currentReadIterator = new ArrowReaderIterator(channel, this.getClass.getSimpleName)
             currentReadIterator.map((0, _)) // use 0 as key since it's not used
           }
+        CompletionIterator[(Int, ColumnarBatch), Iterator[(Int, ColumnarBatch)]](
+          iter,
+          accumulatedReaders.foreach(_.close(true)))
       }
 
     // Update the context task metrics for each record read.

diff --git a/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala
@@ -72,15 +72,12 @@ abstract class CometColumnarShuffleSuite extends CometTestBase with AdaptiveSpar
         CometConf.COMET_EXEC_ENABLED.key -> "true",
         SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
         SQLConf.COALESCE_PARTITIONS_ENABLED.key -> coalescePartitionsEnabled.toString,
+        "spark.comet.shuffle.enforceMode.enabled" -> "true",
         SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
         val df = sql(
           "SELECT * FROM (SELECT * FROM testData WHERE key = 0) t1 FULL JOIN " +
             "testData2 t2 ON t1.key = t2.a")
-        if (coalescePartitionsEnabled) {
-          checkShuffleAnswer(df, 0)
-        } else {
-          checkShuffleAnswer(df, 2)
-        }
+        checkShuffleAnswer(df, 2)
       }
     }
   }