[SPARK-56521][SQL] Refactor BatchScanExec: guard cast with runtimeFilters.nonEmpty, simplify partPredicates

szehon-ho · szehon-ho · commit 939e1286bb47 · 2026-04-16T18:54:59.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala
@@ -61,83 +61,97 @@ case class BatchScanExec(
 
   // Visible for testing
   @transient private[sql] lazy val filteredPartitions: Seq[Option[InputPartition]] = {
-    val dataSourceFilters = runtimeFilters.flatMap {
-      case DynamicPruningExpression(e) => DataSourceV2Strategy.translateRuntimeFilterV2(e)
-      case f => DataSourceV2Strategy.translateScalarSubqueryFilterV2(f)
-    }
-
     val originalPartitioning = outputPartitioning
-    // the cast is safe as runtime filters are only assigned if the scan can be filtered
-    val filterableScan = scan.asInstanceOf[SupportsRuntimeV2Filtering]
-    var filtered = false
-
-    if (dataSourceFilters.nonEmpty) {
-      filterableScan.filter(dataSourceFilters.toArray)
-      filtered = true
-    }
+    if (runtimeFilters.nonEmpty) {
+      // the cast is safe as runtime filters are only assigned if the scan can be filtered
+      val filterableScan = scan.asInstanceOf[SupportsRuntimeV2Filtering]
+
+      // push down translatable runtime filters
+      val dataSourceFilters = runtimeFilters.flatMap {
+        case DynamicPruningExpression(e) => DataSourceV2Strategy.translateRuntimeFilterV2(e)
+        case f => DataSourceV2Strategy.translateScalarSubqueryFilterV2(f)
+      }
+      if (dataSourceFilters.nonEmpty) {
+        filterableScan.filter(dataSourceFilters.toArray)
+      }
 
-    // If the scan supports iterative filtering, derive PartitionPredicates from the
-    // runtime filters and push them in a second pass. (See SPARK-55596)
-    if (filterableScan.supportsIterativeFiltering()) {
-      PushDownUtils.getPartitionPredicateSchema(table, output).foreach { partitionFields =>
-        val partPredicates =
+      // If the scan supports iterative filtering, derive PartitionPredicates from the
+      // runtime filters and push them in a second pass. (See SPARK-55596)
+      val partPredicates = if (filterableScan.supportsIterativeFiltering()) {
+        PushDownUtils.getPartitionPredicateSchema(table, output).map { partitionFields =>
           PushDownUtils.createRuntimePartitionPredicates(runtimeFilters, partitionFields)
-        if (partPredicates.nonEmpty) {
-          filterableScan.filter(partPredicates.toArray)
-          filtered = true
-        }
+        }.getOrElse(Seq.empty)
+      } else {
+        Seq.empty
+      }
+      if (partPredicates.nonEmpty) {
+        filterableScan.filter(partPredicates.toArray)
       }
-    }
 
-    if (filtered) {
-      // call toBatch again to get filtered partitions
-      val newPartitions = scan.toBatch.planInputPartitions()
+      if (dataSourceFilters.nonEmpty || partPredicates.nonEmpty) {
+        // call toBatch again to get filtered partitions
+        val newPartitions = scan.toBatch.planInputPartitions()
+
+        originalPartitioning match {
+          case k: KeyedPartitioning =>
+            if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) {
+              throw new SparkException(
+                "Data source must have preserved the original partitioning " +
+                  "during runtime filtering: not all partitions implement " +
+                  "HasPartitionKey after filtering")
+            }
 
-      originalPartitioning match {
-        case k: KeyedPartitioning =>
-          if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) {
-            throw new SparkException("Data source must have preserved the original partitioning " +
-                "during runtime filtering: not all partitions implement HasPartitionKey after " +
-                "filtering")
-          }
-
-          val inputMap = k.partitionKeys.groupBy(identity).view.mapValues(_.size)
-          val comparableKeyWrapperFactory = InternalRowComparableWrapper
-            .getInternalRowComparableWrapperFactory(k.expressionDataTypes)
-          val filteredMap = newPartitions.groupBy(
-            p => comparableKeyWrapperFactory(p.asInstanceOf[HasPartitionKey].partitionKey())
-          )
-
-          if (!filteredMap.keySet.subsetOf(inputMap.keySet)) {
-            throw new SparkException("During runtime filtering, data source must not report new " +
-                "partition keys that are not present in the original partitioning.")
-          }
-
-          inputMap.toSeq
-            .sortBy(_._1)(k.keyOrdering)
-            .flatMap { case (key, size) =>
-              // We require the new number of partitions to be equal or less than the old number of
-              // partitions for a given key. In the case of less than, empty partitions are added.
-              val fps = filteredMap.getOrElse(key, Array.empty)
-
-              if (fps.size > size) {
-                throw new SparkException("During runtime filtering, data source must not report " +
-                  s"new partitions for a given key. Before: $size partitions. " +
-                  s"After: ${fps.size} partitions")
+            val inputMap = k.partitionKeys.groupBy(identity).view.mapValues(_.size)
+            val comparableKeyWrapperFactory = InternalRowComparableWrapper
+              .getInternalRowComparableWrapperFactory(k.expressionDataTypes)
+            val filteredMap = newPartitions.groupBy(
+              p => comparableKeyWrapperFactory(
+                p.asInstanceOf[HasPartitionKey].partitionKey()))
+
+            if (!filteredMap.keySet.subsetOf(inputMap.keySet)) {
+              throw new SparkException(
+                "During runtime filtering, data source must not report new " +
+                  "partition keys that are not present in the original partitioning.")
+            }
+
+            inputMap.toSeq
+              .sortBy(_._1)(k.keyOrdering)
+              .flatMap { case (key, size) =>
+                // We require the new number of partitions to be equal or less than
+                // the old number of partitions for a given key. In the case of less
+                // than, empty partitions are added.
+                val fps = filteredMap.getOrElse(key, Array.empty)
+
+                if (fps.size > size) {
+                  throw new SparkException(
+                    "During runtime filtering, data source must not report " +
+                      s"new partitions for a given key. Before: $size partitions. " +
+                      s"After: ${fps.size} partitions")
+                }
+
+                fps.map(Some).padTo(size, None)
               }
 
-              fps.map(Some).padTo(size, None)
-            }
+          case _ =>
+            // no validation is needed as the data source did not report any specific
+            // partitioning
+            newPartitions.toSeq.map(Some)
+        }
 
-        case _ =>
-          // no validation is needed as the data source did not report any specific partitioning
-          newPartitions.toSeq.map(Some)
-      }
+      } else {
+        (originalPartitioning match {
+          case k: KeyedPartitioning =>
+            inputPartitions.sortBy(
+              _.asInstanceOf[HasPartitionKey].partitionKey())(k.keyRowOrdering)
 
+          case _ => inputPartitions
+        }).map(Some)
+      }
     } else {
       (originalPartitioning match {
         case k: KeyedPartitioning =>
-          inputPartitions.sortBy(_.asInstanceOf[HasPartitionKey].partitionKey())(k.keyRowOrdering)
+          inputPartitions.sortBy(
+            _.asInstanceOf[HasPartitionKey].partitionKey())(k.keyRowOrdering)
 
         case _ => inputPartitions
       }).map(Some)