apache · yyanyy · Apr 8, 2026 · Apr 8, 2026 · aokolnychyi · Apr 8, 2026
diff --git a/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -157,13 +157,22 @@ case class DataSourceV2Relation(
  * @param keyGroupedPartitioning if set, the partitioning expressions that are used to split the
  *                               rows in the scan across different partitions
  * @param ordering if set, the ordering provided by the scan
+ * @param pushedFilters Catalyst expressions for filters that were fully pushed to the data
+ *                      source and do not appear as post-scan filters
  */
 case class DataSourceV2ScanRelation(
     relation: DataSourceV2Relation,
     scan: Scan,
     output: Seq[AttributeReference],
     keyGroupedPartitioning: Option[Seq[Expression]] = None,
-    ordering: Option[Seq[SortOrder]] = None) extends LeafNode with NamedRelation {
+    ordering: Option[Seq[SortOrder]] = None,
+    pushedFilters: Seq[Expression] = Seq.empty) extends LeafNode with NamedRelation {
+
+  // TODO: Override validConstraints to return ExpressionSet(pushedFilters) so that pushed
+  // filters participate in constraint propagation (InferFiltersFromConstraints, PruneFilters).
+  // This changes which filters InferFiltersFromConstraints adds or removes (e.g., it may
+  // skip adding IsNotNull when the scan already implies it, or infer new filters across
+  // joins), so plan stability testing is needed first.
 
   override def name: String = relation.name
 
@@ -197,7 +206,8 @@ case class DataSourceV2ScanRelation(
       ),
       ordering = ordering.map(
         _.map(o => o.copy(child = QueryPlan.normalizeExpressions(o.child, output)))
-      )
+      ),
+      pushedFilters = pushedFilters.map(QueryPlan.normalizeExpressions(_, output))
     )
   }
 }

diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 
 import org.apache.spark.SparkException
 import org.apache.spark.internal.LogKeys.{AGGREGATE_FUNCTIONS, COLUMN_NAMES, GROUP_BY_EXPRS, JOIN_CONDITION, JOIN_TYPE, POST_SCAN_FILTERS, PUSHED_FILTERS, RELATION_NAME, RELATION_OUTPUT}
-import org.apache.spark.sql.catalyst.expressions.{aggregate, Alias, And, Attribute, AttributeMap, AttributeReference, AttributeSet, Cast, Expression, ExprId, IntegerLiteral, Literal, NamedExpression, PredicateHelper, ProjectionOverSchema, SortOrder, SubqueryExpression}
+import org.apache.spark.sql.catalyst.expressions.{aggregate, Alias, And, Attribute, AttributeMap, AttributeReference, AttributeSet, Cast, Expression, ExpressionSet, ExprId, IntegerLiteral, Literal, NamedExpression, PredicateHelper, ProjectionOverSchema, SortOrder, SubqueryExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.optimizer.CollapseProject
 import org.apache.spark.sql.catalyst.planning.{PhysicalOperation, ScanOperation}
@@ -95,6 +95,14 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
 
       val postScanFilters = postScanFiltersWithoutSubquery ++ normalizedFiltersWithSubquery
 
+      // Compute the pushed filter expressions: the normalized filters that were fully pushed
+      // down (i.e., not in postScanFilters). These are stored on the scan relation for
+      // potential future use in constraint propagation.
-      // potential future use in constraint propagation.
+      val postScanFilterSet = ExpressionSet(postScanFiltersWithoutSubquery)
+      sHolder.pushedFilterExpressions =
+        normalizedFiltersWithoutSubquery.filterNot(postScanFilterSet.contains).filter(_.deterministic)
-      // potential future use in constraint propagation.
+      val postScanFilterSet = ExpressionSet(postScanFiltersWithoutSubquery)
+      sHolder.pushedFilterExpressions =
+        normalizedFiltersWithoutSubquery.filterNot(postScanFilterSet.contains).filter(_.deterministic)
+      val postScanFilterSet = ExpressionSet(postScanFiltersWithoutSubquery)
+      sHolder.pushedFilterExpressions = normalizedFiltersWithoutSubquery
+        .filterNot(postScanFilterSet.contains)
+        .filter(_.deterministic)
+
       logInfo(
         log"""
             |Pushing operators to ${MDC(RELATION_NAME, sHolder.relation.name)}
@@ -698,6 +706,8 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
       assert(realOutput.length == holder.output.length,
         "The data source returns unexpected number of columns")
       val wrappedScan = getWrappedScan(scan, holder)
+      // Note: holder.pushedFilterExpressions is not propagated here because the output schema
+      // changes to aggregate columns. When validConstraints is wired up, this needs revisiting.
       val scanRelation = DataSourceV2ScanRelation(holder.relation, wrappedScan, realOutput)
       val projectList = realOutput.zip(holder.output).map { case (a1, a2) =>
         // The data source may return columns with arbitrary data types and it's safer to cast them
@@ -715,6 +725,8 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
       assert(realOutput.length == holder.output.length,
         "The data source returns unexpected number of columns")
       val wrappedScan = getWrappedScan(scan, holder)
+      // Note: holder.pushedFilterExpressions is not propagated here because the output schema
+      // changes with pushed join. When validConstraints is wired up, this needs revisiting.
       val scanRelation = DataSourceV2ScanRelation(holder.relation, wrappedScan, realOutput)
 
       // When join is pushed down, the real output is going to be, for example,
@@ -737,6 +749,8 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
       val scan = holder.builder.build()
       val realOutput = toAttributes(scan.readSchema())
       val wrappedScan = getWrappedScan(scan, holder)
+      // Note: holder.pushedFilterExpressions is not propagated here because the output schema
+      // changes with variant extraction. When validConstraints is wired up, this needs revisiting.
       val scanRelation = DataSourceV2ScanRelation(holder.relation, wrappedScan, realOutput)
 
       // Create projection to map real output to expected output (with transformed types)
@@ -787,14 +801,19 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
 
       val wrappedScan = getWrappedScan(scan, sHolder)
 
-      val scanRelation = DataSourceV2ScanRelation(sHolder.relation, wrappedScan, output)
-
       val projectionOverSchema =
         ProjectionOverSchema(output.toStructType, AttributeSet(output))
       val projectionFunc = (expr: Expression) => expr transformDown {
         case projectionOverSchema(newExpr) => newExpr
       }
 
+      // Remap pushed filter attributes to the pruned output schema and drop filters
+      // whose references are no longer in the pruned output.
+      val remappedPushedFilters = sHolder.pushedFilterExpressions.map(projectionFunc)
+        .filter(_.references.subsetOf(AttributeSet(output)))
+      val scanRelation = DataSourceV2ScanRelation(sHolder.relation, wrappedScan, output,
+        pushedFilters = remappedPushedFilters)
+
       val finalFilters = normalizedFilters.map(projectionFunc)
       // bottom-most filters are put in the left of the list.
       val withFilter = finalFilters.foldLeft[LogicalPlan](scanRelation)((plan, cond) => {
@@ -1018,6 +1037,8 @@ case class ScanBuilderHolder(
   var pushedVariantAttributeMap: Map[ExprId, AttributeReference] = Map.empty
 
   var pushedVariants: Option[VariantInRelation] = None
+
+  var pushedFilterExpressions: Seq[Expression] = Seq.empty
 }
 
 // A wrapper for v1 scan to carry the translated filters and the handled ones, along with