diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala index 2340385dcdd66..f1cb20ca40619 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import scala.annotation.tailrec -import org.apache.spark.sql.catalyst.analysis.MultiAlias +import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedFunction} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project} import org.apache.spark.sql.catalyst.trees.CurrentOrigin @@ -112,6 +112,10 @@ trait AliasHelper { } protected def trimAliases(e: Expression): Expression = e match { + // SPARK-48091: Do not descend into unresolved function calls. Aliases inside them + // (e.g., UnresolvedFunction("struct", Seq(Alias(x, "data")))) carry semantic information + // that ResolveFunctions -> CreateStruct.apply consumes to produce field names. + case u: UnresolvedFunction => u // The children of `CreateNamedStruct` may use `Alias` to carry metadata and we should not // trim them. case c: CreateNamedStruct => c.mapChildren { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala index 015ea9defae94..58f399bf797f5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.trees.LeafLike import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{IntegerType, StructType} +import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType} class GeneratorFunctionSuite extends SharedSparkSession { import testImplicits._ @@ -765,6 +765,37 @@ class GeneratorFunctionSuite extends SharedSparkSession { Seq(Row(0, 10, 0, 10), Row(1, 20, 1, 20)) ) } + + test("SPARK-48091: explode with transform should preserve struct field aliases") { + val df = spark.createDataFrame(Seq((1, Array(1, 2, 3), Array(4, 5, 6)))) + .toDF("id", "my_array", "my_array2") + + // Without explode - aliases should work (baseline) + val good = df.select( + transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct") + ) + assert(good.schema("my_struct").dataType.asInstanceOf[ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data")) + + // With explode in same select - aliases should still be preserved + val result = df.select( + explode(col("my_array")).as("exploded"), + transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct") + ) + assert(result.schema("my_struct").dataType.asInstanceOf[ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data")) + + // Multiple aliases inside struct + val result2 = df.select( + explode(col("my_array")).as("exploded"), + transform(col("my_array2"), + x => struct(x.as("value"), col("id").as("key")) + ).as("my_struct") + ) + val fields2 = result2.schema("my_struct").dataType.asInstanceOf[ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq + assert(fields2 === Seq("value", "key")) + } } case class EmptyGenerator() extends Generator with LeafLike[Expression] {