Fix test failures

karuppayya · karuppayya · commit 47ef17e162d8 · 2025-10-09T18:32:53.000-07:00
diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
@@ -4337,12 +4337,17 @@ def hll_union(
 def theta_sketch_agg(
     col: "ColumnOrName",
     lgNomEntries: Optional[Union[int, Column]] = None,
+    family: Optional[str] = None,
 ) -> Column:
     fn = "theta_sketch_agg"
-    if lgNomEntries is None:
+    if lgNomEntries is None and family is None:
         return _invoke_function_over_columns(fn, col)
-    else:
+    elif family is None:
         return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
+    else:
+        if lgNomEntries is None:
+            lgNomEntries = 12  # default value
+        return _invoke_function_over_columns(fn, col, lit(lgNomEntries), lit(family))
 
 
 theta_sketch_agg.__doc__ = pysparkfuncs.theta_sketch_agg.__doc__
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -25978,25 +25978,25 @@ def theta_sketch_agg(
     >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([1,2,2,3], "INT")
     >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
-    +--------------------------------------------------+
-    |theta_sketch_estimate(theta_sketch_agg(value, 12))|
-    +--------------------------------------------------+
-    |                                                 3|
-    +--------------------------------------------------+
+    +---------------------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 12, QUICKSELECT))|
+    +---------------------------------------------------------------+
+    |                                                              3|
+    +---------------------------------------------------------------+
 
     >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15))).show()
-    +--------------------------------------------------+
-    |theta_sketch_estimate(theta_sketch_agg(value, 15))|
-    +--------------------------------------------------+
-    |                                                 3|
-    +--------------------------------------------------+
+    +---------------------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 15, QUICKSELECT))|
+    +---------------------------------------------------------------+
+    |                                                              3|
+    +---------------------------------------------------------------+
 
     >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15, "ALPHA"))).show()
-    +-------------------------------------------------------+
-    |theta_sketch_estimate(theta_sketch_agg(value, 15, AL..|
-    +-------------------------------------------------------+
-    |                                                      3|
-    +-------------------------------------------------------+
+    +---------------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 15, ALPHA))|
+    +---------------------------------------------------------+
+    |                                                        3|
+    +---------------------------------------------------------+
     """
     fn = "theta_sketch_agg"
     if lgNomEntries is None and family is None:
@@ -26133,11 +26133,11 @@ def theta_sketch_estimate(col: "ColumnOrName") -> Column:
     >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([1,2,2,3], "INT")
     >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
-    +--------------------------------------------------+
-    |theta_sketch_estimate(theta_sketch_agg(value, 12))|
-    +--------------------------------------------------+
-    |                                                 3|
-    +--------------------------------------------------+
+    +---------------------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 12, QUICKSELECT))|
+    +---------------------------------------------------------------+
+    |                                                              3|
+    +---------------------------------------------------------------+
     """
 
     fn = "theta_sketch_estimate"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala
@@ -76,16 +76,14 @@ case class FinalizedSketch(sketch: CompactSketch) extends ThetaSketchState {
     _FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact binary representation.
       `lgNomEntries` (optional) is the log-base-2 of nominal entries, with nominal entries deciding
       the number buckets or slots for the ThetaSketch.
-      `family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to 'QUICKSELECT').
-      Note: You can pass family as the second parameter to use default lgNomEntries with a specific family.""",
+      `family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to
+      'QUICKSELECT').""",
   examples = """
     Examples:
       > SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
        3
       > SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
        3
-      > SELECT theta_sketch_estimate(_FUNC_(col, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);
-       3
       > SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);
        3
   """,
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -342,10 +342,10 @@
 | org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> |
 | org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct<TAN(0):double> |
 | org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct<TANH(0):double> |
-| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
-| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
-| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
-| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT))):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT))):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12, QUICKSELECT)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT), 12)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.TimeDiff | time_diff | SELECT time_diff('HOUR', TIME'20:30:29', TIME'21:30:28') | struct<time_diff(HOUR, TIME '20:30:29', TIME '21:30:28'):bigint> |
 | org.apache.spark.sql.catalyst.expressions.TimeTrunc | time_trunc | SELECT time_trunc('HOUR', TIME'09:32:05.359') | struct<time_trunc(HOUR, TIME '09:32:05.359'):time(6)> |
 | org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
@@ -468,7 +468,7 @@
 | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev_samp | SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<stddev_samp(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.Sum | sum | SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<sum(col):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.ThetaIntersectionAgg | theta_intersection_agg | SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint> |
-| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12, QUICKSELECT)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.ThetaUnionAgg | theta_union_agg | SELECT theta_sketch_estimate(theta_union_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.TryAverageExpressionBuilder | try_avg | SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<try_avg(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.TrySumExpressionBuilder | try_sum | SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<try_sum(col):bigint> |
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out
diff --git a/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out

Original file line number	Diff line number	Diff line change
`@@ -76,16 +76,14 @@ case class FinalizedSketch(sketch: CompactSketch) extends ThetaSketchState {`
`76`	`76`	`_FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact binary representation.`
`77`	`77`	`lgNomEntries` (optional) is the log-base-2 of nominal entries, with nominal entries deciding
`78`	`78`	`the number buckets or slots for the ThetaSketch.`
`79`		- `family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to 'QUICKSELECT').
`80`		`- Note: You can pass family as the second parameter to use default lgNomEntries with a specific family.""",`
	`79`	+ `family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to
	`80`	`+ 'QUICKSELECT').""",`
`81`	`81`	`examples = """`
`82`	`82`	`Examples:`
`83`	`83`	`> SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col);`
`84`	`84`	`3`
`85`	`85`	`> SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col);`
`86`	`86`	`3`
`87`		`- > SELECT theta_sketch_estimate(_FUNC_(col, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);`
`88`		`- 3`
`89`	`87`	`> SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);`
`90`	`88`	`3`
`91`	`89`	`""",`