Skip to content

Commit 47ef17e

Browse files
committed
Fix test failures
1 parent 13aa686 commit 47ef17e

File tree

6 files changed

+180
-177
lines changed

6 files changed

+180
-177
lines changed

python/pyspark/sql/connect/functions/builtin.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4337,12 +4337,17 @@ def hll_union(
43374337
def theta_sketch_agg(
43384338
col: "ColumnOrName",
43394339
lgNomEntries: Optional[Union[int, Column]] = None,
4340+
family: Optional[str] = None,
43404341
) -> Column:
43414342
fn = "theta_sketch_agg"
4342-
if lgNomEntries is None:
4343+
if lgNomEntries is None and family is None:
43434344
return _invoke_function_over_columns(fn, col)
4344-
else:
4345+
elif family is None:
43454346
return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
4347+
else:
4348+
if lgNomEntries is None:
4349+
lgNomEntries = 12 # default value
4350+
return _invoke_function_over_columns(fn, col, lit(lgNomEntries), lit(family))
43464351

43474352

43484353
theta_sketch_agg.__doc__ = pysparkfuncs.theta_sketch_agg.__doc__

python/pyspark/sql/functions/builtin.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25978,25 +25978,25 @@ def theta_sketch_agg(
2597825978
>>> from pyspark.sql import functions as sf
2597925979
>>> df = spark.createDataFrame([1,2,2,3], "INT")
2598025980
>>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
25981-
+--------------------------------------------------+
25982-
|theta_sketch_estimate(theta_sketch_agg(value, 12))|
25983-
+--------------------------------------------------+
25984-
| 3|
25985-
+--------------------------------------------------+
25981+
+---------------------------------------------------------------+
25982+
|theta_sketch_estimate(theta_sketch_agg(value, 12, QUICKSELECT))|
25983+
+---------------------------------------------------------------+
25984+
| 3|
25985+
+---------------------------------------------------------------+
2598625986

2598725987
>>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15))).show()
25988-
+--------------------------------------------------+
25989-
|theta_sketch_estimate(theta_sketch_agg(value, 15))|
25990-
+--------------------------------------------------+
25991-
| 3|
25992-
+--------------------------------------------------+
25988+
+---------------------------------------------------------------+
25989+
|theta_sketch_estimate(theta_sketch_agg(value, 15, QUICKSELECT))|
25990+
+---------------------------------------------------------------+
25991+
| 3|
25992+
+---------------------------------------------------------------+
2599325993

2599425994
>>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15, "ALPHA"))).show()
25995-
+-------------------------------------------------------+
25996-
|theta_sketch_estimate(theta_sketch_agg(value, 15, AL..|
25997-
+-------------------------------------------------------+
25998-
| 3|
25999-
+-------------------------------------------------------+
25995+
+---------------------------------------------------------+
25996+
|theta_sketch_estimate(theta_sketch_agg(value, 15, ALPHA))|
25997+
+---------------------------------------------------------+
25998+
| 3|
25999+
+---------------------------------------------------------+
2600026000
"""
2600126001
fn = "theta_sketch_agg"
2600226002
if lgNomEntries is None and family is None:
@@ -26133,11 +26133,11 @@ def theta_sketch_estimate(col: "ColumnOrName") -> Column:
2613326133
>>> from pyspark.sql import functions as sf
2613426134
>>> df = spark.createDataFrame([1,2,2,3], "INT")
2613526135
>>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
26136-
+--------------------------------------------------+
26137-
|theta_sketch_estimate(theta_sketch_agg(value, 12))|
26138-
+--------------------------------------------------+
26139-
| 3|
26140-
+--------------------------------------------------+
26136+
+---------------------------------------------------------------+
26137+
|theta_sketch_estimate(theta_sketch_agg(value, 12, QUICKSELECT))|
26138+
+---------------------------------------------------------------+
26139+
| 3|
26140+
+---------------------------------------------------------------+
2614126141
"""
2614226142

2614326143
fn = "theta_sketch_estimate"

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,14 @@ case class FinalizedSketch(sketch: CompactSketch) extends ThetaSketchState {
7676
_FUNC_(expr, lgNomEntries, family) - Returns the ThetaSketch compact binary representation.
7777
`lgNomEntries` (optional) is the log-base-2 of nominal entries, with nominal entries deciding
7878
the number buckets or slots for the ThetaSketch.
79-
`family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to 'QUICKSELECT').
80-
Note: You can pass family as the second parameter to use default lgNomEntries with a specific family.""",
79+
`family` (optional) is the sketch family, either 'QUICKSELECT' or 'ALPHA' (defaults to
80+
'QUICKSELECT').""",
8181
examples = """
8282
Examples:
8383
> SELECT theta_sketch_estimate(_FUNC_(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
8484
3
8585
> SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
8686
3
87-
> SELECT theta_sketch_estimate(_FUNC_(col, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);
88-
3
8987
> SELECT theta_sketch_estimate(_FUNC_(col, 15, 'ALPHA')) FROM VALUES (1), (1), (2), (2), (3) tab(col);
9088
3
9189
""",

sql/core/src/test/resources/sql-functions/sql-expression-schema.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -342,10 +342,10 @@
342342
| org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> |
343343
| org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct<TAN(0):double> |
344344
| org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct<TANH(0):double> |
345-
| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
346-
| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
347-
| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
348-
| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint> |
345+
| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT))):bigint> |
346+
| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT))):bigint> |
347+
| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12, QUICKSELECT)):bigint> |
348+
| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12, QUICKSELECT), theta_sketch_agg(col2, 12, QUICKSELECT), 12)):bigint> |
349349
| org.apache.spark.sql.catalyst.expressions.TimeDiff | time_diff | SELECT time_diff('HOUR', TIME'20:30:29', TIME'21:30:28') | struct<time_diff(HOUR, TIME '20:30:29', TIME '21:30:28'):bigint> |
350350
| org.apache.spark.sql.catalyst.expressions.TimeTrunc | time_trunc | SELECT time_trunc('HOUR', TIME'09:32:05.359') | struct<time_trunc(HOUR, TIME '09:32:05.359'):time(6)> |
351351
| org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
@@ -468,7 +468,7 @@
468468
| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev_samp | SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<stddev_samp(col):double> |
469469
| org.apache.spark.sql.catalyst.expressions.aggregate.Sum | sum | SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<sum(col):bigint> |
470470
| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaIntersectionAgg | theta_intersection_agg | SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint> |
471-
| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
471+
| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12, QUICKSELECT)):bigint> |
472472
| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaUnionAgg | theta_union_agg | SELECT theta_sketch_estimate(theta_union_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint> |
473473
| org.apache.spark.sql.catalyst.expressions.aggregate.TryAverageExpressionBuilder | try_avg | SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<try_avg(col):double> |
474474
| org.apache.spark.sql.catalyst.expressions.aggregate.TrySumExpressionBuilder | try_sum | SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<try_sum(col):bigint> |

0 commit comments

Comments
 (0)