Skip to content
This repository has been archived by the owner on Jun 14, 2024. It is now read-only.

Gold Standard: spark-only version for creating and comparing golden files #361

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
== Physical Plan ==
TakeOrderedAndProject(limit=100, orderBy=[c_customer_id#1 ASC NULLS FIRST], output=[c_customer_id#1])
+- *(9) Project [c_customer_id#1]
+- *(9) BroadcastHashJoin [ctr_customer_sk#2], [cast(c_customer_sk#3 as bigint)], Inner, BuildRight
:- *(9) Project [ctr_customer_sk#2]
: +- *(9) BroadcastHashJoin [ctr_store_sk#4], [cast(s_store_sk#5 as bigint)], Inner, BuildRight
: :- *(9) Project [ctr_customer_sk#2, ctr_store_sk#4]
: : +- *(9) BroadcastHashJoin [ctr_store_sk#4], [ctr_store_sk#4#6], Inner, BuildRight, (cast(ctr_total_return#7 as decimal(24,7)) > (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#8)
: : :- *(9) Filter isnotnull(ctr_total_return#7)
: : : +- *(9) HashAggregate(keys=[sr_customer_sk#9, sr_store_sk#10], functions=[sum(UnscaledValue(sr_return_amt#11))])
: : : +- Exchange hashpartitioning(sr_customer_sk#9, sr_store_sk#10, 200)
: : : +- *(2) HashAggregate(keys=[sr_customer_sk#9, sr_store_sk#10], functions=[partial_sum(UnscaledValue(sr_return_amt#11))])
: : : +- *(2) Project [sr_customer_sk#9, sr_store_sk#10, sr_return_amt#11]
: : : +- *(2) BroadcastHashJoin [sr_returned_date_sk#12], [cast(d_date_sk#13 as bigint)], Inner, BuildRight
: : : :- *(2) Project [sr_returned_date_sk#12, sr_customer_sk#9, sr_store_sk#10, sr_return_amt#11]
: : : : +- *(2) Filter ((isnotnull(sr_returned_date_sk#12) && isnotnull(sr_store_sk#10)) && isnotnull(sr_customer_sk#9))
: : : : +- *(2) FileScan parquet default.store_returns[sr_returned_date_sk#12,sr_customer_sk#9,sr_store_sk#10,sr_return_amt#11] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/store_returns], PartitionFilters: [], PushedFilters: [IsNotNull(sr_returned_date_sk), IsNotNull(sr_store_sk), IsNotNull(sr_customer_sk)], ReadSchema: struct<sr_returned_date_sk:bigint,sr_customer_sk:bigint,sr_store_sk:bigint,sr_return_amt:decimal(...
: : : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : : +- *(1) Project [d_date_sk#13]
: : : +- *(1) Filter ((isnotnull(d_year#14) && (d_year#14 = 2000)) && isnotnull(d_date_sk#13))
: : : +- *(1) FileScan parquet default.date_dim[d_date_sk#13,d_year#14] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/date_dim], PartitionFilters: [], PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2000), IsNotNull(d_date_sk)], ReadSchema: struct<d_date_sk:int,d_year:int>
: : +- BroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true]))
: : +- *(6) Filter isnotnull((CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#8)
: : +- *(6) HashAggregate(keys=[ctr_store_sk#4], functions=[avg(ctr_total_return#7)])
: : +- Exchange hashpartitioning(ctr_store_sk#4, 200)
: : +- *(5) HashAggregate(keys=[ctr_store_sk#4], functions=[partial_avg(ctr_total_return#7)])
: : +- *(5) HashAggregate(keys=[sr_customer_sk#9, sr_store_sk#10], functions=[sum(UnscaledValue(sr_return_amt#11))])
: : +- Exchange hashpartitioning(sr_customer_sk#9, sr_store_sk#10, 200)
: : +- *(4) HashAggregate(keys=[sr_customer_sk#9, sr_store_sk#10], functions=[partial_sum(UnscaledValue(sr_return_amt#11))])
: : +- *(4) Project [sr_customer_sk#9, sr_store_sk#10, sr_return_amt#11]
: : +- *(4) BroadcastHashJoin [sr_returned_date_sk#12], [cast(d_date_sk#13 as bigint)], Inner, BuildRight
: : :- *(4) Project [sr_returned_date_sk#12, sr_customer_sk#9, sr_store_sk#10, sr_return_amt#11]
: : : +- *(4) Filter (isnotnull(sr_returned_date_sk#12) && isnotnull(sr_store_sk#10))
: : : +- *(4) FileScan parquet default.store_returns[sr_returned_date_sk#12,sr_customer_sk#9,sr_store_sk#10,sr_return_amt#11] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/store_returns], PartitionFilters: [], PushedFilters: [IsNotNull(sr_returned_date_sk), IsNotNull(sr_store_sk)], ReadSchema: struct<sr_returned_date_sk:bigint,sr_customer_sk:bigint,sr_store_sk:bigint,sr_return_amt:decimal(...
: : +- ReusedExchange [d_date_sk#13], BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: +- *(7) Project [s_store_sk#5]
: +- *(7) Filter ((isnotnull(s_state#15) && (s_state#15 = TN)) && isnotnull(s_store_sk#5))
: +- *(7) FileScan parquet default.store[s_store_sk#5,s_state#15] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/store], PartitionFilters: [], PushedFilters: [IsNotNull(s_state), EqualTo(s_state,TN), IsNotNull(s_store_sk)], ReadSchema: struct<s_store_sk:int,s_state:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
+- *(8) Project [c_customer_sk#3, c_customer_id#1]
+- *(8) Filter isnotnull(c_customer_sk#3)
+- *(8) FileScan parquet default.customer[c_customer_sk#3,c_customer_id#1] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/customer], PartitionFilters: [], PushedFilters: [IsNotNull(c_customer_sk)], ReadSchema: struct<c_customer_sk:int,c_customer_id:string>
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
TakeOrderedAndProject [c_customer_id]
WholeStageCodegen
Project [c_customer_id]
BroadcastHashJoin [ctr_customer_sk,c_customer_sk]
Project [ctr_customer_sk]
BroadcastHashJoin [ctr_store_sk,s_store_sk]
Project [ctr_customer_sk,ctr_store_sk]
BroadcastHashJoin [ctr_store_sk,ctr_store_skL,ctr_total_return,(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
Filter [ctr_total_return]
HashAggregate [sr_customer_sk,sr_store_sk,sum,sum(UnscaledValue(sr_return_amt))] [ctr_customer_sk,ctr_total_return,ctr_store_sk,sum,sum(UnscaledValue(sr_return_amt))]
InputAdapter
Exchange [sr_customer_sk,sr_store_sk] #1
WholeStageCodegen
HashAggregate [sr_customer_sk,sum,sr_return_amt,sum,sr_store_sk] [sum,sum]
Project [sr_customer_sk,sr_store_sk,sr_return_amt]
BroadcastHashJoin [sr_returned_date_sk,d_date_sk]
Project [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt]
Filter [sr_returned_date_sk,sr_store_sk,sr_customer_sk]
Scan parquet default.store_returns [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt] [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt]
InputAdapter
BroadcastExchange #2
WholeStageCodegen
Project [d_date_sk]
Filter [d_year,d_date_sk]
Scan parquet default.date_dim [d_date_sk,d_year] [d_date_sk,d_year]
InputAdapter
BroadcastExchange #3
WholeStageCodegen
Filter [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
HashAggregate [ctr_store_sk,sum,count,avg(ctr_total_return)] [avg(ctr_total_return),ctr_store_skL,sum,count,(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
InputAdapter
Exchange [ctr_store_sk] #4
WholeStageCodegen
HashAggregate [ctr_store_sk,ctr_total_return,sum,count,sum,count] [sum,count,sum,count]
HashAggregate [sr_customer_sk,sr_store_sk,sum,sum(UnscaledValue(sr_return_amt))] [sum(UnscaledValue(sr_return_amt)),ctr_store_sk,ctr_total_return,sum]
InputAdapter
Exchange [sr_customer_sk,sr_store_sk] #5
WholeStageCodegen
HashAggregate [sum,sr_customer_sk,sum,sr_return_amt,sr_store_sk] [sum,sum]
Project [sr_customer_sk,sr_store_sk,sr_return_amt]
BroadcastHashJoin [sr_returned_date_sk,d_date_sk]
Project [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt]
Filter [sr_returned_date_sk,sr_store_sk]
Scan parquet default.store_returns [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt] [sr_returned_date_sk,sr_customer_sk,sr_store_sk,sr_return_amt]
InputAdapter
ReusedExchange [d_date_sk] [d_date_sk] #2
InputAdapter
BroadcastExchange #6
WholeStageCodegen
Project [s_store_sk]
Filter [s_state,s_store_sk]
Scan parquet default.store [s_store_sk,s_state] [s_store_sk,s_state]
InputAdapter
BroadcastExchange #7
WholeStageCodegen
Project [c_customer_sk,c_customer_id]
Filter [c_customer_sk]
Scan parquet default.customer [c_customer_sk,c_customer_id] [c_customer_sk,c_customer_id]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
== Physical Plan ==
TakeOrderedAndProject(limit=100, orderBy=[cd_gender#1 ASC NULLS FIRST,cd_marital_status#2 ASC NULLS FIRST,cd_education_status#3 ASC NULLS FIRST,cd_purchase_estimate#4 ASC NULLS FIRST,cd_credit_rating#5 ASC NULLS FIRST,cd_dep_count#6 ASC NULLS FIRST,cd_dep_employed_count#7 ASC NULLS FIRST,cd_dep_college_count#8 ASC NULLS FIRST], output=[cd_gender#1,cd_marital_status#2,cd_education_status#3,cnt1#9,cd_purchase_estimate#4,cnt2#10,cd_credit_rating#5,cnt3#11,cd_dep_count#6,cnt4#12,cd_dep_employed_count#7,cnt5#13,cd_dep_college_count#8,cnt6#14])
+- *(10) HashAggregate(keys=[cd_gender#1, cd_marital_status#2, cd_education_status#3, cd_purchase_estimate#4, cd_credit_rating#5, cd_dep_count#6, cd_dep_employed_count#7, cd_dep_college_count#8], functions=[count(1)])
+- Exchange hashpartitioning(cd_gender#1, cd_marital_status#2, cd_education_status#3, cd_purchase_estimate#4, cd_credit_rating#5, cd_dep_count#6, cd_dep_employed_count#7, cd_dep_college_count#8, 200)
+- *(9) HashAggregate(keys=[cd_gender#1, cd_marital_status#2, cd_education_status#3, cd_purchase_estimate#4, cd_credit_rating#5, cd_dep_count#6, cd_dep_employed_count#7, cd_dep_college_count#8], functions=[partial_count(1)])
+- *(9) Project [cd_gender#1, cd_marital_status#2, cd_education_status#3, cd_purchase_estimate#4, cd_credit_rating#5, cd_dep_count#6, cd_dep_employed_count#7, cd_dep_college_count#8]
+- *(9) BroadcastHashJoin [c_current_cdemo_sk#15], [cd_demo_sk#16], Inner, BuildRight
:- *(9) Project [c_current_cdemo_sk#15]
: +- *(9) BroadcastHashJoin [c_current_addr_sk#17], [ca_address_sk#18], Inner, BuildRight
: :- *(9) Project [c_current_cdemo_sk#15, c_current_addr_sk#17]
: : +- *(9) Filter (exists#19 || exists#20)
: : +- *(9) BroadcastHashJoin [c_customer_sk#21], [cs_ship_customer_sk#22], ExistenceJoin(exists#20), BuildRight
: : :- *(9) BroadcastHashJoin [c_customer_sk#21], [ws_bill_customer_sk#23], ExistenceJoin(exists#19), BuildRight
: : : :- *(9) BroadcastHashJoin [c_customer_sk#21], [ss_customer_sk#24], LeftSemi, BuildRight
: : : : :- *(9) Project [c_customer_sk#21, c_current_cdemo_sk#15, c_current_addr_sk#17]
: : : : : +- *(9) Filter (isnotnull(c_current_addr_sk#17) && isnotnull(c_current_cdemo_sk#15))
: : : : : +- *(9) FileScan parquet default.customer[c_customer_sk#21,c_current_cdemo_sk#15,c_current_addr_sk#17] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/customer], PartitionFilters: [], PushedFilters: [IsNotNull(c_current_addr_sk), IsNotNull(c_current_cdemo_sk)], ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int>
: : : : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : : : +- *(2) Project [ss_customer_sk#24]
: : : : +- *(2) BroadcastHashJoin [ss_sold_date_sk#25], [d_date_sk#26], Inner, BuildRight
: : : : :- *(2) Project [ss_sold_date_sk#25, ss_customer_sk#24]
: : : : : +- *(2) Filter isnotnull(ss_sold_date_sk#25)
: : : : : +- *(2) FileScan parquet default.store_sales[ss_sold_date_sk#25,ss_customer_sk#24] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/store_sales], PartitionFilters: [], PushedFilters: [IsNotNull(ss_sold_date_sk)], ReadSchema: struct<ss_sold_date_sk:int,ss_customer_sk:int>
: : : : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : : : +- *(1) Project [d_date_sk#26]
: : : : +- *(1) Filter (((((isnotnull(d_year#27) && isnotnull(d_moy#28)) && (d_year#27 = 2002)) && (d_moy#28 >= 1)) && (d_moy#28 <= 4)) && isnotnull(d_date_sk#26))
: : : : +- *(1) FileScan parquet default.date_dim[d_date_sk#26,d_year#27,d_moy#28] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/date_dim], PartitionFilters: [], PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,2002), GreaterThanOrEqual(d_moy,1), LessThan..., ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
: : : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : : +- *(4) Project [ws_bill_customer_sk#23]
: : : +- *(4) BroadcastHashJoin [ws_sold_date_sk#29], [d_date_sk#26], Inner, BuildRight
: : : :- *(4) Project [ws_sold_date_sk#29, ws_bill_customer_sk#23]
: : : : +- *(4) Filter isnotnull(ws_sold_date_sk#29)
: : : : +- *(4) FileScan parquet default.web_sales[ws_sold_date_sk#29,ws_bill_customer_sk#23] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/web_sales], PartitionFilters: [], PushedFilters: [IsNotNull(ws_sold_date_sk)], ReadSchema: struct<ws_sold_date_sk:int,ws_bill_customer_sk:int>
: : : +- ReusedExchange [d_date_sk#26], BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: : +- *(6) Project [cs_ship_customer_sk#22]
: : +- *(6) BroadcastHashJoin [cs_sold_date_sk#30], [d_date_sk#26], Inner, BuildRight
: : :- *(6) Project [cs_sold_date_sk#30, cs_ship_customer_sk#22]
: : : +- *(6) Filter isnotnull(cs_sold_date_sk#30)
: : : +- *(6) FileScan parquet default.catalog_sales[cs_sold_date_sk#30,cs_ship_customer_sk#22] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/catalog_sales], PartitionFilters: [], PushedFilters: [IsNotNull(cs_sold_date_sk)], ReadSchema: struct<cs_sold_date_sk:int,cs_ship_customer_sk:int>
: : +- ReusedExchange [d_date_sk#26], BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
: +- *(7) Project [ca_address_sk#18]
: +- *(7) Filter (ca_county#31 IN (Rush County,Toole County,Jefferson County,Dona Ana County,La Porte County) && isnotnull(ca_address_sk#18))
: +- *(7) FileScan parquet default.customer_address[ca_address_sk#18,ca_county#31] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/customer_address], PartitionFilters: [], PushedFilters: [In(ca_county, [Rush County,Toole County,Jefferson County,Dona Ana County,La Porte County]), IsNo..., ReadSchema: struct<ca_address_sk:int,ca_county:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
+- *(8) Project [cd_demo_sk#16, cd_gender#1, cd_marital_status#2, cd_education_status#3, cd_purchase_estimate#4, cd_credit_rating#5, cd_dep_count#6, cd_dep_employed_count#7, cd_dep_college_count#8]
+- *(8) Filter isnotnull(cd_demo_sk#16)
+- *(8) FileScan parquet default.customer_demographics[cd_demo_sk#16,cd_gender#1,cd_marital_status#2,cd_education_status#3,cd_purchase_estimate#4,cd_credit_rating#5,cd_dep_count#6,cd_dep_employed_count#7,cd_dep_college_count#8] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:/Users/apdave/github/hyperspace-1/spark-warehouse/customer_demographics], PartitionFilters: [], PushedFilters: [IsNotNull(cd_demo_sk)], ReadSchema: struct<cd_demo_sk:int,cd_gender:string,cd_marital_status:string,cd_education_status:string,cd_pur...
Loading