diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index 2a947c5e0eed..ede30bfb946d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats); LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows()); } + boolean useColStats = roi.filterStats.getColumnStats() != null; StatsUtils.updateStats(roi.filterStats, newNumRows, - true, roi.filterOperator, roi.colNames); + useColStats, roi.filterOperator, roi.colNames); if (LOG.isDebugEnabled()) { LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats); } diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q new file mode 100644 index 000000000000..dc4984960ec0 --- /dev/null +++ b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q @@ -0,0 +1,19 @@ +-- HIVE-29516: Verify that query compilation succeeds when column statistics +-- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit. + +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.min.bloom.filter.entries=1; +set hive.tez.bloom.filter.factor=1.0f; +set hive.auto.convert.join=false; + +create table t1_nocolstats (id int, val string); +create table t2_nocolstats (id int, val string); + +alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000'); +alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000'); + +explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id; diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out new file mode 100644 index 000000000000..8a82a858f96d --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out @@ -0,0 +1,161 @@ +PREHOOK: query: create table t1_nocolstats (id int, val string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_nocolstats +POSTHOOK: query: create table t1_nocolstats (id int, val string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_nocolstats +PREHOOK: query: create table t2_nocolstats (id int, val string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_nocolstats +POSTHOOK: query: create table t2_nocolstats (id int, val string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_nocolstats +PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_nocolstats +PREHOOK: Output: default@t1_nocolstats +POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_nocolstats +POSTHOOK: Output: default@t1_nocolstats +PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_nocolstats +PREHOOK: Output: default@t2_nocolstats +POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_nocolstats +POSTHOOK: Output: default@t2_nocolstats +PREHOOK: query: explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_nocolstats +PREHOOK: Input: default@t2_nocolstats +#### A masked pattern was here #### +POSTHOOK: query: explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_nocolstats +POSTHOOK: Input: default@t2_nocolstats +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +