Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIVE-28259: Common table expression detection and rewrites using CBO #5249

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.hadoop.hive.common.ZooKeeperHiveHelper;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceAudience.LimitedPrivate;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.common.type.TimestampTZUtil;
import org.apache.hadoop.hive.conf.Validator.PatternSet;
import org.apache.hadoop.hive.conf.Validator.RangeValidator;
Expand Down Expand Up @@ -2762,6 +2763,9 @@ public static enum ConfVars {
+ " provides an optimization if it is accurate."),

// CTE
@InterfaceStability.Unstable
HIVE_CTE_SUGGESTER_CLASS("hive.optimize.cte.suggester.class", "",
"Class for finding and suggesting common table expressions (CTEs) based on a given query. The class must implement the CommonTableExpressionSuggester interface."),
HIVE_CTE_MATERIALIZE_THRESHOLD("hive.optimize.cte.materialize.threshold", 3,
"If the number of references to a CTE clause exceeds this threshold, Hive will materialize it\n" +
"before executing the main query block. -1 will disable this feature."),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import org.apache.calcite.plan.Contexts;
import org.apache.calcite.plan.hep.HepPlanner;
import org.apache.calcite.plan.hep.HepProgram;
import org.apache.calcite.plan.hep.HepProgramBuilder;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.metadata.RelMetadataQuery;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.CommonRelSubExprRegisterRule;

import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
* Suggester for common table expressions that appear as is (identical trees) more than once in the query plan.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class CommonTableExpressionIdentitySuggester implements CommonTableExpressionSuggester {

@Override
public List<RelNode> suggest(final RelNode input, final Configuration configuration) {
CommonTableExpressionRegistry localRegistry = new CommonTableExpressionRegistry();
HepProgram ruleProgram = new HepProgramBuilder()
.addRuleInstance(CommonRelSubExprRegisterRule.JOIN)
.addRuleInstance(CommonRelSubExprRegisterRule.AGGREGATE)
.addRuleInstance(CommonRelSubExprRegisterRule.FILTER)
.addRuleInstance(CommonRelSubExprRegisterRule.PROJECT)
.build();
HepPlanner planner = new HepPlanner(ruleProgram, Contexts.of(localRegistry));
planner.setRoot(input);
planner.findBestExp();
RelMetadataQuery mq = input.getCluster().getMetadataQuery();
Comparator<RelNode> rowCountCmp = Comparator.comparing(mq::getRowCount).reversed();
Comparator<RelNode> rowSizeCmp = Comparator.comparing(mq::getAverageRowSize).reversed();
// Criterion based sorting to ensure deterministic ordering of results.
return localRegistry.entries()
.sorted(Comparator.comparing(HiveCalciteUtil::countNodes).thenComparing(rowCountCmp).thenComparing(rowSizeCmp))
.collect(Collectors.toList());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.rel.RelNode;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.ql.session.SessionState;

import java.util.List;

/**
* Suggester for printing common table expressions to the Session's console.
*/
@InterfaceAudience.Private
public class CommonTableExpressionPrintSuggester implements CommonTableExpressionSuggester {

private final CommonTableExpressionSuggester internal = new CommonTableExpressionIdentitySuggester();

@Override
public List<RelNode> suggest(final RelNode input, final Configuration configuration) {
List<RelNode> result = internal.suggest(input, configuration);
for (int i = 0; i < result.size(); i++) {
SessionState.getConsole().printInfo("CTE Suggestion " + i + ":" + RelOptUtil.toString(result.get(i)), false);
}
return result;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import org.apache.calcite.rel.RelNode;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;

import java.util.HashMap;
import java.util.Map;
import java.util.stream.Stream;

/**
* A registry of common table expressions for a given query.
* <p>The registry is only meant to hold common expressions for a single query.</p>
* <p>The class is not thread-safe.</p>
*/
@InterfaceAudience.Private
public final class CommonTableExpressionRegistry {
/**
* A unique collection of common table expressions.
* <p>The expressions may contain internal planning concepts such as {@link org.apache.calcite.plan.hep.HepRelVertex}.
* </p>
*/
private final Map<String, RelNode> ctes = new HashMap<>();

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, all CTEs are weighted equally. A potentially future enhancement could be one where there is weight/rank associated with it which would feed into the optimizer decisions. This only matters if we want to restrict the number of CTEs to be considered (for shorter compilation time).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weighting and ranking CTEs definitely makes sense and I see this as a responsibility of the CommonTableExpressionSuggester; concrete implementations of the interface could trade off speed for quality or the opposite.

Furthermore, the CommonTableExpressionSuggester interface currently accepts a Configuration object so it is technically possible to pass arbitrary user-specified properties all the way to the suggester (e.g., set mysupersuggester.cte.limit=5) and fine tune specific aspects of its behavior.


/**
* Adds the specified common table expression to this registry.
* @param cte common table expression to be added to the registry.
*/
public void add(RelNode cte) {
this.ctes.put(cte.getDigest(), cte);
}

/**
* @return a stream with all common table expression entries
*/
public Stream<RelNode> entries() {
return ctes.values().stream().map(HiveCalciteUtil::stripHepVertices);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import org.apache.calcite.rel.RelNode;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;

import java.util.List;

/**
* Suggester for finding and returning interesting expressions that appear more than once in a query. The notion of
* "interesting" is specific to the actual implementation of this interface.
* <p>In some cases the interesting expressions may be readily available in the input query while in others they could be
* revealed after applying various transformations and algebraic equivalences.</p>
* <p>The final decision about using (or not) the suggestions provided by this class lies to the query optimizer. For
* various reasons (e.g, incorrect or expensive expressions) the optimizer may choose to reject/ignore the results
* provided by this class.</p>
* <p>Implementations of this interface must have a public no argument constructor since they are instantiated via reflection.
* The class is instantiated once for each query under compilation and then thrown away.</p>
* <p>The interface is experimental and subject to change without notice.</p>
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public interface CommonTableExpressionSuggester {
/**
* Suggests interesting expressions for the specified query and configuration.
* @param input a relational expression representing the query under compilation.
* @param configuration the configuration to use for identifying and returning interesting expressions.
* @return a list with interesting expressions for the specified query
*/
List<RelNode> suggest(RelNode input, Configuration configuration);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.conf.HiveConf;

import java.util.Collections;

@InterfaceAudience.Private
public final class CommonTableExpressionSuggesterFactory {

public static CommonTableExpressionSuggester create(HiveConf configuration) {
String suggestName = configuration.getVar(HiveConf.ConfVars.HIVE_CTE_SUGGESTER_CLASS);
if (suggestName == null || suggestName.isEmpty()) {
return (query, conf) -> Collections.emptyList();
}
try {
Class<?> suggesterClass = Class.forName(suggestName);
if (CommonTableExpressionSuggester.class.isAssignableFrom(suggesterClass)) {
return (CommonTableExpressionSuggester) suggesterClass.newInstance();
}
throw new IllegalArgumentException(
suggesterClass.getSimpleName() + " must implement " + CommonTableExpressionSuggester.class.getSimpleName());
} catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import org.apache.calcite.plan.RelTraitSet;
import org.apache.calcite.plan.RelOptUtil.InputFinder;
import org.apache.calcite.plan.RelOptUtil.InputReferencedVisitor;
import org.apache.calcite.plan.hep.HepRelVertex;
import org.apache.calcite.rel.RelHomogeneousShuttle;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Aggregate;
import org.apache.calcite.rel.core.AggregateCall;
Expand Down Expand Up @@ -1343,4 +1345,32 @@ public RexTableInputRef visitTableInputRef(RexTableInputRef ref) {
rexNode.accept(visitor);
return rexTableInputRefs;
}

public static long countNodes(RelNode root) {
long[] count = new long[] { 0 };
root.accept(new RelHomogeneousShuttle() {
@Override
public RelNode visit(final RelNode other) {
count[0]++;
return super.visit(other);
}
});
return count[0];
}

public static RelNode stripHepVertices(RelNode rel) {
if (rel instanceof HepRelVertex) {
rel = ((HepRelVertex) rel).getCurrentRel();
}
List<RelNode> oldInputs = rel.getInputs();
List<RelNode> newInputs = new ArrayList<>();
for (RelNode oldInput : oldInputs) {
newInputs.add(stripHepVertices(oldInput));
}
if (oldInputs.equals(newInputs)) {
return rel;
} else {
return rel.copy(rel.getTraitSet(), newInputs);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveDefaultCostModel;
import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveOnTezCostModel;
import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveRelMdCost;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdAggregatedColumns;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdColumnUniqueness;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdCollation;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdCumulativeCost;
Expand All @@ -40,6 +41,7 @@
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdRuntimeRowCount;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdSelectivity;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdSize;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdTableReferences;
import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdUniqueKeys;

import com.google.common.collect.ImmutableList;
Expand Down Expand Up @@ -67,6 +69,8 @@ public class HiveDefaultRelMetadataProvider {
HiveRelMdDistribution.SOURCE,
HiveRelMdCollation.SOURCE,
HiveRelMdPredicates.SOURCE,
HiveRelMdTableReferences.SOURCE,
HiveRelMdAggregatedColumns.SOURCE,
JaninoRelMetadataProvider.DEFAULT)));

private final RelMetadataProvider metadataProvider;
Expand Down Expand Up @@ -102,6 +106,8 @@ private RelMetadataProvider init(HiveConf hiveConf, List<Class<? extends RelNode
HiveRelMdDistribution.SOURCE,
HiveRelMdCollation.SOURCE,
HiveRelMdPredicates.SOURCE,
HiveRelMdTableReferences.SOURCE,
HiveRelMdAggregatedColumns.SOURCE,
JaninoRelMetadataProvider.DEFAULT)));

if (nodeClasses != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableSpool;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveValues;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
Expand Down Expand Up @@ -91,6 +92,9 @@ public class HiveRelFactories {
public static final SetOpFactory HIVE_SET_OP_FACTORY =
new HiveSetOpFactoryImpl();

public static final RelFactories.SpoolFactory HIVE_SPOOL_FACTORY =
(input, readType, writeType, table) -> new HiveTableSpool(input, readType, writeType, table);

public static final RelBuilderFactory HIVE_BUILDER =
HiveRelBuilder.proto(
Contexts.of(
Expand All @@ -102,7 +106,8 @@ public class HiveRelFactories {
HIVE_SORT_EXCHANGE_FACTORY,
HIVE_VALUES_FACTORY,
HIVE_AGGREGATE_FACTORY,
HIVE_SET_OP_FACTORY));
HIVE_SET_OP_FACTORY,
HIVE_SPOOL_FACTORY));

private HiveRelFactories() {
}
Expand Down
Loading
Loading