Skip to content

feat: change Expr OuterReferenceColumn to Box type for reducing expr struct size #16771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
}
Expr::Literal(_, _)
| Expr::Alias(_)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::ScalarVariable(_, _)
| Expr::Not(_)
| Expr::IsNotNull(_)
Expand Down
33 changes: 23 additions & 10 deletions datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

use std::cmp::Ordering;
use std::collections::{BTreeMap, HashSet};
use std::fmt::{self, Display, Formatter, Write};
use std::fmt::{self, Debug, Display, Formatter, Write};
use std::hash::{Hash, Hasher};
use std::mem;
use std::sync::Arc;
Expand Down Expand Up @@ -362,7 +362,7 @@ pub enum Expr {
Placeholder(Placeholder),
/// A placeholder which holds a reference to a qualified field
/// in the outer query, used for correlated sub queries.
OuterReferenceColumn(DataType, Column),
OuterReferenceColumn(Box<OuterReference>),
/// Unnest expression
Unnest(Unnest),
}
Expand Down Expand Up @@ -413,6 +413,18 @@ impl<'a> TreeNodeContainer<'a, Self> for Expr {
}
}

#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct OuterReference {
pub data_type: DataType,
pub column: Column,
}

impl Debug for OuterReference {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{:?}, {:?}", self.data_type, self.column)
}
}

/// Literal metadata
///
/// Stores metadata associated with a literal expressions
Expand Down Expand Up @@ -1443,7 +1455,7 @@ impl Expr {
Expr::Case { .. } => "Case",
Expr::Cast { .. } => "Cast",
Expr::Column(..) => "Column",
Expr::OuterReferenceColumn(_, _) => "Outer",
Expr::OuterReferenceColumn(_) => "Outer",
Expr::Exists { .. } => "Exists",
Expr::GroupingSet(..) => "GroupingSet",
Expr::InList { .. } => "InList",
Expand Down Expand Up @@ -2018,7 +2030,7 @@ impl Expr {
| Expr::SimilarTo(..)
| Expr::Not(..)
| Expr::Negative(..)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::TryCast(..)
| Expr::Unnest(..)
| Expr::Wildcard { .. }
Expand Down Expand Up @@ -2601,9 +2613,9 @@ impl HashNode for Expr {
Expr::Placeholder(place_holder) => {
place_holder.hash(state);
}
Expr::OuterReferenceColumn(data_type, column) => {
data_type.hash(state);
column.hash(state);
Expr::OuterReferenceColumn(boxed_orc) => {
boxed_orc.data_type.hash(state);
boxed_orc.column.hash(state);
}
Expr::Unnest(Unnest { expr: _expr }) => {}
};
Expand Down Expand Up @@ -2908,7 +2920,7 @@ struct SqlDisplay<'a>(&'a Expr);
impl Display for SqlDisplay<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self.0 {
Expr::Literal(scalar, _) => scalar.fmt(f),
Expr::Literal(scalar, _) => Display::fmt(&scalar, f),
Expr::Alias(Alias { name, .. }) => write!(f, "{name}"),
Expr::Between(Between {
expr,
Expand Down Expand Up @@ -3171,7 +3183,8 @@ impl Display for Expr {
match self {
Expr::Alias(Alias { expr, name, .. }) => write!(f, "{expr} AS {name}"),
Expr::Column(c) => write!(f, "{c}"),
Expr::OuterReferenceColumn(_, c) => {
Expr::OuterReferenceColumn(boxed_orc) => {
let c = &boxed_orc.column;
write!(f, "{OUTER_REFERENCE_COLUMN_PREFIX}({c})")
}
Expr::ScalarVariable(_, var_names) => write!(f, "{}", var_names.join(".")),
Expand Down Expand Up @@ -3818,7 +3831,7 @@ mod test {
// If this test fails when you change `Expr`, please try
// `Box`ing the fields to make `Expr` smaller
// See https://github.com/apache/datafusion/issues/16199 for details
assert_eq!(size_of::<Expr>(), 128);
assert_eq!(size_of::<Expr>(), 112);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change saves 16 bytes per Expr. Nice.

I will run some planning benchmarks and see if we can see any difference

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb , i am doing more amazing experiment, try to reduce from 128 to 80, so we can save 48 bytes per Expr!

assert_eq!(size_of::<ScalarValue>(), 64);
assert_eq!(size_of::<DataType>(), 24); // 3 ptrs
assert_eq!(size_of::<Vec<Expr>>(), 24);
Expand Down
11 changes: 9 additions & 2 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@

use crate::expr::{
AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery,
Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, WindowFunctionParams,
OuterReference, Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction,
WindowFunctionParams,
};
use crate::function::{
AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory,
Expand Down Expand Up @@ -69,7 +70,13 @@ pub fn col(ident: impl Into<Column>) -> Expr {
/// Create an out reference column which hold a reference that has been resolved to a field
/// outside of the current plan.
pub fn out_ref_col(dt: DataType, ident: impl Into<Column>) -> Expr {
Expr::OuterReferenceColumn(dt, ident.into())
Expr::OuterReferenceColumn(
OuterReference {
data_type: dt,
column: ident.into(),
}
.into(),
)
}

/// Create an unqualified column expression from the provided name, without normalizing
Expand Down
5 changes: 3 additions & 2 deletions datafusion/expr/src/expr_rewriter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,9 @@ pub fn unnormalize_cols(exprs: impl IntoIterator<Item = Expr>) -> Vec<Expr> {
pub fn strip_outer_reference(expr: Expr) -> Expr {
expr.transform(|expr| {
Ok({
if let Expr::OuterReferenceColumn(_, col) = expr {
Transformed::yes(Expr::Column(col))
// Match the boxed (DataType, Column) tuple and extract the Column
if let Expr::OuterReferenceColumn(boxed_pair) = expr {
Transformed::yes(Expr::Column(boxed_pair.column))
} else {
Transformed::no(expr)
}
Expand Down
14 changes: 9 additions & 5 deletions datafusion/expr/src/expr_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ impl ExprSchemable for Expr {
},
Expr::Negative(expr) => expr.get_type(schema),
Expr::Column(c) => Ok(schema.data_type(c)?.clone()),
Expr::OuterReferenceColumn(ty, _) => Ok(ty.clone()),
Expr::OuterReferenceColumn(boxed_orc) => {
Ok(boxed_orc.as_ref().data_type.clone())
}
Expr::ScalarVariable(ty, _) => Ok(ty.clone()),
Expr::Literal(l, _) => Ok(l.data_type()),
Expr::Case(case) => {
Expand Down Expand Up @@ -276,7 +278,7 @@ impl ExprSchemable for Expr {
|| high.nullable(input_schema)?),

Expr::Column(c) => input_schema.nullable(c),
Expr::OuterReferenceColumn(_, _) => Ok(true),
Expr::OuterReferenceColumn(_) => Ok(true),
Expr::Literal(value, _) => Ok(value.is_null()),
Expr::Case(case) => {
// This expression is nullable if any of the input expressions are nullable
Expand Down Expand Up @@ -411,9 +413,11 @@ impl ExprSchemable for Expr {
}
Expr::Negative(expr) => expr.to_field(schema).map(|(_, f)| f),
Expr::Column(c) => schema.field_from_column(c).map(|f| Arc::new(f.clone())),
Expr::OuterReferenceColumn(ty, _) => {
Ok(Arc::new(Field::new(&schema_name, ty.clone(), true)))
}
Expr::OuterReferenceColumn(boxed_alias) => Ok(Arc::new(Field::new(
&schema_name,
boxed_alias.as_ref().data_type.clone(),
true,
))),
Expr::ScalarVariable(ty, _) => {
Ok(Arc::new(Field::new(&schema_name, ty.clone(), true)))
}
Expand Down
4 changes: 2 additions & 2 deletions datafusion/expr/src/tree_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl TreeNode for Expr {
#[expect(deprecated)]
Expr::Column(_)
// Treat OuterReferenceColumn as a leaf expression
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::ScalarVariable(_, _)
| Expr::Literal(_, _)
| Expr::Exists { .. }
Expand Down Expand Up @@ -122,7 +122,7 @@ impl TreeNode for Expr {
Expr::Column(_)
| Expr::Wildcard { .. }
| Expr::Placeholder(Placeholder { .. })
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::Exists { .. }
| Expr::ScalarSubquery(_)
| Expr::ScalarVariable(_, _)
Expand Down
2 changes: 1 addition & 1 deletion datafusion/optimizer/src/analyzer/type_coercion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
| Expr::Wildcard { .. }
| Expr::GroupingSet(_)
| Expr::Placeholder(_)
| Expr::OuterReferenceColumn(_, _) => Ok(Transformed::no(expr)),
| Expr::OuterReferenceColumn(_) => Ok(Transformed::no(expr)),
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions datafusion/optimizer/src/optimize_projections/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,8 @@ fn outer_columns<'a>(expr: &'a Expr, columns: &mut HashSet<&'a Column>) {
// inspect_expr_pre doesn't handle subquery references, so find them explicitly
expr.apply(|expr| {
match expr {
Expr::OuterReferenceColumn(_, col) => {
columns.insert(col);
Expr::OuterReferenceColumn(boxed_orc) => {
columns.insert(&boxed_orc.as_ref().column);
}
Expr::ScalarSubquery(subquery) => {
outer_columns_helper_multi(&subquery.outer_ref_columns, columns);
Expand Down
2 changes: 1 addition & 1 deletion datafusion/optimizer/src/push_down_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
Expr::Exists { .. }
| Expr::InSubquery(_)
| Expr::ScalarSubquery(_)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::Unnest(_) => {
is_evaluate = false;
Ok(TreeNodeRecursion::Stop)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ impl<'a> ConstEvaluator<'a> {
Expr::AggregateFunction { .. }
| Expr::ScalarVariable(_, _)
| Expr::Column(_)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::Exists { .. }
| Expr::InSubquery(_)
| Expr::ScalarSubquery(_)
Expand Down
20 changes: 13 additions & 7 deletions datafusion/sql/src/expr/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,18 @@
// specific language governing permissions and limitations
// under the License.

use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
use arrow::datatypes::Field;
use datafusion_common::{
internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema,
DataFusionError, Result, Span, TableReference,
};
use datafusion_expr::expr::OuterReference;
use datafusion_expr::planner::PlannerResult;
use datafusion_expr::UNNAMED_TABLE;
use datafusion_expr::{Case, Expr};
use sqlparser::ast::{CaseWhen, Expr as SQLExpr, Ident};

use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
use datafusion_expr::UNNAMED_TABLE;

impl<S: ContextProvider> SqlToRel<'_, S> {
pub(super) fn sql_identifier_to_expr(
&self,
Expand Down Expand Up @@ -75,8 +75,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
{
// Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
return Ok(Expr::OuterReferenceColumn(
field.data_type().clone(),
Column::from((qualifier, field)),
OuterReference {
data_type: field.data_type().clone(),
column: Column::from((qualifier, field)),
}
.into(),
));
}
}
Expand Down Expand Up @@ -182,8 +185,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
Some((field, qualifier, _nested_names)) => {
// Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
Ok(Expr::OuterReferenceColumn(
field.data_type().clone(),
Column::from((qualifier, field)),
OuterReference {
data_type: field.data_type().clone(),
column: Column::from((qualifier, field)),
}
.into(),
))
}
// Found no matching field, will return a default
Expand Down
4 changes: 3 additions & 1 deletion datafusion/sql/src/unparser/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,9 @@ impl Unparser<'_> {
Expr::Placeholder(p) => {
Ok(ast::Expr::value(ast::Value::Placeholder(p.id.to_string())))
}
Expr::OuterReferenceColumn(_, col) => self.col_to_sql(col),
Expr::OuterReferenceColumn(boxed_orc) => {
self.col_to_sql(&boxed_orc.as_ref().column)
}
Expr::Unnest(unnest) => self.unnest_to_sql(unnest),
}
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/substrait/src/logical_plan/producer/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ pub fn to_substrait_rex(
Expr::Wildcard { .. } => not_impl_err!("Cannot convert {expr:?} to Substrait"),
Expr::GroupingSet(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
Expr::Placeholder(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
Expr::OuterReferenceColumn(_, _) => {
Expr::OuterReferenceColumn(_) => {
not_impl_err!("Cannot convert {expr:?} to Substrait")
}
Expr::Unnest(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
Expand Down