diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 1c1228949171..30140101df7b 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -56,6 +56,9 @@ pub struct ExprSimplifier { /// Guarantees about the values of columns. This is provided by the user /// in [ExprSimplifier::with_guarantees()]. guarantees: Vec<(Expr, NullableInterval)>, + /// Should expressions be canonicalized before simplification? Defaults to + /// true + canonicalize: bool, } pub const THRESHOLD_INLINE_INLIST: usize = 3; @@ -70,6 +73,7 @@ impl ExprSimplifier { Self { info, guarantees: vec![], + canonicalize: true, } } @@ -137,6 +141,12 @@ impl ExprSimplifier { let mut inlist_simplifier = InListSimplifier::new(); let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees); + let expr = if self.canonicalize { + expr.rewrite(&mut Canonicalizer::new())? + } else { + expr + }; + // TODO iterate until no changes are made during rewrite // (evaluating constants can enable new simplifications and // simplifications can enable new constant evaluation) @@ -151,10 +161,6 @@ impl ExprSimplifier { .rewrite(&mut simplifier) } - pub fn canonicalize(&self, expr: Expr) -> Result { - let mut canonicalizer = Canonicalizer::new(); - expr.rewrite(&mut canonicalizer) - } /// Apply type coercion to an [`Expr`] so that it can be /// evaluated as a [`PhysicalExpr`](datafusion_physical_expr::PhysicalExpr). /// @@ -229,6 +235,60 @@ impl ExprSimplifier { self.guarantees = guarantees; self } + + /// Should [`Canonicalizer`] be applied before simplification? + /// + /// If true (the default), the expression will be rewritten to canonical + /// form before simplification. This is useful to ensure that the simplifier + /// can apply all possible simplifications. + /// + /// Some expressions, such as those in some Joins, can not be canonicalized + /// without changing their meaning. In these cases, canonicalization should + /// be disabled. + /// + /// ```rust + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, Expr}; + /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; + /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; + /// use datafusion_physical_expr::execution_props::ExecutionProps; + /// use datafusion_optimizer::simplify_expressions::{ + /// ExprSimplifier, SimplifyContext}; + /// + /// let schema = Schema::new(vec![ + /// Field::new("a", DataType::Int64, false), + /// Field::new("b", DataType::Int64, false), + /// Field::new("c", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref().unwrap(); + /// + /// // Create the simplifier + /// let props = ExecutionProps::new(); + /// let context = SimplifyContext::new(&props) + /// .with_schema(schema); + /// let simplifier = ExprSimplifier::new(context); + /// + /// // Expression: a = c AND 1 = b + /// let expr = col("a").eq(col("c")).and(lit(1).eq(col("b"))); + /// + /// // With canonicalization, the expression is rewritten to canonical form + /// // (though it is no simpler in this case): + /// let canonical = simplifier.simplify(expr.clone()).unwrap(); + /// // Expression has been rewritten to: (c = a AND b = 1) + /// assert_eq!(canonical, col("c").eq(col("a")).and(col("b").eq(lit(1)))); + /// + /// // If canonicalization is disabled, the expression is not changed + /// let non_canonicalized = simplifier + /// .with_canonicalize(false) + /// .simplify(expr.clone()) + /// .unwrap(); + /// + /// assert_eq!(non_canonicalized, expr); + /// ``` + pub fn with_canonicalize(mut self, canonicalize: bool) -> Self { + self.canonicalize = canonicalize; + self + } } /// Canonicalize any BinaryExprs that are not in canonical form @@ -236,7 +296,7 @@ impl ExprSimplifier { /// ` ` is rewritten to ` ` /// /// ` ` is rewritten so that the name of `col1` sorts higher -/// than `col2` (`b > a` would be canonicalized to `a < b`) +/// than `col2` (`a > b` would be canonicalized to `b < a`) struct Canonicalizer {} impl Canonicalizer { @@ -2889,8 +2949,7 @@ mod tests { let simplifier = ExprSimplifier::new( SimplifyContext::new(&execution_props).with_schema(schema), ); - let cano = simplifier.canonicalize(expr)?; - simplifier.simplify(cano) + simplifier.simplify(expr) } fn simplify(expr: Expr) -> Expr { diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index d68474dcde0b..f36cd8f838fb 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -85,44 +85,39 @@ impl SimplifyExpressions { }; let info = SimplifyContext::new(execution_props).with_schema(schema); - let simplifier = ExprSimplifier::new(info); - let new_inputs = plan .inputs() .iter() .map(|input| Self::optimize_internal(input, execution_props)) .collect::>>()?; - let expr = match plan { - // Canonicalize step won't reorder expressions in a Join on clause. - // The left and right expressions in a Join on clause are not commutative, - // since the order of the columns must match the order of the children. - LogicalPlan::Join(_) => { - plan.expressions() - .into_iter() - .map(|e| { - // TODO: unify with `rewrite_preserving_name` - let original_name = e.name_for_alias()?; - let new_e = simplifier.simplify(e)?; - new_e.alias_if_changed(original_name) - }) - .collect::>>()? - } - _ => { - plan.expressions() - .into_iter() - .map(|e| { - // TODO: unify with `rewrite_preserving_name` - let original_name = e.name_for_alias()?; - let cano_e = simplifier.canonicalize(e)?; - let new_e = simplifier.simplify(cano_e)?; - new_e.alias_if_changed(original_name) - }) - .collect::>>()? - } + let simplifier = ExprSimplifier::new(info); + + // The left and right expressions in a Join on clause are not + // commutative, for reasons that are not entirely clear. Thus, do not + // reorder expressions in Join while simplifying. + // + // This is likely related to the fact that order of the columns must + // match the order of the children. see + // https://github.com/apache/arrow-datafusion/pull/8780 for more details + let simplifier = if let LogicalPlan::Join(_) = plan { + simplifier.with_canonicalize(false) + } else { + simplifier }; - plan.with_new_exprs(expr, new_inputs) + let exprs = plan + .expressions() + .into_iter() + .map(|e| { + // TODO: unify with `rewrite_preserving_name` + let original_name = e.name_for_alias()?; + let new_e = simplifier.simplify(e)?; + new_e.alias_if_changed(original_name) + }) + .collect::>>()?; + + plan.with_new_exprs(exprs, new_inputs) } }