From 36d1111594d07d5aed9cb8c44212218e8b160ce7 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Tue, 28 May 2024 03:20:31 +0300 Subject: [PATCH] minor: inconsistent group by position planning (#10679) * fix: inconsistent group by position planning * Update datafusion/sql/src/utils.rs Co-authored-by: Andrew Lamb * improved error message --------- Co-authored-by: Andrew Lamb --- datafusion/sql/src/select.rs | 7 +- datafusion/sql/src/utils.rs | 75 +++++++++++++------ datafusion/sql/tests/sql_integration.rs | 12 +-- .../sqllogictest/test_files/group_by.slt | 5 ++ 4 files changed, 68 insertions(+), 31 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index d2cd1bcf3a064..e8e016bf09812 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -131,7 +131,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // // SELECT c1, MAX(c2) AS m FROM t GROUP BY c1 HAVING MAX(c2) > 10; // - let having_expr = resolve_aliases_to_exprs(&having_expr, &alias_map)?; + let having_expr = resolve_aliases_to_exprs(having_expr, &alias_map)?; normalize_col(having_expr, &projected_plan) }) .transpose()?; @@ -163,10 +163,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { alias_map.remove(f.name()); } let group_by_expr = - resolve_aliases_to_exprs(&group_by_expr, &alias_map)?; + resolve_aliases_to_exprs(group_by_expr, &alias_map)?; let group_by_expr = - resolve_positions_to_exprs(&group_by_expr, &select_exprs) - .unwrap_or(group_by_expr); + resolve_positions_to_exprs(group_by_expr, &select_exprs)?; let group_by_expr = normalize_col(group_by_expr, &projected_plan)?; self.validate_schema_satisfies_exprs( base_plan.schema(), diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 35fd3b48f02c9..51bacb5f702b0 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -149,47 +149,51 @@ pub(crate) fn extract_aliases(exprs: &[Expr]) -> HashMap { } /// Given an expression that's literal int encoding position, lookup the corresponding expression -/// in the select_exprs list, if the index is within the bounds and it is indeed a position literal; -/// Otherwise, return None +/// in the select_exprs list, if the index is within the bounds and it is indeed a position literal, +/// otherwise, returns planning error. +/// If input expression is not an int literal, returns expression as-is. pub(crate) fn resolve_positions_to_exprs( - expr: &Expr, + expr: Expr, select_exprs: &[Expr], -) -> Option { +) -> Result { match expr { // sql_expr_to_logical_expr maps number to i64 // https://github.com/apache/datafusion/blob/8d175c759e17190980f270b5894348dc4cff9bbf/datafusion/src/sql/planner.rs#L882-L887 Expr::Literal(ScalarValue::Int64(Some(position))) - if position > &0_i64 && position <= &(select_exprs.len() as i64) => + if position > 0_i64 && position <= select_exprs.len() as i64 => { let index = (position - 1) as usize; let select_expr = &select_exprs[index]; - Some(match select_expr { + Ok(match select_expr { Expr::Alias(Alias { expr, .. }) => *expr.clone(), _ => select_expr.clone(), }) } - _ => None, + Expr::Literal(ScalarValue::Int64(Some(position))) => plan_err!( + "Cannot find column with position {} in SELECT clause. Valid columns: 1 to {}", + position, select_exprs.len() + ), + _ => Ok(expr), } } /// Rebuilds an `Expr` with columns that refer to aliases replaced by the /// alias' underlying `Expr`. pub(crate) fn resolve_aliases_to_exprs( - expr: &Expr, + expr: Expr, aliases: &HashMap, ) -> Result { - expr.clone() - .transform_up(|nested_expr| match nested_expr { - Expr::Column(c) if c.relation.is_none() => { - if let Some(aliased_expr) = aliases.get(&c.name) { - Ok(Transformed::yes(aliased_expr.clone())) - } else { - Ok(Transformed::no(Expr::Column(c))) - } + expr.transform_up(|nested_expr| match nested_expr { + Expr::Column(c) if c.relation.is_none() => { + if let Some(aliased_expr) = aliases.get(&c.name) { + Ok(Transformed::yes(aliased_expr.clone())) + } else { + Ok(Transformed::no(Expr::Column(c))) } - _ => Ok(Transformed::no(nested_expr)), - }) - .data() + } + _ => Ok(Transformed::no(nested_expr)), + }) + .data() } /// given a slice of window expressions sharing the same sort key, find their common partition @@ -346,9 +350,9 @@ mod tests { use arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; use arrow_schema::Fields; use datafusion_common::{DFSchema, Result}; - use datafusion_expr::{col, lit, unnest, EmptyRelation, LogicalPlan}; + use datafusion_expr::{col, count, lit, unnest, EmptyRelation, LogicalPlan}; - use crate::utils::recursive_transform_unnest; + use crate::utils::{recursive_transform_unnest, resolve_positions_to_exprs}; #[test] fn test_recursive_transform_unnest() -> Result<()> { @@ -437,4 +441,33 @@ mod tests { Ok(()) } + + #[test] + fn test_resolve_positions_to_exprs() -> Result<()> { + let select_exprs = vec![col("c1"), col("c2"), count(lit(1))]; + + // Assert 1 resolved as first column in select list + let resolved = resolve_positions_to_exprs(lit(1i64), &select_exprs)?; + assert_eq!(resolved, col("c1")); + + // Assert error if index out of select clause bounds + let resolved = resolve_positions_to_exprs(lit(-1i64), &select_exprs); + assert!(resolved.is_err_and(|e| e.message().contains( + "Cannot find column with position -1 in SELECT clause. Valid columns: 1 to 3" + ))); + + let resolved = resolve_positions_to_exprs(lit(5i64), &select_exprs); + assert!(resolved.is_err_and(|e| e.message().contains( + "Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 3" + ))); + + // Assert expression returned as-is + let resolved = resolve_positions_to_exprs(lit("text"), &select_exprs)?; + assert_eq!(resolved, lit("text")); + + let resolved = resolve_positions_to_exprs(col("fake"), &select_exprs)?; + assert_eq!(resolved, col("fake")); + + Ok(()) + } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 0d84d9b2ab724..e32a01dbc38be 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -1484,16 +1484,16 @@ fn select_simple_aggregate_with_groupby_position_out_of_range() { let sql = "SELECT state, MIN(age) FROM person GROUP BY 0"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.state could not be resolved from available columns: Int64(0), MIN(person.age)", - err.strip_backtrace() - ); + "Error during planning: Cannot find column with position 0 in SELECT clause. Valid columns: 1 to 2", + err.strip_backtrace() + ); let sql2 = "SELECT state, MIN(age) FROM person GROUP BY 5"; let err2 = logical_plan(sql2).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.state could not be resolved from available columns: Int64(5), MIN(person.age)", - err2.strip_backtrace() - ); + "Error during planning: Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 2", + err2.strip_backtrace() + ); } #[test] diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 43bbf6bed643c..7f4106a27caba 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -5059,3 +5059,8 @@ query II SELECT a + 1 AS d, a + 1 + b AS c FROM (SELECT 1 AS a, 2 AS b) GROUP BY a + 1, a + 1 + b; ---- 2 4 + +statement error DataFusion error: Error during planning: Cannot find column with position 4 in SELECT clause. Valid columns: 1 to 3 +SELECT a, b, COUNT(1) +FROM multiple_ordered_table +GROUP BY 1, 2, 4, 5, 6;