This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 4be56108d [BugFix] abort plan if order by column not in select list 
(#5132)
4be56108d is described below

commit 4be56108d55ba301d1677e1a5554308a536479dd
Author: xyz <[email protected]>
AuthorDate: Sat Feb 11 21:38:31 2023 +0800

    [BugFix] abort plan if order by column not in select list (#5132)
    
    Currently, if the order by column is not in select list, we will
    always add_missing_columns to the select list. This will lead to the bug in 
issue apache#5065.
    
    In this pr, we modify the behaviour, for select distinct, we will throw 
error msg
    if order by expressions are not int select list.
    
    Signed-off-by: xyz <[email protected]>
---
 datafusion/core/tests/dataframe.rs                 | 34 ++++++++++++++++++++++
 .../core/tests/sqllogictests/test_files/select.slt | 27 ++++++++++++++++-
 datafusion/expr/src/logical_plan/builder.rs        | 22 ++++++++++++++
 datafusion/sql/tests/integration_test.rs           | 22 ++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/tests/dataframe.rs 
b/datafusion/core/tests/dataframe.rs
index 644cc1a0b..f7274fdab 100644
--- a/datafusion/core/tests/dataframe.rs
+++ b/datafusion/core/tests/dataframe.rs
@@ -24,6 +24,7 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use datafusion::from_slice::FromSlice;
+use datafusion_common::DataFusionError;
 use std::sync::Arc;
 
 use datafusion::dataframe::DataFrame;
@@ -127,6 +128,39 @@ async fn sort_on_unprojected_columns() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn sort_on_distinct_unprojected_columns() -> Result<()> {
+    let schema = Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+    ]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema.clone()),
+        vec![
+            Arc::new(Int32Array::from_slice([1, 10, 10, 100])),
+            Arc::new(Int32Array::from_slice([2, 12, 12, 120])),
+        ],
+    )
+    .unwrap();
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("t", batch).unwrap();
+
+    assert!(matches!(
+        ctx.table("t")
+            .await
+            .unwrap()
+            .select(vec![col("a")])
+            .unwrap()
+            .distinct()
+            .unwrap()
+            .sort(vec![Expr::Sort(Sort::new(Box::new(col("b")), false, 
true))]),
+        Err(DataFusionError::Plan(_))
+    ));
+    Ok(())
+}
+
 #[tokio::test]
 async fn filter_with_alias_overwrite() -> Result<()> {
     let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
diff --git a/datafusion/core/tests/sqllogictests/test_files/select.slt 
b/datafusion/core/tests/sqllogictests/test_files/select.slt
index 1b5ede066..65353b0c1 100644
--- a/datafusion/core/tests/sqllogictests/test_files/select.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/select.slt
@@ -55,4 +55,29 @@ query IC rowsort
 VALUES (1,'a'),(2,'b')
 ----
 1   a
-2   b
\ No newline at end of file
+2   b
+
+# table foo for distinct order by
+statement ok
+CREATE TABLE foo AS VALUES
+(1, 2),
+(3, 4),
+(5, 6);
+
+# foo distinct
+query I
+select distinct '1' from foo;
+----
+1
+
+# foo order by
+query I
+select '1' from foo order by column1;
+----
+1
+1
+1
+
+# foo distinct order by 
+statement error DataFusion error: Error during planning: For SELECT DISTINCT, 
ORDER BY expressions column1 must appear in select list
+select distinct '1' from foo order by column1;
\ No newline at end of file
diff --git a/datafusion/expr/src/logical_plan/builder.rs 
b/datafusion/expr/src/logical_plan/builder.rs
index 4bbb83bb7..e1092b96b 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -406,6 +406,28 @@ impl LogicalPlanBuilder {
                 Ok(())
             })?;
 
+        // if current plan is distinct or current plan is repartition and its 
child plan is distinct,
+        // then this plan is a select distinct plan
+        let is_select_distinct = match self.plan {
+            LogicalPlan::Distinct(_) => true,
+            LogicalPlan::Repartition(Repartition { ref input, .. }) => {
+                matches!(input.as_ref(), &LogicalPlan::Distinct(_))
+            }
+            _ => false,
+        };
+
+        // for select distinct, order by expressions must exist in select list
+        if is_select_distinct && !missing_cols.is_empty() {
+            let missing_col_names = missing_cols
+                .iter()
+                .map(|col| col.flat_name())
+                .collect::<String>();
+            let error_msg = format!(
+                "For SELECT DISTINCT, ORDER BY expressions {missing_col_names} 
must appear in select list",
+            );
+            return Err(DataFusionError::Plan(error_msg));
+        }
+
         if missing_cols.is_empty() {
             return Ok(Self::from(LogicalPlan::Sort(Sort {
                 expr: normalize_cols(exprs, &self.plan)?,
diff --git a/datafusion/sql/tests/integration_test.rs 
b/datafusion/sql/tests/integration_test.rs
index 38ede4e6b..c75f93d36 100644
--- a/datafusion/sql/tests/integration_test.rs
+++ b/datafusion/sql/tests/integration_test.rs
@@ -3213,6 +3213,28 @@ fn test_select_join_key_inner_join() {
     quick_test(sql, expected);
 }
 
+#[test]
+fn test_select_order_by() {
+    let sql = "SELECT '1' from person order by id";
+
+    let expected = "Projection: Utf8(\"1\")\n  Sort: person.id ASC NULLS 
LAST\n    Projection: Utf8(\"1\"), person.id\n      TableScan: person";
+    quick_test(sql, expected);
+}
+
+#[test]
+fn test_select_distinct_order_by() {
+    let sql = "SELECT distinct '1' from person order by id";
+
+    let expected =
+        "Error during planning: For SELECT DISTINCT, ORDER BY expressions id 
must appear in select list";
+
+    // It should return error.
+    let result = logical_plan(sql);
+    assert!(result.is_err());
+    let err = result.err().unwrap();
+    assert_eq!(err.to_string(), expected);
+}
+
 #[test]
 fn test_duplicated_left_join_key_inner_join() {
     //  person.id * 2 happen twice in left side.

Reply via email to