This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 4be56108d [BugFix] abort plan if order by column not in select list
(#5132)
4be56108d is described below
commit 4be56108d55ba301d1677e1a5554308a536479dd
Author: xyz <[email protected]>
AuthorDate: Sat Feb 11 21:38:31 2023 +0800
[BugFix] abort plan if order by column not in select list (#5132)
Currently, if the order by column is not in select list, we will
always add_missing_columns to the select list. This will lead to the bug in
issue apache#5065.
In this pr, we modify the behaviour, for select distinct, we will throw
error msg
if order by expressions are not int select list.
Signed-off-by: xyz <[email protected]>
---
datafusion/core/tests/dataframe.rs | 34 ++++++++++++++++++++++
.../core/tests/sqllogictests/test_files/select.slt | 27 ++++++++++++++++-
datafusion/expr/src/logical_plan/builder.rs | 22 ++++++++++++++
datafusion/sql/tests/integration_test.rs | 22 ++++++++++++++
4 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/datafusion/core/tests/dataframe.rs
b/datafusion/core/tests/dataframe.rs
index 644cc1a0b..f7274fdab 100644
--- a/datafusion/core/tests/dataframe.rs
+++ b/datafusion/core/tests/dataframe.rs
@@ -24,6 +24,7 @@ use arrow::{
record_batch::RecordBatch,
};
use datafusion::from_slice::FromSlice;
+use datafusion_common::DataFusionError;
use std::sync::Arc;
use datafusion::dataframe::DataFrame;
@@ -127,6 +128,39 @@ async fn sort_on_unprojected_columns() -> Result<()> {
Ok(())
}
+#[tokio::test]
+async fn sort_on_distinct_unprojected_columns() -> Result<()> {
+ let schema = Schema::new(vec![
+ Field::new("a", DataType::Int32, false),
+ Field::new("b", DataType::Int32, false),
+ ]);
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema.clone()),
+ vec![
+ Arc::new(Int32Array::from_slice([1, 10, 10, 100])),
+ Arc::new(Int32Array::from_slice([2, 12, 12, 120])),
+ ],
+ )
+ .unwrap();
+
+ let ctx = SessionContext::new();
+ ctx.register_batch("t", batch).unwrap();
+
+ assert!(matches!(
+ ctx.table("t")
+ .await
+ .unwrap()
+ .select(vec![col("a")])
+ .unwrap()
+ .distinct()
+ .unwrap()
+ .sort(vec![Expr::Sort(Sort::new(Box::new(col("b")), false,
true))]),
+ Err(DataFusionError::Plan(_))
+ ));
+ Ok(())
+}
+
#[tokio::test]
async fn filter_with_alias_overwrite() -> Result<()> {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
diff --git a/datafusion/core/tests/sqllogictests/test_files/select.slt
b/datafusion/core/tests/sqllogictests/test_files/select.slt
index 1b5ede066..65353b0c1 100644
--- a/datafusion/core/tests/sqllogictests/test_files/select.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/select.slt
@@ -55,4 +55,29 @@ query IC rowsort
VALUES (1,'a'),(2,'b')
----
1 a
-2 b
\ No newline at end of file
+2 b
+
+# table foo for distinct order by
+statement ok
+CREATE TABLE foo AS VALUES
+(1, 2),
+(3, 4),
+(5, 6);
+
+# foo distinct
+query I
+select distinct '1' from foo;
+----
+1
+
+# foo order by
+query I
+select '1' from foo order by column1;
+----
+1
+1
+1
+
+# foo distinct order by
+statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions column1 must appear in select list
+select distinct '1' from foo order by column1;
\ No newline at end of file
diff --git a/datafusion/expr/src/logical_plan/builder.rs
b/datafusion/expr/src/logical_plan/builder.rs
index 4bbb83bb7..e1092b96b 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -406,6 +406,28 @@ impl LogicalPlanBuilder {
Ok(())
})?;
+ // if current plan is distinct or current plan is repartition and its
child plan is distinct,
+ // then this plan is a select distinct plan
+ let is_select_distinct = match self.plan {
+ LogicalPlan::Distinct(_) => true,
+ LogicalPlan::Repartition(Repartition { ref input, .. }) => {
+ matches!(input.as_ref(), &LogicalPlan::Distinct(_))
+ }
+ _ => false,
+ };
+
+ // for select distinct, order by expressions must exist in select list
+ if is_select_distinct && !missing_cols.is_empty() {
+ let missing_col_names = missing_cols
+ .iter()
+ .map(|col| col.flat_name())
+ .collect::<String>();
+ let error_msg = format!(
+ "For SELECT DISTINCT, ORDER BY expressions {missing_col_names}
must appear in select list",
+ );
+ return Err(DataFusionError::Plan(error_msg));
+ }
+
if missing_cols.is_empty() {
return Ok(Self::from(LogicalPlan::Sort(Sort {
expr: normalize_cols(exprs, &self.plan)?,
diff --git a/datafusion/sql/tests/integration_test.rs
b/datafusion/sql/tests/integration_test.rs
index 38ede4e6b..c75f93d36 100644
--- a/datafusion/sql/tests/integration_test.rs
+++ b/datafusion/sql/tests/integration_test.rs
@@ -3213,6 +3213,28 @@ fn test_select_join_key_inner_join() {
quick_test(sql, expected);
}
+#[test]
+fn test_select_order_by() {
+ let sql = "SELECT '1' from person order by id";
+
+ let expected = "Projection: Utf8(\"1\")\n Sort: person.id ASC NULLS
LAST\n Projection: Utf8(\"1\"), person.id\n TableScan: person";
+ quick_test(sql, expected);
+}
+
+#[test]
+fn test_select_distinct_order_by() {
+ let sql = "SELECT distinct '1' from person order by id";
+
+ let expected =
+ "Error during planning: For SELECT DISTINCT, ORDER BY expressions id
must appear in select list";
+
+ // It should return error.
+ let result = logical_plan(sql);
+ assert!(result.is_err());
+ let err = result.err().unwrap();
+ assert_eq!(err.to_string(), expected);
+}
+
#[test]
fn test_duplicated_left_join_key_inner_join() {
// person.id * 2 happen twice in left side.