(arrow-datafusion) branch main updated: Port tests in `group_by.rs` to sqllogictest (#8834)

alamb Fri, 12 Jan 2024 14:26:02 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 8353a2ca2f Port tests in `group_by.rs` to sqllogictest (#8834)
8353a2ca2f is described below

commit 8353a2ca2fd1f0ed5fc764b7463dfbcaa033ceef
Author: Trevor Hilton <[email protected]>
AuthorDate: Fri Jan 12 17:25:51 2024 -0500

    Port tests in `group_by.rs` to sqllogictest (#8834)
    
    * Add group_by_date_trunc and group_by_limit tests to groupby.slt
    
    * Remove two group_by_date_trunc and group_by_limit from .rs tests
    
    * port group_by_dictionary to groupby.slt
    
    * remove group_by.rs
    
    * rename groupby.slt to group_by.slt
    
    * fix typo in comment
---
 datafusion/core/tests/sql/group_by.rs              | 253 ------------
 datafusion/core/tests/sql/mod.rs                   |   1 -
 .../test_files/{groupby.slt => group_by.slt}       | 430 +++++++++++++++++++++
 3 files changed, 430 insertions(+), 254 deletions(-)

diff --git a/datafusion/core/tests/sql/group_by.rs 
b/datafusion/core/tests/sql/group_by.rs
deleted file mode 100644
index 58f0ac21d9..0000000000
--- a/datafusion/core/tests/sql/group_by.rs
+++ /dev/null
@@ -1,253 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use super::*;
-use arrow::util::pretty::pretty_format_batches;
-use arrow_schema::{DataType, TimeUnit};
-
-#[tokio::test]
-async fn group_by_date_trunc() -> Result<()> {
-    let tmp_dir = TempDir::new()?;
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("c2", DataType::UInt64, false),
-        Field::new(
-            "t1",
-            DataType::Timestamp(TimeUnit::Microsecond, None),
-            false,
-        ),
-    ]));
-
-    // generate a partitioned file
-    for partition in 0..4 {
-        let filename = format!("partition-{}.{}", partition, "csv");
-        let file_path = tmp_dir.path().join(filename);
-        let mut file = File::create(file_path)?;
-
-        // generate some data
-        for i in 0..10 {
-            let data = format!("{},2020-12-{}T00:00:00.000Z\n", i, i + 10);
-            file.write_all(data.as_bytes())?;
-        }
-    }
-
-    ctx.register_csv(
-        "test",
-        tmp_dir.path().to_str().unwrap(),
-        CsvReadOptions::new().schema(&schema).has_header(false),
-    )
-    .await?;
-
-    let results = plan_and_collect(
-        &ctx,
-        "SELECT date_trunc('week', t1) as week, SUM(c2) FROM test GROUP BY 
date_trunc('week', t1)",
-    ).await?;
-
-    let expected = [
-        "+---------------------+--------------+",
-        "| week                | SUM(test.c2) |",
-        "+---------------------+--------------+",
-        "| 2020-12-07T00:00:00 | 24           |",
-        "| 2020-12-14T00:00:00 | 156          |",
-        "+---------------------+--------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn group_by_limit() -> Result<()> {
-    let tmp_dir = TempDir::new()?;
-    let ctx = create_groupby_context(&tmp_dir).await?;
-
-    let sql = "SELECT trace_id, MAX(ts) from traces group by trace_id order by 
MAX(ts) desc limit 4";
-    let dataframe = ctx.sql(sql).await?;
-
-    // ensure we see `lim=[4]`
-    let physical_plan = dataframe.create_physical_plan().await?;
-    let mut expected_physical_plan = r#"
-GlobalLimitExec: skip=0, fetch=4
-  SortExec: TopK(fetch=4), expr=[MAX(traces.ts)@1 DESC]
-    AggregateExec: mode=Single, gby=[trace_id@0 as trace_id], 
aggr=[MAX(traces.ts)], lim=[4]
-    "#.trim().to_string();
-    let actual_phys_plan =
-        format_plan(physical_plan.clone(), &mut expected_physical_plan);
-    assert_eq!(actual_phys_plan, expected_physical_plan);
-
-    let batches = collect(physical_plan, ctx.task_ctx()).await?;
-    let expected = r#"
-+----------+----------------------+
-| trace_id | MAX(traces.ts)       |
-+----------+----------------------+
-| 9        | 2020-12-01T00:00:18Z |
-| 8        | 2020-12-01T00:00:17Z |
-| 7        | 2020-12-01T00:00:16Z |
-| 6        | 2020-12-01T00:00:15Z |
-+----------+----------------------+
-"#
-    .trim();
-    let actual = format!("{}", pretty_format_batches(&batches)?);
-    assert_eq!(actual, expected);
-
-    Ok(())
-}
-
-fn format_plan(
-    physical_plan: Arc<dyn ExecutionPlan>,
-    expected_phys_plan: &mut String,
-) -> String {
-    let actual_phys_plan = 
displayable(physical_plan.as_ref()).indent(true).to_string();
-    let last_line = actual_phys_plan
-        .as_str()
-        .lines()
-        .last()
-        .expect("Plan should not be empty");
-
-    expected_phys_plan.push('\n');
-    expected_phys_plan.push_str(last_line);
-    expected_phys_plan.push('\n');
-    actual_phys_plan
-}
-
-async fn create_groupby_context(tmp_dir: &TempDir) -> Result<SessionContext> {
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("trace_id", DataType::Utf8, false),
-        Field::new(
-            "ts",
-            DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
-            false,
-        ),
-    ]));
-
-    // generate a file
-    let filename = "traces.csv";
-    let file_path = tmp_dir.path().join(filename);
-    let mut file = File::create(file_path)?;
-
-    // generate some data
-    for trace_id in 0..10 {
-        for ts in 0..10 {
-            let ts = trace_id + ts;
-            let data = 
format!("\"{trace_id}\",2020-12-01T00:00:{ts:02}.000Z\n");
-            file.write_all(data.as_bytes())?;
-        }
-    }
-
-    let cfg = SessionConfig::new().with_target_partitions(1);
-    let ctx = SessionContext::new_with_config(cfg);
-    ctx.register_csv(
-        "traces",
-        tmp_dir.path().to_str().unwrap(),
-        CsvReadOptions::new().schema(&schema).has_header(false),
-    )
-    .await?;
-    Ok(ctx)
-}
-
-#[tokio::test]
-async fn group_by_dictionary() {
-    async fn run_test_case<K: ArrowDictionaryKeyType>() {
-        let ctx = SessionContext::new();
-
-        // input data looks like:
-        // A, 1
-        // B, 2
-        // A, 2
-        // A, 4
-        // C, 1
-        // A, 1
-
-        let dict_array: DictionaryArray<K> =
-            vec!["A", "B", "A", "A", "C", "A"].into_iter().collect();
-        let dict_array = Arc::new(dict_array);
-
-        let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into();
-        let val_array = Arc::new(val_array);
-
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("dict", dict_array.data_type().clone(), false),
-            Field::new("val", val_array.data_type().clone(), false),
-        ]));
-
-        let batch =
-            RecordBatch::try_new(schema.clone(), vec![dict_array, 
val_array]).unwrap();
-
-        ctx.register_batch("t", batch).unwrap();
-
-        let results =
-            plan_and_collect(&ctx, "SELECT dict, count(val) FROM t GROUP BY 
dict")
-                .await
-                .expect("ran plan correctly");
-
-        let expected = [
-            "+------+--------------+",
-            "| dict | COUNT(t.val) |",
-            "+------+--------------+",
-            "| A    | 4            |",
-            "| B    | 1            |",
-            "| C    | 1            |",
-            "+------+--------------+",
-        ];
-        assert_batches_sorted_eq!(expected, &results);
-
-        // Now, use dict as an aggregate
-        let results =
-            plan_and_collect(&ctx, "SELECT val, count(dict) FROM t GROUP BY 
val")
-                .await
-                .expect("ran plan correctly");
-
-        let expected = [
-            "+-----+---------------+",
-            "| val | COUNT(t.dict) |",
-            "+-----+---------------+",
-            "| 1   | 3             |",
-            "| 2   | 2             |",
-            "| 4   | 1             |",
-            "+-----+---------------+",
-        ];
-        assert_batches_sorted_eq!(expected, &results);
-
-        // Now, use dict as an aggregate
-        let results = plan_and_collect(
-            &ctx,
-            "SELECT val, count(distinct dict) FROM t GROUP BY val",
-        )
-        .await
-        .expect("ran plan correctly");
-
-        let expected = [
-            "+-----+------------------------+",
-            "| val | COUNT(DISTINCT t.dict) |",
-            "+-----+------------------------+",
-            "| 1   | 2                      |",
-            "| 2   | 2                      |",
-            "| 4   | 1                      |",
-            "+-----+------------------------+",
-        ];
-        assert_batches_sorted_eq!(expected, &results);
-    }
-
-    run_test_case::<Int8Type>().await;
-    run_test_case::<Int16Type>().await;
-    run_test_case::<Int32Type>().await;
-    run_test_case::<Int64Type>().await;
-    run_test_case::<UInt8Type>().await;
-    run_test_case::<UInt16Type>().await;
-    run_test_case::<UInt32Type>().await;
-    run_test_case::<UInt64Type>().await;
-}
diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 849d85dec6..3f52d2aae8 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -76,7 +76,6 @@ pub mod create_drop;
 pub mod csv_files;
 pub mod explain_analyze;
 pub mod expr;
-pub mod group_by;
 pub mod joins;
 pub mod order;
 pub mod partitioned_csv;
diff --git a/datafusion/sqllogictest/test_files/groupby.slt 
b/datafusion/sqllogictest/test_files/group_by.slt
similarity index 92%
rename from datafusion/sqllogictest/test_files/groupby.slt
rename to datafusion/sqllogictest/test_files/group_by.slt
index b09ff79e88..7c5803d385 100644
--- a/datafusion/sqllogictest/test_files/groupby.slt
+++ b/datafusion/sqllogictest/test_files/group_by.slt
@@ -4284,3 +4284,433 @@ LIMIT 5
 1 FRA 3 2022-01-02T12:00:00 EUR 200
 1 TUR 2 2022-01-01T11:30:00 TRY 75
 1 TUR 4 2022-01-03T10:00:00 TRY 100
+
+# Create a table with timestamp data
+statement ok
+CREATE TABLE src_table (
+       t1 TIMESTAMP,
+       c2 INT,
+) AS VALUES
+('2020-12-10T00:00:00.00Z', 0),
+('2020-12-11T00:00:00.00Z', 1),
+('2020-12-12T00:00:00.00Z', 2),
+('2020-12-13T00:00:00.00Z', 3),
+('2020-12-14T00:00:00.00Z', 4),
+('2020-12-15T00:00:00.00Z', 5),
+('2020-12-16T00:00:00.00Z', 6),
+('2020-12-17T00:00:00.00Z', 7),
+('2020-12-18T00:00:00.00Z', 8),
+('2020-12-19T00:00:00.00Z', 9);
+
+# Use src_table to create a partitioned file
+query PI
+COPY (SELECT * FROM src_table)
+TO 'test_files/scratch/group_by/timestamp_table/0.csv'
+(FORMAT CSV, SINGLE_FILE_OUTPUT true);
+----
+10
+
+query PI
+COPY (SELECT * FROM src_table)
+TO 'test_files/scratch/group_by/timestamp_table/1.csv'
+(FORMAT CSV, SINGLE_FILE_OUTPUT true);
+----
+10
+
+query PI
+COPY (SELECT * FROM src_table)
+TO 'test_files/scratch/group_by/timestamp_table/2.csv'
+(FORMAT CSV, SINGLE_FILE_OUTPUT true);
+----
+10
+
+query PI
+COPY (SELECT * FROM src_table)
+TO 'test_files/scratch/group_by/timestamp_table/3.csv'
+(FORMAT CSV, SINGLE_FILE_OUTPUT true);
+----
+10
+
+# Create a table from the generated CSV files:
+statement ok
+CREATE EXTERNAL TABLE timestamp_table (
+       t1 TIMESTAMP,
+       c2 INT,
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION 'test_files/scratch/group_by/timestamp_table';
+
+# Group By using date_trunc
+query PI rowsort
+SELECT date_trunc('week', t1) as week, sum(c2)
+FROM timestamp_table
+GROUP BY date_trunc('week', t1)
+----
+2020-12-07T00:00:00 24
+2020-12-14T00:00:00 156
+
+# GROUP BY using LIMIT
+query IP
+SELECT c2, MAX(t1)
+FROM timestamp_table
+GROUP BY c2
+ORDER BY MAX(t1) DESC
+LIMIT 4;
+----
+9 2020-12-19T00:00:00
+8 2020-12-18T00:00:00
+7 2020-12-17T00:00:00
+6 2020-12-16T00:00:00
+
+# Explain the GROUP BY with LIMIT to ensure the plan contains `lim=[4]`
+query TT
+EXPLAIN
+SELECT c2, MAX(t1)
+FROM timestamp_table
+GROUP BY c2
+ORDER BY MAX(t1) DESC
+LIMIT 4;
+----
+logical_plan
+Limit: skip=0, fetch=4
+--Sort: MAX(timestamp_table.t1) DESC NULLS FIRST, fetch=4
+----Aggregate: groupBy=[[timestamp_table.c2]], aggr=[[MAX(timestamp_table.t1)]]
+------TableScan: timestamp_table projection=[t1, c2]
+physical_plan
+GlobalLimitExec: skip=0, fetch=4
+--SortPreservingMergeExec: [MAX(timestamp_table.t1)@1 DESC], fetch=4
+----SortExec: TopK(fetch=4), expr=[MAX(timestamp_table.t1)@1 DESC]
+------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], 
aggr=[MAX(timestamp_table.t1)], lim=[4]
+--------CoalesceBatchesExec: target_batch_size=2
+----------RepartitionExec: partitioning=Hash([c2@0], 8), input_partitions=8
+------------AggregateExec: mode=Partial, gby=[c2@1 as c2], 
aggr=[MAX(timestamp_table.t1)], lim=[4]
+--------------RepartitionExec: partitioning=RoundRobinBatch(8), 
input_partitions=4
+----------------CsvExec: file_groups={4 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/0.csv],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/1.csv],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/2.csv],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/3.csv]]},
 projection=[t1, c2], has_header=true
+
+# Clean up
+statement ok
+DROP TABLE src_table;
+
+statement ok
+DROP TABLE timestamp_table;
+
+### BEGIN Group By with Dictionary Variants ###
+#
+# The following tests use GROUP BY on tables with dictionary columns.
+# The same test is repeated using dictionaries with the key types:
+#
+# - Int8
+# - Int16
+# - Int32
+# - Int64
+# - UInt8
+# - UInt16
+# - UInt32
+# - UInt64
+
+# Table with an int column and Dict<Int8> column:
+statement ok
+CREATE TABLE int8_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(Int8, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(Int8, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(Int8, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(Int8, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(Int8, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(Int8, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM int8_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM int8_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM int8_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE int8_dict;
+
+# Table with an int column and Dict<Int16> column:
+statement ok
+CREATE TABLE int16_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(Int16, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(Int16, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(Int16, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(Int16, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(Int16, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(Int16, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM int16_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM int16_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM int16_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE int16_dict;
+
+# Table with an int column and Dict<Int32> column:
+statement ok
+CREATE TABLE int32_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(Int32, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(Int32, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(Int32, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(Int32, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(Int32, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(Int32, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM int32_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM int32_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM int32_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE int32_dict;
+
+# Table with an int column and Dict<Int64> column:
+statement ok
+CREATE TABLE int64_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(Int64, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(Int64, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(Int64, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(Int64, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(Int64, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(Int64, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM int64_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM int64_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM int64_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE int64_dict;
+
+# Table with an int column and Dict<UInt8> column:
+statement ok
+CREATE TABLE uint8_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(UInt8, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(UInt8, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(UInt8, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(UInt8, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(UInt8, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(UInt8, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM uint8_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM uint8_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM uint8_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE uint8_dict;
+
+# Table with an int column and Dict<UInt16> column:
+statement ok
+CREATE TABLE uint16_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(UInt16, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(UInt16, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(UInt16, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(UInt16, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(UInt16, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(UInt16, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM uint16_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM uint16_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM uint16_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE uint16_dict;
+
+# Table with an int column and Dict<UInt32> column:
+statement ok
+CREATE TABLE uint32_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(UInt32, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(UInt32, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(UInt32, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(UInt32, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(UInt32, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(UInt32, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM uint32_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM uint32_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM uint32_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE uint32_dict;
+
+# Table with an int column and Dict<UInt64> column:
+statement ok
+CREATE TABLE uint64_dict AS VALUES 
+(1, arrow_cast('A', 'Dictionary(UInt64, Utf8)')),
+(2, arrow_cast('B', 'Dictionary(UInt64, Utf8)')),
+(2, arrow_cast('A', 'Dictionary(UInt64, Utf8)')),
+(4, arrow_cast('A', 'Dictionary(UInt64, Utf8)')),
+(1, arrow_cast('C', 'Dictionary(UInt64, Utf8)')),
+(1, arrow_cast('A', 'Dictionary(UInt64, Utf8)'));
+
+# Group by the non-dict column
+query ?I rowsort
+SELECT column2, count(column1) FROM uint64_dict GROUP BY column2;
+----
+A 4
+B 1
+C 1
+
+# Group by the value column with dict as aggregate
+query II rowsort
+SELECT column1, count(column2) FROM uint64_dict GROUP BY column1;
+----
+1 3
+2 2
+4 1
+
+# Group by with dict as aggregate using distinct
+query II rowsort
+SELECT column1, count(distinct column2) FROM uint64_dict GROUP BY column1;
+----
+1 2
+2 2
+4 1
+
+# Clean up
+statement ok
+DROP TABLE uint64_dict;
+
+### END Group By with Dictionary Variants ###

(arrow-datafusion) branch main updated: Port tests in `group_by.rs` to sqllogictest (#8834)

Reply via email to