mustafasrepo commented on code in PR #6234:
URL: https://github.com/apache/arrow-datafusion/pull/6234#discussion_r1188480502
##########
datafusion/core/tests/sqllogictests/test_files/groupby.slt:
##########
@@ -1921,3 +1921,96 @@ SELECT DISTINCT + col1 FROM tab2 AS cor0 GROUP BY
cor0.col1
41
59
61
+
+
+
+# Columns in the table are a,b,c,d. Source is CsvExec which is ordered by
+# a,b,c column. Column a has cardinality 2, column b has cardinality 4.
+# Column c has cardinality 100 (unique entries). Column d has cardinality 5.
+statement ok
+CREATE EXTERNAL TABLE annotated_data_finite2 (
+ a0 INTEGER,
+ a INTEGER,
+ b INTEGER,
+ c INTEGER,
+ d INTEGER
+)
+STORED AS CSV
+WITH HEADER ROW
+WITH ORDER (a ASC, b ASC, c ASC)
+LOCATION 'tests/data/window_2.csv';
+
+
+# test_source_sorted_groupby
+query TT
+EXPLAIN SELECT a, b,
+ SUM(c) as summation1
+ FROM annotated_data_finite2
+ GROUP BY b, a
+----
+logical_plan
+Projection: annotated_data_finite2.a, annotated_data_finite2.b,
SUM(annotated_data_finite2.c) AS summation1
+ Aggregate: groupBy=[[annotated_data_finite2.b, annotated_data_finite2.a]],
aggr=[[SUM(annotated_data_finite2.c)]]
+ TableScan: annotated_data_finite2 projection=[a, b, c]
+physical_plan
+ProjectionExec: expr=[a@1 as a, b@0 as b, SUM(annotated_data_finite2.c)@2 as
summation1]
+ AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a],
aggr=[SUM(annotated_data_finite2.c)]
+ CoalesceBatchesExec: target_batch_size=8192
+ RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 },
Column { name: "a", index: 1 }], 4), input_partitions=4
+ AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a],
aggr=[SUM(annotated_data_finite2.c)], ordering_mode=FullyOrdered
+ RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+ CsvExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b,
c], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS
LAST], has_header=true
Review Comment:
Above changes would tun this section into
```sql
AggregateExec: mode=Single, gby=[b@1 as b, a@0 as a],
aggr=[SUM(annotated_data_finite2.c)], ordering_mode=FullyOrdered
CsvExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b,
c], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS
LAST, c@2 ASC NULLS LAST], has_header=true
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]