Copilot commented on code in PR #19538: URL: https://github.com/apache/datafusion/pull/19538#discussion_r2715298718
########## datafusion/sqllogictest/test_files/projection_pushdown.slt: ########## @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for projection pushdown behavior with get_field expressions +# +# This file tests the ExtractTrivialProjections optimizer rule and +# physical projection pushdown for: +# - get_field expressions (struct field access like s['foo']) +# - Pushdown through Filter, Sort, and TopK operators +# - Multi-partition scenarios with SortPreservingMergeExec +########## + +##################### +# Section 1: Setup - Single Partition Tests +##################### + +# Set target_partitions = 1 for deterministic plan output +statement ok +SET datafusion.execution.target_partitions = 1; + +# Create parquet file with struct column containing value and label fields +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, {value: 200, label: 'beta'}), + (3, {value: 150, label: 'gamma'}), + (4, {value: 300, label: 'delta'}), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/simple.parquet' +STORED AS PARQUET; + +# Create table for simple struct tests +statement ok +CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/simple.parquet'; + +# Create parquet file with nested struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as nested + FROM VALUES + (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}), + (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}), + (3, {outer: {inner: 30, name: 'three'}, extra: 'z'}) +) TO 'test_files/scratch/projection_pushdown/nested.parquet' +STORED AS PARQUET; + +# Create table for nested struct tests +statement ok +CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nested.parquet'; + +# Create parquet file with nullable struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, NULL), + (3, {value: 150, label: 'gamma'}), + (4, NULL), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/nullable.parquet' +STORED AS PARQUET; + +# Create table for nullable struct tests +statement ok +CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet'; + + +##################### +# Section 2: Basic get_field Pushdown (Projection above scan) +##################### + +### +# Test 2.1: Simple s['value'] - pushed into DataSourceExec +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 2.2: Multiple get_field expressions - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id; +---- +1 100 alpha +2 200 beta +3 150 gamma +4 300 delta +5 250 epsilon + +### +# Test 2.3: Nested s['outer']['inner'] - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; +---- +logical_plan +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--TableScan: nested_struct projection=[id, nested] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id; +---- +1 10 +2 20 +3 30 + +### +# Test 2.4: s['value'] + 1 - entire expression pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 2.5: s['label'] || '_suffix' - pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix +4 delta_suffix +5 epsilon_suffix + + +##################### +# Section 3: Projection Through Filter +##################### + +### +# Test 3.1: Simple get_field through Filter - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)FilterExec: id@0 > 2 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 3.2: s['value'] + 1 through Filter - get_field extracted and pushed +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 151 +4 301 +5 251 + +### +# Test 3.3: Filter on get_field expression +### + +query TT +EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id; +---- +2 beta +4 delta +5 epsilon + + +##################### +# Section 4: Projection Through Sort (no LIMIT) +##################### + +### +# Test 4.1: Simple get_field through Sort - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 4.2: s['value'] + 1 through Sort - split projection +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + Review Comment: This section label says "split projection", but the expected physical plan shows the full `get_field(s@1, value) + 1` expression pushed into `DataSourceExec` (no separate extracted projection). Please clarify the comment or adjust expected output to reflect whether splitting is expected here. ########## datafusion/sqllogictest/test_files/projection_pushdown.slt: ########## @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for projection pushdown behavior with get_field expressions +# +# This file tests the ExtractTrivialProjections optimizer rule and +# physical projection pushdown for: +# - get_field expressions (struct field access like s['foo']) +# - Pushdown through Filter, Sort, and TopK operators +# - Multi-partition scenarios with SortPreservingMergeExec +########## + +##################### +# Section 1: Setup - Single Partition Tests +##################### + +# Set target_partitions = 1 for deterministic plan output +statement ok +SET datafusion.execution.target_partitions = 1; + +# Create parquet file with struct column containing value and label fields +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, {value: 200, label: 'beta'}), + (3, {value: 150, label: 'gamma'}), + (4, {value: 300, label: 'delta'}), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/simple.parquet' +STORED AS PARQUET; + +# Create table for simple struct tests +statement ok +CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/simple.parquet'; + +# Create parquet file with nested struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as nested + FROM VALUES + (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}), + (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}), + (3, {outer: {inner: 30, name: 'three'}, extra: 'z'}) +) TO 'test_files/scratch/projection_pushdown/nested.parquet' +STORED AS PARQUET; + +# Create table for nested struct tests +statement ok +CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nested.parquet'; + +# Create parquet file with nullable struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, NULL), + (3, {value: 150, label: 'gamma'}), + (4, NULL), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/nullable.parquet' +STORED AS PARQUET; + +# Create table for nullable struct tests +statement ok +CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet'; + + +##################### +# Section 2: Basic get_field Pushdown (Projection above scan) +##################### + +### +# Test 2.1: Simple s['value'] - pushed into DataSourceExec +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 2.2: Multiple get_field expressions - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id; +---- +1 100 alpha +2 200 beta +3 150 gamma +4 300 delta +5 250 epsilon + +### +# Test 2.3: Nested s['outer']['inner'] - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; +---- +logical_plan +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--TableScan: nested_struct projection=[id, nested] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id; +---- +1 10 +2 20 +3 30 + +### +# Test 2.4: s['value'] + 1 - entire expression pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 2.5: s['label'] || '_suffix' - pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix +4 delta_suffix +5 epsilon_suffix + + +##################### +# Section 3: Projection Through Filter +##################### + +### +# Test 3.1: Simple get_field through Filter - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)FilterExec: id@0 > 2 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 3.2: s['value'] + 1 through Filter - get_field extracted and pushed +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 151 +4 301 +5 251 + +### +# Test 3.3: Filter on get_field expression +### + +query TT +EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id; +---- +2 beta +4 delta +5 epsilon + + +##################### +# Section 4: Projection Through Sort (no LIMIT) +##################### + +### +# Test 4.1: Simple get_field through Sort - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 4.2: s['value'] + 1 through Sort - split projection +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 4.3: Sort by get_field expression +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +1 100 +3 150 +2 200 +5 250 +4 300 + + +##################### +# Section 5: Projection Through TopK (ORDER BY + LIMIT) +##################### + +### +# Test 5.1: Simple get_field through TopK - pushed (trivial) +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 5.2: s['value'] + 1 through TopK - NOT pushed (non-trivial stays above) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +1 101 +2 201 +3 151 + +### +# Test 5.3: Multiple get_field through TopK - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 alpha +2 200 beta +3 150 gamma + +### +# Test 5.4: Nested get_field through TopK - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +logical_plan +01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +03)----TableScan: nested_struct projection=[id, nested] +physical_plan +01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +1 10 +2 20 + +### +# Test 5.5: String concat through TopK - NOT pushed (non-trivial) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix + + +##################### +# Section 6: Combined Operators +##################### + +### +# Test 6.1: Filter + Sort + get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value']; +---- +3 150 +2 200 +5 250 +4 300 + +### +# Test 6.2: Filter + TopK + get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2; +---- +3 150 +2 200 + +### +# Test 6.3: Filter + TopK + get_field with arithmetic +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2; +---- +2 201 +3 151 + + +##################### +# Section 7: Multi-Partition Tests +##################### + +# Set target_partitions = 4 for parallel execution +statement ok +SET datafusion.execution.target_partitions = 4; + +# Create 5 parquet files (more than partitions) for parallel tests +statement ok +COPY (SELECT 1 as id, {value: 100, label: 'alpha'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part1.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 2 as id, {value: 200, label: 'beta'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part2.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 3 as id, {value: 150, label: 'gamma'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part3.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 4 as id, {value: 300, label: 'delta'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part4.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 5 as id, {value: 250, label: 'epsilon'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part5.parquet' +STORED AS PARQUET; + +# Create table from multiple parquet files +statement ok +CREATE EXTERNAL TABLE multi_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/multi/'; + +### +# Test 7.1: Multi-partition Sort with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] +02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 7.2: Multi-partition TopK with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 +02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 7.3: Multi-partition TopK with arithmetic (non-trivial stays above merge) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 +02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) + 1 as multi_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; +---- +1 101 +2 201 +3 151 + +### +# Test 7.4: Multi-partition Filter with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----Filter: multi_struct.id > Int64(2) +04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] +02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----FilterExec: id@0 > 2 +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 +05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 7.5: Aggregation with get_field (CoalescePartitions) +### + +query TT +EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; +---- +logical_plan +01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] +02)--TableScan: multi_struct projection=[s] +physical_plan +01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 +03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet + +# Verify correctness +query TI +SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'] ORDER BY s['label']; +---- +alpha 100 +beta 200 +delta 300 +epsilon 250 +gamma 150 + + +##################### +# Section 8: Edge Cases +##################### + +# Reset to single partition for edge case tests +statement ok +SET datafusion.execution.target_partitions = 1; + +### +# Test 8.1: get_field on nullable struct column +### + +query TT +EXPLAIN SELECT id, s['value'] FROM nullable_struct; +---- +logical_plan +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) +02)--TableScan: nullable_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet + +# Verify correctness (NULL struct returns NULL field) +query II +SELECT id, s['value'] FROM nullable_struct ORDER BY id; +---- +1 100 +2 NULL +3 150 +4 NULL +5 250 + +### +# Test 8.2: get_field returning NULL values +### + +query TT +EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; +---- +logical_plan +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) +02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL +03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] +02)--FilterExec: get_field(s@1, value) IS NOT NULL +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL ORDER BY id; +---- +1 alpha +3 gamma +5 epsilon + +### +# Test 8.3: Mixed trivial and non-trivial in same projection +### + +query TT +EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, __common_expr_1 AS simple_struct.s[value], __common_expr_1 AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __common_expr_1, simple_struct.id, simple_struct.s +04)------TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, value) + 10 as simple_struct.s[value] + Int64(10), get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IIIT +SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 110 alpha +2 200 210 beta +3 150 160 gamma + +### +# Test 8.4: Literal projection through TopK (NOT pushed through TopK, stays above) +### + +query TT +EXPLAIN SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, Int64(42) AS constant +03)----TableScan: simple_struct projection=[id] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as constant], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3; +---- +1 42 +2 42 +3 42 + +### +# Test 8.5: Simple column through TopK (baseline comparison) +### + +query TT +EXPLAIN SELECT id FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--TableScan: simple_struct projection=[id] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query I +SELECT id FROM simple_struct ORDER BY id LIMIT 3; +---- +1 +2 +3 + + +##################### +# Section 9: Coverage Tests - Edge Cases for Uncovered Code Paths +##################### + +### +# Test 9.1: TopK with non-trivial projection that should NOT be pushed +# This tests the is_trivial_or_narrows_schema guard in sort.rs +# The projection adds a computed column (literal + column), which should not be pushed into TopK +### + +query TT +EXPLAIN SELECT id, id + 100 as computed FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, simple_struct.id + Int64(100) AS computed +03)----TableScan: simple_struct projection=[id] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, id@0 + 100 as computed], file_type=parquet, predicate=DynamicFilter [ empty ] + Review Comment: The comment claims this query exercises a guard that prevents pushing computed columns into TopK, but the expected physical plan shows `id@0 + 100 as computed` pushed into `DataSourceExec`. Please update the comment or expected plan so they agree on whether this pushdown is allowed. ########## datafusion/sqllogictest/test_files/projection_pushdown.slt: ########## @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for projection pushdown behavior with get_field expressions +# +# This file tests the ExtractTrivialProjections optimizer rule and +# physical projection pushdown for: +# - get_field expressions (struct field access like s['foo']) +# - Pushdown through Filter, Sort, and TopK operators +# - Multi-partition scenarios with SortPreservingMergeExec +########## + +##################### +# Section 1: Setup - Single Partition Tests +##################### + +# Set target_partitions = 1 for deterministic plan output +statement ok +SET datafusion.execution.target_partitions = 1; + +# Create parquet file with struct column containing value and label fields +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, {value: 200, label: 'beta'}), + (3, {value: 150, label: 'gamma'}), + (4, {value: 300, label: 'delta'}), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/simple.parquet' +STORED AS PARQUET; + +# Create table for simple struct tests +statement ok +CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/simple.parquet'; + +# Create parquet file with nested struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as nested + FROM VALUES + (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}), + (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}), + (3, {outer: {inner: 30, name: 'three'}, extra: 'z'}) +) TO 'test_files/scratch/projection_pushdown/nested.parquet' +STORED AS PARQUET; + +# Create table for nested struct tests +statement ok +CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nested.parquet'; + +# Create parquet file with nullable struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, NULL), + (3, {value: 150, label: 'gamma'}), + (4, NULL), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/nullable.parquet' +STORED AS PARQUET; + +# Create table for nullable struct tests +statement ok +CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet'; + + +##################### +# Section 2: Basic get_field Pushdown (Projection above scan) +##################### + +### +# Test 2.1: Simple s['value'] - pushed into DataSourceExec +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 2.2: Multiple get_field expressions - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id; +---- +1 100 alpha +2 200 beta +3 150 gamma +4 300 delta +5 250 epsilon + +### +# Test 2.3: Nested s['outer']['inner'] - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; +---- +logical_plan +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--TableScan: nested_struct projection=[id, nested] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id; +---- +1 10 +2 20 +3 30 + +### +# Test 2.4: s['value'] + 1 - entire expression pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 2.5: s['label'] || '_suffix' - pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix +4 delta_suffix +5 epsilon_suffix + + +##################### +# Section 3: Projection Through Filter +##################### + +### +# Test 3.1: Simple get_field through Filter - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)FilterExec: id@0 > 2 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 3.2: s['value'] + 1 through Filter - get_field extracted and pushed +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 151 +4 301 +5 251 + +### +# Test 3.3: Filter on get_field expression +### + +query TT +EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id; +---- +2 beta +4 delta +5 epsilon + + +##################### +# Section 4: Projection Through Sort (no LIMIT) +##################### + +### +# Test 4.1: Simple get_field through Sort - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 4.2: s['value'] + 1 through Sort - split projection +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 4.3: Sort by get_field expression +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +1 100 +3 150 +2 200 +5 250 +4 300 + + +##################### +# Section 5: Projection Through TopK (ORDER BY + LIMIT) +##################### + +### +# Test 5.1: Simple get_field through TopK - pushed (trivial) +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 5.2: s['value'] + 1 through TopK - NOT pushed (non-trivial stays above) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +1 101 +2 201 +3 151 + +### +# Test 5.3: Multiple get_field through TopK - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 alpha +2 200 beta +3 150 gamma + +### +# Test 5.4: Nested get_field through TopK - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +logical_plan +01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +03)----TableScan: nested_struct projection=[id, nested] +physical_plan +01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +1 10 +2 20 + +### +# Test 5.5: String concat through TopK - NOT pushed (non-trivial) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet, predicate=DynamicFilter [ empty ] + Review Comment: Test header says string concat is "NOT pushed" through TopK, but the expected physical plan shows the concat expression pushed into `DataSourceExec`. Please update the comment to match the asserted plan (or adjust the plan if this should remain above SortExec). ########## datafusion/physical-plan/src/filter.rs: ########## @@ -562,8 +562,10 @@ impl ExecutionPlan for FilterExec { &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>> { - // If the projection does not narrow the schema, we should not try to push it down: - if projection.expr().len() < projection.input().schema().fields().len() { + // Push projection through filter if: + // - It narrows the schema (drops columns), OR + // - It's trivial (columns, literals, or cheap expressions like get_field) Review Comment: The comment says projections with "literals" can be pushed through FilterExec, but `is_trivial_or_narrows_schema` only treats Column/TrivialExpr as trivial (and literals are excluded unless the projection narrows the schema). Please update this comment to match actual behavior (or adjust the helper if literals are intended to be treated as trivial here). ```suggestion // - It's otherwise considered trivial by `is_trivial_or_narrows_schema` ``` ########## datafusion/physical-plan/src/sorts/sort_preserving_merge.rs: ########## @@ -391,8 +393,10 @@ impl ExecutionPlan for SortPreservingMergeExec { &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>> { - // If the projection does not narrow the schema, we should not try to push it down. - if projection.expr().len() >= projection.input().schema().fields().len() { + // When SortPreservingMergeExec has a fetch, it acts as a filter reducing rows. + // Only push projections that are trivial or narrow the schema to avoid + // evaluating expressions (like literals) on all input rows. + if self.fetch.is_some() && !is_trivial_or_narrows_schema(projection) { Review Comment: `SortPreservingMergeExec::try_swapping_with_projection` used to block swapping when the projection didn't narrow the schema (to avoid increasing the amount of data the merge has to carry). With this change, when `fetch` is `None` the method will now allow swapping *any* projection (including non-trivial computed columns) below the merge. That can increase memory/CPU for the merge step. Consider applying `is_trivial_or_narrows_schema(projection)` regardless of `fetch` (similar to `SortExec`), or otherwise restricting pushdown to cases proven beneficial. ```suggestion // Only push projections that are trivial or narrow the schema to avoid // increasing the amount of data or evaluating expressions (like literals) // on all input rows. if !is_trivial_or_narrows_schema(projection) { ``` ########## datafusion/sqllogictest/test_files/projection_pushdown.slt: ########## @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for projection pushdown behavior with get_field expressions +# +# This file tests the ExtractTrivialProjections optimizer rule and +# physical projection pushdown for: +# - get_field expressions (struct field access like s['foo']) +# - Pushdown through Filter, Sort, and TopK operators +# - Multi-partition scenarios with SortPreservingMergeExec +########## + +##################### +# Section 1: Setup - Single Partition Tests +##################### + +# Set target_partitions = 1 for deterministic plan output +statement ok +SET datafusion.execution.target_partitions = 1; + +# Create parquet file with struct column containing value and label fields +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, {value: 200, label: 'beta'}), + (3, {value: 150, label: 'gamma'}), + (4, {value: 300, label: 'delta'}), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/simple.parquet' +STORED AS PARQUET; + +# Create table for simple struct tests +statement ok +CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/simple.parquet'; + +# Create parquet file with nested struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as nested + FROM VALUES + (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}), + (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}), + (3, {outer: {inner: 30, name: 'three'}, extra: 'z'}) +) TO 'test_files/scratch/projection_pushdown/nested.parquet' +STORED AS PARQUET; + +# Create table for nested struct tests +statement ok +CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nested.parquet'; + +# Create parquet file with nullable struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, NULL), + (3, {value: 150, label: 'gamma'}), + (4, NULL), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/nullable.parquet' +STORED AS PARQUET; + +# Create table for nullable struct tests +statement ok +CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet'; + + +##################### +# Section 2: Basic get_field Pushdown (Projection above scan) +##################### + +### +# Test 2.1: Simple s['value'] - pushed into DataSourceExec +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 2.2: Multiple get_field expressions - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id; +---- +1 100 alpha +2 200 beta +3 150 gamma +4 300 delta +5 250 epsilon + +### +# Test 2.3: Nested s['outer']['inner'] - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; +---- +logical_plan +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--TableScan: nested_struct projection=[id, nested] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id; +---- +1 10 +2 20 +3 30 + +### +# Test 2.4: s['value'] + 1 - entire expression pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 2.5: s['label'] || '_suffix' - pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix +4 delta_suffix +5 epsilon_suffix + + +##################### +# Section 3: Projection Through Filter +##################### + +### +# Test 3.1: Simple get_field through Filter - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)FilterExec: id@0 > 2 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 3.2: s['value'] + 1 through Filter - get_field extracted and pushed +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 151 +4 301 +5 251 + +### +# Test 3.3: Filter on get_field expression +### + +query TT +EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id; +---- +2 beta +4 delta +5 epsilon + + +##################### +# Section 4: Projection Through Sort (no LIMIT) +##################### + +### +# Test 4.1: Simple get_field through Sort - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 4.2: s['value'] + 1 through Sort - split projection +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 4.3: Sort by get_field expression +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +1 100 +3 150 +2 200 +5 250 +4 300 + + +##################### +# Section 5: Projection Through TopK (ORDER BY + LIMIT) +##################### + +### +# Test 5.1: Simple get_field through TopK - pushed (trivial) +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 5.2: s['value'] + 1 through TopK - NOT pushed (non-trivial stays above) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + Review Comment: Test header says this non-trivial projection is "NOT pushed" through TopK, but the expected physical plan shows the full expression pushed into `DataSourceExec` (no separate `ProjectionExec` above SortExec). Please align the comment with the expected plan (or update the plan if pushdown is unintended). ########## datafusion/sqllogictest/test_files/projection_pushdown.slt: ########## @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for projection pushdown behavior with get_field expressions +# +# This file tests the ExtractTrivialProjections optimizer rule and +# physical projection pushdown for: +# - get_field expressions (struct field access like s['foo']) +# - Pushdown through Filter, Sort, and TopK operators +# - Multi-partition scenarios with SortPreservingMergeExec +########## + +##################### +# Section 1: Setup - Single Partition Tests +##################### + +# Set target_partitions = 1 for deterministic plan output +statement ok +SET datafusion.execution.target_partitions = 1; + +# Create parquet file with struct column containing value and label fields +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, {value: 200, label: 'beta'}), + (3, {value: 150, label: 'gamma'}), + (4, {value: 300, label: 'delta'}), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/simple.parquet' +STORED AS PARQUET; + +# Create table for simple struct tests +statement ok +CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/simple.parquet'; + +# Create parquet file with nested struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as nested + FROM VALUES + (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}), + (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}), + (3, {outer: {inner: 30, name: 'three'}, extra: 'z'}) +) TO 'test_files/scratch/projection_pushdown/nested.parquet' +STORED AS PARQUET; + +# Create table for nested struct tests +statement ok +CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nested.parquet'; + +# Create parquet file with nullable struct column +statement ok +COPY ( + SELECT + column1 as id, + column2 as s + FROM VALUES + (1, {value: 100, label: 'alpha'}), + (2, NULL), + (3, {value: 150, label: 'gamma'}), + (4, NULL), + (5, {value: 250, label: 'epsilon'}) +) TO 'test_files/scratch/projection_pushdown/nullable.parquet' +STORED AS PARQUET; + +# Create table for nullable struct tests +statement ok +CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet'; + + +##################### +# Section 2: Basic get_field Pushdown (Projection above scan) +##################### + +### +# Test 2.1: Simple s['value'] - pushed into DataSourceExec +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 2.2: Multiple get_field expressions - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id; +---- +1 100 alpha +2 200 beta +3 150 gamma +4 300 delta +5 250 epsilon + +### +# Test 2.3: Nested s['outer']['inner'] - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; +---- +logical_plan +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--TableScan: nested_struct projection=[id, nested] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id; +---- +1 10 +2 20 +3 30 + +### +# Test 2.4: s['value'] + 1 - entire expression pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 2.5: s['label'] || '_suffix' - pushed (directly above scan) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--TableScan: simple_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix +4 delta_suffix +5 epsilon_suffix + + +##################### +# Section 3: Projection Through Filter +##################### + +### +# Test 3.1: Simple get_field through Filter - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)FilterExec: id@0 > 2 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 3.2: s['value'] + 1 through Filter - get_field extracted and pushed +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Filter: simple_struct.id > Int64(2) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +physical_plan +01)ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id; +---- +3 151 +4 301 +5 251 + +### +# Test 3.3: Filter on get_field expression +### + +query TT +EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; +---- +logical_plan +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id; +---- +2 beta +4 delta +5 epsilon + + +##################### +# Section 4: Projection Through Sort (no LIMIT) +##################### + +### +# Test 4.1: Simple get_field through Sort - pushed +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 4.2: s['value'] + 1 through Sort - split projection +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; +---- +1 101 +2 201 +3 151 +4 301 +5 251 + +### +# Test 4.3: Sort by get_field expression +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; +---- +1 100 +3 150 +2 200 +5 250 +4 300 + + +##################### +# Section 5: Projection Through TopK (ORDER BY + LIMIT) +##################### + +### +# Test 5.1: Simple get_field through TopK - pushed (trivial) +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 5.2: s['value'] + 1 through TopK - NOT pushed (non-trivial stays above) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; +---- +1 101 +2 201 +3 151 + +### +# Test 5.3: Multiple get_field through TopK - all pushed +### + +query TT +EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IIT +SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 alpha +2 200 beta +3 150 gamma + +### +# Test 5.4: Nested get_field through TopK - pushed +### + +query TT +EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +logical_plan +01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +03)----TableScan: nested_struct projection=[id, nested] +physical_plan +01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2; +---- +1 10 +2 20 + +### +# Test 5.5: String concat through TopK - NOT pushed (non-trivial) +### + +query TT +EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +03)----TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IT +SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3; +---- +1 alpha_suffix +2 beta_suffix +3 gamma_suffix + + +##################### +# Section 6: Combined Operators +##################### + +### +# Test 6.1: Filter + Sort + get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value']; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value']; +---- +3 150 +2 200 +5 250 +4 300 + +### +# Test 6.2: Filter + TopK + get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2; +---- +logical_plan +01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2; +---- +3 150 +2 200 + +### +# Test 6.3: Filter + TopK + get_field with arithmetic +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +03)----Filter: simple_struct.id > Int64(1) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +physical_plan +01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--ProjectionExec: expr=[id@1 as id, __extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2; +---- +2 201 +3 151 + + +##################### +# Section 7: Multi-Partition Tests +##################### + +# Set target_partitions = 4 for parallel execution +statement ok +SET datafusion.execution.target_partitions = 4; + +# Create 5 parquet files (more than partitions) for parallel tests +statement ok +COPY (SELECT 1 as id, {value: 100, label: 'alpha'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part1.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 2 as id, {value: 200, label: 'beta'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part2.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 3 as id, {value: 150, label: 'gamma'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part3.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 4 as id, {value: 300, label: 'delta'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part4.parquet' +STORED AS PARQUET; + +statement ok +COPY (SELECT 5 as id, {value: 250, label: 'epsilon'} as s) +TO 'test_files/scratch/projection_pushdown/multi/part5.parquet' +STORED AS PARQUET; + +# Create table from multiple parquet files +statement ok +CREATE EXTERNAL TABLE multi_struct STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/multi/'; + +### +# Test 7.1: Multi-partition Sort with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] +02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct ORDER BY id; +---- +1 100 +2 200 +3 150 +4 300 +5 250 + +### +# Test 7.2: Multi-partition TopK with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 +02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; +---- +1 100 +2 200 +3 150 + +### +# Test 7.3: Multi-partition TopK with arithmetic (non-trivial stays above merge) +### + +query TT +EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) +03)----TableScan: multi_struct projection=[id, s] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 +02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) + 1 as multi_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query II +SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; +---- +1 101 +2 201 +3 151 + +### +# Test 7.4: Multi-partition Filter with get_field +### + +query TT +EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; +---- +logical_plan +01)Sort: multi_struct.id ASC NULLS LAST +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +03)----Filter: multi_struct.id > Int64(2) +04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +physical_plan +01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] +02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----FilterExec: id@0 > 2 +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 +05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] + +# Verify correctness +query II +SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; +---- +3 150 +4 300 +5 250 + +### +# Test 7.5: Aggregation with get_field (CoalescePartitions) +### + +query TT +EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; +---- +logical_plan +01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] +02)--TableScan: multi_struct projection=[s] +physical_plan +01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 +03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet + +# Verify correctness +query TI +SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'] ORDER BY s['label']; +---- +alpha 100 +beta 200 +delta 300 +epsilon 250 +gamma 150 + + +##################### +# Section 8: Edge Cases +##################### + +# Reset to single partition for edge case tests +statement ok +SET datafusion.execution.target_partitions = 1; + +### +# Test 8.1: get_field on nullable struct column +### + +query TT +EXPLAIN SELECT id, s['value'] FROM nullable_struct; +---- +logical_plan +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) +02)--TableScan: nullable_struct projection=[id, s] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet + +# Verify correctness (NULL struct returns NULL field) +query II +SELECT id, s['value'] FROM nullable_struct ORDER BY id; +---- +1 100 +2 NULL +3 150 +4 NULL +5 250 + +### +# Test 8.2: get_field returning NULL values +### + +query TT +EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; +---- +logical_plan +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) +02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL +03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +physical_plan +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] +02)--FilterExec: get_field(s@1, value) IS NOT NULL +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet + +# Verify correctness +query IT +SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL ORDER BY id; +---- +1 alpha +3 gamma +5 epsilon + +### +# Test 8.3: Mixed trivial and non-trivial in same projection +### + +query TT +EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, __common_expr_1 AS simple_struct.s[value], __common_expr_1 AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __common_expr_1, simple_struct.id, simple_struct.s +04)------TableScan: simple_struct projection=[id, s] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, value) + 10 as simple_struct.s[value] + Int64(10), get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness +query IIIT +SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3; +---- +1 100 110 alpha +2 200 210 beta +3 150 160 gamma + +### +# Test 8.4: Literal projection through TopK (NOT pushed through TopK, stays above) +### + +query TT +EXPLAIN SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3; +---- +logical_plan +01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 +02)--Projection: simple_struct.id, Int64(42) AS constant +03)----TableScan: simple_struct projection=[id] +physical_plan +01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as constant], file_type=parquet, predicate=DynamicFilter [ empty ] + Review Comment: Test header says literal projection through TopK is "NOT pushed" and stays above, but the expected physical plan shows `42 as constant` pushed into `DataSourceExec`. Please align the comment with the expected plan. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
