Re: [PR] Row group limit pruning [datafusion]

via GitHub Tue, 06 Jan 2026 11:35:12 -0800


alamb commented on code in PR #18868:
URL: https://github.com/apache/datafusion/pull/18868#discussion_r2666002608



##########
datafusion/datasource-parquet/src/metrics.rs:
##########
@@ -93,6 +95,11 @@ impl ParquetFileMetrics {
             .with_type(MetricType::SUMMARY)
             .pruning_metrics("row_groups_pruned_bloom_filter", partition);
 
+        let limit_pruned_row_groups = MetricBuilder::new(metrics)

Review Comment:
   Can you please also document this metric in the docs here? 
https://github.com/apache/datafusion/blob/9e9eb944c06279a4abbfd1924b80192a2f4b4ef6/docs/source/user-guide/explain-usage.md?plain=1#L229-L228



##########
datafusion/datasource-parquet/src/row_group_filter.rs:
##########
@@ -70,6 +78,52 @@ impl RowGroupAccessPlanFilter {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row 
groups.
+    /// See the 
[description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093)

Review Comment:
   I think it would be better if you summarized how this works in the comments 
rather than having to follow a link. 



##########
datafusion/datasource-parquet/src/row_group_filter.rs:
##########
@@ -153,6 +218,68 @@ impl RowGroupAccessPlanFilter {
         }
     }
 
+    /// Identifies row groups that are fully matched by the predicate.
+    ///
+    /// This optimization checks whether all rows in a row group satisfy the 
predicate
+    /// by inverting the predicate and checking if it prunes the row group. If 
the
+    /// inverted predicate prunes a row group, it means no rows match the 
inverted
+    /// predicate, which implies all rows match the original predicate.
+    ///
+    /// Note: This optimization is relatively inexpensive for a limited number 
of row groups.
+    fn identify_fully_matched_row_groups(
+        &mut self,
+        candidate_row_group_indices: &[usize],
+        arrow_schema: &Schema,
+        parquet_schema: &SchemaDescriptor,
+        groups: &[RowGroupMetaData],
+        predicate: &PruningPredicate,
+        metrics: &ParquetFileMetrics,
+    ) {
+        if candidate_row_group_indices.is_empty() {
+            return;
+        }
+
+        // Use NotExpr to create the inverted predicate
+        let inverted_expr = 
Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
+
+        // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
+        // before building the pruning predicate
+        let simplifier = PhysicalExprSimplifier::new(arrow_schema);

Review Comment:
   my only concern is that this code is now called for every row file and it 
makes an entirely new pruning predicate (even when there is no limit and thus 
the results aren't used). Could we only run this code when there is a limit?



##########
datafusion/physical-plan/src/limit.rs:
##########
@@ -51,6 +52,9 @@ pub struct GlobalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
+    /// If the child plan is a sort node, after the sort node is removed during
+    /// physical optimization, we should add the required ordering to the 
limit node

Review Comment:
   I think it might also help to explain why this is needed. Something like
   ```suggestion
       /// Does the limit have to preserve the order if its input, and if so 
what is it?
       /// Some optimizations may reorder the input if no particular sort is 
required
   ```



##########
datafusion/core/tests/parquet/row_group_pruning.rs:
##########
@@ -1636,3 +1722,241 @@ async fn test_bloom_filter_decimal_dict() {
         .test_row_group_prune()
         .await;
 }
+
+// Helper function to create a batch with a single Int32 column.
+fn make_i32_batch(
+    name: &str,
+    values: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, 
false)]));
+    let array: ArrayRef = Arc::new(Int32Array::from(values));
+    RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
+}
+
+// Helper function to create a batch with two Int32 columns
+fn make_two_col_i32_batch(
+    name_a: &str,
+    name_b: &str,
+    values_a: Vec<i32>,
+    values_b: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(name_a, DataType::Int32, false),
+        Field::new(name_b, DataType::Int32, false),
+    ]));
+    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
+    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
+    RecordBatch::try_new(schema, vec![array_a, 
array_b]).map_err(DataFusionError::from)
+}
+
+#[tokio::test]
+async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
+    // Scenario: Simple integer column, multiple row groups
+    // Query: SELECT c1 FROM  t WHERE c1 = 0 LIMIT 2
+    // We expect 2 rows in total.
+
+    // Row Group 0: c1 = [0, -2] -> Partially matched, 1 row
+    // Row Group 1: c1 = [1, 2] -> Fully matched, 2 rows
+    // Row Group 2: c1 = [3, 4] -> Fully matched, 2 rows
+    // Row Group 3: c1 = [5, 6] -> Fully matched, 2 rows
+    // Row Group 4: c1 = [-1, -2] -> Not matched
+
+    // If limit = 2, and RG1 is fully matched and has 2 rows, we should
+    // only scan RG1 and prune other row groups
+    // RG4 is pruned by statistics. RG2 and RG3 are pruned by limit.
+    // So 2 row groups are effectively pruned due to limit pruning.
+
+    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, 
false)]));
+    let query = "SELECT c1 FROM t WHERE c1 >= 0 LIMIT 2";
+
+    let batches = vec![
+        make_i32_batch("c1", vec![0, -2])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![-1, -2])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int) // Assuming Scenario::Int can handle 
this data
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(2)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4))
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(1))
+        .with_limit_pruned_row_groups(Some(3))
+        .test_row_group_prune_with_custom_data(schema, batches, 2)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_complex_filter() -> 
datafusion_common::error::Result<()> {
+    // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
+    // Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match 
(b=2,3)
+    // Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows
+    // Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows
+    // Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows
+    // Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1)
+    // Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4)
+
+    // With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows
+    // RG4 and RG5 should be pruned by statistics
+    // RG3 should be pruned by limit
+    // RG0 is partially matched, so it depends on the order
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5";
+
+    let batches = vec![
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(5)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(2)) // RG4,5 are pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_multiple_fully_matched()
+-> datafusion_common::error::Result<()> {
+    // Test Case 2: Limit requires multiple fully matched row groups
+    // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 4: a=[1,2,3,4] -> Not matched
+
+    // With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows)  8 rows
+    // RG2,3 should be pruned by limit
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, 
false)]));
+    let query = "SELECT a FROM t WHERE a = 5 LIMIT 8";
+
+    let batches = vec![
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![1, 2, 3, 4])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(8)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(4))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_no_fully_matched() -> 
datafusion_common::error::Result<()> {
+    // Test Case 3: No fully matched row groups - all are partially matched
+    // Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2)
+    // Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2)
+    // Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2)
+    // Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2)
+    // Row Group 4: a=[9,10,11] -> Not matched
+
+    // With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows
+    // Cannot prune much by limit since all matching RGs are partial
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, 
false)]));
+    let query = "SELECT a FROM t WHERE a = 2 LIMIT 3";
+
+    let batches = vec![
+        make_i32_batch("a", vec![1, 2, 3])?,
+        make_i32_batch("a", vec![2, 3, 4])?,
+        make_i32_batch("a", vec![2, 5, 6])?,
+        make_i32_batch("a", vec![2, 7, 8])?,
+        make_i32_batch("a", vec![9, 10, 11])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(3)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(0))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_exceeds_fully_matched() -> 
datafusion_common::error::Result<()>
+{
+    // Test Case 4: Limit exceeds all fully matched rows, need partially 
matched
+    // Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10)
+    // Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10)
+    // Row Group 4: a=[20,21,22,22] -> Not matched
+
+    // With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched
+    // Still need 2 more, so we need to scan partially matched RG0 and RG3
+    // All matching row groups should be scanned, only RG4 pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, 
false)]));
+    let query = "SELECT a FROM t WHERE a = 10 LIMIT 10";
+
+    let batches = vec![
+        make_i32_batch("a", vec![10, 11, 12, 12])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 13, 14, 11])?,
+        make_i32_batch("a", vec![20, 21, 22, 22])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)

Review Comment:
   I don't understand this comment -- the limit is 10 so we would expect that 
we have 10 rows, right? What is the `9` here? 



##########
datafusion/sqllogictest/test_files/limit_pruning.slt:
##########
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE t AS VALUES
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);

Review Comment:
   When reviewing the test results, I needed to break it up like this so the 
expected data was easier to verify. It might help other readers to do the same
   
   ```suggestion
   CREATE TABLE t AS VALUES
   -- ***** Row Group 0 *****
     ('Anow Vole', 7),
     ('Brown Bear', 133),
     ('Gray Wolf', 82),
   -- ***** Row Group 1 *****
     ('Lynx', 71),
     ('Red Fox', 40),
     ('Alpine Bat', 6),
   -- ***** Row Group 2 *****
     ('Nlpine Ibex', 101),
     ('Nlpine Goat', 76),
     ('Nlpine Sheep', 83),
   -- ***** Row Group 3 *****
     ('Europ. Mole', 4),
     ('Polecat', 16),
     ('Alpine Ibex', 97);
   ```



##########
datafusion/core/tests/parquet/mod.rs:
##########
@@ -231,20 +269,41 @@ impl TestOutput {
 /// and the appropriate scenario
 impl ContextWithParquet {
     async fn new(scenario: Scenario, unit: Unit) -> Self {
-        Self::with_config(scenario, unit, SessionConfig::new()).await
+        Self::with_config(scenario, unit, SessionConfig::new(), None, 
None).await
+    }
+
+    /// Set custom schema and batches for the test
+    pub async fn with_custom_data(
+        scenario: Scenario,
+        unit: Unit,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+    ) -> Self {
+        Self::with_config(
+            scenario,
+            unit,
+            SessionConfig::new(),
+            Some(schema),
+            Some(batches),
+        )
+        .await
     }
 
     async fn with_config(
         scenario: Scenario,
         unit: Unit,
         mut config: SessionConfig,
+        custom_schema: Option<Arc<Schema>>,

Review Comment:
   nit: typically this is `SchemaRef` rather than `ArcSchema` I think



##########
datafusion/datasource-parquet/src/row_group_filter.rs:
##########
@@ -46,13 +48,19 @@ use parquet::{
 pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
+    /// Row groups where ALL rows are known to match the pruning predicate

Review Comment:
   ```suggestion
       /// Row groups where ALL rows are known to match the pruning predicate 
       /// (the predicate does not filter any rows)
   ```



##########
datafusion/datasource-parquet/src/row_group_filter.rs:
##########
@@ -70,6 +78,52 @@ impl RowGroupAccessPlanFilter {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row 
groups.
+    /// See the 
[description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093)
+    /// for how the pruning works and improves performance.
+    /// For more information, see the 
[paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
+    pub fn prune_by_limit(
+        &mut self,
+        limit: usize,
+        rg_metadata: &[RowGroupMetaData],
+        metrics: &ParquetFileMetrics,
+    ) {
+        let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
+        let mut fully_matched_rows_count: usize = 0;
+
+        // Iterate through the currently accessible row groups
+        for &idx in self.access_plan.row_group_indexes().iter() {
+            if self.is_fully_matched[idx] {
+                let row_group_row_count = rg_metadata[idx].num_rows() as usize;
+                fully_matched_row_group_indexes.push(idx);
+                fully_matched_rows_count += row_group_row_count;
+                if fully_matched_rows_count >= limit {
+                    break;
+                }
+            }
+        }
+
+        if fully_matched_rows_count >= limit {

Review Comment:
   ```suggestion
           // If we can satisfy the limit with fully matching row groups, 
           // rewrite the plan to do so
           if fully_matched_rows_count >= limit {
   ```



##########
datafusion/sqllogictest/test_files/limit_pruning.slt:
##########
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok

Review Comment:
   Shall we credit https://arxiv.org/pdf/2504.11540 ? I love these names



##########
datafusion/datasource-parquet/src/row_group_filter.rs:
##########
@@ -70,6 +78,52 @@ impl RowGroupAccessPlanFilter {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row 
groups.
+    /// See the 
[description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093)
+    /// for how the pruning works and improves performance.
+    /// For more information, see the 
[paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
+    pub fn prune_by_limit(
+        &mut self,
+        limit: usize,
+        rg_metadata: &[RowGroupMetaData],
+        metrics: &ParquetFileMetrics,
+    ) {
+        let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
+        let mut fully_matched_rows_count: usize = 0;
+
+        // Iterate through the currently accessible row groups

Review Comment:
   ```suggestion
           // Iterate through the currently accessible row groups, and try and
           // find a set of matching row groups that can satisfy the limit
   ```



##########
datafusion/sqllogictest/test_files/limit_pruning.slt:
##########
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE t AS VALUES
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as a, column2 as b FROM t)
+TO 'test_files/scratch/limit_pruning/data.parquet'
+STORED AS PARQUET
+OPTIONS (
+  'format.max_row_group_size' '3'
+);
+
+statement ok
+drop table t;
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/limit_pruning/data.parquet';
+
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+# row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched

Review Comment:
   this is because Row Group 2 is entirely matched (`Nlpine*`) right?



##########
test_files/scratch/limit_pruning/data.parquet:
##########


Review Comment:
   I don't think this file (test_files/scratch/limit_pruning/data.parquet) was 
meant to be checked in
   
   <img width="639" height="115" alt="Image" 
src="https://github.com/user-attachments/assets/060f5bdc-ef6b-4748-91de-bae0f95090e4";
 />



##########
datafusion/sqllogictest/test_files/limit_pruning.slt:
##########
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE t AS VALUES
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as a, column2 as b FROM t)

Review Comment:
   nit is I would find it easier to read these tests if the columns are named 
'species' and `s`, following the paper



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Row group limit pruning [datafusion]

Reply via email to