Ted-Jiang commented on code in PR #3780:
URL: https://github.com/apache/arrow-datafusion/pull/3780#discussion_r992227669


##########
datafusion/core/src/physical_plan/file_format/parquet.rs:
##########
@@ -828,6 +996,97 @@ fn prune_row_groups(
     filtered
 }
 
+fn prune_pages_in_one_row_group(
+    group: &RowGroupMetaData,
+    predicate: Option<PruningPredicate>,
+    offset_indexes: Option<&Vec<Vec<PageLocation>>>,
+    page_indexes: Option<&Vec<Index>>,
+    metrics: &ParquetFileMetrics,
+) -> Result<Vec<RowSelector>> {
+    let num_rows = group.num_rows() as usize;
+    if let (Some(predicate), Some(offset_indexes), Some(page_indexes)) =
+        (&predicate, offset_indexes, page_indexes)
+    {
+        let pruning_stats = PagesPruningStatistics {
+            page_indexes,
+            offset_indexes,
+            parquet_schema: predicate.schema().as_ref(),
+            // now we assume only support one col.
+            col_id: *predicate
+                .need_input_columns_ids()
+                .iter()
+                .take(1)
+                .next()
+                .unwrap(),
+        };
+
+        match predicate.prune(&pruning_stats) {
+            Ok(values) => {
+                let mut vec = Vec::with_capacity(values.len());
+                if let Some(cols_offset_indexes) =
+                    offset_indexes.get(pruning_stats.col_id)
+                {
+                    let row_vec =
+                        create_row_count_in_each_page(cols_offset_indexes, 
num_rows);
+                    assert_eq!(row_vec.len(), values.len());
+                    let mut sum_row = *row_vec.first().unwrap();
+                    let mut selected = *values.first().unwrap();
+
+                    for (i, &f) in values.iter().skip(1).enumerate() {
+                        if f == selected {
+                            sum_row += *row_vec.get(i).unwrap();
+                        } else {
+                            let selector = if selected {
+                                RowSelector::select(sum_row)
+                            } else {
+                                RowSelector::skip(sum_row)
+                            };
+                            vec.push(selector);
+                            sum_row = *row_vec.get(i).unwrap();
+                            selected = f;
+                        }
+                    }
+
+                    let selector = if selected {
+                        RowSelector::select(sum_row)
+                    } else {
+                        RowSelector::skip(sum_row)
+                    };
+                    vec.push(selector);

Review Comment:
   the selector must be alternative  `skip` or `select` , but not show in the 
doc. I think it should make one api in arrow-rs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to