alamb commented on code in PR #7821:
URL: https://github.com/apache/arrow-datafusion/pull/7821#discussion_r1366808645


##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -84,6 +84,9 @@ pub struct ParquetExec {
     /// Override for `Self::with_enable_page_index`. If None, uses
     /// values from base_config
     enable_page_index: Option<bool>,
+    /// Override for `Self::with_enable_bloom_filter`. If None, uses
+    /// values from base_config
+    enable_bloom_filter: Option<bool>,

Review Comment:
   Looks good to me. Thank you



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -246,6 +250,18 @@ impl ParquetExec {
             .unwrap_or(config_options.execution.parquet.enable_page_index)
     }
 
+    /// If enabled, the reader will read by the bloom filter
+    pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> 
Self {
+        self.enable_bloom_filter = Some(enable_bloom_filter);
+        self
+    }
+
+    /// Return the value described in [`Self::with_enable_bloom_filter`]
+    fn enable_bloom_filter(&self, config_options: &ConfigOptions) -> bool {
+        self.enable_bloom_filter
+            .unwrap_or(config_options.execution.parquet.bloom_filter_enabled)

Review Comment:
   👍 



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -246,6 +250,18 @@ impl ParquetExec {
             .unwrap_or(config_options.execution.parquet.enable_page_index)
     }
 
+    /// If enabled, the reader will read by the bloom filter
+    pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> 
Self {
+        self.enable_bloom_filter = Some(enable_bloom_filter);
+        self
+    }
+
+    /// Return the value described in [`Self::with_enable_bloom_filter`]
+    fn enable_bloom_filter(&self, config_options: &ConfigOptions) -> bool {
+        self.enable_bloom_filter
+            .unwrap_or(config_options.execution.parquet.bloom_filter_enabled)

Review Comment:
   👍 



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -92,6 +98,242 @@ pub(crate) fn prune_row_groups(
     filtered
 }
 
+/// Prune row groups by bloom filters
+///
+/// Returns a vector of indexes into `groups` which should be scanned.
+///
+/// If an index is NOT present in the returned Vec it means the
+/// predicate filtered all the row group.
+///
+/// If an index IS present in the returned Vec it means the predicate
+/// did not filter out that row group.
+pub(crate) async fn prune_row_groups_by_bloom_filters<
+    T: AsyncFileReader + Send + 'static,
+>(
+    builder: &mut ParquetRecordBatchStreamBuilder<T>,
+    row_groups: &[usize],
+    groups: &[RowGroupMetaData],
+    predicate: &PruningPredicate,
+    metrics: &ParquetFileMetrics,
+) -> Vec<usize> {
+    let bf_predicates = match 
BloomFilterPruningPredicate::try_new(predicate.orig_expr())

Review Comment:
   BTW I have some ideas (working on them in 
https://github.com/alamb/arrow-datafusion/pull/14) of how to support `InList` 
expressions directly by using the interval analysis library.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to