Re: [PR] parquet reader: move pruning predicate creation from ParquetSource to ParquetOpener [datafusion]

via GitHub Fri, 04 Apr 2025 19:24:12 -0700


zhuqi-lucas commented on code in PR #15561:
URL: https://github.com/apache/datafusion/pull/15561#discussion_r2029658117



##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -109,47 +108,84 @@ impl FileOpener for ParquetOpener {
             .schema_adapter_factory
             .create(projected_schema, Arc::clone(&self.table_schema));
         let predicate = self.predicate.clone();
-        let pruning_predicate = self.pruning_predicate.clone();
-        let page_pruning_predicate = self.page_pruning_predicate.clone();
         let table_schema = Arc::clone(&self.table_schema);
         let reorder_predicates = self.reorder_filters;
         let pushdown_filters = self.pushdown_filters;
-        let enable_page_index = should_enable_page_index(
-            self.enable_page_index,
-            &self.page_pruning_predicate,
-        );
         let enable_bloom_filter = self.enable_bloom_filter;
+        let enable_row_group_stats_pruning = 
self.enable_row_group_stats_pruning;
         let limit = self.limit;
 
-        Ok(Box::pin(async move {
-            let options = 
ArrowReaderOptions::new().with_page_index(enable_page_index);
+        let predicate_creation_errors = MetricBuilder::new(&self.metrics)
+            .global_counter("num_predicate_creation_errors");
+
+        let enable_page_index = self.enable_page_index;
 
+        Ok(Box::pin(async move {
+            // Don't load the page index yet. Since it is not stored inline in
+            // the footer, loading the page index if it is not needed will do
+            // unecessary I/O. We decide later if it is needed to evaluate the
+            // pruning predicates. Thus default to not requesting if from the
+            // underlying reader.
+            let mut options = ArrowReaderOptions::new().with_page_index(false);
             let mut metadata_timer = file_metrics.metadata_load_time.timer();
-            let metadata =
-                ArrowReaderMetadata::load_async(&mut reader, 
options.clone()).await?;
-            let mut schema = Arc::clone(metadata.schema());
 
-            // read with view types
-            if let Some(merged) = 
apply_file_schema_type_coercions(&table_schema, &schema)
+            // Begin by loading the metadata from the underlying reader (note
+            // the returned metadata may actually include page indexes as some
+            // readers may return page indexes even when not requested -- for
+            // example when they are cached)
+            let mut reader_metadata =
+                ArrowReaderMetadata::load_async(&mut async_file_reader, 
options.clone())
+                    .await?;
+
+            // Note about schemas: we are actually dealing with **3 different 
schemas** here:
+            // - The table schema as defined by the TableProvider. This is 
what the user sees, what they get when they `SELECT * FROM table`, etc.
+            // - The "virtual" file schema: this is the table schema minus any 
hive partition columns and projections. This is what the file schema is coerced 
to.
+            // - The physical file schema: this is the schema as defined by 
the parquet file. This is what the parquet file actually contains.
+            let mut physical_file_schema = 
Arc::clone(reader_metadata.schema());
+
+            // The schema loaded from the file may not be the same as the
+            // desired schema (for example if we want to instruct the parquet
+            // reader to read strings using Utf8View instead). Update if 
necessary
+            if let Some(merged) =
+                apply_file_schema_type_coercions(&table_schema, 
&physical_file_schema)
             {
-                schema = Arc::new(merged);
+                physical_file_schema = Arc::new(merged);
+                options = 
options.with_schema(Arc::clone(&physical_file_schema));
+                reader_metadata = ArrowReaderMetadata::try_new(
+                    Arc::clone(reader_metadata.metadata()),
+                    options.clone(),
+                )?;
             }
 
-            let options = ArrowReaderOptions::new()
-                .with_page_index(enable_page_index)
-                .with_schema(Arc::clone(&schema));
-            let metadata =
-                ArrowReaderMetadata::try_new(Arc::clone(metadata.metadata()), 
options)?;
+            // Build predicates for this specific file
+            let (pruning_predicate, page_pruning_predicate) = 
build_pruning_predicates(
+                &predicate,
+                &physical_file_schema,
+                &predicate_creation_errors,
+            );
 
-            metadata_timer.stop();
+            // The page index is not stored inline in the parquet footer so the
+            // code above may not have raed the page index structures yet. If 
we

Review Comment:
   Minor:
   
   Typo:
   raed => read



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] parquet reader: move pruning predicate creation from ParquetSource to ParquetOpener [datafusion]

Reply via email to