This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 1e7c3a123f Clarify the generality of the embedded parquet index 
(#16692)
1e7c3a123f is described below

commit 1e7c3a123fee755f2a329ff261104cf50245bd4b
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jul 7 07:28:44 2025 -0400

    Clarify the generality of the embedded parquet index (#16692)
---
 .../examples/parquet_embedded_index.rs             | 70 ++++++++++++----------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/datafusion-examples/examples/parquet_embedded_index.rs 
b/datafusion-examples/examples/parquet_embedded_index.rs
index b1e16e899d..54a5f213a2 100644
--- a/datafusion-examples/examples/parquet_embedded_index.rs
+++ b/datafusion-examples/examples/parquet_embedded_index.rs
@@ -31,37 +31,41 @@
 //! metadata because the footer must be read and parsed by all readers,
 //! even those that do not use the index.
 //!
+//! This example uses a file level index for skipping entire files, but any
+//! index can be stored using the same techniques and used skip row groups,
+//! data pages, or rows using the APIs on [`TableProvider`] and 
[`ParquetSource`].
+//!
 //! The resulting Parquet file layout is as follows:
 //!
 //! ```text
-//!                   ┌──────────────────────┐                           
-//!                   │┌───────────────────┐ │                           
-//!                   ││     DataPage      │ │                           
-//!                   │└───────────────────┘ │                           
-//!  Standard Parquet │┌───────────────────┐ │                           
-//!  Data Pages       ││     DataPage      │ │                           
-//!                   │└───────────────────┘ │                           
-//!                   │        ...           │                           
-//!                   │┌───────────────────┐ │                           
-//!                   ││     DataPage      │ │                           
-//!                   │└───────────────────┘ │                           
-//!                   │┏━━━━━━━━━━━━━━━━━━━┓ │                           
-//! Non standard      │┃                   ┃ │                           
-//! index (ignored by │┃Custom Binary Index┃ │                           
-//! other Parquet     │┃ (Distinct Values) ┃◀│─ ─ ─                      
-//! readers)          │┃                   ┃ │     │                     
-//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │                           
+//!                   ┌──────────────────────┐
+//!                   │┌───────────────────┐ │
+//!                   ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!  Standard Parquet │┌───────────────────┐ │
+//!  Data Pages       ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!                   │        ...           │
+//!                   │┌───────────────────┐ │
+//!                   ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!                   │┏━━━━━━━━━━━━━━━━━━━┓ │
+//! Non standard      │┃                   ┃ │
+//! index (ignored by │┃Custom Binary Index┃ │
+//! other Parquet     │┃ (Distinct Values) ┃◀│─ ─ ─
+//! readers)          │┃                   ┃ │     │
+//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │
 //! Standard Parquet  │┏━━━━━━━━━━━━━━━━━━━┓ │     │  key/value metadata
-//! Page Index        │┃    Page Index     ┃ │        contains location  
-//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │     │  of special index   
-//!                   │╔═══════════════════╗ │                           
-//!                   │║ Parquet Footer w/ ║ │     │                     
-//!                   │║     Metadata      ║ ┼ ─ ─                       
-//!                   │║ (Thrift Encoded)  ║ │                           
-//!                   │╚═══════════════════╝ │                           
-//!                   └──────────────────────┘                           
-//!                                                                      
-//!                         Parquet File                                 
+//! Page Index        │┃    Page Index     ┃ │        contains location
+//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │     │  of special index
+//!                   │╔═══════════════════╗ │
+//!                   │║ Parquet Footer w/ ║ │     │
+//!                   │║     Metadata      ║ ┼ ─ ─
+//!                   │║ (Thrift Encoded)  ║ │
+//!                   │╚═══════════════════╝ │
+//!                   └──────────────────────┘
+//!
+//!                         Parquet File
 //!
 //! # High Level Flow
 //!
@@ -420,17 +424,19 @@ impl TableProvider for DistinctIndexTable {
 
         println!("Scanning only files: {files_to_scan:?}");
 
-        // Build ParquetSource to sctually read the files
+        // Build ParquetSource to actually read the files
         let url = ObjectStoreUrl::parse("file://")?;
         let source = 
Arc::new(ParquetSource::default().with_enable_page_index(true));
         let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(), 
source);
         for file in files_to_scan {
             let path = self.dir.join(file);
             let len = std::fs::metadata(&path)?.len();
-            builder = builder.with_file(PartitionedFile::new(
-                path.to_str().unwrap().to_string(),
-                len,
-            ));
+            // If the index contained information about row groups or pages,
+            // you could also pass that information here to further prune
+            // the data read from the file.
+            let partitioned_file =
+                PartitionedFile::new(path.to_str().unwrap().to_string(), len);
+            builder = builder.with_file(partitioned_file);
         }
         Ok(DataSourceExec::from_data_source(builder.build()))
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to