This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 1e7c3a123f Clarify the generality of the embedded parquet index
(#16692)
1e7c3a123f is described below
commit 1e7c3a123fee755f2a329ff261104cf50245bd4b
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jul 7 07:28:44 2025 -0400
Clarify the generality of the embedded parquet index (#16692)
---
.../examples/parquet_embedded_index.rs | 70 ++++++++++++----------
1 file changed, 38 insertions(+), 32 deletions(-)
diff --git a/datafusion-examples/examples/parquet_embedded_index.rs
b/datafusion-examples/examples/parquet_embedded_index.rs
index b1e16e899d..54a5f213a2 100644
--- a/datafusion-examples/examples/parquet_embedded_index.rs
+++ b/datafusion-examples/examples/parquet_embedded_index.rs
@@ -31,37 +31,41 @@
//! metadata because the footer must be read and parsed by all readers,
//! even those that do not use the index.
//!
+//! This example uses a file level index for skipping entire files, but any
+//! index can be stored using the same techniques and used skip row groups,
+//! data pages, or rows using the APIs on [`TableProvider`] and
[`ParquetSource`].
+//!
//! The resulting Parquet file layout is as follows:
//!
//! ```text
-//! ┌──────────────────────┐
-//! │┌───────────────────┐ │
-//! ││ DataPage │ │
-//! │└───────────────────┘ │
-//! Standard Parquet │┌───────────────────┐ │
-//! Data Pages ││ DataPage │ │
-//! │└───────────────────┘ │
-//! │ ... │
-//! │┌───────────────────┐ │
-//! ││ DataPage │ │
-//! │└───────────────────┘ │
-//! │┏━━━━━━━━━━━━━━━━━━━┓ │
-//! Non standard │┃ ┃ │
-//! index (ignored by │┃Custom Binary Index┃ │
-//! other Parquet │┃ (Distinct Values) ┃◀│─ ─ ─
-//! readers) │┃ ┃ │ │
-//! │┗━━━━━━━━━━━━━━━━━━━┛ │
+//! ┌──────────────────────┐
+//! │┌───────────────────┐ │
+//! ││ DataPage │ │
+//! │└───────────────────┘ │
+//! Standard Parquet │┌───────────────────┐ │
+//! Data Pages ││ DataPage │ │
+//! │└───────────────────┘ │
+//! │ ... │
+//! │┌───────────────────┐ │
+//! ││ DataPage │ │
+//! │└───────────────────┘ │
+//! │┏━━━━━━━━━━━━━━━━━━━┓ │
+//! Non standard │┃ ┃ │
+//! index (ignored by │┃Custom Binary Index┃ │
+//! other Parquet │┃ (Distinct Values) ┃◀│─ ─ ─
+//! readers) │┃ ┃ │ │
+//! │┗━━━━━━━━━━━━━━━━━━━┛ │
//! Standard Parquet │┏━━━━━━━━━━━━━━━━━━━┓ │ │ key/value metadata
-//! Page Index │┃ Page Index ┃ │ contains location
-//! │┗━━━━━━━━━━━━━━━━━━━┛ │ │ of special index
-//! │╔═══════════════════╗ │
-//! │║ Parquet Footer w/ ║ │ │
-//! │║ Metadata ║ ┼ ─ ─
-//! │║ (Thrift Encoded) ║ │
-//! │╚═══════════════════╝ │
-//! └──────────────────────┘
-//!
-//! Parquet File
+//! Page Index │┃ Page Index ┃ │ contains location
+//! │┗━━━━━━━━━━━━━━━━━━━┛ │ │ of special index
+//! │╔═══════════════════╗ │
+//! │║ Parquet Footer w/ ║ │ │
+//! │║ Metadata ║ ┼ ─ ─
+//! │║ (Thrift Encoded) ║ │
+//! │╚═══════════════════╝ │
+//! └──────────────────────┘
+//!
+//! Parquet File
//!
//! # High Level Flow
//!
@@ -420,17 +424,19 @@ impl TableProvider for DistinctIndexTable {
println!("Scanning only files: {files_to_scan:?}");
- // Build ParquetSource to sctually read the files
+ // Build ParquetSource to actually read the files
let url = ObjectStoreUrl::parse("file://")?;
let source =
Arc::new(ParquetSource::default().with_enable_page_index(true));
let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(),
source);
for file in files_to_scan {
let path = self.dir.join(file);
let len = std::fs::metadata(&path)?.len();
- builder = builder.with_file(PartitionedFile::new(
- path.to_str().unwrap().to_string(),
- len,
- ));
+ // If the index contained information about row groups or pages,
+ // you could also pass that information here to further prune
+ // the data read from the file.
+ let partitioned_file =
+ PartitionedFile::new(path.to_str().unwrap().to_string(), len);
+ builder = builder.with_file(partitioned_file);
}
Ok(DataSourceExec::from_data_source(builder.build()))
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]