advancedxy commented on code in PR #10813: URL: https://github.com/apache/datafusion/pull/10813#discussion_r1634409535
########## datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs: ########## @@ -405,6 +459,53 @@ mod test { ); } + #[test] + fn test_invalid_too_few() { + let access_plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + // select 12 rows, but row group 1 has 20 Review Comment: Nit: -> `specifies 12 rows`? I think the select is referred to as selection, which the following code also includes a skip. ########## datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs: ########## @@ -348,14 +384,22 @@ mod test { let access_plan = ParquetAccessPlan::new(vec![ RowGroupAccess::Scan, RowGroupAccess::Selection( - vec![RowSelector::select(5), RowSelector::skip(7)].into(), + // select / skip all 20 rows in row group 1 Review Comment: Nit: to be consistent with L427 in this file, it would be better to call it as `specifies all 20 rows in row group `. ########## datafusion/core/src/datasource/physical_plan/parquet/opener.rs: ########## @@ -212,3 +213,34 @@ impl FileOpener for ParquetOpener { })) } } + +/// Return the initial [`ParquetAccessPlan`] +/// +/// If the user has supplied one as an extension, use that +/// otherwise return a plan that scans all row groups +/// +/// Returns an error if an invalid `ParquetAccessPlan` is provided +/// +/// Note: file_name is only used for error messages +fn create_initial_plan( + file_name: &str, + extensions: Option<Arc<dyn std::any::Any + Send + Sync>>, + row_group_count: usize, +) -> Result<ParquetAccessPlan> { + if let Some(extensions) = extensions { + if let Some(access_plan) = extensions.downcast_ref::<ParquetAccessPlan>() { + let plan_len = access_plan.len(); + if plan_len != row_group_count { + return exec_err!( + "Invalid ParquetAccessPlan for {file_name}. Specified {plan_len} row groups, but file has {row_group_count}" + ); + } + + // check row group count matches the plan + return Ok(access_plan.clone()); + } Review Comment: Nit: is it better to add a logging in the else branch? ########## datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs: ########## @@ -405,6 +459,53 @@ mod test { ); } + #[test] + fn test_invalid_too_few() { + let access_plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + // select 12 rows, but row group 1 has 20 + RowGroupAccess::Selection( + vec![RowSelector::select(5), RowSelector::skip(7)].into(), + ), + RowGroupAccess::Scan, + RowGroupAccess::Scan, + ]); + + let row_group_indexes = access_plan.row_group_indexes(); + let err = access_plan + .into_overall_row_selection(row_group_metadata()) + .unwrap_err() + .to_string(); + assert_eq!(row_group_indexes, vec![0, 1, 2, 3]); + assert_contains!(err, "Internal error: Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 12 rows"); + } + + #[test] + fn test_invalid_too_many() { + let access_plan = ParquetAccessPlan::new(vec![ + RowGroupAccess::Scan, + // select 22 rows, but row group 1 has only 20 Review Comment: ditto. ########## datafusion/core/src/datasource/physical_plan/parquet/mod.rs: ########## @@ -145,6 +145,52 @@ pub use writer::plan_to_parquet; /// custom reader is used, it supplies the metadata directly and this parameter /// is ignored. [`ParquetExecBuilder::with_metadata_size_hint`] for more details. /// +/// * User provided [`ParquetAccessPlan`]s to skip row groups and/or pages +/// based on external information. See "Implementing External Indexes" below +/// +/// # Implementing External Indexes +/// +/// It is possible to restrict the row groups and selections within those row +/// groups that the ParquetExec will consider by providing an initial +/// [`ParquetAccessPlan`] as `extensions` on [`PartitionedFile`]. This can be +/// used to implement external indexes on top of parquet files and select only +/// portions of the files. +/// +/// The `ParquetExec` will try and further reduce any provided +/// `ParquetAccessPlan` further based on the contents of `ParquetMetadata` and Review Comment: Nit: there are two `further` in this sentence. How about: ``` /// The `ParquetExec` will try and reduce any provided /// `ParquetAccessPlan` further based on the contents ... ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org