Copilot commented on code in PR #19405:
URL: https://github.com/apache/datafusion/pull/19405#discussion_r2634720785
##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -988,6 +988,173 @@ mod test {
use parquet::arrow::ArrowWriter;
use parquet::file::properties::WriterProperties;
+ /// Builder for creating [`ParquetOpener`] instances with sensible
defaults for tests.
+ /// This helps reduce code duplication and makes it clear what differs
between test cases.
+ struct ParquetOpenerBuilder {
+ store: Option<Arc<dyn ObjectStore>>,
+ file_schema: Option<SchemaRef>,
+ table_schema: Option<TableSchema>,
+ partition_index: usize,
+ projection_indices: Option<Vec<usize>>,
+ projection: Option<ProjectionExprs>,
+ batch_size: usize,
+ limit: Option<usize>,
+ predicate: Option<Arc<dyn PhysicalExpr>>,
+ metadata_size_hint: Option<usize>,
+ metrics: ExecutionPlanMetricsSet,
+ pushdown_filters: bool,
+ reorder_filters: bool,
+ force_filter_selections: bool,
+ enable_page_index: bool,
+ enable_bloom_filter: bool,
+ enable_row_group_stats_pruning: bool,
+ coerce_int96: Option<arrow::datatypes::TimeUnit>,
+ max_predicate_cache_size: Option<usize>,
+ reverse_row_groups: bool,
+ }
+
+ impl ParquetOpenerBuilder {
+ /// Create a new builder with sensible defaults for tests.
+ fn new() -> Self {
+ Self {
+ store: None,
+ file_schema: None,
+ table_schema: None,
+ partition_index: 0,
+ projection_indices: None,
+ projection: None,
+ batch_size: 1024,
+ limit: None,
+ predicate: None,
+ metadata_size_hint: None,
+ metrics: ExecutionPlanMetricsSet::new(),
+ pushdown_filters: false,
+ reorder_filters: false,
+ force_filter_selections: false,
+ enable_page_index: false,
+ enable_bloom_filter: false,
+ enable_row_group_stats_pruning: false,
+ coerce_int96: None,
+ max_predicate_cache_size: None,
+ reverse_row_groups: false,
+ }
+ }
+
+ /// Set the object store (required for building).
+ fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
+ self.store = Some(store);
+ self
+ }
+
+ /// Set the file schema and create a simple table schema (for files
without partition columns).
+ fn with_schema(mut self, file_schema: SchemaRef) -> Self {
+ self.file_schema = Some(Arc::clone(&file_schema));
+ self.table_schema =
Some(TableSchema::from_file_schema(file_schema));
+ self
+ }
+
+ /// Set a custom table schema (for files with partition columns).
+ fn with_table_schema(mut self, table_schema: TableSchema) -> Self {
+ let file_schema = Arc::clone(table_schema.file_schema());
+ self.table_schema = Some(table_schema);
+ if self.file_schema.is_none() {
+ self.file_schema = Some(file_schema);
+ }
Review Comment:
The logic in `with_table_schema` could be confusing if someone calls
`with_schema()` first and then `with_table_schema()`. The `file_schema`
extracted from `table_schema` won't override the previously set `file_schema`,
potentially causing inconsistency between `file_schema` and
`table_schema.file_schema()`. Consider either: (1) always setting `file_schema`
to match the table_schema's file_schema regardless of whether it's None, or (2)
adding documentation that clarifies these methods are mutually exclusive.
```suggestion
// Ensure file_schema always matches the table schema's
file_schema
self.file_schema = Some(file_schema);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]