(datafusion) branch main updated: Use TableSchema in FileScanConfig (#18231)

github-bot Thu, 23 Oct 2025 22:22:04 -0700

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 665a55268a Use TableSchema in FileScanConfig (#18231)
665a55268a is described below

commit 665a55268a18b51e4d1e9216fd1ab9391624e7e9
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Fri Oct 24 00:21:20 2025 -0500

    Use TableSchema in FileScanConfig (#18231)
    
    Steps towards #14993
---
 datafusion/datasource/src/file_scan_config.rs | 42 +++++++++++++--------------
 datafusion/datasource/src/table_schema.rs     | 14 +++++++++
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/datafusion/datasource/src/file_scan_config.rs 
b/datafusion/datasource/src/file_scan_config.rs
index d557a99274..4dfb6a4ec3 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -156,6 +156,11 @@ pub struct FileScanConfig {
     /// Schema information including the file schema, table partition columns,
     /// and the combined table schema.
     ///
+    /// The table schema (file schema + partition columns) is the schema 
exposed
+    /// upstream of [`FileScanConfig`] (e.g. in [`DataSourceExec`]).
+    ///
+    /// See [`TableSchema`] for more information.
+    ///
     /// [`DataSourceExec`]: crate::source::DataSourceExec
     pub table_schema: TableSchema,
     /// List of files to be processed, grouped into partitions
@@ -244,23 +249,19 @@ pub struct FileScanConfig {
 #[derive(Clone)]
 pub struct FileScanConfigBuilder {
     object_store_url: ObjectStoreUrl,
-    /// Table schema before any projections or partition columns are applied.
+    /// Schema information including the file schema, table partition columns,
+    /// and the combined table schema.
     ///
-    /// This schema is used to read the files, but is **not** necessarily the
-    /// schema of the physical files. Rather this is the schema that the
+    /// This schema is used to read the files, but the file schema is **not** 
necessarily
+    /// the schema of the physical files. Rather this is the schema that the
     /// physical file schema will be mapped onto, and the schema that the
     /// [`DataSourceExec`] will return.
     ///
-    /// This is usually the same as the table schema as specified by the 
`TableProvider` minus any partition columns.
-    ///
-    /// This probably would be better named `table_schema`
-    ///
     /// [`DataSourceExec`]: crate::source::DataSourceExec
-    file_schema: SchemaRef,
+    table_schema: TableSchema,
     file_source: Arc<dyn FileSource>,
     limit: Option<usize>,
     projection: Option<Vec<usize>>,
-    table_partition_cols: Vec<FieldRef>,
     constraints: Option<Constraints>,
     file_groups: Vec<FileGroup>,
     statistics: Option<Statistics>,
@@ -285,7 +286,7 @@ impl FileScanConfigBuilder {
     ) -> Self {
         Self {
             object_store_url,
-            file_schema,
+            table_schema: TableSchema::from_file_schema(file_schema),
             file_source,
             file_groups: vec![],
             statistics: None,
@@ -294,7 +295,6 @@ impl FileScanConfigBuilder {
             new_lines_in_values: None,
             limit: None,
             projection: None,
-            table_partition_cols: vec![],
             constraints: None,
             batch_size: None,
             expr_adapter_factory: None,
@@ -326,10 +326,13 @@ impl FileScanConfigBuilder {
 
     /// Set the partitioning columns
     pub fn with_table_partition_cols(mut self, table_partition_cols: 
Vec<Field>) -> Self {
-        self.table_partition_cols = table_partition_cols
+        let table_partition_cols: Vec<FieldRef> = table_partition_cols
             .into_iter()
             .map(|f| Arc::new(f) as FieldRef)
             .collect();
+        self.table_schema = self
+            .table_schema
+            .with_table_partition_cols(table_partition_cols);
         self
     }
 
@@ -427,11 +430,10 @@ impl FileScanConfigBuilder {
     pub fn build(self) -> FileScanConfig {
         let Self {
             object_store_url,
-            file_schema,
+            table_schema,
             file_source,
             limit,
             projection,
-            table_partition_cols,
             constraints,
             file_groups,
             statistics,
@@ -443,19 +445,16 @@ impl FileScanConfigBuilder {
         } = self;
 
         let constraints = constraints.unwrap_or_default();
-        let statistics =
-            statistics.unwrap_or_else(|| 
Statistics::new_unknown(&file_schema));
+        let statistics = statistics
+            .unwrap_or_else(|| 
Statistics::new_unknown(table_schema.file_schema()));
 
         let file_source = file_source
             .with_statistics(statistics.clone())
-            .with_schema(Arc::clone(&file_schema));
+            .with_schema(Arc::clone(table_schema.file_schema()));
         let file_compression_type =
             file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);
         let new_lines_in_values = new_lines_in_values.unwrap_or(false);
 
-        // Create TableSchema from file_schema and table_partition_cols
-        let table_schema = TableSchema::new(file_schema, table_partition_cols);
-
         FileScanConfig {
             object_store_url,
             table_schema,
@@ -477,7 +476,7 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
     fn from(config: FileScanConfig) -> Self {
         Self {
             object_store_url: config.object_store_url,
-            file_schema: Arc::clone(config.table_schema.file_schema()),
+            table_schema: config.table_schema,
             file_source: Arc::<dyn FileSource>::clone(&config.file_source),
             file_groups: config.file_groups,
             statistics: config.file_source.statistics().ok(),
@@ -486,7 +485,6 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
             new_lines_in_values: Some(config.new_lines_in_values),
             limit: config.limit,
             projection: config.projection,
-            table_partition_cols: 
config.table_schema.table_partition_cols().clone(),
             constraints: Some(config.constraints),
             batch_size: config.batch_size,
             expr_adapter_factory: config.expr_adapter_factory,
diff --git a/datafusion/datasource/src/table_schema.rs 
b/datafusion/datasource/src/table_schema.rs
index 9413bd9ef2..8e95585ce8 100644
--- a/datafusion/datasource/src/table_schema.rs
+++ b/datafusion/datasource/src/table_schema.rs
@@ -121,6 +121,20 @@ impl TableSchema {
         }
     }
 
+    /// Create a new TableSchema from a file schema with no partition columns.
+    pub fn from_file_schema(file_schema: SchemaRef) -> Self {
+        Self::new(file_schema, vec![])
+    }
+
+    /// Set the table partition columns and rebuild the table schema.
+    pub fn with_table_partition_cols(
+        mut self,
+        table_partition_cols: Vec<FieldRef>,
+    ) -> TableSchema {
+        self.table_partition_cols = table_partition_cols;
+        self
+    }
+
     /// Get the file schema (without partition columns).
     ///
     /// This is the schema of the actual data files on disk.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Use TableSchema in FileScanConfig (#18231)

Reply via email to