adriangb commented on code in PR #22026:
URL: https://github.com/apache/datafusion/pull/22026#discussion_r3319230237
##########
datafusion/datasource/src/table_schema.rs:
##########
@@ -166,13 +203,43 @@ impl TableSchema {
&self.table_partition_cols
}
- /// Get the full table schema (file schema + partition columns).
+ /// Get the virtual columns.
///
- /// This is the complete schema that will be seen by queries, combining
- /// both the columns from the files and the partition columns.
+ /// Virtual columns are produced by the file reader (e.g. Parquet
+ /// `row_number`) and are not stored in the data files or derived from
+ /// partition paths.
+ pub fn virtual_columns(&self) -> &Fields {
+ &self.virtual_columns
+ }
+
+ /// Get the full table schema (file schema + partition columns + virtual
columns).
+ ///
+ /// This is the complete schema that will be seen by queries. Fields appear
+ /// in the order: file columns, partition columns, virtual columns.
pub fn table_schema(&self) -> &SchemaRef {
&self.table_schema
}
+
+ /// Schema of columns that can be referenced by predicates pushed into the
+ /// file reader: file columns plus partition columns, excluding virtual
+ /// columns.
+ ///
+ /// Virtual columns are produced by the reader itself (e.g. Parquet
+ /// `row_number`) and cannot be referenced inside the reader's row filter,
+ /// so predicates that reference them must stay above the scan. Callers
+ /// deciding which filters to push down should check against this schema
+ /// rather than [`Self::table_schema`].
+ ///
+ /// When there are no virtual columns this returns the same schema as
+ /// [`Self::table_schema`].
+ pub fn schema_without_virtual_columns(&self) -> SchemaRef {
+ if self.virtual_columns.is_empty() {
+ return Arc::clone(&self.table_schema);
+ }
+ let mut builder = SchemaBuilder::from(self.file_schema.as_ref());
+ builder.extend(self.table_partition_cols.iter().cloned());
+ Arc::new(builder.finish())
Review Comment:
We could potentially cache this on the struct at construction time.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]