liurenjie1024 commented on code in PR #1824:
URL: https://github.com/apache/iceberg-rust/pull/1824#discussion_r2541439845


##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use arrow_schema::{DataType, Field};
+use once_cell::sync::Lazy;
+use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = i32::MAX - 1;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Lazy-initialized Arrow Field definition for the _file metadata column.
+/// Uses Run-End Encoding for memory efficiency.
+static FILE_FIELD: Lazy<Arc<Field>> = Lazy::new(|| {
+    let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, 
false));
+    let values_field = Arc::new(Field::new("values", DataType::Utf8, true));
+    Arc::new(
+        Field::new(
+            RESERVED_COL_NAME_FILE,
+            DataType::RunEndEncoded(run_ends_field, values_field),
+            false,
+        )
+        .with_metadata(HashMap::from([(
+            PARQUET_FIELD_ID_META_KEY.to_string(),
+            RESERVED_FIELD_ID_FILE.to_string(),
+        )])),
+    )
+});
+
+/// Returns the Arrow Field definition for the _file metadata column.
+///
+/// # Returns
+/// A reference to the _file field definition (RunEndEncoded type)
+pub fn file_field() -> &'static Arc<Field> {
+    &FILE_FIELD
+}
+
+/// Returns the Arrow Field definition for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The Arrow Field definition for the metadata column, or an error if not a 
metadata field
+pub fn get_metadata_field(field_id: i32) -> Result<Arc<Field>> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_field())),

Review Comment:
   This is fine for now, but I'm thinking maybe a better approach is to have a 
static map for this so that we don't need to repeat such pattern matching in 
different places.



##########
crates/iceberg/src/scan/mod.rs:
##########
@@ -124,6 +127,45 @@ impl<'a> TableScanBuilder<'a> {
         self
     }
 
+    /// Include the _file metadata column in the scan.
+    ///
+    /// This is a convenience method that adds the _file column to the current 
selection.
+    /// If no columns are currently selected (select_all), this will select 
all columns plus _file.
+    /// If specific columns are selected, this adds _file to that selection.
+    ///
+    /// # Example
+    /// ```no_run
+    /// # use iceberg::table::Table;
+    /// # async fn example(table: Table) -> iceberg::Result<()> {
+    /// // Select id, name, and _file
+    /// let scan = table
+    ///     .scan()
+    ///     .select(["id", "name"])
+    ///     .with_file_column()
+    ///     .build()?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn with_file_column(mut self) -> Self {

Review Comment:
   It's odd to add this function, use could just `select("_file")`



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -163,32 +164,49 @@ impl RecordBatchTransformerBuilder {
         Self {
             snapshot_schema,
             projected_iceberg_field_ids: projected_iceberg_field_ids.to_vec(),
-            partition_spec: None,
-            partition_data: None,
+            constant_fields: HashMap::new(),
         }
     }
 
+    /// Add a constant value for a specific field ID.
+    /// This is used for virtual/metadata fields like _file that have constant 
values per batch.
+    ///
+    /// # Arguments
+    /// * `field_id` - The field ID to associate with the constant
+    /// * `value` - The constant value for this field
+    pub(crate) fn with_constant(mut self, field_id: i32, value: 
PrimitiveLiteral) -> Result<Self> {
+        let arrow_type = 
RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?;

Review Comment:
   The `primitive_literal_to_arrow_type` doesn't make sense to me. 
`PrimitiveLiteral` is designed to be a literal without type, and we should not 
guess its type.



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -111,6 +113,9 @@ enum SchemaComparison {
 pub(crate) struct RecordBatchTransformer {
     snapshot_schema: Arc<IcebergSchema>,
     projected_iceberg_field_ids: Vec<i32>,
+    // Pre-computed constant field information: field_id -> (arrow_type, value)
+    // Avoids duplicate lookups and type conversions during batch processing
+    constant_fields: HashMap<i32, (DataType, PrimitiveLiteral)>,

Review Comment:
   What's the benefit of using arrow's DataType here?



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -163,32 +164,49 @@ impl RecordBatchTransformerBuilder {
         Self {
             snapshot_schema,
             projected_iceberg_field_ids: projected_iceberg_field_ids.to_vec(),
-            partition_spec: None,
-            partition_data: None,
+            constant_fields: HashMap::new(),
         }
     }
 
+    /// Add a constant value for a specific field ID.
+    /// This is used for virtual/metadata fields like _file that have constant 
values per batch.
+    ///
+    /// # Arguments
+    /// * `field_id` - The field ID to associate with the constant
+    /// * `value` - The constant value for this field
+    pub(crate) fn with_constant(mut self, field_id: i32, value: 
PrimitiveLiteral) -> Result<Self> {
+        let arrow_type = 
RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?;
+        self.constant_fields.insert(field_id, (arrow_type, value));
+        Ok(self)
+    }
+
     /// Set partition spec and data together for identifying 
identity-transformed partition columns.
     ///
     /// Both partition_spec and partition_data must be provided together since 
the spec defines
     /// which fields are identity-partitioned, and the data provides their 
constant values.
-    /// One without the other cannot produce a valid constants map.
+    /// This method computes the partition constants and merges them into 
constant_fields.
     pub(crate) fn with_partition(
         mut self,
         partition_spec: Arc<PartitionSpec>,
         partition_data: Struct,
-    ) -> Self {
-        self.partition_spec = Some(partition_spec);
-        self.partition_data = Some(partition_data);
-        self
+    ) -> Result<Self> {
+        // Compute partition constants for identity-transformed fields
+        let partition_constants = constants_map(&partition_spec, 
&partition_data);
+
+        // Add partition constants to constant_fields (compute REE types from 
literals)
+        for (field_id, value) in partition_constants {
+            let arrow_type = 
RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?;

Review Comment:
   Ditto.



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -311,22 +321,39 @@ impl RecordBatchTransformer {
         let fields: Result<Vec<_>> = projected_iceberg_field_ids
             .iter()
             .map(|field_id| {
-                Ok(field_id_to_mapped_schema_map
-                    .get(field_id)
-                    .ok_or(Error::new(ErrorKind::Unexpected, "field not 
found"))?
-                    .0
-                    .clone())
+                // Check if this is a constant field (virtual or partition)

Review Comment:
   ```suggestion
                   // Check if this is a constant field
   ```



##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+

Review Comment:
   We should not create arrow field. Please remember that iceberg tries to be 
engine independent, and such core abstractions should use iceberg's own 
abastractions.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to