vustef commented on code in PR #1824:
URL: https://github.com/apache/iceberg-rust/pull/1824#discussion_r2517455795


##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Returns the column name for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The name of the metadata column, or an error if the field ID is not 
recognized
+pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata field ID: {field_id}"),
+        )),
+    }
+}
+
+/// Returns the field ID for a metadata column name.
+///
+/// # Arguments
+/// * `column_name` - The metadata column name
+///
+/// # Returns
+/// The field ID of the metadata column, or an error if the column name is not 
recognized
+pub fn get_metadata_field_id(column_name: &str) -> Result<i32> {
+    match column_name {
+        RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata column name: {column_name}"),
+        )),
+    }
+}
+
+/// Checks if a field ID is a metadata field.
+///
+/// # Arguments
+/// * `field_id` - The field ID to check
+///
+/// # Returns
+/// `true` if the field ID is a metadata field, `false` otherwise
+pub fn is_metadata_field(field_id: i32) -> bool {
+    field_id == RESERVED_FIELD_ID_FILE

Review Comment:
   I had a suggestion to check start and end of the range (since reserved field 
IDs are a range according to the spec). Perhaps it got lost in the refactorings.



##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Returns the column name for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The name of the metadata column, or an error if the field ID is not 
recognized
+pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata field ID: {field_id}"),
+        )),
+    }
+}
+
+/// Returns the field ID for a metadata column name.
+///
+/// # Arguments
+/// * `column_name` - The metadata column name
+///
+/// # Returns
+/// The field ID of the metadata column, or an error if the column name is not 
recognized
+pub fn get_metadata_field_id(column_name: &str) -> Result<i32> {
+    match column_name {
+        RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata column name: {column_name}"),
+        )),
+    }
+}
+
+/// Checks if a field ID is a metadata field.
+///
+/// # Arguments
+/// * `field_id` - The field ID to check
+///
+/// # Returns
+/// `true` if the field ID is a metadata field, `false` otherwise
+pub fn is_metadata_field(field_id: i32) -> bool {
+    field_id == RESERVED_FIELD_ID_FILE
+    // Additional metadata fields can be checked here in the future
+}
+
+/// Checks if a column name is a metadata column.
+///
+/// # Arguments
+/// * `column_name` - The column name to check
+///
+/// # Returns
+/// `true` if the column name is a metadata column, `false` otherwise

Review Comment:
   This will only return true for the currently supported metadata columns. 
E.g. if users pass in some other metadata column, even though it is a metadata 
column, it will return false.



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -429,6 +471,27 @@ impl RecordBatchTransformer {
                 let vals: Vec<Option<f64>> = vec![None; num_rows];
                 Arc::new(Float64Array::from(vals))
             }
+            (DataType::RunEndEncoded(_, _), 
Some(PrimitiveLiteral::String(value))) => {

Review Comment:
   It seems a bit random to me that only strings use REE in 
`primitive_literal_to_arrow_type` below. Yes, they might use the most memory 
otherwise, but other types also have similar kind of redundancy suitable for 
the REE.



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -270,11 +302,21 @@ impl RecordBatchTransformer {
         snapshot_schema: &IcebergSchema,
         projected_iceberg_field_ids: &[i32],
         field_id_to_mapped_schema_map: HashMap<i32, (FieldRef, usize)>,
+        constants_map: &HashMap<i32, PrimitiveLiteral>,
     ) -> Result<Vec<ColumnSource>> {
         let field_id_to_source_schema_map =
             Self::build_field_id_to_arrow_schema_map(source_schema)?;
 
         projected_iceberg_field_ids.iter().map(|field_id|{
+            // Check if this is a constant/virtual field first
+            if let Some(constant_value) = constants_map.get(field_id) {
+                // This is a virtual field - add it with the constant value
+                return Ok(ColumnSource::Add {
+                    value: Some(constant_value.clone()),
+                    target_type: 
Self::primitive_literal_to_arrow_type(constant_value)?,

Review Comment:
   Feels a bit redundant to have to not only lookup `constants_map` twice, but 
call `primitive_literal_to_arrow_type` twice (once here and once in 
`generate_batch_transform`)



##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Returns the column name for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The name of the metadata column, or an error if the field ID is not 
recognized
+pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata field ID: {field_id}"),
+        )),
+    }
+}
+
+/// Returns the field ID for a metadata column name.
+///
+/// # Arguments
+/// * `column_name` - The metadata column name
+///
+/// # Returns
+/// The field ID of the metadata column, or an error if the column name is not 
recognized
+pub fn get_metadata_field_id(column_name: &str) -> Result<i32> {
+    match column_name {
+        RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata column name: {column_name}"),
+        )),
+    }
+}
+
+/// Checks if a field ID is a metadata field.
+///
+/// # Arguments
+/// * `field_id` - The field ID to check
+///
+/// # Returns
+/// `true` if the field ID is a metadata field, `false` otherwise
+pub fn is_metadata_field(field_id: i32) -> bool {
+    field_id == RESERVED_FIELD_ID_FILE
+    // Additional metadata fields can be checked here in the future
+}
+
+/// Checks if a column name is a metadata column.
+///
+/// # Arguments
+/// * `column_name` - The column name to check
+///
+/// # Returns
+/// `true` if the column name is a metadata column, `false` otherwise
+pub fn is_metadata_column_name(column_name: &str) -> bool {
+    column_name == RESERVED_COL_NAME_FILE

Review Comment:
   seems like there's already `get_metadata_field_id` which you can use



##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Returns the column name for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The name of the metadata column, or an error if the field ID is not 
recognized
+pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata field ID: {field_id}"),
+        )),
+    }
+}
+
+/// Returns the field ID for a metadata column name.
+///
+/// # Arguments
+/// * `column_name` - The metadata column name
+///
+/// # Returns
+/// The field ID of the metadata column, or an error if the column name is not 
recognized
+pub fn get_metadata_field_id(column_name: &str) -> Result<i32> {
+    match column_name {
+        RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE),

Review Comment:
   wish that we could somehow reuse the mapping from 
`get_metadata_column_name`, to form some bidirectional 1-1 mapping. Perhaps 
some macro magic or something else?



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -452,6 +515,33 @@ impl RecordBatchTransformer {
             }
         })
     }
+
+    /// Converts a PrimitiveLiteral to its corresponding Arrow DataType.
+    /// This is used for virtual fields to determine the Arrow type based on 
the constant value.
+    fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) -> 
Result<DataType> {
+        Ok(match literal {
+            PrimitiveLiteral::Boolean(_) => DataType::Boolean,
+            PrimitiveLiteral::Int(_) => DataType::Int32,
+            PrimitiveLiteral::Long(_) => DataType::Int64,
+            PrimitiveLiteral::Float(_) => DataType::Float32,
+            PrimitiveLiteral::Double(_) => DataType::Float64,
+            PrimitiveLiteral::String(_) => {
+                // Use Run-End Encoding for constant strings (memory efficient)
+                let run_ends_field = Arc::new(Field::new("run_ends", 
DataType::Int32, false));
+                let values_field = Arc::new(Field::new("values", 
DataType::Utf8, true));

Review Comment:
   maybe add comment we had why this is nullable



##########
crates/iceberg/src/metadata_columns.rs:
##########
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metadata columns (virtual/reserved fields) for Iceberg tables.
+//!
+//! This module defines metadata columns that can be requested in projections
+//! but are not stored in data files. Instead, they are computed on-the-fly
+//! during reading. Examples include the _file column (file path) and future
+//! columns like partition values or row numbers.
+
+use crate::{Error, ErrorKind, Result};
+
+/// Reserved field ID for the file path (_file) column per Iceberg spec
+pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646;
+
+/// Reserved column name for the file path metadata column
+pub const RESERVED_COL_NAME_FILE: &str = "_file";
+
+/// Returns the column name for a metadata field ID.
+///
+/// # Arguments
+/// * `field_id` - The metadata field ID
+///
+/// # Returns
+/// The name of the metadata column, or an error if the field ID is not 
recognized
+pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> {
+    match field_id {
+        RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata field ID: {field_id}"),
+        )),
+    }
+}
+
+/// Returns the field ID for a metadata column name.
+///
+/// # Arguments
+/// * `column_name` - The metadata column name
+///
+/// # Returns
+/// The field ID of the metadata column, or an error if the column name is not 
recognized
+pub fn get_metadata_field_id(column_name: &str) -> Result<i32> {
+    match column_name {
+        RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE),
+        _ => Err(Error::new(
+            ErrorKind::Unexpected,
+            format!("Unknown metadata column name: {column_name}"),
+        )),
+    }
+}
+
+/// Checks if a field ID is a metadata field.
+///
+/// # Arguments
+/// * `field_id` - The field ID to check
+///
+/// # Returns
+/// `true` if the field ID is a metadata field, `false` otherwise
+pub fn is_metadata_field(field_id: i32) -> bool {
+    field_id == RESERVED_FIELD_ID_FILE
+    // Additional metadata fields can be checked here in the future
+}
+
+/// Checks if a column name is a metadata column.
+///
+/// # Arguments
+/// * `column_name` - The column name to check
+///
+/// # Returns
+/// `true` if the column name is a metadata column, `false` otherwise
+pub fn is_metadata_column_name(column_name: &str) -> bool {
+    column_name == RESERVED_COL_NAME_FILE

Review Comment:
   to leverage field ID range as proposed above, maybe we could establish a 
const map from names to field IDs (and perhaps vice versa), and then just check 
field IDs?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to