Re: [PR] feat(reader): Add PartitionSpec support to FileScanTask and RecordBatchTransformer [iceberg-rust]

via GitHub Thu, 06 Nov 2025 07:31:08 -0800


mbutrovich commented on PR #1821:
URL: https://github.com/apache/iceberg-rust/pull/1821#issuecomment-3497835126


   This should be a test that demonstrates the scenario that fails if I 
strictly follow what the spec says
   ```rust
       /// Test demonstrating field ID conflict from Spark-written partitioned 
Parquet files.
       ///
       /// This reproduces the issue found in 
TestAddFilesProcedure.addPartitionToPartitioned():
       /// When Spark writes partitioned Parquet files, it:
       /// 1. Excludes partition columns from the file (they're in directory 
structure)
       /// 2. RENUMBERS remaining field IDs starting from 1
       ///
       /// # Scenario
       ///
       /// Iceberg schema after add_files:
       /// - field_id=1 -> "id" (partition column, from constants_map)
       /// - field_id=2 -> "name"
       /// - field_id=3 -> "dept"
       /// - field_id=4 -> "subdept"
       ///
       /// Spark-written partitioned Parquet file:
       /// - field_id=1 -> "name" (RENUMBERED! should be 2)
       /// - field_id=2 -> "dept" (RENUMBERED! should be 3)
       /// - field_id=3 -> "subdept" (RENUMBERED! should be 4)
       /// - "id" excluded (in directory: id=1/)
       ///
       /// # The Problem
       ///
       /// Per spec "Columns in Iceberg data files are selected by field id":
       /// - We look for field_id=1 ("id")
       /// - Find field_id=1 in Parquet (but it's "name"!)
       /// - Use it directly -> WRONG COLUMN
       ///
       /// Expected: id=1 (from partition metadata)
       /// Actual: returns "name" column data
       ///
       /// # Note
       ///
       /// This test currently FAILS, demonstrating the issue. It should pass 
once
       /// we implement field ID conflict detection (checking both ID and name 
match).
       #[test]
       #[should_panic(expected = "assertion `left == right` failed")]
       fn spark_partitioned_file_field_id_conflict() {
           use crate::spec::{MappedField, NameMapping, Transform};
   
           // Iceberg schema after add_files with partition column
           let snapshot_schema = Arc::new(
               Schema::builder()
                   .with_schema_id(0)
                   .with_fields(vec![
                       NestedField::optional(1, "id", 
Type::Primitive(PrimitiveType::Int)).into(),
                       NestedField::optional(2, "name", 
Type::Primitive(PrimitiveType::String))
                           .into(),
                       NestedField::optional(3, "dept", 
Type::Primitive(PrimitiveType::String))
                           .into(),
                       NestedField::optional(4, "subdept", 
Type::Primitive(PrimitiveType::String))
                           .into(),
                   ])
                   .build()
                   .unwrap(),
           );
   
           // Parquet schema with RENUMBERED field IDs (simulating Spark's 
behavior)
           // Partition column "id" is excluded, remaining fields renumbered 
from 1
           let parquet_schema = Arc::new(ArrowSchema::new(vec![
               simple_field("name", DataType::Utf8, true, "1"), // WRONG! 
Should be field_id=2
               simple_field("dept", DataType::Utf8, true, "2"), // WRONG! 
Should be field_id=3
               simple_field("subdept", DataType::Utf8, true, "3"), // WRONG! 
Should be field_id=4
           ]));
   
           // Partition spec: id is a partition column with identity transform
           let partition_spec = Arc::new(
               PartitionSpec::builder(snapshot_schema.clone())
                   .with_spec_id(0)
                   .add_partition_field("id", "id", Transform::Identity)
                   .unwrap()
                   .build()
                   .unwrap(),
           );
   
           // Partition data: id=1 (from directory structure: id=1/)
           let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]);
   
           // Name mapping to resolve columns by name when field IDs conflict
           let name_mapping = Arc::new(NameMapping::new(vec![
               MappedField::new(Some(2), vec!["name".to_string()], vec![]),
               MappedField::new(Some(3), vec!["dept".to_string()], vec![]),
               MappedField::new(Some(4), vec!["subdept".to_string()], vec![]),
           ]));
   
           let projected_field_ids = [1, 2, 3, 4];
   
           let mut transformer = 
RecordBatchTransformer::build_with_partition_data(
               snapshot_schema,
               &projected_field_ids,
               Some(partition_spec),
               Some(partition_data),
               Some(name_mapping),
           );
   
           // Parquet batch with actual data
           let parquet_batch = RecordBatch::try_new(
               parquet_schema,
               vec![
                   Arc::new(StringArray::from(vec!["Alice"])),
                   Arc::new(StringArray::from(vec!["Engineering"])),
                   Arc::new(StringArray::from(vec!["Backend"])),
               ],
           )
           .unwrap();
   
           let result = 
transformer.process_record_batch(parquet_batch).unwrap();
   
           assert_eq!(result.num_columns(), 4);
           assert_eq!(result.num_rows(), 1);
   
           // Column 0: id should be 1 (from partition metadata)
           // BUG: Currently returns "Alice" because we use Parquet field_id=1 
which is "name"
           let id_column = result
               .column(0)
               .as_any()
               .downcast_ref::<Int32Array>()
               .unwrap();
           assert_eq!(
               id_column.value(0),
               1,
               "Expected id=1 from partition metadata, but got wrong column due 
to field ID conflict"
           );
   
           // Column 1: name should be "Alice"
           // BUG: Currently returns "Engineering" because we use Parquet 
field_id=2 which is "dept"
           let name_column = result
               .column(1)
               .as_any()
               .downcast_ref::<StringArray>()
               .unwrap();
           assert_eq!(
               name_column.value(0),
               "Alice",
               "Expected name='Alice', but got wrong column due to field ID 
conflict"
           );
   
           // Column 2: dept should be "Engineering"
           // BUG: Currently returns "Backend" because we use Parquet 
field_id=3 which is "subdept"
           let dept_column = result
               .column(2)
               .as_any()
               .downcast_ref::<StringArray>()
               .unwrap();
           assert_eq!(
               dept_column.value(0),
               "Engineering",
               "Expected dept='Engineering', but got wrong column due to field 
ID conflict"
           );
   
           // Column 3: subdept should be "Backend"
           // This one works because we don't find field_id=4, so we fall 
through to name mapping
           let subdept_column = result
               .column(3)
               .as_any()
               .downcast_ref::<StringArray>()
               .unwrap();
           assert_eq!(subdept_column.value(0), "Backend");
       }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(reader): Add PartitionSpec support to FileScanTask and RecordBatchTransformer [iceberg-rust]

Reply via email to