mbutrovich commented on PR #1821:
URL: https://github.com/apache/iceberg-rust/pull/1821#issuecomment-3497835126
This should be a test that demonstrates the scenario that fails if I
strictly follow what the spec says
```rust
/// Test demonstrating field ID conflict from Spark-written partitioned
Parquet files.
///
/// This reproduces the issue found in
TestAddFilesProcedure.addPartitionToPartitioned():
/// When Spark writes partitioned Parquet files, it:
/// 1. Excludes partition columns from the file (they're in directory
structure)
/// 2. RENUMBERS remaining field IDs starting from 1
///
/// # Scenario
///
/// Iceberg schema after add_files:
/// - field_id=1 -> "id" (partition column, from constants_map)
/// - field_id=2 -> "name"
/// - field_id=3 -> "dept"
/// - field_id=4 -> "subdept"
///
/// Spark-written partitioned Parquet file:
/// - field_id=1 -> "name" (RENUMBERED! should be 2)
/// - field_id=2 -> "dept" (RENUMBERED! should be 3)
/// - field_id=3 -> "subdept" (RENUMBERED! should be 4)
/// - "id" excluded (in directory: id=1/)
///
/// # The Problem
///
/// Per spec "Columns in Iceberg data files are selected by field id":
/// - We look for field_id=1 ("id")
/// - Find field_id=1 in Parquet (but it's "name"!)
/// - Use it directly -> WRONG COLUMN
///
/// Expected: id=1 (from partition metadata)
/// Actual: returns "name" column data
///
/// # Note
///
/// This test currently FAILS, demonstrating the issue. It should pass
once
/// we implement field ID conflict detection (checking both ID and name
match).
#[test]
#[should_panic(expected = "assertion `left == right` failed")]
fn spark_partitioned_file_field_id_conflict() {
use crate::spec::{MappedField, NameMapping, Transform};
// Iceberg schema after add_files with partition column
let snapshot_schema = Arc::new(
Schema::builder()
.with_schema_id(0)
.with_fields(vec![
NestedField::optional(1, "id",
Type::Primitive(PrimitiveType::Int)).into(),
NestedField::optional(2, "name",
Type::Primitive(PrimitiveType::String))
.into(),
NestedField::optional(3, "dept",
Type::Primitive(PrimitiveType::String))
.into(),
NestedField::optional(4, "subdept",
Type::Primitive(PrimitiveType::String))
.into(),
])
.build()
.unwrap(),
);
// Parquet schema with RENUMBERED field IDs (simulating Spark's
behavior)
// Partition column "id" is excluded, remaining fields renumbered
from 1
let parquet_schema = Arc::new(ArrowSchema::new(vec![
simple_field("name", DataType::Utf8, true, "1"), // WRONG!
Should be field_id=2
simple_field("dept", DataType::Utf8, true, "2"), // WRONG!
Should be field_id=3
simple_field("subdept", DataType::Utf8, true, "3"), // WRONG!
Should be field_id=4
]));
// Partition spec: id is a partition column with identity transform
let partition_spec = Arc::new(
PartitionSpec::builder(snapshot_schema.clone())
.with_spec_id(0)
.add_partition_field("id", "id", Transform::Identity)
.unwrap()
.build()
.unwrap(),
);
// Partition data: id=1 (from directory structure: id=1/)
let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]);
// Name mapping to resolve columns by name when field IDs conflict
let name_mapping = Arc::new(NameMapping::new(vec![
MappedField::new(Some(2), vec!["name".to_string()], vec![]),
MappedField::new(Some(3), vec!["dept".to_string()], vec![]),
MappedField::new(Some(4), vec!["subdept".to_string()], vec![]),
]));
let projected_field_ids = [1, 2, 3, 4];
let mut transformer =
RecordBatchTransformer::build_with_partition_data(
snapshot_schema,
&projected_field_ids,
Some(partition_spec),
Some(partition_data),
Some(name_mapping),
);
// Parquet batch with actual data
let parquet_batch = RecordBatch::try_new(
parquet_schema,
vec![
Arc::new(StringArray::from(vec!["Alice"])),
Arc::new(StringArray::from(vec!["Engineering"])),
Arc::new(StringArray::from(vec!["Backend"])),
],
)
.unwrap();
let result =
transformer.process_record_batch(parquet_batch).unwrap();
assert_eq!(result.num_columns(), 4);
assert_eq!(result.num_rows(), 1);
// Column 0: id should be 1 (from partition metadata)
// BUG: Currently returns "Alice" because we use Parquet field_id=1
which is "name"
let id_column = result
.column(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap();
assert_eq!(
id_column.value(0),
1,
"Expected id=1 from partition metadata, but got wrong column due
to field ID conflict"
);
// Column 1: name should be "Alice"
// BUG: Currently returns "Engineering" because we use Parquet
field_id=2 which is "dept"
let name_column = result
.column(1)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(
name_column.value(0),
"Alice",
"Expected name='Alice', but got wrong column due to field ID
conflict"
);
// Column 2: dept should be "Engineering"
// BUG: Currently returns "Backend" because we use Parquet
field_id=3 which is "subdept"
let dept_column = result
.column(2)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(
dept_column.value(0),
"Engineering",
"Expected dept='Engineering', but got wrong column due to field
ID conflict"
);
// Column 3: subdept should be "Backend"
// This one works because we don't find field_id=4, so we fall
through to name mapping
let subdept_column = result
.column(3)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(subdept_column.value(0), "Backend");
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]