yshcz opened a new issue, #1973:
URL: https://github.com/apache/iceberg-rust/issues/1973
### Apache Iceberg Rust version
None
### Describe the bug
The spec requires that manifests written in format version 2+ must include
the content field in the Avro file key-value metadata.
Currently the writer only writes the content metadata for V2 manifests. V3
manifests are missing this required field. This causes a roundtrip problem
where V3 delete manifests written by iceberg-rust are read back as data
manifests.
Probably the fix is just updating the condition that writes the content
metadata field to include V3.
### To Reproduce
Add the following test at `crates/iceberg/src/spec/manifest/mod.rs`
```rust
#[tokio::test]
async fn test_v3_delete_manifest_delte_file_roundtrip() {
let schema = Arc::new(
Schema::builder()
.with_fields(vec![
Arc::new(NestedField::optional(
1,
"id",
Type::Primitive(PrimitiveType::Long),
)),
Arc::new(NestedField::optional(
2,
"data",
Type::Primitive(PrimitiveType::String),
)),
])
.build()
.unwrap(),
);
let partition_spec = PartitionSpec::builder(schema.clone())
.with_spec_id(0)
.build()
.unwrap();
// Create a position delete file entry
let delete_entry = ManifestEntry {
status: ManifestStatus::Added,
snapshot_id: None,
sequence_number: None,
file_sequence_number: None,
data_file: DataFile {
content: DataContentType::PositionDeletes,
file_path:
"s3://bucket/table/data/delete-00000.parquet".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::empty(),
record_count: 10,
file_size_in_bytes: 1024,
column_sizes: HashMap::new(),
value_counts: HashMap::new(),
null_value_counts: HashMap::new(),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
},
};
// Write a V3 delete manifest
let tmp_dir = TempDir::new().unwrap();
let path = tmp_dir.path().join("v3_delete_manifest.avro");
let io = FileIOBuilder::new_fs_io().build().unwrap();
let output_file = io.new_output(path.to_str().unwrap()).unwrap();
let mut writer = ManifestWriterBuilder::new(
output_file,
Some(1),
None,
schema.clone(),
partition_spec.clone(),
)
.build_v3_deletes();
writer.add_entry(delete_entry).unwrap();
let manifest_file = writer.write_manifest_file().await.unwrap();
// The returned ManifestFile correctly reports Deletes content
assert_eq!(manifest_file.content, ManifestContentType::Deletes);
// Read back the manifest file
let actual_manifest =
Manifest::parse_avro(fs::read(&path).expect("read_file must
succeed").as_slice())
.unwrap();
// The content type reads as Data due to the bug.
assert_eq!(
actual_manifest.metadata().content,
ManifestContentType::Data,
);
// Expected:
// assert_eq!(
// actual_manifest.metadata().content,
// ManifestContentType::Deletes,
// );
}
```
### Expected behavior
_No response_
### Willingness to contribute
None
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]