This is an automated email from the ASF dual-hosted git repository. xuanwo pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push: new 921f3895 refactor: Move equality-ids closer to the spec (#1705) 921f3895 is described below commit 921f38955443dfecedcc0a94fa4bd9bbed2b2afb Author: Fokko Driesprong <fo...@apache.org> AuthorDate: Tue Sep 23 18:56:43 2025 +0200 refactor: Move equality-ids closer to the spec (#1705) ## Which issue does this PR close? Right now the equality-deletes can't be not-null, while the spec states that it should be null in the case of a manifest-entry that's not an equality delete: <img width="835" height="341" alt="image" src="https://github.com/user-attachments/assets/60a88f37-7c50-48b7-8878-ecfe4bd70509" /> ## What changes are included in this PR? <!-- Provide a summary of the modifications in this PR. List the main changes such as new features, bug fixes, refactoring, or any other updates. --> ## Are these changes tested? <!-- Specify what test covers (unit test, integration test, etc.). If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> --- bindings/python/src/data_file.rs | 2 +- bindings/python/tests/test_manifest.py | 2 +- .../src/arrow/caching_delete_file_loader.rs | 2 +- crates/iceberg/src/arrow/delete_filter.rs | 6 +++--- .../src/expr/visitors/expression_evaluator.rs | 4 ++-- .../expr/visitors/inclusive_metrics_evaluator.rs | 12 +++++------ .../src/expr/visitors/strict_metrics_evaluator.rs | 8 ++++---- crates/iceberg/src/scan/task.rs | 4 ++-- crates/iceberg/src/spec/manifest/_serde.rs | 17 +++++++-------- crates/iceberg/src/spec/manifest/data_file.rs | 6 +++--- crates/iceberg/src/spec/manifest/mod.rs | 24 +++++++++++----------- crates/iceberg/src/spec/manifest/writer.rs | 6 +++--- crates/iceberg/src/spec/snapshot_summary.rs | 10 ++++----- .../writer/base_writer/equality_delete_writer.rs | 2 +- 14 files changed, 51 insertions(+), 54 deletions(-) diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs index 3339b384..900d6c60 100644 --- a/bindings/python/src/data_file.rs +++ b/bindings/python/src/data_file.rs @@ -148,7 +148,7 @@ impl PyDataFile { } #[getter] - fn equality_ids(&self) -> &[i32] { + fn equality_ids(&self) -> Option<Vec<i32>> { self.inner.equality_ids() } diff --git a/bindings/python/tests/test_manifest.py b/bindings/python/tests/test_manifest.py index 0e838c39..701eac25 100644 --- a/bindings/python/tests/test_manifest.py +++ b/bindings/python/tests/test_manifest.py @@ -138,5 +138,5 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: } assert data_file.key_metadata is None assert data_file.split_offsets == [4] - assert data_file.equality_ids == [] + assert data_file.equality_ids is None assert data_file.sort_order_id == 0 diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index c6a8943d..9cf60568 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -233,7 +233,7 @@ impl CachingDeleteFileLoader { ) .await?, sender, - equality_ids: HashSet::from_iter(task.equality_ids.clone()), + equality_ids: HashSet::from_iter(task.equality_ids.clone().unwrap()), }) } diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index 0dd53a34..b853baa9 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -311,21 +311,21 @@ pub(crate) mod tests { file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let pos_del_2 = FileScanTaskDeleteFile { file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let pos_del_3 = FileScanTaskDeleteFile { file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let file_scan_tasks = vec![ diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 4db1ad7d..3675ce35 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -347,7 +347,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -375,7 +375,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index a00376e1..2b65cf12 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -1996,7 +1996,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2022,7 +2022,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2084,7 +2084,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2115,7 +2115,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2147,7 +2147,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2179,7 +2179,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs index f74ce3a6..e17c44c6 100644 --- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs @@ -579,7 +579,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -605,7 +605,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -631,7 +631,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -658,7 +658,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs index 7b111e4f..32fe3ae3 100644 --- a/crates/iceberg/src/scan/task.rs +++ b/crates/iceberg/src/scan/task.rs @@ -112,6 +112,6 @@ pub struct FileScanTaskDeleteFile { /// partition id pub partition_spec_id: i32, - /// equality ids for equality deletes (empty for positional deletes) - pub equality_ids: Vec<i32>, + /// equality ids for equality deletes (null for anything other than equality-deletes) + pub equality_ids: Option<Vec<i32>>, } diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index 462128cf..7738af46 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -116,7 +116,6 @@ pub(super) struct DataFileSerde { upper_bounds: Option<Vec<BytesEntry>>, key_metadata: Option<serde_bytes::ByteBuf>, split_offsets: Option<Vec<i64>>, - #[serde(default)] equality_ids: Option<Vec<i32>>, sort_order_id: Option<i32>, first_row_id: Option<i64>, @@ -155,7 +154,7 @@ impl DataFileSerde { upper_bounds: Some(to_bytes_entry(value.upper_bounds)?), key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from), split_offsets: Some(value.split_offsets), - equality_ids: Some(value.equality_ids), + equality_ids: value.equality_ids, sort_order_id: value.sort_order_id, first_row_id: value.first_row_id, referenced_data_file: value.referenced_data_file, @@ -224,7 +223,7 @@ impl DataFileSerde { .unwrap_or_default(), key_metadata: self.key_metadata.map(|v| v.to_vec()), split_offsets: self.split_offsets.unwrap_or_default(), - equality_ids: self.equality_ids.unwrap_or_default(), + equality_ids: self.equality_ids, sort_order_id: self.sort_order_id, partition_spec_id, first_row_id: self.first_row_id, @@ -382,7 +381,7 @@ mod tests { upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -517,9 +516,8 @@ mod tests { "DataFileSerde should convert content 0 to DataContentType::Data" ); assert_eq!( - v2_entry.data_file.equality_ids, - Vec::<i32>::new(), - "DataFileSerde should convert None equality_ids to empty vec" + v2_entry.data_file.equality_ids, None, + "DataFileSerde should preserve None equality_ids as None" ); // Verify other fields are preserved during conversion @@ -581,9 +579,8 @@ mod tests { "content 0 should convert to DataContentType::Data" ); assert_eq!( - data_file.equality_ids, - Vec::<i32>::new(), - "None equality_ids should convert to empty vec via unwrap_or_default()" + data_file.equality_ids, None, + "None equality_ids should remain as None" ); // Verify other fields are handled correctly during conversion diff --git a/crates/iceberg/src/spec/manifest/data_file.rs b/crates/iceberg/src/spec/manifest/data_file.rs index d191ba2e..931f9441 100644 --- a/crates/iceberg/src/spec/manifest/data_file.rs +++ b/crates/iceberg/src/spec/manifest/data_file.rs @@ -135,7 +135,7 @@ pub struct DataFile { /// otherwise. Fields with ids listed in this column must be present /// in the delete file #[builder(default)] - pub(crate) equality_ids: Vec<i32>, + pub(crate) equality_ids: Option<Vec<i32>>, /// field id: 140 /// /// ID representing sort order for this file. @@ -249,8 +249,8 @@ impl DataFile { /// Get the equality ids of the data file. /// Field ids used to determine row equality in equality delete files. /// null when content is not EqualityDeletes. - pub fn equality_ids(&self) -> &[i32] { - &self.equality_ids + pub fn equality_ids(&self) -> Option<Vec<i32>> { + self.equality_ids.clone() } /// Get the first row id in the data file. pub fn first_row_id(&self) -> Option<i64> { diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index da039773..a1a5612c 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -256,7 +256,7 @@ mod tests { snapshot_id: None, sequence_number: None, file_sequence_number: None, - data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)] [...] + data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)] [...] } ]; @@ -435,7 +435,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: Some(Vec::new()), sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -532,7 +532,7 @@ mod tests { upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -640,7 +640,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -749,7 +749,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -840,7 +840,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -922,7 +922,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -957,7 +957,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -992,7 +992,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -1027,7 +1027,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -1182,7 +1182,7 @@ mod tests { "upper_bounds": [], "key_metadata": null, "split_offsets": [], - "equality_ids": [], + "equality_ids": null, "sort_order_id": null, "first_row_id": null, "referenced_data_file": null, @@ -1213,7 +1213,7 @@ mod tests { "upper_bounds": [], "key_metadata": null, "split_offsets": [], - "equality_ids": [], + "equality_ids": null, "sort_order_id": null, "first_row_id": null, "referenced_data_file": null, diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 39945a51..673f8b5d 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -545,7 +545,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -574,7 +574,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -603,7 +603,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/spec/snapshot_summary.rs b/crates/iceberg/src/spec/snapshot_summary.rs index e374e567..1b07ce3f 100644 --- a/crates/iceberg/src/spec/snapshot_summary.rs +++ b/crates/iceberg/src/spec/snapshot_summary.rs @@ -768,7 +768,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -800,7 +800,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -910,7 +910,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -938,7 +938,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -992,7 +992,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs index f710544d..765ff1ca 100644 --- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs +++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs @@ -156,7 +156,7 @@ impl<B: FileWriterBuilder> IcebergWriter for EqualityDeleteFileWriter<B> { .into_iter() .map(|mut res| { res.content(crate::spec::DataContentType::EqualityDeletes); - res.equality_ids(self.equality_ids.iter().copied().collect_vec()); + res.equality_ids(Some(self.equality_ids.iter().copied().collect_vec())); res.partition(self.partition_value.clone()); res.partition_spec_id(self.partition_spec_id); res.build().expect("msg")