This is an automated email from the ASF dual-hosted git repository.

xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 921f3895 refactor: Move equality-ids closer to the spec (#1705)
921f3895 is described below

commit 921f38955443dfecedcc0a94fa4bd9bbed2b2afb
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Tue Sep 23 18:56:43 2025 +0200

    refactor: Move equality-ids closer to the spec (#1705)
    
    ## Which issue does this PR close?
    
    Right now the equality-deletes can't be not-null, while the spec states
    that it should be null in the case of a manifest-entry that's not an
    equality delete:
    
    <img width="835" height="341" alt="image"
    
src="https://github.com/user-attachments/assets/60a88f37-7c50-48b7-8878-ecfe4bd70509";
    />
    
    
    ## What changes are included in this PR?
    
    <!--
    Provide a summary of the modifications in this PR. List the main changes
    such as new features, bug fixes, refactoring, or any other updates.
    -->
    
    ## Are these changes tested?
    
    <!--
    Specify what test covers (unit test, integration test, etc.).
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    -->
---
 bindings/python/src/data_file.rs                   |  2 +-
 bindings/python/tests/test_manifest.py             |  2 +-
 .../src/arrow/caching_delete_file_loader.rs        |  2 +-
 crates/iceberg/src/arrow/delete_filter.rs          |  6 +++---
 .../src/expr/visitors/expression_evaluator.rs      |  4 ++--
 .../expr/visitors/inclusive_metrics_evaluator.rs   | 12 +++++------
 .../src/expr/visitors/strict_metrics_evaluator.rs  |  8 ++++----
 crates/iceberg/src/scan/task.rs                    |  4 ++--
 crates/iceberg/src/spec/manifest/_serde.rs         | 17 +++++++--------
 crates/iceberg/src/spec/manifest/data_file.rs      |  6 +++---
 crates/iceberg/src/spec/manifest/mod.rs            | 24 +++++++++++-----------
 crates/iceberg/src/spec/manifest/writer.rs         |  6 +++---
 crates/iceberg/src/spec/snapshot_summary.rs        | 10 ++++-----
 .../writer/base_writer/equality_delete_writer.rs   |  2 +-
 14 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs
index 3339b384..900d6c60 100644
--- a/bindings/python/src/data_file.rs
+++ b/bindings/python/src/data_file.rs
@@ -148,7 +148,7 @@ impl PyDataFile {
     }
 
     #[getter]
-    fn equality_ids(&self) -> &[i32] {
+    fn equality_ids(&self) -> Option<Vec<i32>> {
         self.inner.equality_ids()
     }
 
diff --git a/bindings/python/tests/test_manifest.py 
b/bindings/python/tests/test_manifest.py
index 0e838c39..701eac25 100644
--- a/bindings/python/tests/test_manifest.py
+++ b/bindings/python/tests/test_manifest.py
@@ -138,5 +138,5 @@ def test_read_manifest_entry(generated_manifest_entry_file: 
str) -> None:
     }
     assert data_file.key_metadata is None
     assert data_file.split_offsets == [4]
-    assert data_file.equality_ids == []
+    assert data_file.equality_ids is None
     assert data_file.sort_order_id == 0
diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs 
b/crates/iceberg/src/arrow/caching_delete_file_loader.rs
index c6a8943d..9cf60568 100644
--- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs
+++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs
@@ -233,7 +233,7 @@ impl CachingDeleteFileLoader {
                     )
                     .await?,
                     sender,
-                    equality_ids: 
HashSet::from_iter(task.equality_ids.clone()),
+                    equality_ids: 
HashSet::from_iter(task.equality_ids.clone().unwrap()),
                 })
             }
 
diff --git a/crates/iceberg/src/arrow/delete_filter.rs 
b/crates/iceberg/src/arrow/delete_filter.rs
index 0dd53a34..b853baa9 100644
--- a/crates/iceberg/src/arrow/delete_filter.rs
+++ b/crates/iceberg/src/arrow/delete_filter.rs
@@ -311,21 +311,21 @@ pub(crate) mod tests {
             file_path: format!("{}/pos-del-1.parquet", 
table_location.to_str().unwrap()),
             file_type: DataContentType::PositionDeletes,
             partition_spec_id: 0,
-            equality_ids: vec![],
+            equality_ids: None,
         };
 
         let pos_del_2 = FileScanTaskDeleteFile {
             file_path: format!("{}/pos-del-2.parquet", 
table_location.to_str().unwrap()),
             file_type: DataContentType::PositionDeletes,
             partition_spec_id: 0,
-            equality_ids: vec![],
+            equality_ids: None,
         };
 
         let pos_del_3 = FileScanTaskDeleteFile {
             file_path: format!("{}/pos-del-3.parquet", 
table_location.to_str().unwrap()),
             file_type: DataContentType::PositionDeletes,
             partition_spec_id: 0,
-            equality_ids: vec![],
+            equality_ids: None,
         };
 
         let file_scan_tasks = vec![
diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs 
b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
index 4db1ad7d..3675ce35 100644
--- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
@@ -347,7 +347,7 @@ mod tests {
             upper_bounds: HashMap::new(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -375,7 +375,7 @@ mod tests {
             upper_bounds: HashMap::new(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs 
b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
index a00376e1..2b65cf12 100644
--- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
@@ -1996,7 +1996,7 @@ mod test {
             upper_bounds: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -2022,7 +2022,7 @@ mod test {
             upper_bounds: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -2084,7 +2084,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -2115,7 +2115,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -2147,7 +2147,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -2179,7 +2179,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs 
b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
index f74ce3a6..e17c44c6 100644
--- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
@@ -579,7 +579,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -605,7 +605,7 @@ mod test {
             upper_bounds: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -631,7 +631,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
@@ -658,7 +658,7 @@ mod test {
             column_sizes: Default::default(),
             key_metadata: None,
             split_offsets: vec![],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: None,
             partition_spec_id: 0,
             first_row_id: None,
diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs
index 7b111e4f..32fe3ae3 100644
--- a/crates/iceberg/src/scan/task.rs
+++ b/crates/iceberg/src/scan/task.rs
@@ -112,6 +112,6 @@ pub struct FileScanTaskDeleteFile {
     /// partition id
     pub partition_spec_id: i32,
 
-    /// equality ids for equality deletes (empty for positional deletes)
-    pub equality_ids: Vec<i32>,
+    /// equality ids for equality deletes (null for anything other than 
equality-deletes)
+    pub equality_ids: Option<Vec<i32>>,
 }
diff --git a/crates/iceberg/src/spec/manifest/_serde.rs 
b/crates/iceberg/src/spec/manifest/_serde.rs
index 462128cf..7738af46 100644
--- a/crates/iceberg/src/spec/manifest/_serde.rs
+++ b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -116,7 +116,6 @@ pub(super) struct DataFileSerde {
     upper_bounds: Option<Vec<BytesEntry>>,
     key_metadata: Option<serde_bytes::ByteBuf>,
     split_offsets: Option<Vec<i64>>,
-    #[serde(default)]
     equality_ids: Option<Vec<i32>>,
     sort_order_id: Option<i32>,
     first_row_id: Option<i64>,
@@ -155,7 +154,7 @@ impl DataFileSerde {
             upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
             key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
             split_offsets: Some(value.split_offsets),
-            equality_ids: Some(value.equality_ids),
+            equality_ids: value.equality_ids,
             sort_order_id: value.sort_order_id,
             first_row_id: value.first_row_id,
             referenced_data_file: value.referenced_data_file,
@@ -224,7 +223,7 @@ impl DataFileSerde {
                 .unwrap_or_default(),
             key_metadata: self.key_metadata.map(|v| v.to_vec()),
             split_offsets: self.split_offsets.unwrap_or_default(),
-            equality_ids: self.equality_ids.unwrap_or_default(),
+            equality_ids: self.equality_ids,
             sort_order_id: self.sort_order_id,
             partition_spec_id,
             first_row_id: self.first_row_id,
@@ -382,7 +381,7 @@ mod tests {
             upper_bounds: 
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
             key_metadata: None,
             split_offsets: vec![4],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: Some(0),
             partition_spec_id: 0,
             first_row_id: None,
@@ -517,9 +516,8 @@ mod tests {
             "DataFileSerde should convert content 0 to DataContentType::Data"
         );
         assert_eq!(
-            v2_entry.data_file.equality_ids,
-            Vec::<i32>::new(),
-            "DataFileSerde should convert None equality_ids to empty vec"
+            v2_entry.data_file.equality_ids, None,
+            "DataFileSerde should preserve None equality_ids as None"
         );
 
         // Verify other fields are preserved during conversion
@@ -581,9 +579,8 @@ mod tests {
             "content 0 should convert to DataContentType::Data"
         );
         assert_eq!(
-            data_file.equality_ids,
-            Vec::<i32>::new(),
-            "None equality_ids should convert to empty vec via 
unwrap_or_default()"
+            data_file.equality_ids, None,
+            "None equality_ids should remain as None"
         );
 
         // Verify other fields are handled correctly during conversion
diff --git a/crates/iceberg/src/spec/manifest/data_file.rs 
b/crates/iceberg/src/spec/manifest/data_file.rs
index d191ba2e..931f9441 100644
--- a/crates/iceberg/src/spec/manifest/data_file.rs
+++ b/crates/iceberg/src/spec/manifest/data_file.rs
@@ -135,7 +135,7 @@ pub struct DataFile {
     /// otherwise. Fields with ids listed in this column must be present
     /// in the delete file
     #[builder(default)]
-    pub(crate) equality_ids: Vec<i32>,
+    pub(crate) equality_ids: Option<Vec<i32>>,
     /// field id: 140
     ///
     /// ID representing sort order for this file.
@@ -249,8 +249,8 @@ impl DataFile {
     /// Get the equality ids of the data file.
     /// Field ids used to determine row equality in equality delete files.
     /// null when content is not EqualityDeletes.
-    pub fn equality_ids(&self) -> &[i32] {
-        &self.equality_ids
+    pub fn equality_ids(&self) -> Option<Vec<i32>> {
+        self.equality_ids.clone()
     }
     /// Get the first row id in the data file.
     pub fn first_row_id(&self) -> Option<i64> {
diff --git a/crates/iceberg/src/spec/manifest/mod.rs 
b/crates/iceberg/src/spec/manifest/mod.rs
index da039773..a1a5612c 100644
--- a/crates/iceberg/src/spec/manifest/mod.rs
+++ b/crates/iceberg/src/spec/manifest/mod.rs
@@ -256,7 +256,7 @@ mod tests {
                     snapshot_id: None,
                     sequence_number: None,
                     file_sequence_number: None,
-                    data_file: DataFile 
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
 [...]
+                    data_file: DataFile 
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
 [...]
                 }
             ];
 
@@ -435,7 +435,7 @@ mod tests {
                     upper_bounds: HashMap::new(),
                     key_metadata: None,
                     split_offsets: vec![4],
-                    equality_ids: vec![],
+                    equality_ids: Some(Vec::new()),
                     sort_order_id: None,
                     partition_spec_id: 0,
                     first_row_id: None,
@@ -532,7 +532,7 @@ mod tests {
                     upper_bounds: 
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
                     key_metadata: None,
                     split_offsets: vec![4],
-                    equality_ids: vec![],
+                    equality_ids: None,
                     sort_order_id: Some(0),
                     partition_spec_id: 0,
                     first_row_id: None,
@@ -640,7 +640,7 @@ mod tests {
                         ]),
                         key_metadata: None,
                         split_offsets: vec![4],
-                        equality_ids: vec![],
+                        equality_ids: None,
                         sort_order_id: Some(0),
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -749,7 +749,7 @@ mod tests {
                     ]),
                     key_metadata: None,
                     split_offsets: vec![4],
-                    equality_ids: vec![],
+                    equality_ids: None,
                     sort_order_id: None,
                     partition_spec_id: 0,
                     first_row_id: None,
@@ -840,7 +840,7 @@ mod tests {
                     ]),
                     key_metadata: None,
                     split_offsets: vec![4],
-                    equality_ids: vec![],
+                    equality_ids: None,
                     sort_order_id: None,
                     partition_spec_id: 0,
                     first_row_id: None,
@@ -922,7 +922,7 @@ mod tests {
                     upper_bounds: HashMap::new(),
                     key_metadata: None,
                     split_offsets: vec![4],
-                    equality_ids: Vec::new(),
+                    equality_ids: None,
                     sort_order_id: None,
                     partition_spec_id: 0,
                     first_row_id: None,
@@ -957,7 +957,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: None,
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -992,7 +992,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: None,
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -1027,7 +1027,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: None,
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -1182,7 +1182,7 @@ mod tests {
             "upper_bounds": [],
             "key_metadata": null,
             "split_offsets": [],
-            "equality_ids": [],
+            "equality_ids": null,
             "sort_order_id": null,
             "first_row_id": null,
             "referenced_data_file": null,
@@ -1213,7 +1213,7 @@ mod tests {
             "upper_bounds": [],
             "key_metadata": null,
             "split_offsets": [],
-            "equality_ids": [],
+            "equality_ids": null,
             "sort_order_id": null,
             "first_row_id": null,
             "referenced_data_file": null,
diff --git a/crates/iceberg/src/spec/manifest/writer.rs 
b/crates/iceberg/src/spec/manifest/writer.rs
index 39945a51..673f8b5d 100644
--- a/crates/iceberg/src/spec/manifest/writer.rs
+++ b/crates/iceberg/src/spec/manifest/writer.rs
@@ -545,7 +545,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: Some(Vec::new()),
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -574,7 +574,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: Some(Vec::new()),
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
@@ -603,7 +603,7 @@ mod tests {
                         upper_bounds: HashMap::new(),
                         key_metadata: Some(Vec::new()),
                         split_offsets: vec![4],
-                        equality_ids: Vec::new(),
+                        equality_ids: None,
                         sort_order_id: None,
                         partition_spec_id: 0,
                         first_row_id: None,
diff --git a/crates/iceberg/src/spec/snapshot_summary.rs 
b/crates/iceberg/src/spec/snapshot_summary.rs
index e374e567..1b07ce3f 100644
--- a/crates/iceberg/src/spec/snapshot_summary.rs
+++ b/crates/iceberg/src/spec/snapshot_summary.rs
@@ -768,7 +768,7 @@ mod tests {
             ]),
             key_metadata: None,
             split_offsets: vec![4],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: Some(0),
             partition_spec_id: 0,
             first_row_id: None,
@@ -800,7 +800,7 @@ mod tests {
             ]),
             key_metadata: None,
             split_offsets: vec![4],
-            equality_ids: vec![],
+            equality_ids: None,
             sort_order_id: Some(0),
             partition_spec_id: 0,
             first_row_id: None,
@@ -910,7 +910,7 @@ mod tests {
                 upper_bounds: HashMap::new(),
                 key_metadata: None,
                 split_offsets: vec![],
-                equality_ids: vec![],
+                equality_ids: None,
                 sort_order_id: None,
                 partition_spec_id: 0,
                 first_row_id: None,
@@ -938,7 +938,7 @@ mod tests {
                 upper_bounds: HashMap::new(),
                 key_metadata: None,
                 split_offsets: vec![],
-                equality_ids: vec![],
+                equality_ids: None,
                 sort_order_id: None,
                 partition_spec_id: 0,
                 first_row_id: None,
@@ -992,7 +992,7 @@ mod tests {
                 upper_bounds: HashMap::new(),
                 key_metadata: None,
                 split_offsets: vec![],
-                equality_ids: vec![],
+                equality_ids: None,
                 sort_order_id: None,
                 partition_spec_id: 0,
                 first_row_id: None,
diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs 
b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
index f710544d..765ff1ca 100644
--- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
+++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
@@ -156,7 +156,7 @@ impl<B: FileWriterBuilder> IcebergWriter for 
EqualityDeleteFileWriter<B> {
                 .into_iter()
                 .map(|mut res| {
                     res.content(crate::spec::DataContentType::EqualityDeletes);
-                    
res.equality_ids(self.equality_ids.iter().copied().collect_vec());
+                    
res.equality_ids(Some(self.equality_ids.iter().copied().collect_vec()));
                     res.partition(self.partition_value.clone());
                     res.partition_spec_id(self.partition_spec_id);
                     res.build().expect("msg")

Reply via email to