This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new b4bc6dd1 feat: Add deletion vector related fields in spec types (#1276)
b4bc6dd1 is described below
commit b4bc6dd15ae4f7a4b1c9ce555e3458089a83e228
Author: dentiny <[email protected]>
AuthorDate: Wed Apr 30 22:50:16 2025 -0700
feat: Add deletion vector related fields in spec types (#1276)
---
.../src/expr/visitors/expression_evaluator.rs | 8 +
.../expr/visitors/inclusive_metrics_evaluator.rs | 24 ++
.../src/expr/visitors/strict_metrics_evaluator.rs | 16 ++
crates/iceberg/src/spec/manifest/_serde.rs | 18 +-
crates/iceberg/src/spec/manifest/data_file.rs | 53 ++++-
crates/iceberg/src/spec/manifest/entry.rs | 44 ++++
crates/iceberg/src/spec/manifest/mod.rs | 244 ++++++++++++---------
crates/iceberg/src/spec/manifest/writer.rs | 18 +-
crates/iceberg/src/spec/snapshot_summary.rs | 20 ++
9 files changed, 335 insertions(+), 110 deletions(-)
diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
index 561b3fb1..4715b164 100644
--- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
@@ -347,6 +347,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -371,6 +375,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
index d34bb99b..30235998 100644
--- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
@@ -1997,6 +1997,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -2019,6 +2023,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -2077,6 +2085,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
fn get_test_file_2() -> DataFile {
@@ -2104,6 +2116,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -2132,6 +2148,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -2160,6 +2180,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
}
diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
index 4fba3eaf..841b743e 100644
--- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
@@ -582,6 +582,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -604,6 +608,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -626,6 +634,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
@@ -649,6 +661,10 @@ mod test {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}
diff --git a/crates/iceberg/src/spec/manifest/_serde.rs
b/crates/iceberg/src/spec/manifest/_serde.rs
index 74c1faf8..951480e1 100644
--- a/crates/iceberg/src/spec/manifest/_serde.rs
+++ b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -119,6 +119,10 @@ pub(super) struct DataFileSerde {
#[serde(default)]
equality_ids: Option<Vec<i32>>,
sort_order_id: Option<i32>,
+ first_row_id: Option<i64>,
+ referenced_data_file: Option<String>,
+ content_offset: Option<i64>,
+ content_size_in_bytes: Option<i64>,
}
impl DataFileSerde {
@@ -149,6 +153,10 @@ impl DataFileSerde {
split_offsets: Some(value.split_offsets),
equality_ids: Some(value.equality_ids),
sort_order_id: value.sort_order_id,
+ first_row_id: value.first_row_id,
+ referenced_data_file: value.referenced_data_file,
+ content_offset: value.content_offset,
+ content_size_in_bytes: value.content_size_in_bytes,
})
}
@@ -215,6 +223,10 @@ impl DataFileSerde {
equality_ids: self.equality_ids.unwrap_or_default(),
sort_order_id: self.sort_order_id,
partition_spec_id,
+ first_row_id: self.first_row_id,
+ referenced_data_file: self.referenced_data_file,
+ content_offset: self.content_offset,
+ content_size_in_bytes: self.content_size_in_bytes,
})
}
}
@@ -359,7 +371,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: Some(0),
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}];
let mut buffer = Vec::new();
diff --git a/crates/iceberg/src/spec/manifest/data_file.rs
b/crates/iceberg/src/spec/manifest/data_file.rs
index df03b646..cd867b10 100644
--- a/crates/iceberg/src/spec/manifest/data_file.rs
+++ b/crates/iceberg/src/spec/manifest/data_file.rs
@@ -43,7 +43,7 @@ pub struct DataFile {
pub(crate) file_path: String,
/// field id: 101
///
- /// String file format name, avro, orc or parquet
+ /// String file format name, `avro`, `orc`, `parquet`, or `puffin`
pub(crate) file_format: DataFileFormat,
/// field id: 102
///
@@ -52,7 +52,7 @@ pub struct DataFile {
pub(crate) partition: Struct,
/// field id: 103
///
- /// Number of records in this file
+ /// Number of records in this file, or the cardinality of a deletion vector
pub(crate) record_count: u64,
/// field id: 104
///
@@ -148,9 +148,35 @@ pub struct DataFile {
/// delete files.
#[builder(default, setter(strip_option))]
pub(crate) sort_order_id: Option<i32>,
+ /// field id: 142
+ ///
+ /// The _row_id for the first row in the data file.
+ /// For more details, refer to
https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance
+ #[builder(default)]
+ pub(crate) first_row_id: Option<i64>,
/// This field is not included in spec. It is just store in memory
representation used
/// in process.
pub(crate) partition_spec_id: i32,
+ /// field id: 143
+ ///
+ /// Fully qualified location (URI with FS scheme) of a data file that all
deletes reference.
+ /// Position delete metadata can use `referenced_data_file` when all
deletes tracked by the
+ /// entry are in a single data file. Setting the referenced file is
required for deletion vectors.
+ #[builder(default)]
+ pub(crate) referenced_data_file: Option<String>,
+ /// field: 144
+ ///
+ /// The offset in the file where the content starts.
+ /// The `content_offset` and `content_size_in_bytes` fields are used to
reference a specific blob
+ /// for direct access to a deletion vector. For deletion vectors, these
values are required and must
+ /// exactly match the `offset` and `length` stored in the Puffin footer
for the deletion vector blob.
+ #[builder(default)]
+ pub(crate) content_offset: Option<i64>,
+ /// field: 145
+ ///
+ /// The length of a referenced content stored in the file; required if
`content_offset` is present
+ #[builder(default)]
+ pub(crate) content_size_in_bytes: Option<i64>,
}
impl DataFile {
@@ -226,6 +252,10 @@ impl DataFile {
pub fn equality_ids(&self) -> &[i32] {
&self.equality_ids
}
+ /// Get the first row id in the data file.
+ pub fn first_row_id(&self) -> Option<i64> {
+ self.first_row_id
+ }
/// Get the sort order id of the data file.
/// Only data files and equality delete files should be
/// written with a non-null order id. Position deletes are required to be
@@ -235,6 +265,21 @@ impl DataFile {
pub fn sort_order_id(&self) -> Option<i32> {
self.sort_order_id
}
+ /// Get the fully qualified referenced location for the corresponding data
file.
+ /// Positional delete files could have the field set, and deletion vectors
must the field set.
+ pub fn referenced_data_file(&self) -> Option<String> {
+ self.referenced_data_file.clone()
+ }
+ /// Get the offset in the file where the blob content starts.
+ /// Only meaningful for puffin blobs, and required for deletion vectors.
+ pub fn content_offset(&self) -> Option<i64> {
+ self.content_offset
+ }
+ /// Get the length of a puffin blob.
+ /// Only meaningful for puffin blobs, and required for deletion vectors.
+ pub fn content_size_in_bytes(&self) -> Option<i64> {
+ self.content_size_in_bytes
+ }
}
/// Convert data files to avro bytes and write to writer.
@@ -323,6 +368,8 @@ pub enum DataFileFormat {
Orc,
/// Parquet file format: <https://parquet.apache.org/>
Parquet,
+ /// Puffin file format: <https://iceberg.apache.org/puffin-spec/>
+ Puffin,
}
impl FromStr for DataFileFormat {
@@ -333,6 +380,7 @@ impl FromStr for DataFileFormat {
"avro" => Ok(Self::Avro),
"orc" => Ok(Self::Orc),
"parquet" => Ok(Self::Parquet),
+ "puffin" => Ok(Self::Puffin),
_ => Err(Error::new(
ErrorKind::DataInvalid,
format!("Unsupported data file format: {}", s),
@@ -347,6 +395,7 @@ impl std::fmt::Display for DataFileFormat {
DataFileFormat::Avro => write!(f, "avro"),
DataFileFormat::Orc => write!(f, "orc"),
DataFileFormat::Parquet => write!(f, "parquet"),
+ DataFileFormat::Puffin => write!(f, "puffin"),
}
}
}
diff --git a/crates/iceberg/src/spec/manifest/entry.rs
b/crates/iceberg/src/spec/manifest/entry.rs
index f02b3014..85022a11 100644
--- a/crates/iceberg/src/spec/manifest/entry.rs
+++ b/crates/iceberg/src/spec/manifest/entry.rs
@@ -469,6 +469,46 @@ static SORT_ORDER_ID: Lazy<NestedFieldRef> = {
})
};
+static FIRST_ROW_ID: Lazy<NestedFieldRef> = {
+ Lazy::new(|| {
+ Arc::new(NestedField::optional(
+ 142,
+ "first_row_id",
+ Type::Primitive(PrimitiveType::Long),
+ ))
+ })
+};
+
+static REFERENCE_DATA_FILE: Lazy<NestedFieldRef> = {
+ Lazy::new(|| {
+ Arc::new(NestedField::optional(
+ 143,
+ "referenced_data_file",
+ Type::Primitive(PrimitiveType::String),
+ ))
+ })
+};
+
+static CONTENT_OFFSET: Lazy<NestedFieldRef> = {
+ Lazy::new(|| {
+ Arc::new(NestedField::optional(
+ 144,
+ "content_offset",
+ Type::Primitive(PrimitiveType::Long),
+ ))
+ })
+};
+
+static CONTENT_SIZE_IN_BYTES: Lazy<NestedFieldRef> = {
+ Lazy::new(|| {
+ Arc::new(NestedField::optional(
+ 145,
+ "content_size_in_bytes",
+ Type::Primitive(PrimitiveType::Long),
+ ))
+ })
+};
+
fn data_file_fields_v2(partition_type: &StructType) -> Vec<NestedFieldRef> {
vec![
CONTENT.clone(),
@@ -491,6 +531,10 @@ fn data_file_fields_v2(partition_type: &StructType) ->
Vec<NestedFieldRef> {
SPLIT_OFFSETS.clone(),
EQUALITY_IDS.clone(),
SORT_ORDER_ID.clone(),
+ FIRST_ROW_ID.clone(),
+ REFERENCE_DATA_FILE.clone(),
+ CONTENT_OFFSET.clone(),
+ CONTENT_SIZE_IN_BYTES.clone(),
]
}
diff --git a/crates/iceberg/src/spec/manifest/mod.rs
b/crates/iceberg/src/spec/manifest/mod.rs
index eec7fcaf..5b53abae 100644
--- a/crates/iceberg/src/spec/manifest/mod.rs
+++ b/crates/iceberg/src/spec/manifest/mod.rs
@@ -215,7 +215,7 @@ mod tests {
snapshot_id: None,
sequence_number: None,
file_sequence_number: None,
- data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
+ data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
}
];
@@ -396,7 +396,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
}];
@@ -489,7 +493,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: Some(0),
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
}];
@@ -593,7 +601,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: Some(0),
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
}
];
@@ -697,7 +709,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
}];
@@ -784,7 +800,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
})],
};
@@ -834,6 +854,41 @@ mod tests {
format_version: FormatVersion::V2,
};
let entries = vec![
+ ManifestEntry {
+ status: ManifestStatus::Added,
+ snapshot_id: None,
+ sequence_number: None,
+ file_sequence_number: None,
+ data_file: DataFile {
+ content: DataContentType::Data,
+ file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
+ file_format: DataFileFormat::Parquet,
+ partition: Struct::from_iter(
+ vec![
+ Some(Literal::int(2021)),
+ Some(Literal::float(1.0)),
+ Some(Literal::double(2.0)),
+ ]
+ ),
+ record_count: 1,
+ file_size_in_bytes: 5442,
+ column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
+ value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
+ null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
+ nan_value_counts: HashMap::new(),
+ lower_bounds: HashMap::new(),
+ upper_bounds: HashMap::new(),
+ key_metadata: None,
+ split_offsets: vec![4],
+ equality_ids: Vec::new(),
+ sort_order_id: None,
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
+ }
+ },
ManifestEntry {
status: ManifestStatus::Added,
snapshot_id: None,
@@ -845,9 +900,9 @@ mod tests {
file_format: DataFileFormat::Parquet,
partition: Struct::from_iter(
vec![
- Some(Literal::int(2021)),
- Some(Literal::float(1.0)),
- Some(Literal::double(2.0)),
+ Some(Literal::int(1111)),
+ Some(Literal::float(15.5)),
+ Some(Literal::double(25.5)),
]
),
record_count: 1,
@@ -862,103 +917,84 @@ mod tests {
split_offsets: vec![4],
equality_ids: Vec::new(),
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
}
},
- ManifestEntry {
- status: ManifestStatus::Added,
- snapshot_id: None,
- sequence_number: None,
- file_sequence_number: None,
- data_file: DataFile {
- content: DataContentType::Data,
- file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
- file_format: DataFileFormat::Parquet,
- partition: Struct::from_iter(
- vec![
- Some(Literal::int(1111)),
- Some(Literal::float(15.5)),
- Some(Literal::double(25.5)),
- ]
- ),
- record_count: 1,
- file_size_in_bytes: 5442,
- column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
- value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
- null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
- nan_value_counts: HashMap::new(),
- lower_bounds: HashMap::new(),
- upper_bounds: HashMap::new(),
- key_metadata: None,
- split_offsets: vec![4],
- equality_ids: Vec::new(),
- sort_order_id: None,
- partition_spec_id: 0
- }
- },
- ManifestEntry {
- status: ManifestStatus::Added,
- snapshot_id: None,
- sequence_number: None,
- file_sequence_number: None,
- data_file: DataFile {
- content: DataContentType::Data,
- file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
- file_format: DataFileFormat::Parquet,
- partition: Struct::from_iter(
- vec![
- Some(Literal::int(1211)),
- Some(Literal::float(f32::NAN)),
- Some(Literal::double(1.0)),
- ]
- ),
- record_count: 1,
- file_size_in_bytes: 5442,
- column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
- value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
- null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
- nan_value_counts: HashMap::new(),
- lower_bounds: HashMap::new(),
- upper_bounds: HashMap::new(),
- key_metadata: None,
- split_offsets: vec![4],
- equality_ids: Vec::new(),
- sort_order_id: None,
- partition_spec_id: 0
- }
- },
- ManifestEntry {
- status: ManifestStatus::Added,
- snapshot_id: None,
- sequence_number: None,
- file_sequence_number: None,
- data_file: DataFile {
- content: DataContentType::Data,
- file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
- file_format: DataFileFormat::Parquet,
- partition: Struct::from_iter(
- vec![
- Some(Literal::int(1111)),
- None,
- Some(Literal::double(11.0)),
- ]
- ),
- record_count: 1,
- file_size_in_bytes: 5442,
- column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
- value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
- null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
- nan_value_counts: HashMap::new(),
- lower_bounds: HashMap::new(),
- upper_bounds: HashMap::new(),
- key_metadata: None,
- split_offsets: vec![4],
- equality_ids: Vec::new(),
- sort_order_id: None,
- partition_spec_id: 0
- }
- },
- ];
+ ManifestEntry {
+ status: ManifestStatus::Added,
+ snapshot_id: None,
+ sequence_number: None,
+ file_sequence_number: None,
+ data_file: DataFile {
+ content: DataContentType::Data,
+ file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
+ file_format: DataFileFormat::Parquet,
+ partition: Struct::from_iter(
+ vec![
+ Some(Literal::int(1211)),
+ Some(Literal::float(f32::NAN)),
+ Some(Literal::double(1.0)),
+ ]
+ ),
+ record_count: 1,
+ file_size_in_bytes: 5442,
+ column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
+ value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
+ null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
+ nan_value_counts: HashMap::new(),
+ lower_bounds: HashMap::new(),
+ upper_bounds: HashMap::new(),
+ key_metadata: None,
+ split_offsets: vec![4],
+ equality_ids: Vec::new(),
+ sort_order_id: None,
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
+ }
+ },
+ ManifestEntry {
+ status: ManifestStatus::Added,
+ snapshot_id: None,
+ sequence_number: None,
+ file_sequence_number: None,
+ data_file: DataFile {
+ content: DataContentType::Data,
+ file_path:
"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),
+ file_format: DataFileFormat::Parquet,
+ partition: Struct::from_iter(
+ vec![
+ Some(Literal::int(1111)),
+ None,
+ Some(Literal::double(11.0)),
+ ]
+ ),
+ record_count: 1,
+ file_size_in_bytes: 5442,
+ column_sizes:
HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),
+ value_counts:
HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),
+ null_value_counts:
HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),
+ nan_value_counts: HashMap::new(),
+ lower_bounds: HashMap::new(),
+ upper_bounds: HashMap::new(),
+ key_metadata: None,
+ split_offsets: vec![4],
+ equality_ids: Vec::new(),
+ sort_order_id: None,
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
+ }
+ },
+ ];
// write manifest to file
let tmp_dir = TempDir::new().unwrap();
diff --git a/crates/iceberg/src/spec/manifest/writer.rs
b/crates/iceberg/src/spec/manifest/writer.rs
index a2232451..35d9acb6 100644
--- a/crates/iceberg/src/spec/manifest/writer.rs
+++ b/crates/iceberg/src/spec/manifest/writer.rs
@@ -542,7 +542,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: Vec::new(),
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
},
ManifestEntry {
@@ -567,7 +571,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: Vec::new(),
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
},
ManifestEntry {
@@ -592,7 +600,11 @@ mod tests {
split_offsets: vec![4],
equality_ids: Vec::new(),
sort_order_id: None,
- partition_spec_id: 0
+ partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
},
];
diff --git a/crates/iceberg/src/spec/snapshot_summary.rs
b/crates/iceberg/src/spec/snapshot_summary.rs
index 05f9fb8e..3e123229 100644
--- a/crates/iceberg/src/spec/snapshot_summary.rs
+++ b/crates/iceberg/src/spec/snapshot_summary.rs
@@ -769,6 +769,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: Some(0),
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
};
let file2 = DataFile {
@@ -797,6 +801,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: Some(0),
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
};
collector.add_file(&file1, schema.clone(), partition_spec.clone());
@@ -901,6 +909,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
schema.clone(),
partition_spec.clone(),
@@ -925,6 +937,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
schema.clone(),
partition_spec.clone(),
@@ -975,6 +991,10 @@ mod tests {
equality_ids: vec![],
sort_order_id: None,
partition_spec_id: 0,
+ first_row_id: None,
+ referenced_data_file: None,
+ content_offset: None,
+ content_size_in_bytes: None,
},
schema.clone(),
partition_spec.clone(),