This is an automated email from the ASF dual-hosted git repository.

liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 76d8e2d3 fix: support reading compressed metadata (#1802)
76d8e2d3 is described below

commit 76d8e2d31e36d4c5013f24f8bac51eaab1d9c1db
Author: Colin Marc <[email protected]>
AuthorDate: Wed Nov 5 11:27:04 2025 +0100

    fix: support reading compressed metadata (#1802)
    
    The spec mentions this naming convention here:
    
    
    
https://iceberg.apache.org/spec/#naming-for-gzip-compressed-metadata-json-files
    
    ## Which issue does this PR close?
    
    
    - Closes #1801
    
    ## What changes are included in this PR?
    
    Support for reading compressed metadata.
    
    ## Are these changes tested?
    
    Yes.
    
    Co-authored-by: Renjie Liu <[email protected]>
---
 Cargo.lock                                |  1 +
 Cargo.toml                                |  1 +
 crates/iceberg/Cargo.toml                 |  1 +
 crates/iceberg/src/spec/table_metadata.rs | 50 ++++++++++++++++++++++++++++++-
 crates/iceberg/src/table.rs               |  4 +--
 5 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 38d284e5..2edc51a4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3516,6 +3516,7 @@ dependencies = [
  "ctor",
  "derive_builder",
  "expect-test",
+ "flate2",
  "fnv",
  "futures",
  "iceberg_test_utils",
diff --git a/Cargo.toml b/Cargo.toml
index 5161810b..c10c01d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,6 +71,7 @@ enum-ordinalize = "4.3.0"
 env_logger = "0.11.8"
 expect-test = "1"
 faststr = "0.2.31"
+flate2 = "1.1.5"
 fnv = "1.0.7"
 fs-err = "3.1.0"
 futures = "0.3"
diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml
index c9b1bb31..895a5cf5 100644
--- a/crates/iceberg/Cargo.toml
+++ b/crates/iceberg/Cargo.toml
@@ -63,6 +63,7 @@ bytes = { workspace = true }
 chrono = { workspace = true }
 derive_builder = { workspace = true }
 expect-test = { workspace = true }
+flate2 = { workspace = true }
 fnv = { workspace = true }
 futures = { workspace = true }
 itertools = { workspace = true }
diff --git a/crates/iceberg/src/spec/table_metadata.rs 
b/crates/iceberg/src/spec/table_metadata.rs
index 437b0df5..06b32cc8 100644
--- a/crates/iceberg/src/spec/table_metadata.rs
+++ b/crates/iceberg/src/spec/table_metadata.rs
@@ -22,10 +22,12 @@ use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::fmt::{Display, Formatter};
 use std::hash::Hash;
+use std::io::Read as _;
 use std::sync::Arc;
 
 use _serde::TableMetadataEnum;
 use chrono::{DateTime, Utc};
+use flate2::read::GzDecoder;
 use serde::{Deserialize, Serialize};
 use serde_repr::{Deserialize_repr, Serialize_repr};
 use uuid::Uuid;
@@ -426,9 +428,30 @@ impl TableMetadata {
         file_io: &FileIO,
         metadata_location: impl AsRef<str>,
     ) -> Result<TableMetadata> {
+        let metadata_location = metadata_location.as_ref();
         let input_file = file_io.new_input(metadata_location)?;
         let metadata_content = input_file.read().await?;
-        let metadata = 
serde_json::from_slice::<TableMetadata>(&metadata_content)?;
+
+        // Check if the file is compressed by looking for the gzip "magic 
number".
+        let metadata = if metadata_content.len() > 2
+            && metadata_content[0] == 0x1F
+            && metadata_content[1] == 0x8B
+        {
+            let mut decoder = GzDecoder::new(metadata_content.as_ref());
+            let mut decompressed_data = Vec::new();
+            decoder.read_to_end(&mut decompressed_data).map_err(|e| {
+                Error::new(
+                    ErrorKind::DataInvalid,
+                    "Trying to read compressed metadata file",
+                )
+                .with_context("file_path", metadata_location)
+                .with_source(e)
+            })?;
+            serde_json::from_slice(&decompressed_data)?
+        } else {
+            serde_json::from_slice(&metadata_content)?
+        };
+
         Ok(metadata)
     }
 
@@ -1516,6 +1539,7 @@ impl SnapshotLog {
 mod tests {
     use std::collections::HashMap;
     use std::fs;
+    use std::io::Write as _;
     use std::sync::Arc;
 
     use anyhow::Result;
@@ -3524,6 +3548,30 @@ mod tests {
         assert_eq!(read_metadata, original_metadata);
     }
 
+    #[tokio::test]
+    async fn test_table_metadata_read_compressed() {
+        let temp_dir = TempDir::new().unwrap();
+        let metadata_location = temp_dir.path().join("v1.gz.metadata.json");
+
+        let original_metadata: TableMetadata = 
get_test_table_metadata("TableMetadataV2Valid.json");
+        let json = serde_json::to_string(&original_metadata).unwrap();
+
+        let mut encoder = flate2::write::GzEncoder::new(Vec::new(), 
flate2::Compression::default());
+        encoder.write_all(json.as_bytes()).unwrap();
+        std::fs::write(&metadata_location, encoder.finish().unwrap())
+            .expect("failed to write metadata");
+
+        // Read the metadata back
+        let file_io = FileIOBuilder::new_fs_io().build().unwrap();
+        let metadata_location = metadata_location.to_str().unwrap();
+        let read_metadata = TableMetadata::read_from(&file_io, 
metadata_location)
+            .await
+            .unwrap();
+
+        // Verify the metadata matches
+        assert_eq!(read_metadata, original_metadata);
+    }
+
     #[tokio::test]
     async fn test_table_metadata_read_nonexistent_file() {
         // Create a FileIO instance
diff --git a/crates/iceberg/src/table.rs b/crates/iceberg/src/table.rs
index 80e10b2f..9c789e21 100644
--- a/crates/iceberg/src/table.rs
+++ b/crates/iceberg/src/table.rs
@@ -297,9 +297,7 @@ impl StaticTable {
         table_ident: TableIdent,
         file_io: FileIO,
     ) -> Result<Self> {
-        let metadata_file = file_io.new_input(metadata_location)?;
-        let metadata_file_content = metadata_file.read().await?;
-        let metadata = 
serde_json::from_slice::<TableMetadata>(&metadata_file_content)?;
+        let metadata = TableMetadata::read_from(&file_io, 
metadata_location).await?;
 
         let table = Table::builder()
             .metadata(metadata)

Reply via email to