liurenjie1024 commented on code in PR #1851:
URL: https://github.com/apache/iceberg-rust/pull/1851#discussion_r2549282238


##########
crates/iceberg/src/spec/avro_util.rs:
##########
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for working with Apache Avro in Iceberg.
+
+use apache_avro::Codec;
+use log::warn;
+
+use crate::spec::TableProperties;
+
+/// Settings for compression codec and level.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CompressionSettings {

Review Comment:
   I feel this struct is complete unnecessary. We should just add a 
`avro_write_codec: Codec` in `TableProperties`, and everything else should just 
use that field.



##########
crates/iceberg/src/spec/table_properties.rs:
##########
@@ -137,11 +148,21 @@ impl TableProperties {
     pub const PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES: &str = 
"write.target-file-size-bytes";
     /// Default target file size
     pub const PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT: usize = 512 * 
1024 * 1024; // 512 MB
+
+    /// Compression codec for Avro files (manifests, manifest lists)
+    pub const PROPERTY_AVRO_COMPRESSION_CODEC: &str = 
"write.avro.compression-codec";
+    /// Default Avro compression codec - gzip
+    pub const PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str = "gzip";
+
+    /// Compression level for Avro files
+    pub const PROPERTY_AVRO_COMPRESSION_LEVEL: &str = 
"write.avro.compression-level";
+    /// Default Avro compression level (None, uses codec-specific defaults: 
gzip=9, zstd=1)
+    pub const PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: Option<u8> = None;

Review Comment:
   The spec says there is no default value, I don't think we need this one.



##########
crates/iceberg/src/spec/avro_util.rs:
##########
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for working with Apache Avro in Iceberg.
+
+use apache_avro::Codec;
+use log::warn;
+
+use crate::spec::TableProperties;
+
+/// Settings for compression codec and level.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CompressionSettings {
+    /// The compression codec name (e.g., "gzip", "zstd", "snappy", 
"uncompressed")
+    pub codec: String,
+    /// The compression level (None uses codec-specific defaults: gzip=9, 
zstd=1)
+    pub level: Option<u8>,
+}
+
+impl CompressionSettings {
+    /// Create a new CompressionSettings with the specified codec and level.
+    pub fn new(codec: String, level: Option<u8>) -> Self {
+        Self { codec, level }
+    }
+
+    /// Convert to apache_avro::Codec using the codec_from_str helper function.
+    pub(crate) fn to_codec(&self) -> Codec {
+        codec_from_str(Some(&self.codec), self.level)
+    }
+}
+
+impl Default for CompressionSettings {
+    fn default() -> Self {
+        Self {
+            codec: 
TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(),
+            level: None,
+        }
+    }
+}
+
+/// Convert codec name and level to apache_avro::Codec.
+/// Returns Codec::Null for unknown or unsupported codecs.
+///
+/// # Arguments
+///
+/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", 
"snappy", "uncompressed")
+/// * `level` - The compression level (None uses codec defaults: gzip=9, 
zstd=1). For gzip:
+///   - 0: NoCompression
+///   - 1: BestSpeed
+///   - 9: BestCompression
+///   - 10: UberCompression
+///   - Other values: DefaultLevel (6)
+///
+/// # Supported Codecs
+///
+/// - `gzip`: Uses Deflate compression with specified level (default: 9)
+/// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid 
zstd range 0-22)
+/// - `snappy`: Uses Snappy compression (level parameter ignored)
+/// - `uncompressed` or `None`: No compression
+/// - Any other value: Defaults to no compression (Codec::Null)
+///
+/// # Compression Levels
+///
+/// The compression level mapping is based on miniz_oxide's CompressionLevel 
enum:
+/// - Level 0: No compression
+/// - Level 1: Best speed (fastest)
+/// - Level 9: Best compression (slower, better compression)
+/// - Level 10: Uber compression (slowest, best compression)
+/// - Other: Default level (balanced speed/compression)
+pub(crate) fn codec_from_str(codec: Option<&str>, level: Option<u8>) -> Codec {
+    use apache_avro::{DeflateSettings, ZstandardSettings};
+
+    // Use case-insensitive comparison to match Java implementation
+    match codec.map(|s| s.to_lowercase()).as_deref() {
+        Some("gzip") => {
+            // Map compression level to miniz_oxide::deflate::CompressionLevel
+            // Reference: 
https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html
+            // Default level for gzip/deflate is 9 (BestCompression) to match 
Java
+            use miniz_oxide::deflate::CompressionLevel;
+
+            let compression_level = match level.unwrap_or(9) {

Review Comment:
   Ditto.



##########
crates/iceberg/src/spec/table_properties.rs:
##########
@@ -49,6 +56,10 @@ pub struct TableProperties {
     pub write_format_default: String,
     /// The target file size for files.
     pub write_target_file_size_bytes: usize,
+    /// Compression codec for Avro files (manifests, manifest lists)
+    pub avro_compression_codec: String,

Review Comment:
   ```suggestion
       pub avro_compression_codec: Codec,
   ```



##########
crates/iceberg/src/spec/manifest/writer.rs:
##########
@@ -43,6 +44,7 @@ pub struct ManifestWriterBuilder {
     key_metadata: Option<Vec<u8>>,
     schema: SchemaRef,
     partition_spec: PartitionSpec,
+    compression: CompressionSettings,

Review Comment:
   This should be just `Codec`.



##########
Cargo.toml:
##########
@@ -40,7 +40,7 @@ rust-version = "1.87"
 
 [workspace.dependencies]
 anyhow = "1.0.72"
-apache-avro = { version = "0.20", features = ["zstandard"] }
+apache-avro = { version = "0.20", features = ["zstandard", "snappy"] }

Review Comment:
   Why we need this?



##########
crates/iceberg/Cargo.toml:
##########
@@ -67,6 +67,8 @@ flate2 = { workspace = true }
 fnv = { workspace = true }
 futures = { workspace = true }
 itertools = { workspace = true }
+log = { workspace = true }
+miniz_oxide = "0.8"

Review Comment:
   We manage dependencies version in root workspace.



##########
crates/iceberg/src/spec/avro_util.rs:
##########
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for working with Apache Avro in Iceberg.
+
+use apache_avro::Codec;
+use log::warn;
+
+use crate::spec::TableProperties;
+
+/// Settings for compression codec and level.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CompressionSettings {
+    /// The compression codec name (e.g., "gzip", "zstd", "snappy", 
"uncompressed")
+    pub codec: String,
+    /// The compression level (None uses codec-specific defaults: gzip=9, 
zstd=1)
+    pub level: Option<u8>,
+}
+
+impl CompressionSettings {
+    /// Create a new CompressionSettings with the specified codec and level.
+    pub fn new(codec: String, level: Option<u8>) -> Self {
+        Self { codec, level }
+    }
+
+    /// Convert to apache_avro::Codec using the codec_from_str helper function.
+    pub(crate) fn to_codec(&self) -> Codec {
+        codec_from_str(Some(&self.codec), self.level)
+    }
+}
+
+impl Default for CompressionSettings {
+    fn default() -> Self {
+        Self {
+            codec: 
TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(),
+            level: None,
+        }
+    }
+}
+
+/// Convert codec name and level to apache_avro::Codec.
+/// Returns Codec::Null for unknown or unsupported codecs.
+///
+/// # Arguments
+///
+/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", 
"snappy", "uncompressed")
+/// * `level` - The compression level (None uses codec defaults: gzip=9, 
zstd=1). For gzip:
+///   - 0: NoCompression
+///   - 1: BestSpeed
+///   - 9: BestCompression
+///   - 10: UberCompression
+///   - Other values: DefaultLevel (6)
+///
+/// # Supported Codecs
+///
+/// - `gzip`: Uses Deflate compression with specified level (default: 9)
+/// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid 
zstd range 0-22)
+/// - `snappy`: Uses Snappy compression (level parameter ignored)
+/// - `uncompressed` or `None`: No compression
+/// - Any other value: Defaults to no compression (Codec::Null)
+///
+/// # Compression Levels
+///
+/// The compression level mapping is based on miniz_oxide's CompressionLevel 
enum:
+/// - Level 0: No compression
+/// - Level 1: Best speed (fastest)
+/// - Level 9: Best compression (slower, better compression)
+/// - Level 10: Uber compression (slowest, best compression)
+/// - Other: Default level (balanced speed/compression)
+pub(crate) fn codec_from_str(codec: Option<&str>, level: Option<u8>) -> Codec {
+    use apache_avro::{DeflateSettings, ZstandardSettings};
+
+    // Use case-insensitive comparison to match Java implementation
+    match codec.map(|s| s.to_lowercase()).as_deref() {
+        Some("gzip") => {

Review Comment:
   Use constants rather than raw string.



##########
crates/iceberg/src/spec/avro_util.rs:
##########
@@ -0,0 +1,310 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for working with Apache Avro in Iceberg.
+
+use apache_avro::Codec;
+use log::warn;
+
+use crate::spec::TableProperties;
+
+/// Settings for compression codec and level.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CompressionSettings {
+    /// The compression codec name (e.g., "gzip", "zstd", "snappy", 
"uncompressed")
+    pub codec: String,
+    /// The compression level (None uses codec-specific defaults: gzip=9, 
zstd=1)
+    pub level: Option<u8>,
+}
+
+impl CompressionSettings {
+    /// Create a new CompressionSettings with the specified codec and level.
+    pub fn new(codec: String, level: Option<u8>) -> Self {
+        Self { codec, level }
+    }
+
+    /// Convert to apache_avro::Codec using the codec_from_str helper function.
+    pub(crate) fn to_codec(&self) -> Codec {
+        codec_from_str(Some(&self.codec), self.level)
+    }
+}
+
+impl Default for CompressionSettings {
+    fn default() -> Self {
+        Self {
+            codec: 
TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(),
+            level: None,
+        }
+    }
+}
+
+/// Convert codec name and level to apache_avro::Codec.
+/// Returns Codec::Null for unknown or unsupported codecs.
+///
+/// # Arguments
+///
+/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", 
"snappy", "uncompressed")
+/// * `level` - The compression level (None uses codec defaults: gzip=9, 
zstd=1). For gzip:
+///   - 0: NoCompression
+///   - 1: BestSpeed
+///   - 9: BestCompression
+///   - 10: UberCompression
+///   - Other values: DefaultLevel (6)
+///
+/// # Supported Codecs
+///
+/// - `gzip`: Uses Deflate compression with specified level (default: 9)
+/// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid 
zstd range 0-22)
+/// - `snappy`: Uses Snappy compression (level parameter ignored)
+/// - `uncompressed` or `None`: No compression
+/// - Any other value: Defaults to no compression (Codec::Null)
+///
+/// # Compression Levels
+///
+/// The compression level mapping is based on miniz_oxide's CompressionLevel 
enum:
+/// - Level 0: No compression
+/// - Level 1: Best speed (fastest)
+/// - Level 9: Best compression (slower, better compression)
+/// - Level 10: Uber compression (slowest, best compression)
+/// - Other: Default level (balanced speed/compression)
+pub(crate) fn codec_from_str(codec: Option<&str>, level: Option<u8>) -> Codec {
+    use apache_avro::{DeflateSettings, ZstandardSettings};
+
+    // Use case-insensitive comparison to match Java implementation
+    match codec.map(|s| s.to_lowercase()).as_deref() {
+        Some("gzip") => {
+            // Map compression level to miniz_oxide::deflate::CompressionLevel
+            // Reference: 
https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html
+            // Default level for gzip/deflate is 9 (BestCompression) to match 
Java
+            use miniz_oxide::deflate::CompressionLevel;

Review Comment:
   Why put importing here?



##########
crates/iceberg/src/spec/table_properties.rs:
##########
@@ -49,6 +56,10 @@ pub struct TableProperties {
     pub write_format_default: String,
     /// The target file size for files.
     pub write_target_file_size_bytes: usize,
+    /// Compression codec for Avro files (manifests, manifest lists)
+    pub avro_compression_codec: String,
+    /// Compression level for Avro files (None uses codec-specific defaults: 
gzip=9, zstd=1)
+    pub avro_compression_level: Option<u8>,

Review Comment:
   We don't need this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to