This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 93c7a12cff Mark Encoding::BIT_PACKED as deprecated and document its 
compatibility issues (#5348)
93c7a12cff is described below

commit 93c7a12cff17cf485c87909ebef5f1095078bc3f
Author: Jörn Horstmann <[email protected]>
AuthorDate: Wed Jan 31 11:35:44 2024 +0100

    Mark Encoding::BIT_PACKED as deprecated and document its compatibility 
issues (#5348)
    
    * Mark Encoding::BIT_PACKED as deprecated and document its compatibility 
issues
    
    * Allow deprecated BIT_PACKED in parquet-layout binary
---
 parquet/src/arrow/record_reader/definition_levels.rs |  1 +
 parquet/src/basic.rs                                 | 17 ++++++++++++++++-
 parquet/src/bin/parquet-layout.rs                    |  1 +
 parquet/src/column/reader.rs                         |  1 +
 parquet/src/column/reader/decoder.rs                 |  1 +
 parquet/src/encodings/decoding.rs                    |  1 +
 parquet/src/encodings/encoding/mod.rs                |  1 +
 parquet/src/encodings/levels.rs                      |  2 ++
 parquet/src/file/serialized_reader.rs                |  4 +++-
 9 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/parquet/src/arrow/record_reader/definition_levels.rs 
b/parquet/src/arrow/record_reader/definition_levels.rs
index 87f531df09..fcd04fbb9b 100644
--- a/parquet/src/arrow/record_reader/definition_levels.rs
+++ b/parquet/src/arrow/record_reader/definition_levels.rs
@@ -276,6 +276,7 @@ impl PackedDecoder {
         self.packed_offset = 0;
         self.packed_count = match encoding {
             Encoding::RLE => 0,
+            #[allow(deprecated)]
             Encoding::BIT_PACKED => data.len() * 8,
             _ => unreachable!("invalid level encoding: {}", encoding),
         };
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 2327e1d84b..2dd252a589 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -256,10 +256,21 @@ pub enum Encoding {
     /// Usable for definition/repetition levels encoding and boolean values.
     RLE,
 
-    /// Bit packed encoding.
+    /// **Deprecated** Bit-packed encoding.
     ///
     /// This can only be used if the data has a known max width.
     /// Usable for definition/repetition levels encoding.
+    ///
+    /// There are compatibility issues with files using this encoding.
+    /// The parquet standard specifies the bits to be packed starting from the
+    /// most-significant bit, several implementations do not follow this bit 
order.
+    /// Several other implementations also have issues reading this encoding
+    /// because of incorrect assumptions about the length of the encoded data.
+    ///
+    /// The RLE/bit-packing hybrid is more cpu and memory efficient and should 
be used instead.
+    #[deprecated(
+        note = "Please see documentation for compatibility issues and use the 
RLE/bit-packing hybrid encoding instead"
+    )]
     BIT_PACKED,
 
     /// Delta encoding for integers, either INT32 or INT64.
@@ -301,6 +312,7 @@ impl FromStr for Encoding {
             "PLAIN" | "plain" => Ok(Encoding::PLAIN),
             "PLAIN_DICTIONARY" | "plain_dictionary" => 
Ok(Encoding::PLAIN_DICTIONARY),
             "RLE" | "rle" => Ok(Encoding::RLE),
+            #[allow(deprecated)]
             "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
             "DELTA_BINARY_PACKED" | "delta_binary_packed" => 
Ok(Encoding::DELTA_BINARY_PACKED),
             "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
@@ -910,6 +922,7 @@ impl TryFrom<parquet::Encoding> for Encoding {
             parquet::Encoding::PLAIN => Encoding::PLAIN,
             parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY,
             parquet::Encoding::RLE => Encoding::RLE,
+            #[allow(deprecated)]
             parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED,
             parquet::Encoding::DELTA_BINARY_PACKED => 
Encoding::DELTA_BINARY_PACKED,
             parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => 
Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -927,6 +940,7 @@ impl From<Encoding> for parquet::Encoding {
             Encoding::PLAIN => parquet::Encoding::PLAIN,
             Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY,
             Encoding::RLE => parquet::Encoding::RLE,
+            #[allow(deprecated)]
             Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED,
             Encoding::DELTA_BINARY_PACKED => 
parquet::Encoding::DELTA_BINARY_PACKED,
             Encoding::DELTA_LENGTH_BYTE_ARRAY => 
parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -1114,6 +1128,7 @@ impl str::FromStr for LogicalType {
 }
 
 #[cfg(test)]
+#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
 mod tests {
     use super::*;
 
diff --git a/parquet/src/bin/parquet-layout.rs 
b/parquet/src/bin/parquet-layout.rs
index b6d655757b..79a0acb5f5 100644
--- a/parquet/src/bin/parquet-layout.rs
+++ b/parquet/src/bin/parquet-layout.rs
@@ -200,6 +200,7 @@ fn encoding(encoding: parquet::format::Encoding) -> 
&'static str {
         Ok(Encoding::PLAIN) => "plain",
         Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary",
         Ok(Encoding::RLE) => "rle",
+        #[allow(deprecated)]
         Ok(Encoding::BIT_PACKED) => "bit_packed",
         Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed",
         Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array",
diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs
index 4f861917c9..53d56dd6b9 100644
--- a/parquet/src/column/reader.rs
+++ b/parquet/src/column/reader.rs
@@ -580,6 +580,7 @@ fn parse_v1_level(
                 buf.slice(i32_size..i32_size + data_size),
             ))
         }
+        #[allow(deprecated)]
         Encoding::BIT_PACKED => {
             let bit_width = num_required_bits(max_level as u64);
             let num_bytes = ceil(num_buffered_values as usize * bit_width as 
usize, 8);
diff --git a/parquet/src/column/reader/decoder.rs 
b/parquet/src/column/reader/decoder.rs
index 9889973b67..afd58b3cd1 100644
--- a/parquet/src/column/reader/decoder.rs
+++ b/parquet/src/column/reader/decoder.rs
@@ -268,6 +268,7 @@ impl LevelDecoder {
                 decoder.set_data(data);
                 Self::Rle(decoder)
             }
+            #[allow(deprecated)]
             Encoding::BIT_PACKED => Self::Packed(BitReader::new(data), 
bit_width),
             _ => unreachable!("invalid level encoding: {}", encoding),
         }
diff --git a/parquet/src/encodings/decoding.rs 
b/parquet/src/encodings/decoding.rs
index 88bc3920f3..a1924f72e8 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -1136,6 +1136,7 @@ mod tests {
         );
 
         // unsupported
+        #[allow(deprecated)]
         create_and_check_decoder::<Int32Type>(
             Encoding::BIT_PACKED,
             Some(nyi_err!("Encoding BIT_PACKED is not supported")),
diff --git a/parquet/src/encodings/encoding/mod.rs 
b/parquet/src/encodings/encoding/mod.rs
index ef49f03e2f..9099187a54 100644
--- a/parquet/src/encodings/encoding/mod.rs
+++ b/parquet/src/encodings/encoding/mod.rs
@@ -754,6 +754,7 @@ mod tests {
         );
 
         // unsupported
+        #[allow(deprecated)]
         create_and_check_encoder::<Int32Type>(
             Encoding::BIT_PACKED,
             Some(nyi_err!("Encoding BIT_PACKED is not supported")),
diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs
index 62c3b89db3..8d4e48ba16 100644
--- a/parquet/src/encodings/levels.rs
+++ b/parquet/src/encodings/levels.rs
@@ -35,6 +35,7 @@ pub fn max_buffer_size(
     let bit_width = num_required_bits(max_level as u64);
     match encoding {
         Encoding::RLE => RleEncoder::max_buffer_size(bit_width, 
num_buffered_values),
+        #[allow(deprecated)]
         Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize, 
8),
         _ => panic!("Unsupported encoding type {encoding}"),
     }
@@ -66,6 +67,7 @@ impl LevelEncoder {
                 buffer.extend_from_slice(&[0; 4]);
                 LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
             }
+            #[allow(deprecated)]
             Encoding::BIT_PACKED => {
                 // Here we set full byte buffer without adjusting for 
num_buffered_values,
                 // because byte buffer will already be allocated with size from
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index f960b1830e..7e06cde376 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -963,7 +963,9 @@ mod tests {
                     assert_eq!(num_values, 8);
                     assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
                     assert_eq!(def_level_encoding, Encoding::RLE);
-                    assert_eq!(rep_level_encoding, Encoding::BIT_PACKED);
+                    #[allow(deprecated)]
+                    let expected_rep_level_encoding = Encoding::BIT_PACKED;
+                    assert_eq!(rep_level_encoding, 
expected_rep_level_encoding);
                     assert!(statistics.is_none());
                     true
                 }

Reply via email to