This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 93c7a12cff Mark Encoding::BIT_PACKED as deprecated and document its
compatibility issues (#5348)
93c7a12cff is described below
commit 93c7a12cff17cf485c87909ebef5f1095078bc3f
Author: Jörn Horstmann <[email protected]>
AuthorDate: Wed Jan 31 11:35:44 2024 +0100
Mark Encoding::BIT_PACKED as deprecated and document its compatibility
issues (#5348)
* Mark Encoding::BIT_PACKED as deprecated and document its compatibility
issues
* Allow deprecated BIT_PACKED in parquet-layout binary
---
parquet/src/arrow/record_reader/definition_levels.rs | 1 +
parquet/src/basic.rs | 17 ++++++++++++++++-
parquet/src/bin/parquet-layout.rs | 1 +
parquet/src/column/reader.rs | 1 +
parquet/src/column/reader/decoder.rs | 1 +
parquet/src/encodings/decoding.rs | 1 +
parquet/src/encodings/encoding/mod.rs | 1 +
parquet/src/encodings/levels.rs | 2 ++
parquet/src/file/serialized_reader.rs | 4 +++-
9 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/record_reader/definition_levels.rs
b/parquet/src/arrow/record_reader/definition_levels.rs
index 87f531df09..fcd04fbb9b 100644
--- a/parquet/src/arrow/record_reader/definition_levels.rs
+++ b/parquet/src/arrow/record_reader/definition_levels.rs
@@ -276,6 +276,7 @@ impl PackedDecoder {
self.packed_offset = 0;
self.packed_count = match encoding {
Encoding::RLE => 0,
+ #[allow(deprecated)]
Encoding::BIT_PACKED => data.len() * 8,
_ => unreachable!("invalid level encoding: {}", encoding),
};
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 2327e1d84b..2dd252a589 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -256,10 +256,21 @@ pub enum Encoding {
/// Usable for definition/repetition levels encoding and boolean values.
RLE,
- /// Bit packed encoding.
+ /// **Deprecated** Bit-packed encoding.
///
/// This can only be used if the data has a known max width.
/// Usable for definition/repetition levels encoding.
+ ///
+ /// There are compatibility issues with files using this encoding.
+ /// The parquet standard specifies the bits to be packed starting from the
+ /// most-significant bit, several implementations do not follow this bit
order.
+ /// Several other implementations also have issues reading this encoding
+ /// because of incorrect assumptions about the length of the encoded data.
+ ///
+ /// The RLE/bit-packing hybrid is more cpu and memory efficient and should
be used instead.
+ #[deprecated(
+ note = "Please see documentation for compatibility issues and use the
RLE/bit-packing hybrid encoding instead"
+ )]
BIT_PACKED,
/// Delta encoding for integers, either INT32 or INT64.
@@ -301,6 +312,7 @@ impl FromStr for Encoding {
"PLAIN" | "plain" => Ok(Encoding::PLAIN),
"PLAIN_DICTIONARY" | "plain_dictionary" =>
Ok(Encoding::PLAIN_DICTIONARY),
"RLE" | "rle" => Ok(Encoding::RLE),
+ #[allow(deprecated)]
"BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
"DELTA_BINARY_PACKED" | "delta_binary_packed" =>
Ok(Encoding::DELTA_BINARY_PACKED),
"DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
@@ -910,6 +922,7 @@ impl TryFrom<parquet::Encoding> for Encoding {
parquet::Encoding::PLAIN => Encoding::PLAIN,
parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY,
parquet::Encoding::RLE => Encoding::RLE,
+ #[allow(deprecated)]
parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED,
parquet::Encoding::DELTA_BINARY_PACKED =>
Encoding::DELTA_BINARY_PACKED,
parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY =>
Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -927,6 +940,7 @@ impl From<Encoding> for parquet::Encoding {
Encoding::PLAIN => parquet::Encoding::PLAIN,
Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY,
Encoding::RLE => parquet::Encoding::RLE,
+ #[allow(deprecated)]
Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED,
Encoding::DELTA_BINARY_PACKED =>
parquet::Encoding::DELTA_BINARY_PACKED,
Encoding::DELTA_LENGTH_BYTE_ARRAY =>
parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -1114,6 +1128,7 @@ impl str::FromStr for LogicalType {
}
#[cfg(test)]
+#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
mod tests {
use super::*;
diff --git a/parquet/src/bin/parquet-layout.rs
b/parquet/src/bin/parquet-layout.rs
index b6d655757b..79a0acb5f5 100644
--- a/parquet/src/bin/parquet-layout.rs
+++ b/parquet/src/bin/parquet-layout.rs
@@ -200,6 +200,7 @@ fn encoding(encoding: parquet::format::Encoding) ->
&'static str {
Ok(Encoding::PLAIN) => "plain",
Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary",
Ok(Encoding::RLE) => "rle",
+ #[allow(deprecated)]
Ok(Encoding::BIT_PACKED) => "bit_packed",
Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed",
Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array",
diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs
index 4f861917c9..53d56dd6b9 100644
--- a/parquet/src/column/reader.rs
+++ b/parquet/src/column/reader.rs
@@ -580,6 +580,7 @@ fn parse_v1_level(
buf.slice(i32_size..i32_size + data_size),
))
}
+ #[allow(deprecated)]
Encoding::BIT_PACKED => {
let bit_width = num_required_bits(max_level as u64);
let num_bytes = ceil(num_buffered_values as usize * bit_width as
usize, 8);
diff --git a/parquet/src/column/reader/decoder.rs
b/parquet/src/column/reader/decoder.rs
index 9889973b67..afd58b3cd1 100644
--- a/parquet/src/column/reader/decoder.rs
+++ b/parquet/src/column/reader/decoder.rs
@@ -268,6 +268,7 @@ impl LevelDecoder {
decoder.set_data(data);
Self::Rle(decoder)
}
+ #[allow(deprecated)]
Encoding::BIT_PACKED => Self::Packed(BitReader::new(data),
bit_width),
_ => unreachable!("invalid level encoding: {}", encoding),
}
diff --git a/parquet/src/encodings/decoding.rs
b/parquet/src/encodings/decoding.rs
index 88bc3920f3..a1924f72e8 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -1136,6 +1136,7 @@ mod tests {
);
// unsupported
+ #[allow(deprecated)]
create_and_check_decoder::<Int32Type>(
Encoding::BIT_PACKED,
Some(nyi_err!("Encoding BIT_PACKED is not supported")),
diff --git a/parquet/src/encodings/encoding/mod.rs
b/parquet/src/encodings/encoding/mod.rs
index ef49f03e2f..9099187a54 100644
--- a/parquet/src/encodings/encoding/mod.rs
+++ b/parquet/src/encodings/encoding/mod.rs
@@ -754,6 +754,7 @@ mod tests {
);
// unsupported
+ #[allow(deprecated)]
create_and_check_encoder::<Int32Type>(
Encoding::BIT_PACKED,
Some(nyi_err!("Encoding BIT_PACKED is not supported")),
diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs
index 62c3b89db3..8d4e48ba16 100644
--- a/parquet/src/encodings/levels.rs
+++ b/parquet/src/encodings/levels.rs
@@ -35,6 +35,7 @@ pub fn max_buffer_size(
let bit_width = num_required_bits(max_level as u64);
match encoding {
Encoding::RLE => RleEncoder::max_buffer_size(bit_width,
num_buffered_values),
+ #[allow(deprecated)]
Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize,
8),
_ => panic!("Unsupported encoding type {encoding}"),
}
@@ -66,6 +67,7 @@ impl LevelEncoder {
buffer.extend_from_slice(&[0; 4]);
LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
}
+ #[allow(deprecated)]
Encoding::BIT_PACKED => {
// Here we set full byte buffer without adjusting for
num_buffered_values,
// because byte buffer will already be allocated with size from
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index f960b1830e..7e06cde376 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -963,7 +963,9 @@ mod tests {
assert_eq!(num_values, 8);
assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
assert_eq!(def_level_encoding, Encoding::RLE);
- assert_eq!(rep_level_encoding, Encoding::BIT_PACKED);
+ #[allow(deprecated)]
+ let expected_rep_level_encoding = Encoding::BIT_PACKED;
+ assert_eq!(rep_level_encoding,
expected_rep_level_encoding);
assert!(statistics.is_none());
true
}