This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new d941ff1c37 Support parquet bloom filter length (#4885)
d941ff1c37 is described below
commit d941ff1c3741ba4e18022d8be8edfbbca8b0af17
Author: Letian Jiang <[email protected]>
AuthorDate: Mon Oct 2 22:14:23 2023 +0800
Support parquet bloom filter length (#4885)
* Support parquet bloom filter length
Signed-off-by: Letian Jiang <[email protected]>
* update
Signed-off-by: Letian Jiang <[email protected]>
---------
Signed-off-by: Letian Jiang <[email protected]>
---
parquet/src/bloom_filter/mod.rs | 30 ++-
parquet/src/file/metadata.rs | 17 ++
parquet/src/file/writer.rs | 9 +-
parquet/src/format.rs | 505 ++++++++++++----------------------------
parquet/src/schema/printer.rs | 5 +
5 files changed, 195 insertions(+), 371 deletions(-)
diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs
index 4d2040b7f2..c893d492b5 100644
--- a/parquet/src/bloom_filter/mod.rs
+++ b/parquet/src/bloom_filter/mod.rs
@@ -135,13 +135,12 @@ pub struct Sbbf(Vec<Block>);
const SBBF_HEADER_SIZE_ESTIMATE: usize = 20;
-/// given an initial offset, and a [ChunkReader], try to read out a bloom
filter header and return
+/// given an initial offset, and a byte buffer, try to read out a bloom filter
header and return
/// both the header and the offset after it (for bitset).
-fn chunk_read_bloom_filter_header_and_offset<R: ChunkReader>(
+fn chunk_read_bloom_filter_header_and_offset(
offset: u64,
- reader: Arc<R>,
+ buffer: Bytes,
) -> Result<(BloomFilterHeader, u64), ParquetError> {
- let buffer = reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE)?;
let (header, length) = read_bloom_filter_header_and_length(buffer)?;
Ok((header, offset + length))
}
@@ -271,8 +270,13 @@ impl Sbbf {
return Ok(None);
};
+ let buffer = match column_metadata.bloom_filter_length() {
+ Some(length) => reader.get_bytes(offset, length as usize),
+ None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE),
+ }?;
+
let (header, bitset_offset) =
- chunk_read_bloom_filter_header_and_offset(offset, reader.clone())?;
+ chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
match header.algorithm {
BloomFilterAlgorithm::BLOCK(_) => {
@@ -289,11 +293,17 @@ impl Sbbf {
// this match exists to future proof the singleton hash enum
}
}
- // length in bytes
- let length: usize = header.num_bytes.try_into().map_err(|_| {
- ParquetError::General("Bloom filter length is invalid".to_string())
- })?;
- let bitset = reader.get_bytes(bitset_offset, length)?;
+
+ let bitset = match column_metadata.bloom_filter_length() {
+ Some(_) => buffer.slice((bitset_offset - offset) as usize..),
+ None => {
+ let bitset_length: usize =
header.num_bytes.try_into().map_err(|_| {
+ ParquetError::General("Bloom filter length is
invalid".to_string())
+ })?;
+ reader.get_bytes(bitset_offset, bitset_length)?
+ }
+ };
+
Ok(Some(Self::new(&bitset)))
}
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index aaa3d28e20..1f46c8105e 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -474,6 +474,7 @@ pub struct ColumnChunkMetaData {
statistics: Option<Statistics>,
encoding_stats: Option<Vec<PageEncodingStats>>,
bloom_filter_offset: Option<i64>,
+ bloom_filter_length: Option<i32>,
offset_index_offset: Option<i64>,
offset_index_length: Option<i32>,
column_index_offset: Option<i64>,
@@ -591,6 +592,11 @@ impl ColumnChunkMetaData {
self.bloom_filter_offset
}
+ /// Returns the offset for the bloom filter.
+ pub fn bloom_filter_length(&self) -> Option<i32> {
+ self.bloom_filter_length
+ }
+
/// Returns the offset for the column index.
pub fn column_index_offset(&self) -> Option<i64> {
self.column_index_offset
@@ -657,6 +663,7 @@ impl ColumnChunkMetaData {
})
.transpose()?;
let bloom_filter_offset = col_metadata.bloom_filter_offset;
+ let bloom_filter_length = col_metadata.bloom_filter_length;
let offset_index_offset = cc.offset_index_offset;
let offset_index_length = cc.offset_index_length;
let column_index_offset = cc.column_index_offset;
@@ -677,6 +684,7 @@ impl ColumnChunkMetaData {
statistics,
encoding_stats,
bloom_filter_offset,
+ bloom_filter_length,
offset_index_offset,
offset_index_length,
column_index_offset,
@@ -722,6 +730,7 @@ impl ColumnChunkMetaData {
.as_ref()
.map(|vec|
vec.iter().map(page_encoding_stats::to_thrift).collect()),
bloom_filter_offset: self.bloom_filter_offset,
+ bloom_filter_length: self.bloom_filter_length,
}
}
@@ -752,6 +761,7 @@ impl ColumnChunkMetaDataBuilder {
statistics: None,
encoding_stats: None,
bloom_filter_offset: None,
+ bloom_filter_length: None,
offset_index_offset: None,
offset_index_length: None,
column_index_offset: None,
@@ -837,6 +847,12 @@ impl ColumnChunkMetaDataBuilder {
self
}
+ /// Sets optional bloom filter length in bytes.
+ pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
+ self.0.bloom_filter_length = value;
+ self
+ }
+
/// Sets optional offset index offset in bytes.
pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
self.0.offset_index_offset = value;
@@ -1053,6 +1069,7 @@ mod tests {
},
])
.set_bloom_filter_offset(Some(6000))
+ .set_bloom_filter_length(Some(25))
.set_offset_index_offset(Some(7000))
.set_offset_index_length(Some(25))
.set_column_index_offset(Some(8000))
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index cafb176135..af25cc9689 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -267,12 +267,15 @@ impl<W: Write + Send> SerializedFileWriter<W> {
Some(bloom_filter) => {
let start_offset = self.buf.bytes_written();
bloom_filter.write(&mut self.buf)?;
+ let end_offset = self.buf.bytes_written();
// set offset and index for bloom filter
- column_chunk
+ let column_chunk_meta = column_chunk
.meta_data
.as_mut()
- .expect("can't have bloom filter without column
metadata")
- .bloom_filter_offset = Some(start_offset as i64);
+ .expect("can't have bloom filter without column
metadata");
+ column_chunk_meta.bloom_filter_offset =
Some(start_offset as i64);
+ column_chunk_meta.bloom_filter_length =
+ Some((end_offset - start_offset) as i32);
}
None => {}
}
diff --git a/parquet/src/format.rs b/parquet/src/format.rs
index 0851b2287f..12c572c23c 100644
--- a/parquet/src/format.rs
+++ b/parquet/src/format.rs
@@ -1,9 +1,10 @@
-// Autogenerated by Thrift Compiler (0.17.0)
+// Autogenerated by Thrift Compiler (0.19.0)
// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+#![allow(dead_code)]
#![allow(unused_imports)]
#![allow(unused_extern_crates)]
-#![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box)]
+#![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box,
clippy::wrong_self_convention)]
#![cfg_attr(rustfmt, rustfmt_skip)]
use std::cell::RefCell;
@@ -99,7 +100,7 @@ impl From<&Type> for i32 {
/// DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet.
/// ConvertedType is superseded by LogicalType. This enum should not be
extended.
-///
+///
/// See LogicalTypes.md for conversion between ConvertedType and LogicalType.
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct ConvertedType(pub i32);
@@ -117,12 +118,12 @@ impl ConvertedType {
/// an enum is converted into a binary field
pub const ENUM: ConvertedType = ConvertedType(4);
/// A decimal value.
- ///
+ ///
/// This may be used to annotate binary or fixed primitive types. The
/// underlying byte array stores the unscaled value encoded as two's
/// complement using big-endian byte order (the most significant byte is the
/// zeroth element). The value of the decimal is the value * 10^{-scale}.
- ///
+ ///
/// This must be accompanied by a (maximum) precision and a scale in the
/// SchemaElement. The precision specifies the number of digits in the
decimal
/// and the scale stores the location of the decimal point. For example 1.23
@@ -130,62 +131,62 @@ impl ConvertedType {
/// 2 digits over).
pub const DECIMAL: ConvertedType = ConvertedType(5);
/// A Date
- ///
+ ///
/// Stored as days since Unix epoch, encoded as the INT32 physical type.
- ///
+ ///
pub const DATE: ConvertedType = ConvertedType(6);
/// A time
- ///
+ ///
/// The total number of milliseconds since midnight. The value is stored
/// as an INT32 physical type.
pub const TIME_MILLIS: ConvertedType = ConvertedType(7);
/// A time.
- ///
+ ///
/// The total number of microseconds since midnight. The value is stored as
/// an INT64 physical type.
pub const TIME_MICROS: ConvertedType = ConvertedType(8);
/// A date/time combination
- ///
+ ///
/// Date and time recorded as milliseconds since the Unix epoch. Recorded as
/// a physical type of INT64.
pub const TIMESTAMP_MILLIS: ConvertedType = ConvertedType(9);
/// A date/time combination
- ///
+ ///
/// Date and time recorded as microseconds since the Unix epoch. The value
is
/// stored as an INT64 physical type.
pub const TIMESTAMP_MICROS: ConvertedType = ConvertedType(10);
/// An unsigned integer value.
- ///
+ ///
/// The number describes the maximum number of meaningful data bits in
/// the stored value. 8, 16 and 32 bit values are stored using the
/// INT32 physical type. 64 bit values are stored using the INT64
/// physical type.
- ///
+ ///
pub const UINT_8: ConvertedType = ConvertedType(11);
pub const UINT_16: ConvertedType = ConvertedType(12);
pub const UINT_32: ConvertedType = ConvertedType(13);
pub const UINT_64: ConvertedType = ConvertedType(14);
/// A signed integer value.
- ///
+ ///
/// The number describes the maximum number of meaningful data bits in
/// the stored value. 8, 16 and 32 bit values are stored using the
/// INT32 physical type. 64 bit values are stored using the INT64
/// physical type.
- ///
+ ///
pub const INT_8: ConvertedType = ConvertedType(15);
pub const INT_16: ConvertedType = ConvertedType(16);
pub const INT_32: ConvertedType = ConvertedType(17);
pub const INT_64: ConvertedType = ConvertedType(18);
/// An embedded JSON document
- ///
+ ///
/// A JSON document embedded within a single UTF8 column.
pub const JSON: ConvertedType = ConvertedType(19);
/// An embedded BSON document
- ///
+ ///
/// A BSON document embedded within a single BINARY column.
pub const BSON: ConvertedType = ConvertedType(20);
/// An interval of time
- ///
+ ///
/// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12
/// This data is composed of three separate little endian unsigned
/// integers. Each stores a component of a duration of time. The first
@@ -443,11 +444,11 @@ impl From<&Encoding> for i32 {
}
/// Supported compression algorithms.
-///
+///
/// Codecs added in format version X.Y can be read by readers based on X.Y and
later.
/// Codec support may vary between readers based on the format version and
/// libraries available at runtime.
-///
+///
/// See Compression.md for a detailed specification of these algorithms.
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct CompressionCodec(pub i32);
@@ -637,17 +638,17 @@ impl From<&BoundaryOrder> for i32 {
/// Statistics per row group and per page
/// All fields are optional.
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct Statistics {
/// DEPRECATED: min and max value of the column. Use min_value and max_value.
- ///
+ ///
/// Values are encoded using PLAIN encoding, except that variable-length byte
/// arrays do not include a length prefix.
- ///
+ ///
/// These fields encode min and max values determined by signed comparison
/// only. New files should use the correct order for a column's logical type
/// and store the values in the min_value and max_value fields.
- ///
+ ///
/// To support older readers, these may be set when the column order is
/// signed.
pub max: Option<Vec<u8>>,
@@ -657,7 +658,7 @@ pub struct Statistics {
/// count of distinct values occurring
pub distinct_count: Option<i64>,
/// Min and max values for the column, determined by its ColumnOrder.
- ///
+ ///
/// Values are encoded using PLAIN encoding, except that variable-length byte
/// arrays do not include a length prefix.
pub max_value: Option<Vec<u8>>,
@@ -772,25 +773,12 @@ impl TSerializable for Statistics {
}
}
-impl Default for Statistics {
- fn default() -> Self {
- Statistics{
- max: Some(Vec::new()),
- min: Some(Vec::new()),
- null_count: Some(0),
- distinct_count: Some(0),
- max_value: Some(Vec::new()),
- min_value: Some(Vec::new()),
- }
- }
-}
-
//
// StringType
//
/// Empty structs to use as logical type annotations
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct StringType {
}
@@ -808,12 +796,7 @@ impl TSerializable for StringType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -828,17 +811,11 @@ impl TSerializable for StringType {
}
}
-impl Default for StringType {
- fn default() -> Self {
- StringType{}
- }
-}
-
//
// UUIDType
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct UUIDType {
}
@@ -856,12 +833,7 @@ impl TSerializable for UUIDType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -876,17 +848,11 @@ impl TSerializable for UUIDType {
}
}
-impl Default for UUIDType {
- fn default() -> Self {
- UUIDType{}
- }
-}
-
//
// MapType
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct MapType {
}
@@ -904,12 +870,7 @@ impl TSerializable for MapType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -924,17 +885,11 @@ impl TSerializable for MapType {
}
}
-impl Default for MapType {
- fn default() -> Self {
- MapType{}
- }
-}
-
//
// ListType
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct ListType {
}
@@ -952,12 +907,7 @@ impl TSerializable for ListType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -972,17 +922,11 @@ impl TSerializable for ListType {
}
}
-impl Default for ListType {
- fn default() -> Self {
- ListType{}
- }
-}
-
//
// EnumType
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct EnumType {
}
@@ -1000,12 +944,7 @@ impl TSerializable for EnumType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1020,17 +959,11 @@ impl TSerializable for EnumType {
}
}
-impl Default for EnumType {
- fn default() -> Self {
- EnumType{}
- }
-}
-
//
// DateType
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DateType {
}
@@ -1048,12 +981,7 @@ impl TSerializable for DateType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1068,22 +996,16 @@ impl TSerializable for DateType {
}
}
-impl Default for DateType {
- fn default() -> Self {
- DateType{}
- }
-}
-
//
// NullType
//
/// Logical type to annotate a column that is always null.
-///
+///
/// Sometimes when discovering the schema of existing data, values are always
/// null and the physical type can't be determined. This annotation signals
/// the case where the physical type was guessed from all null values.
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct NullType {
}
@@ -1101,12 +1023,7 @@ impl TSerializable for NullType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1121,21 +1038,18 @@ impl TSerializable for NullType {
}
}
-impl Default for NullType {
- fn default() -> Self {
- NullType{}
- }
-}
-
//
// DecimalType
//
/// Decimal logical type annotation
-///
+///
+/// Scale must be zero or a positive integer less than or equal to the
precision.
+/// Precision must be a non-zero positive integer.
+///
/// To maintain forward-compatibility in v1, implementations using this logical
/// type must also set scale and precision on the annotated SchemaElement.
-///
+///
/// Allowed for physical types: INT32, INT64, FIXED, and BINARY
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DecimalType {
@@ -1206,7 +1120,7 @@ impl TSerializable for DecimalType {
//
/// Time units for logical types
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct MilliSeconds {
}
@@ -1224,12 +1138,7 @@ impl TSerializable for MilliSeconds {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1244,17 +1153,11 @@ impl TSerializable for MilliSeconds {
}
}
-impl Default for MilliSeconds {
- fn default() -> Self {
- MilliSeconds{}
- }
-}
-
//
// MicroSeconds
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct MicroSeconds {
}
@@ -1272,12 +1175,7 @@ impl TSerializable for MicroSeconds {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1292,17 +1190,11 @@ impl TSerializable for MicroSeconds {
}
}
-impl Default for MicroSeconds {
- fn default() -> Self {
- MicroSeconds{}
- }
-}
-
//
// NanoSeconds
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct NanoSeconds {
}
@@ -1320,12 +1212,7 @@ impl TSerializable for NanoSeconds {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1340,12 +1227,6 @@ impl TSerializable for NanoSeconds {
}
}
-impl Default for NanoSeconds {
- fn default() -> Self {
- NanoSeconds{}
- }
-}
-
//
// TimeUnit
//
@@ -1450,7 +1331,7 @@ impl TSerializable for TimeUnit {
//
/// Timestamp logical type annotation
-///
+///
/// Allowed for physical types: INT64
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct TimestampType {
@@ -1521,7 +1402,7 @@ impl TSerializable for TimestampType {
//
/// Time logical type annotation
-///
+///
/// Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct TimeType {
@@ -1592,9 +1473,9 @@ impl TSerializable for TimeType {
//
/// Integer logical type annotation
-///
+///
/// bitWidth must be 8, 16, 32, or 64.
-///
+///
/// Allowed for physical types: INT32, INT64
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct IntType {
@@ -1665,9 +1546,9 @@ impl TSerializable for IntType {
//
/// Embedded JSON logical type annotation
-///
+///
/// Allowed for physical types: BINARY
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct JsonType {
}
@@ -1685,12 +1566,7 @@ impl TSerializable for JsonType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1705,20 +1581,14 @@ impl TSerializable for JsonType {
}
}
-impl Default for JsonType {
- fn default() -> Self {
- JsonType{}
- }
-}
-
//
// BsonType
//
/// Embedded BSON logical type annotation
-///
+///
/// Allowed for physical types: BINARY
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct BsonType {
}
@@ -1736,12 +1606,7 @@ impl TSerializable for BsonType {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -1756,12 +1621,6 @@ impl TSerializable for BsonType {
}
}
-impl Default for BsonType {
- fn default() -> Self {
- BsonType{}
- }
-}
-
//
// LogicalType
//
@@ -2003,7 +1862,7 @@ impl TSerializable for LogicalType {
pub struct SchemaElement {
/// Data type for this field. Not set if the current element is a non-leaf
node
pub type_: Option<Type>,
- /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales.
+ /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
/// Otherwise, if specified, this is the maximum bit length to store any of
the values.
/// (e.g. a low cardinality INT col could have this set to 3). Note that
this is
/// in the schema, and therefore fixed for the entire file.
@@ -2020,12 +1879,12 @@ pub struct SchemaElement {
pub num_children: Option<i32>,
/// DEPRECATED: When the schema is the result of a conversion from another
model.
/// Used to record the original type to help with cross conversion.
- ///
+ ///
/// This is superseded by logicalType.
pub converted_type: Option<ConvertedType>,
/// DEPRECATED: Used when this column contains decimal data.
/// See the DECIMAL converted type for more details.
- ///
+ ///
/// This is superseded by using the DecimalType annotation in logicalType.
pub scale: Option<i32>,
pub precision: Option<i32>,
@@ -2033,7 +1892,7 @@ pub struct SchemaElement {
/// original field id in the parquet schema
pub field_id: Option<i32>,
/// The logical type of this SchemaElement
- ///
+ ///
/// LogicalType replaces ConvertedType, but ConvertedType is still required
/// for some logical types to ensure forward-compatibility in format v1.
pub logical_type: Option<LogicalType>,
@@ -2309,7 +2168,7 @@ impl TSerializable for DataPageHeader {
// IndexPageHeader
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct IndexPageHeader {
}
@@ -2327,12 +2186,7 @@ impl TSerializable for IndexPageHeader {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -2347,16 +2201,14 @@ impl TSerializable for IndexPageHeader {
}
}
-impl Default for IndexPageHeader {
- fn default() -> Self {
- IndexPageHeader{}
- }
-}
-
//
// DictionaryPageHeader
//
+/// The dictionary page must be placed at the first position of the column
chunk
+/// if it is partly or completely dictionary encoded. At most one dictionary
page
+/// can be placed in a column chunk.
+///
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DictionaryPageHeader {
/// Number of values in the dictionary *
@@ -2444,7 +2296,7 @@ impl TSerializable for DictionaryPageHeader {
/// New page format allowing reading levels without decompressing the data
/// Repetition and definition levels are uncompressed
/// The remaining section containing the data is compressed if is_compressed
is true
-///
+///
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DataPageHeaderV2 {
/// Number of values, including NULLs, in this data page. *
@@ -2601,7 +2453,7 @@ impl TSerializable for DataPageHeaderV2 {
//
/// Block-based algorithm type annotation. *
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct SplitBlockAlgorithm {
}
@@ -2619,12 +2471,7 @@ impl TSerializable for SplitBlockAlgorithm {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -2639,12 +2486,6 @@ impl TSerializable for SplitBlockAlgorithm {
}
}
-impl Default for SplitBlockAlgorithm {
- fn default() -> Self {
- SplitBlockAlgorithm{}
- }
-}
-
//
// BloomFilterAlgorithm
//
@@ -2724,8 +2565,8 @@ impl TSerializable for BloomFilterAlgorithm {
/// Hash strategy type annotation. xxHash is an extremely fast
non-cryptographic hash
/// algorithm. It uses 64 bits version of xxHash.
-///
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+///
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct XxHash {
}
@@ -2743,12 +2584,7 @@ impl TSerializable for XxHash {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -2763,12 +2599,6 @@ impl TSerializable for XxHash {
}
}
-impl Default for XxHash {
- fn default() -> Self {
- XxHash{}
- }
-}
-
//
// BloomFilterHash
//
@@ -2847,8 +2677,8 @@ impl TSerializable for BloomFilterHash {
//
/// The compression used in the Bloom filter.
-///
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+///
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct Uncompressed {
}
@@ -2866,12 +2696,7 @@ impl TSerializable for Uncompressed {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -2886,12 +2711,6 @@ impl TSerializable for Uncompressed {
}
}
-impl Default for Uncompressed {
- fn default() -> Self {
- Uncompressed{}
- }
-}
-
//
// BloomFilterCompression
//
@@ -2971,7 +2790,7 @@ impl TSerializable for BloomFilterCompression {
/// Bloom filter header is stored at beginning of Bloom filter data of each
column
/// and followed by its bitset.
-///
+///
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct BloomFilterHeader {
/// The size of bitset in bytes *
@@ -3076,32 +2895,22 @@ pub struct PageHeader {
pub uncompressed_page_size: i32,
/// Compressed (and potentially encrypted) page size in bytes, not including
this header *
pub compressed_page_size: i32,
- /// The 32bit CRC for the page, to be be calculated as follows:
- /// - Using the standard CRC32 algorithm
- /// - On the data only, i.e. this header should not be included. 'Data'
- /// hereby refers to the concatenation of the repetition levels, the
- /// definition levels and the column value, in this exact order.
- /// - On the encoded versions of the repetition levels, definition levels and
- /// column values
- /// - On the compressed versions of the repetition levels, definition levels
- /// and column values where possible;
- /// - For v1 data pages, the repetition levels, definition levels and
column
- /// values are always compressed together. If a compression scheme is
- /// specified, the CRC shall be calculated on the compressed version of
- /// this concatenation. If no compression scheme is specified, the CRC
- /// shall be calculated on the uncompressed version of this
concatenation.
- /// - For v2 data pages, the repetition levels and definition levels are
- /// handled separately from the data and are never compressed (only
- /// encoded). If a compression scheme is specified, the CRC shall be
- /// calculated on the concatenation of the uncompressed repetition
levels,
- /// uncompressed definition levels and the compressed column values.
- /// If no compression scheme is specified, the CRC shall be calculated on
- /// the uncompressed concatenation.
- /// - In encrypted columns, CRC is calculated after page encryption; the
- /// encryption itself is performed after page compression (if compressed)
+ /// The 32-bit CRC checksum for the page, to be be calculated as follows:
+ ///
+ /// - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
+ /// the same as in e.g. GZip).
+ /// - All page types can have a CRC (v1 and v2 data pages, dictionary pages,
+ /// etc.).
+ /// - The CRC is computed on the serialization binary representation of the
page
+ /// (as written to disk), excluding the page header. For example, for v1
+ /// data pages, the CRC is computed on the concatenation of repetition
levels,
+ /// definition levels and column values (optionally compressed, optionally
+ /// encrypted).
+ /// - The CRC computation therefore takes place after any compression
+ /// and encryption steps, if any.
+ ///
/// If enabled, this allows for disabling checksumming in HDFS if only a few
/// pages need to be read.
- ///
pub crc: Option<i32>,
pub data_page_header: Option<DataPageHeader>,
pub index_page_header: Option<IndexPageHeader>,
@@ -3516,10 +3325,16 @@ pub struct ColumnMetaData {
pub encoding_stats: Option<Vec<PageEncodingStats>>,
/// Byte offset from beginning of file to Bloom filter data. *
pub bloom_filter_offset: Option<i64>,
+ /// Size of Bloom filter data including the serialized header, in bytes.
+ /// Added in 2.10 so readers may not read this field from old files and
+ /// it can be obtained after the BloomFilterHeader has been deserialized.
+ /// Writers should write this field so readers can read the bloom filter
+ /// in a single I/O.
+ pub bloom_filter_length: Option<i32>,
}
impl ColumnMetaData {
- pub fn new<F8, F10, F11, F12, F13, F14>(type_: Type, encodings:
Vec<Encoding>, path_in_schema: Vec<String>, codec: CompressionCodec,
num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64,
key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10,
dictionary_page_offset: F11, statistics: F12, encoding_stats: F13,
bloom_filter_offset: F14) -> ColumnMetaData where F8:
Into<Option<Vec<KeyValue>>>, F10: Into<Option<i64>>, F11: Into<Option<i64>>,
F12: Into<Opt [...]
+ pub fn new<F8, F10, F11, F12, F13, F14, F15>(type_: Type, encodings:
Vec<Encoding>, path_in_schema: Vec<String>, codec: CompressionCodec,
num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64,
key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10,
dictionary_page_offset: F11, statistics: F12, encoding_stats: F13,
bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8:
Into<Option<Vec<KeyValue>>>, F10: Into<Option<i64>>, F11: I [...]
ColumnMetaData {
type_,
encodings,
@@ -3535,6 +3350,7 @@ impl ColumnMetaData {
statistics: statistics.into(),
encoding_stats: encoding_stats.into(),
bloom_filter_offset: bloom_filter_offset.into(),
+ bloom_filter_length: bloom_filter_length.into(),
}
}
}
@@ -3556,6 +3372,7 @@ impl TSerializable for ColumnMetaData {
let mut f_12: Option<Statistics> = None;
let mut f_13: Option<Vec<PageEncodingStats>> = None;
let mut f_14: Option<i64> = None;
+ let mut f_15: Option<i32> = None;
loop {
let field_ident = i_prot.read_field_begin()?;
if field_ident.field_type == TType::Stop {
@@ -3643,6 +3460,10 @@ impl TSerializable for ColumnMetaData {
let val = i_prot.read_i64()?;
f_14 = Some(val);
},
+ 15 => {
+ let val = i_prot.read_i32()?;
+ f_15 = Some(val);
+ },
_ => {
i_prot.skip(field_ident.field_type)?;
},
@@ -3673,6 +3494,7 @@ impl TSerializable for ColumnMetaData {
statistics: f_12,
encoding_stats: f_13,
bloom_filter_offset: f_14,
+ bloom_filter_length: f_15,
};
Ok(ret)
}
@@ -3749,6 +3571,11 @@ impl TSerializable for ColumnMetaData {
o_prot.write_i64(fld_var)?;
o_prot.write_field_end()?
}
+ if let Some(fld_var) = self.bloom_filter_length {
+ o_prot.write_field_begin(&TFieldIdentifier::new("bloom_filter_length",
TType::I32, 15))?;
+ o_prot.write_i32(fld_var)?;
+ o_prot.write_field_end()?
+ }
o_prot.write_field_stop()?;
o_prot.write_struct_end()
}
@@ -3758,7 +3585,7 @@ impl TSerializable for ColumnMetaData {
// EncryptionWithFooterKey
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct EncryptionWithFooterKey {
}
@@ -3776,12 +3603,7 @@ impl TSerializable for EncryptionWithFooterKey {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -3796,12 +3618,6 @@ impl TSerializable for EncryptionWithFooterKey {
}
}
-impl Default for EncryptionWithFooterKey {
- fn default() -> Self {
- EncryptionWithFooterKey{}
- }
-}
-
//
// EncryptionWithColumnKey
//
@@ -3977,14 +3793,14 @@ impl TSerializable for ColumnCryptoMetaData {
pub struct ColumnChunk {
/// File where column data is stored. If not set, assumed to be same file as
/// metadata. This path is relative to the current file.
- ///
+ ///
pub file_path: Option<String>,
/// Byte offset in file_path to the ColumnMetaData *
pub file_offset: i64,
/// Column metadata for this chunk. This is the same content as what is at
/// file_path/file_offset. Having it here has it replicated in the file
/// metadata.
- ///
+ ///
pub meta_data: Option<ColumnMetaData>,
/// File offset of ColumnChunk's OffsetIndex *
pub offset_index_offset: Option<i64>,
@@ -4151,7 +3967,7 @@ impl TSerializable for ColumnChunk {
pub struct RowGroup {
/// Metadata for each column chunk in this row group.
/// This list must have the same order as the SchemaElement list in
FileMetaData.
- ///
+ ///
pub columns: Vec<ColumnChunk>,
/// Total byte size of all the uncompressed column data in this row group *
pub total_byte_size: i64,
@@ -4312,7 +4128,7 @@ impl TSerializable for RowGroup {
//
/// Empty struct to signal the order defined by the physical or logical type
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct TypeDefinedOrder {
}
@@ -4330,12 +4146,7 @@ impl TSerializable for TypeDefinedOrder {
if field_ident.field_type == TType::Stop {
break;
}
- let field_id = field_id(&field_ident)?;
- match field_id {
- _ => {
- i_prot.skip(field_ident.field_type)?;
- },
- };
+ i_prot.skip(field_ident.field_type)?;
i_prot.read_field_end()?;
}
i_prot.read_struct_end()?;
@@ -4350,12 +4161,6 @@ impl TSerializable for TypeDefinedOrder {
}
}
-impl Default for TypeDefinedOrder {
- fn default() -> Self {
- TypeDefinedOrder{}
- }
-}
-
//
// ColumnOrder
//
@@ -4596,13 +4401,14 @@ pub struct ColumnIndex {
/// byte\[0\], so that all lists have the same length. If false, the
/// corresponding entries in min_values and max_values must be valid.
pub null_pages: Vec<bool>,
- /// Two lists containing lower and upper bounds for the values of each page.
- /// These may be the actual minimum and maximum values found on a page, but
- /// can also be (more compact) values that do not exist on a page. For
- /// example, instead of storing ""Blart Versenwald III", a writer may set
- /// min_values\[i\]="B", max_values\[i\]="C". Such more compact values must
still
- /// be valid values within the column's logical type. Readers must make sure
- /// that list entries are populated before using them by inspecting
null_pages.
+ /// Two lists containing lower and upper bounds for the values of each page
+ /// determined by the ColumnOrder of the column. These may be the actual
+ /// minimum and maximum values found on a page, but can also be (more
compact)
+ /// values that do not exist on a page. For example, instead of storing
""Blart
+ /// Versenwald III", a writer may set min_values\[i\]="B",
max_values\[i\]="C".
+ /// Such more compact values must still be valid values within the column's
+ /// logical type. Readers must make sure that list entries are populated
before
+ /// using them by inspecting null_pages.
pub min_values: Vec<Vec<u8>>,
pub max_values: Vec<Vec<u8>>,
/// Stores whether both min_values and max_values are ordered and if so, in
@@ -4750,7 +4556,7 @@ impl TSerializable for ColumnIndex {
// AesGcmV1
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct AesGcmV1 {
/// AAD prefix *
pub aad_prefix: Option<Vec<u8>>,
@@ -4833,21 +4639,11 @@ impl TSerializable for AesGcmV1 {
}
}
-impl Default for AesGcmV1 {
- fn default() -> Self {
- AesGcmV1{
- aad_prefix: Some(Vec::new()),
- aad_file_unique: Some(Vec::new()),
- supply_aad_prefix: Some(false),
- }
- }
-}
-
//
// AesGcmCtrV1
//
-#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct AesGcmCtrV1 {
/// AAD prefix *
pub aad_prefix: Option<Vec<u8>>,
@@ -4930,16 +4726,6 @@ impl TSerializable for AesGcmCtrV1 {
}
}
-impl Default for AesGcmCtrV1 {
- fn default() -> Self {
- AesGcmCtrV1{
- aad_prefix: Some(Vec::new()),
- aad_file_unique: Some(Vec::new()),
- supply_aad_prefix: Some(false),
- }
- }
-}
-
//
// EncryptionAlgorithm
//
@@ -5051,19 +4837,22 @@ pub struct FileMetaData {
/// String for application that wrote this file. This should be in the
format
/// `<Application>` version `<App Version>` (build `<App Build Hash>`).
/// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
- ///
+ ///
pub created_by: Option<String>,
- /// Sort order used for the min_value and max_value fields of each column in
- /// this file. Sort orders are listed in the order matching the columns in
the
- /// schema. The indexes are not necessary the same though, because only leaf
- /// nodes of the schema are represented in the list of sort orders.
- ///
- /// Without column_orders, the meaning of the min_value and max_value fields
is
- /// undefined. To ensure well-defined behaviour, if min_value and max_value
are
- /// written to a Parquet file, column_orders must be written as well.
- ///
- /// The obsolete min and max fields are always sorted by signed comparison
- /// regardless of column_orders.
+ /// Sort order used for the min_value and max_value fields in the Statistics
+ /// objects and the min_values and max_values fields in the ColumnIndex
+ /// objects of each column in this file. Sort orders are listed in the order
+ /// matching the columns in the schema. The indexes are not necessary the
same
+ /// though, because only leaf nodes of the schema are represented in the list
+ /// of sort orders.
+ ///
+ /// Without column_orders, the meaning of the min_value and max_value fields
+ /// in the Statistics object and the ColumnIndex object is undefined. To
ensure
+ /// well-defined behaviour, if these fields are written to a Parquet file,
+ /// column_orders must be written as well.
+ ///
+ /// The obsolete min and max fields in the Statistics object are always
sorted
+ /// by signed comparison regardless of column_orders.
pub column_orders: Option<Vec<ColumnOrder>>,
/// Encryption algorithm. This field is set only in encrypted files
/// with plaintext footer. Files with encrypted footer store algorithm id
diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs
index 0c90c5405a..fe63e758b2 100644
--- a/parquet/src/schema/printer.rs
+++ b/parquet/src/schema/printer.rs
@@ -167,6 +167,11 @@ fn print_column_chunk_metadata(
Some(bfo) => bfo.to_string(),
};
writeln!(out, "bloom filter offset: {bloom_filter_offset_str}");
+ let bloom_filter_length_str = match cc_metadata.bloom_filter_length() {
+ None => "N/A".to_owned(),
+ Some(bfo) => bfo.to_string(),
+ };
+ writeln!(out, "bloom filter length: {bloom_filter_length_str}");
let offset_index_offset_str = match cc_metadata.offset_index_offset() {
None => "N/A".to_owned(),
Some(oio) => oio.to_string(),