This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch 53.0.0-dev
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/53.0.0-dev by this push:
new fe04e09b63 Update Parquet thrift generated structures (#6045)
fe04e09b63 is described below
commit fe04e09b63fce64c8f54b39ce7200ecea93f87ef
Author: Ed Seidl <[email protected]>
AuthorDate: Tue Jul 16 15:16:38 2024 -0700
Update Parquet thrift generated structures (#6045)
* update to latest thrift (as of 11 Jul 2024) from parquet-format
* pass None for optional size statistics
* escape HTML tags
* don't need to escape brackets in arrays
---
parquet/regen.sh | 2 +-
parquet/src/file/metadata/mod.rs | 5 +-
parquet/src/format.rs | 397 ++++++++++++++++++++++++++++++++-------
3 files changed, 339 insertions(+), 65 deletions(-)
diff --git a/parquet/regen.sh b/parquet/regen.sh
index d1b82108a0..39999c7872 100755
--- a/parquet/regen.sh
+++ b/parquet/regen.sh
@@ -17,7 +17,7 @@
# specific language governing permissions and limitations
# under the License.
-REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2
+REVISION=5b564f3c47679526cf72e54f207013f28f53acc4
SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 40922d52bf..278d1e464e 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -790,6 +790,7 @@ impl ColumnChunkMetaData {
.map(|vec|
vec.iter().map(page_encoding_stats::to_thrift).collect()),
bloom_filter_offset: self.bloom_filter_offset,
bloom_filter_length: self.bloom_filter_length,
+ size_statistics: None,
}
}
@@ -1004,6 +1005,8 @@ impl ColumnIndexBuilder {
self.max_values,
self.boundary_order,
self.null_counts,
+ None,
+ None,
)
}
}
@@ -1052,7 +1055,7 @@ impl OffsetIndexBuilder {
.zip(self.first_row_index_array.iter())
.map(|((offset, size), row_index)| PageLocation::new(*offset,
*size, *row_index))
.collect::<Vec<_>>();
- OffsetIndex::new(locations)
+ OffsetIndex::new(locations, None)
}
}
diff --git a/parquet/src/format.rs b/parquet/src/format.rs
index b210d6ec1b..6c93097b73 100644
--- a/parquet/src/format.rs
+++ b/parquet/src/format.rs
@@ -117,12 +117,12 @@ impl ConvertedType {
/// a list is converted into an optional field containing a repeated field
for its
/// values
pub const LIST: ConvertedType = ConvertedType(3);
- /// an enum is converted into a binary field
+ /// an enum is converted into a BYTE_ARRAY field
pub const ENUM: ConvertedType = ConvertedType(4);
/// A decimal value.
///
- /// This may be used to annotate binary or fixed primitive types. The
- /// underlying byte array stores the unscaled value encoded as two's
+ /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive
+ /// types. The underlying byte array stores the unscaled value encoded as
two's
/// complement using big-endian byte order (the most significant byte is the
/// zeroth element). The value of the decimal is the value * 10^{-scale}.
///
@@ -185,7 +185,7 @@ impl ConvertedType {
pub const JSON: ConvertedType = ConvertedType(19);
/// An embedded BSON document
///
- /// A BSON document embedded within a single BINARY column.
+ /// A BSON document embedded within a single BYTE_ARRAY column.
pub const BSON: ConvertedType = ConvertedType(20);
/// An interval of time
///
@@ -288,9 +288,9 @@ impl From<&ConvertedType> for i32 {
pub struct FieldRepetitionType(pub i32);
impl FieldRepetitionType {
- /// This field is required (can not be null) and each record has exactly 1
value.
+ /// This field is required (can not be null) and each row has exactly 1
value.
pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0);
- /// The field is optional (can be null) and each record has 0 or 1 values.
+ /// The field is optional (can be null) and each row has 0 or 1 values.
pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1);
/// The field is repeated and can contain 0 or more values
pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2);
@@ -379,12 +379,15 @@ impl Encoding {
pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7);
/// Dictionary encoding: the ids are encoded using the RLE encoding
pub const RLE_DICTIONARY: Encoding = Encoding(8);
- /// Encoding for floating-point data.
+ /// Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64,
FIXED_LEN_BYTE_ARRAY).
/// K byte-streams are created where K is the size in bytes of the data type.
- /// The individual bytes of an FP value are scattered to the corresponding
stream and
+ /// The individual bytes of a value are scattered to the corresponding
stream and
/// the streams are concatenated.
/// This itself does not reduce the size of the data but can lead to better
compression
/// afterwards.
+ ///
+ /// Added in 2.8 for FLOAT and DOUBLE.
+ /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11.
pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9);
pub const ENUM_VALUES: &'static [Self] = &[
Self::PLAIN,
@@ -634,6 +637,143 @@ impl From<&BoundaryOrder> for i32 {
}
}
+//
+// SizeStatistics
+//
+
+/// A structure for capturing metadata for estimating the unencoded,
+/// uncompressed size of data written. This is useful for readers to estimate
+/// how much memory is needed to reconstruct data in their memory model and for
+/// fine grained filter pushdown on nested structures (the histograms contained
+/// in this structure can help determine the number of nulls at a particular
+/// nesting level and maximum length of lists).
+#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct SizeStatistics {
+ /// The number of physical bytes stored for BYTE_ARRAY data values assuming
+ /// no encoding. This is exclusive of the bytes needed to store the length of
+ /// each byte array. In other words, this field is equivalent to the `(size
+ /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
+ /// written)`. To determine unencoded sizes of other types readers can use
+ /// schema information multiplied by the number of non-null and null values.
+ /// The number of null/non-null values can be inferred from the histograms
+ /// below.
+ ///
+ /// For example, if a column chunk is dictionary-encoded with dictionary
+ /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
+ /// then this value for that data page should be 7 (1 + 1 + 2 + 3).
+ ///
+ /// This field should only be set for types that use BYTE_ARRAY as their
+ /// physical type.
+ pub unencoded_byte_array_data_bytes: Option<i64>,
+ /// When present, there is expected to be one element corresponding to each
+ /// repetition (i.e. size=max repetition_level+1) where each element
+ /// represents the number of times the repetition level was observed in the
+ /// data.
+ ///
+ /// This field may be omitted if max_repetition_level is 0 without loss
+ /// of information.
+ ///
+ pub repetition_level_histogram: Option<Vec<i64>>,
+ /// Same as repetition_level_histogram except for definition levels.
+ ///
+ /// This field may be omitted if max_definition_level is 0 or 1 without
+ /// loss of information.
+ ///
+ pub definition_level_histogram: Option<Vec<i64>>,
+}
+
+impl SizeStatistics {
+ pub fn new<F1, F2, F3>(unencoded_byte_array_data_bytes: F1,
repetition_level_histogram: F2, definition_level_histogram: F3) ->
SizeStatistics where F1: Into<Option<i64>>, F2: Into<Option<Vec<i64>>>, F3:
Into<Option<Vec<i64>>> {
+ SizeStatistics {
+ unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(),
+ repetition_level_histogram: repetition_level_histogram.into(),
+ definition_level_histogram: definition_level_histogram.into(),
+ }
+ }
+}
+
+impl crate::thrift::TSerializable for SizeStatistics {
+ fn read_from_in_protocol<T: TInputProtocol>(i_prot: &mut T) ->
thrift::Result<SizeStatistics> {
+ i_prot.read_struct_begin()?;
+ let mut f_1: Option<i64> = None;
+ let mut f_2: Option<Vec<i64>> = None;
+ let mut f_3: Option<Vec<i64>> = None;
+ loop {
+ let field_ident = i_prot.read_field_begin()?;
+ if field_ident.field_type == TType::Stop {
+ break;
+ }
+ let field_id = field_id(&field_ident)?;
+ match field_id {
+ 1 => {
+ let val = i_prot.read_i64()?;
+ f_1 = Some(val);
+ },
+ 2 => {
+ let list_ident = i_prot.read_list_begin()?;
+ let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
+ for _ in 0..list_ident.size {
+ let list_elem_0 = i_prot.read_i64()?;
+ val.push(list_elem_0);
+ }
+ i_prot.read_list_end()?;
+ f_2 = Some(val);
+ },
+ 3 => {
+ let list_ident = i_prot.read_list_begin()?;
+ let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
+ for _ in 0..list_ident.size {
+ let list_elem_1 = i_prot.read_i64()?;
+ val.push(list_elem_1);
+ }
+ i_prot.read_list_end()?;
+ f_3 = Some(val);
+ },
+ _ => {
+ i_prot.skip(field_ident.field_type)?;
+ },
+ };
+ i_prot.read_field_end()?;
+ }
+ i_prot.read_struct_end()?;
+ let ret = SizeStatistics {
+ unencoded_byte_array_data_bytes: f_1,
+ repetition_level_histogram: f_2,
+ definition_level_histogram: f_3,
+ };
+ Ok(ret)
+ }
+ fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
+ let struct_ident = TStructIdentifier::new("SizeStatistics");
+ o_prot.write_struct_begin(&struct_ident)?;
+ if let Some(fld_var) = self.unencoded_byte_array_data_bytes {
+
o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes",
TType::I64, 1))?;
+ o_prot.write_i64(fld_var)?;
+ o_prot.write_field_end()?
+ }
+ if let Some(ref fld_var) = self.repetition_level_histogram {
+
o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram",
TType::List, 2))?;
+ o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len()
as i32))?;
+ for e in fld_var {
+ o_prot.write_i64(*e)?;
+ }
+ o_prot.write_list_end()?;
+ o_prot.write_field_end()?
+ }
+ if let Some(ref fld_var) = self.definition_level_histogram {
+
o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram",
TType::List, 3))?;
+ o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len()
as i32))?;
+ for e in fld_var {
+ o_prot.write_i64(*e)?;
+ }
+ o_prot.write_list_end()?;
+ o_prot.write_field_end()?
+ }
+ o_prot.write_field_stop()?;
+ o_prot.write_struct_end()
+ }
+}
+
//
// Statistics
//
@@ -1123,7 +1263,7 @@ impl crate::thrift::TSerializable for NullType {
/// To maintain forward-compatibility in v1, implementations using this logical
/// type must also set scale and precision on the annotated SchemaElement.
///
-/// Allowed for physical types: INT32, INT64, FIXED, and BINARY
+/// Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and
BYTE_ARRAY.
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DecimalType {
pub scale: i32,
@@ -1620,7 +1760,7 @@ impl crate::thrift::TSerializable for IntType {
/// Embedded JSON logical type annotation
///
-/// Allowed for physical types: BINARY
+/// Allowed for physical types: BYTE_ARRAY
#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct JsonType {
}
@@ -1660,7 +1800,7 @@ impl crate::thrift::TSerializable for JsonType {
/// Embedded BSON logical type annotation
///
-/// Allowed for physical types: BINARY
+/// Allowed for physical types: BYTE_ARRAY
#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct BsonType {
}
@@ -2146,7 +2286,12 @@ impl crate::thrift::TSerializable for SchemaElement {
/// Data page header
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct DataPageHeader {
- /// Number of values, including NULLs, in this data page. *
+ /// Number of values, including NULLs, in this data page.
+ ///
+ /// If a OffsetIndex is present, a page must begin at a row
+ /// boundary (repetition_level = 0). Otherwise, pages may begin
+ /// within a row (repetition_level > 0).
+ ///
pub num_values: i32,
/// Encoding used for this data page *
pub encoding: Encoding,
@@ -2154,7 +2299,7 @@ pub struct DataPageHeader {
pub definition_level_encoding: Encoding,
/// Encoding used for repetition levels *
pub repetition_level_encoding: Encoding,
- /// Optional statistics for the data in this page*
+ /// Optional statistics for the data in this page *
pub statistics: Option<Statistics>,
}
@@ -2390,21 +2535,24 @@ pub struct DataPageHeaderV2 {
/// Number of NULL values, in this data page.
/// Number of non-null = num_values - num_nulls which is also the number of
values in the data section *
pub num_nulls: i32,
- /// Number of rows in this data page. which means pages change on record
boundaries (r = 0) *
+ /// Number of rows in this data page. Every page must begin at a
+ /// row boundary (repetition_level = 0): rows must **not** be
+ /// split across page boundaries when using V2 data pages.
+ ///
pub num_rows: i32,
/// Encoding used for data in this page *
pub encoding: Encoding,
- /// length of the definition levels
+ /// Length of the definition levels
pub definition_levels_byte_length: i32,
- /// length of the repetition levels
+ /// Length of the repetition levels
pub repetition_levels_byte_length: i32,
- /// whether the values are compressed.
+ /// Whether the values are compressed.
/// Which means the section of the page between
/// definition_levels_byte_length + repetition_levels_byte_length + 1 and
compressed_page_size (included)
/// is compressed with the compression_codec.
/// If missing it is considered compressed
pub is_compressed: Option<bool>,
- /// optional statistics for the data in this page *
+ /// Optional statistics for the data in this page *
pub statistics: Option<Statistics>,
}
@@ -3207,10 +3355,10 @@ impl crate::thrift::TSerializable for KeyValue {
// SortingColumn
//
-/// Wrapper struct to specify sort order
+/// Sort order within a RowGroup of a leaf column
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct SortingColumn {
- /// The column index (in this row group) *
+ /// The ordinal position of the column (in this row group) *
pub column_idx: i32,
/// If true, indicates this column is sorted in descending order. *
pub descending: bool,
@@ -3417,10 +3565,15 @@ pub struct ColumnMetaData {
/// Writers should write this field so readers can read the bloom filter
/// in a single I/O.
pub bloom_filter_length: Option<i32>,
+ /// Optional statistics to help estimate total memory when converted to
in-memory
+ /// representations. The histograms contained in these statistics can
+ /// also be useful in some cases for more fine-grained nullability/list
length
+ /// filter pushdown.
+ pub size_statistics: Option<SizeStatistics>,
}
impl ColumnMetaData {
- pub fn new<F8, F10, F11, F12, F13, F14, F15>(type_: Type, encodings:
Vec<Encoding>, path_in_schema: Vec<String>, codec: CompressionCodec,
num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64,
key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10,
dictionary_page_offset: F11, statistics: F12, encoding_stats: F13,
bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8:
Into<Option<Vec<KeyValue>>>, F10: Into<Option<i64>>, F11: I [...]
+ pub fn new<F8, F10, F11, F12, F13, F14, F15, F16>(type_: Type, encodings:
Vec<Encoding>, path_in_schema: Vec<String>, codec: CompressionCodec,
num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64,
key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10,
dictionary_page_offset: F11, statistics: F12, encoding_stats: F13,
bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) ->
ColumnMetaData where F8: Into<Option<Vec<KeyValue>>>, F10 [...]
ColumnMetaData {
type_,
encodings,
@@ -3437,6 +3590,7 @@ impl ColumnMetaData {
encoding_stats: encoding_stats.into(),
bloom_filter_offset: bloom_filter_offset.into(),
bloom_filter_length: bloom_filter_length.into(),
+ size_statistics: size_statistics.into(),
}
}
}
@@ -3459,6 +3613,7 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let mut f_13: Option<Vec<PageEncodingStats>> = None;
let mut f_14: Option<i64> = None;
let mut f_15: Option<i32> = None;
+ let mut f_16: Option<SizeStatistics> = None;
loop {
let field_ident = i_prot.read_field_begin()?;
if field_ident.field_type == TType::Stop {
@@ -3474,8 +3629,8 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<Encoding> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?;
- val.push(list_elem_0);
+ let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_2);
}
i_prot.read_list_end()?;
f_2 = Some(val);
@@ -3484,8 +3639,8 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<String> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_1 = i_prot.read_string()?;
- val.push(list_elem_1);
+ let list_elem_3 = i_prot.read_string()?;
+ val.push(list_elem_3);
}
i_prot.read_list_end()?;
f_3 = Some(val);
@@ -3510,8 +3665,8 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<KeyValue> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?;
- val.push(list_elem_2);
+ let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_4);
}
i_prot.read_list_end()?;
f_8 = Some(val);
@@ -3536,8 +3691,8 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<PageEncodingStats> =
Vec::with_capacity(list_ident.size as usize);
for _ in 0..list_ident.size {
- let list_elem_3 =
PageEncodingStats::read_from_in_protocol(i_prot)?;
- val.push(list_elem_3);
+ let list_elem_5 =
PageEncodingStats::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_5);
}
i_prot.read_list_end()?;
f_13 = Some(val);
@@ -3550,6 +3705,10 @@ impl crate::thrift::TSerializable for ColumnMetaData {
let val = i_prot.read_i32()?;
f_15 = Some(val);
},
+ 16 => {
+ let val = SizeStatistics::read_from_in_protocol(i_prot)?;
+ f_16 = Some(val);
+ },
_ => {
i_prot.skip(field_ident.field_type)?;
},
@@ -3581,6 +3740,7 @@ impl crate::thrift::TSerializable for ColumnMetaData {
encoding_stats: f_13,
bloom_filter_offset: f_14,
bloom_filter_length: f_15,
+ size_statistics: f_16,
};
Ok(ret)
}
@@ -3662,6 +3822,11 @@ impl crate::thrift::TSerializable for ColumnMetaData {
o_prot.write_i32(fld_var)?;
o_prot.write_field_end()?
}
+ if let Some(ref fld_var) = self.size_statistics {
+ o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics",
TType::Struct, 16))?;
+ fld_var.write_to_out_protocol(o_prot)?;
+ o_prot.write_field_end()?
+ }
o_prot.write_field_stop()?;
o_prot.write_struct_end()
}
@@ -3741,8 +3906,8 @@ impl crate::thrift::TSerializable for
EncryptionWithColumnKey {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<String> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_4 = i_prot.read_string()?;
- val.push(list_elem_4);
+ let list_elem_6 = i_prot.read_string()?;
+ val.push(list_elem_6);
}
i_prot.read_list_end()?;
f_1 = Some(val);
@@ -3881,11 +4046,19 @@ pub struct ColumnChunk {
/// metadata. This path is relative to the current file.
///
pub file_path: Option<String>,
- /// Byte offset in file_path to the ColumnMetaData *
+ /// Deprecated: Byte offset in file_path to the ColumnMetaData
+ ///
+ /// Past use of this field has been inconsistent, with some implementations
+ /// using it to point to the ColumnMetaData and some using it to point to
+ /// the first page in the column chunk. In many cases, the ColumnMetaData at
this
+ /// location is wrong. This field is now deprecated and should not be used.
+ /// Writers should set this field to 0 if no ColumnMetaData has been written
outside
+ /// the footer.
pub file_offset: i64,
- /// Column metadata for this chunk. This is the same content as what is at
- /// file_path/file_offset. Having it here has it replicated in the file
- /// metadata.
+ /// Column metadata for this chunk. Some writers may also replicate this at
the
+ /// location pointed to by file_path/file_offset.
+ /// Note: while marked as optional, this field is in fact required by most
major
+ /// Parquet implementations. As such, writers MUST populate this field.
///
pub meta_data: Option<ColumnMetaData>,
/// File offset of ColumnChunk's OffsetIndex *
@@ -4107,8 +4280,8 @@ impl crate::thrift::TSerializable for RowGroup {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<ColumnChunk> = Vec::with_capacity(list_ident.size
as usize);
for _ in 0..list_ident.size {
- let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?;
- val.push(list_elem_5);
+ let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_7);
}
i_prot.read_list_end()?;
f_1 = Some(val);
@@ -4125,8 +4298,8 @@ impl crate::thrift::TSerializable for RowGroup {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<SortingColumn> = Vec::with_capacity(list_ident.size
as usize);
for _ in 0..list_ident.size {
- let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?;
- val.push(list_elem_6);
+ let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_8);
}
i_prot.read_list_end()?;
f_4 = Some(val);
@@ -4331,8 +4504,9 @@ pub struct PageLocation {
/// Size of the page, including header. Sum of compressed_page_size and
header
/// length
pub compressed_page_size: i32,
- /// Index within the RowGroup of the first row of the page; this means pages
- /// change on record boundaries (r = 0).
+ /// Index within the RowGroup of the first row of the page. When an
+ /// OffsetIndex is present, pages must begin on row boundaries
+ /// (repetition_level = 0).
pub first_row_index: i64,
}
@@ -4409,17 +4583,28 @@ impl crate::thrift::TSerializable for PageLocation {
// OffsetIndex
//
+/// Optional offsets for each data page in a ColumnChunk.
+///
+/// Forms part of the page index, along with ColumnIndex.
+///
+/// OffsetIndex may be present even if ColumnIndex is not.
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct OffsetIndex {
/// PageLocations, ordered by increasing PageLocation.offset. It is required
/// that page_locations\[i\].first_row_index <
page_locations\[i+1\].first_row_index.
pub page_locations: Vec<PageLocation>,
+ /// Unencoded/uncompressed size for BYTE_ARRAY types.
+ ///
+ /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for
+ /// more details on this field.
+ pub unencoded_byte_array_data_bytes: Option<Vec<i64>>,
}
impl OffsetIndex {
- pub fn new(page_locations: Vec<PageLocation>) -> OffsetIndex {
+ pub fn new<F2>(page_locations: Vec<PageLocation>,
unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2:
Into<Option<Vec<i64>>> {
OffsetIndex {
page_locations,
+ unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(),
}
}
}
@@ -4428,6 +4613,7 @@ impl crate::thrift::TSerializable for OffsetIndex {
fn read_from_in_protocol<T: TInputProtocol>(i_prot: &mut T) ->
thrift::Result<OffsetIndex> {
i_prot.read_struct_begin()?;
let mut f_1: Option<Vec<PageLocation>> = None;
+ let mut f_2: Option<Vec<i64>> = None;
loop {
let field_ident = i_prot.read_field_begin()?;
if field_ident.field_type == TType::Stop {
@@ -4439,12 +4625,22 @@ impl crate::thrift::TSerializable for OffsetIndex {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<PageLocation> = Vec::with_capacity(list_ident.size
as usize);
for _ in 0..list_ident.size {
- let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?;
- val.push(list_elem_7);
+ let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_9);
}
i_prot.read_list_end()?;
f_1 = Some(val);
},
+ 2 => {
+ let list_ident = i_prot.read_list_begin()?;
+ let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
+ for _ in 0..list_ident.size {
+ let list_elem_10 = i_prot.read_i64()?;
+ val.push(list_elem_10);
+ }
+ i_prot.read_list_end()?;
+ f_2 = Some(val);
+ },
_ => {
i_prot.skip(field_ident.field_type)?;
},
@@ -4455,6 +4651,7 @@ impl crate::thrift::TSerializable for OffsetIndex {
verify_required_field_exists("OffsetIndex.page_locations", &f_1)?;
let ret = OffsetIndex {
page_locations: f_1.expect("auto-generated code should have checked for
presence of required fields"),
+ unencoded_byte_array_data_bytes: f_2,
};
Ok(ret)
}
@@ -4468,6 +4665,15 @@ impl crate::thrift::TSerializable for OffsetIndex {
}
o_prot.write_list_end()?;
o_prot.write_field_end()?;
+ if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes {
+
o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes",
TType::List, 2))?;
+ o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len()
as i32))?;
+ for e in fld_var {
+ o_prot.write_i64(*e)?;
+ }
+ o_prot.write_list_end()?;
+ o_prot.write_field_end()?
+ }
o_prot.write_field_stop()?;
o_prot.write_struct_end()
}
@@ -4477,8 +4683,14 @@ impl crate::thrift::TSerializable for OffsetIndex {
// ColumnIndex
//
-/// Description for ColumnIndex.
-/// Each `<array-field>`\[i\] refers to the page at
OffsetIndex.page_locations\[i\]
+/// Optional statistics for each data page in a ColumnChunk.
+///
+/// Forms part the page index, along with OffsetIndex.
+///
+/// If this structure is present, OffsetIndex must also be present.
+///
+/// For each field in this structure, `<field>`\[i\] refers to the page at
+/// OffsetIndex.page_locations\[i\]
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct ColumnIndex {
/// A list of Boolean values to determine the validity of the corresponding
@@ -4504,16 +4716,33 @@ pub struct ColumnIndex {
pub boundary_order: BoundaryOrder,
/// A list containing the number of null values for each page *
pub null_counts: Option<Vec<i64>>,
+ /// Contains repetition level histograms for each page
+ /// concatenated together. The repetition_level_histogram field on
+ /// SizeStatistics contains more details.
+ ///
+ /// When present the length should always be (number of pages *
+ /// (max_repetition_level + 1)) elements.
+ ///
+ /// Element 0 is the first element of the histogram for the first page.
+ /// Element (max_repetition_level + 1) is the first element of the histogram
+ /// for the second page.
+ ///
+ pub repetition_level_histograms: Option<Vec<i64>>,
+ /// Same as repetition_level_histograms except for definitions levels.
+ ///
+ pub definition_level_histograms: Option<Vec<i64>>,
}
impl ColumnIndex {
- pub fn new<F5>(null_pages: Vec<bool>, min_values: Vec<Vec<u8>>, max_values:
Vec<Vec<u8>>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex
where F5: Into<Option<Vec<i64>>> {
+ pub fn new<F5, F6, F7>(null_pages: Vec<bool>, min_values: Vec<Vec<u8>>,
max_values: Vec<Vec<u8>>, boundary_order: BoundaryOrder, null_counts: F5,
repetition_level_histograms: F6, definition_level_histograms: F7) ->
ColumnIndex where F5: Into<Option<Vec<i64>>>, F6: Into<Option<Vec<i64>>>, F7:
Into<Option<Vec<i64>>> {
ColumnIndex {
null_pages,
min_values,
max_values,
boundary_order,
null_counts: null_counts.into(),
+ repetition_level_histograms: repetition_level_histograms.into(),
+ definition_level_histograms: definition_level_histograms.into(),
}
}
}
@@ -4526,6 +4755,8 @@ impl crate::thrift::TSerializable for ColumnIndex {
let mut f_3: Option<Vec<Vec<u8>>> = None;
let mut f_4: Option<BoundaryOrder> = None;
let mut f_5: Option<Vec<i64>> = None;
+ let mut f_6: Option<Vec<i64>> = None;
+ let mut f_7: Option<Vec<i64>> = None;
loop {
let field_ident = i_prot.read_field_begin()?;
if field_ident.field_type == TType::Stop {
@@ -4537,8 +4768,8 @@ impl crate::thrift::TSerializable for ColumnIndex {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<bool> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_8 = i_prot.read_bool()?;
- val.push(list_elem_8);
+ let list_elem_11 = i_prot.read_bool()?;
+ val.push(list_elem_11);
}
i_prot.read_list_end()?;
f_1 = Some(val);
@@ -4547,8 +4778,8 @@ impl crate::thrift::TSerializable for ColumnIndex {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<Vec<u8>> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_9 = i_prot.read_bytes()?;
- val.push(list_elem_9);
+ let list_elem_12 = i_prot.read_bytes()?;
+ val.push(list_elem_12);
}
i_prot.read_list_end()?;
f_2 = Some(val);
@@ -4557,8 +4788,8 @@ impl crate::thrift::TSerializable for ColumnIndex {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<Vec<u8>> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_10 = i_prot.read_bytes()?;
- val.push(list_elem_10);
+ let list_elem_13 = i_prot.read_bytes()?;
+ val.push(list_elem_13);
}
i_prot.read_list_end()?;
f_3 = Some(val);
@@ -4571,12 +4802,32 @@ impl crate::thrift::TSerializable for ColumnIndex {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
for _ in 0..list_ident.size {
- let list_elem_11 = i_prot.read_i64()?;
- val.push(list_elem_11);
+ let list_elem_14 = i_prot.read_i64()?;
+ val.push(list_elem_14);
}
i_prot.read_list_end()?;
f_5 = Some(val);
},
+ 6 => {
+ let list_ident = i_prot.read_list_begin()?;
+ let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
+ for _ in 0..list_ident.size {
+ let list_elem_15 = i_prot.read_i64()?;
+ val.push(list_elem_15);
+ }
+ i_prot.read_list_end()?;
+ f_6 = Some(val);
+ },
+ 7 => {
+ let list_ident = i_prot.read_list_begin()?;
+ let mut val: Vec<i64> = Vec::with_capacity(list_ident.size as usize);
+ for _ in 0..list_ident.size {
+ let list_elem_16 = i_prot.read_i64()?;
+ val.push(list_elem_16);
+ }
+ i_prot.read_list_end()?;
+ f_7 = Some(val);
+ },
_ => {
i_prot.skip(field_ident.field_type)?;
},
@@ -4594,6 +4845,8 @@ impl crate::thrift::TSerializable for ColumnIndex {
max_values: f_3.expect("auto-generated code should have checked for
presence of required fields"),
boundary_order: f_4.expect("auto-generated code should have checked for
presence of required fields"),
null_counts: f_5,
+ repetition_level_histograms: f_6,
+ definition_level_histograms: f_7,
};
Ok(ret)
}
@@ -4633,6 +4886,24 @@ impl crate::thrift::TSerializable for ColumnIndex {
o_prot.write_list_end()?;
o_prot.write_field_end()?
}
+ if let Some(ref fld_var) = self.repetition_level_histograms {
+
o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms",
TType::List, 6))?;
+ o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len()
as i32))?;
+ for e in fld_var {
+ o_prot.write_i64(*e)?;
+ }
+ o_prot.write_list_end()?;
+ o_prot.write_field_end()?
+ }
+ if let Some(ref fld_var) = self.definition_level_histograms {
+
o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms",
TType::List, 7))?;
+ o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len()
as i32))?;
+ for e in fld_var {
+ o_prot.write_i64(*e)?;
+ }
+ o_prot.write_list_end()?;
+ o_prot.write_field_end()?
+ }
o_prot.write_field_stop()?;
o_prot.write_struct_end()
}
@@ -4992,8 +5263,8 @@ impl crate::thrift::TSerializable for FileMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<SchemaElement> = Vec::with_capacity(list_ident.size
as usize);
for _ in 0..list_ident.size {
- let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?;
- val.push(list_elem_12);
+ let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_17);
}
i_prot.read_list_end()?;
f_2 = Some(val);
@@ -5006,8 +5277,8 @@ impl crate::thrift::TSerializable for FileMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<RowGroup> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?;
- val.push(list_elem_13);
+ let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_18);
}
i_prot.read_list_end()?;
f_4 = Some(val);
@@ -5016,8 +5287,8 @@ impl crate::thrift::TSerializable for FileMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<KeyValue> = Vec::with_capacity(list_ident.size as
usize);
for _ in 0..list_ident.size {
- let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?;
- val.push(list_elem_14);
+ let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_19);
}
i_prot.read_list_end()?;
f_5 = Some(val);
@@ -5030,8 +5301,8 @@ impl crate::thrift::TSerializable for FileMetaData {
let list_ident = i_prot.read_list_begin()?;
let mut val: Vec<ColumnOrder> = Vec::with_capacity(list_ident.size
as usize);
for _ in 0..list_ident.size {
- let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?;
- val.push(list_elem_15);
+ let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?;
+ val.push(list_elem_20);
}
i_prot.read_list_end()?;
f_7 = Some(val);