tustvold commented on code in PR #1746: URL: https://github.com/apache/arrow-rs/pull/1746#discussion_r883028020
########## parquet/src/arrow/levels.rs: ########## @@ -40,114 +40,32 @@ //! //! \[1\] [parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding) -use arrow::array::{make_array, Array, ArrayRef, MapArray, StructArray}; +use crate::errors::{ParquetError, Result}; +use arrow::array::{ + make_array, Array, ArrayData, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, + StructArray, +}; use arrow::datatypes::{DataType, Field}; - -/// Keeps track of the level information per array that is needed to write an Arrow array to Parquet. -/// -/// When a nested schema is traversed, intermediate [LevelInfo] structs are created to track -/// the state of parent arrays. When a primitive Arrow array is encountered, a final [LevelInfo] -/// is created, and this is what is used to index into the array when writing data to Parquet. -#[derive(Debug, Eq, PartialEq, Clone)] -pub(crate) struct LevelInfo { - /// Array's definition levels - pub definition: Vec<i16>, - /// Array's optional repetition levels - pub repetition: Option<Vec<i16>>, - /// Array's offsets, 64-bit is used to accommodate large offset arrays - pub array_offsets: Vec<i64>, - // TODO: Convert to an Arrow Buffer after ARROW-10766 is merged. - /// Array's logical validity mask, whcih gets unpacked for list children. - /// If the parent of an array is null, all children are logically treated as - /// null. This mask keeps track of that. - /// - pub array_mask: Vec<bool>, - /// The maximum definition at this level, 0 at the record batch - pub max_definition: i16, - /// The type of array represented by this level info - pub level_type: LevelType, - /// The offset of the current level's array - pub offset: usize, - /// The length of the current level's array - pub length: usize, -} - -/// LevelType defines the type of level, and whether it is nullable or not -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub(crate) enum LevelType { - Root, - List(bool), - Struct(bool), - Primitive(bool), -} - -impl LevelType { - #[inline] - const fn level_increment(&self) -> i16 { - match self { - LevelType::Root => 0, - // List repetition adds a constant 1 - LevelType::List(is_nullable) => 1 + *is_nullable as i16, - LevelType::Struct(is_nullable) | LevelType::Primitive(is_nullable) => { - *is_nullable as i16 - } - } - } +use std::ops::Range; + +/// Performs a depth-first scan of the children of `array`, constructing [`LevelInfo`] +/// for each leaf column encountered +pub(crate) fn calculate_array_levels( + array: &ArrayRef, + field: &Field, +) -> Result<Vec<LevelInfo>> { + let mut builder = LevelInfoBuilder::try_new(field, Default::default())?; + builder.write(array, 0..array.len()); + Ok(builder.finish()) } -impl LevelInfo { - /// Create a new [LevelInfo] by filling `length` slots, and setting an initial offset. - /// - /// This is a convenience function to populate the starting point of the traversal. - pub(crate) fn new(offset: usize, length: usize) -> Self { - Self { - // a batch has no definition level yet - definition: vec![0; length], - // a batch has no repetition as it is not a list - repetition: None, - // a batch has sequential offsets, should be num_rows + 1 - array_offsets: (0..=(length as i64)).collect(), - // all values at a batch-level are non-null - array_mask: vec![true; length], - max_definition: 0, - level_type: LevelType::Root, - offset, - length, - } - } - - /// Compute nested levels of the Arrow array, recursing into lists and structs. - /// - /// Returns a list of `LevelInfo`, where each level is for nested primitive arrays. - /// - /// The parent struct's nullness is tracked, as it determines whether the child - /// max_definition should be incremented. - /// The 'is_parent_struct' variable asks "is this field's parent a struct?". - /// * If we are starting at a [RecordBatch](arrow::record_batch::RecordBatch), this is `false`. - /// * If we are calculating a list's child, this is `false`. - /// * If we are calculating a struct (i.e. `field.data_type90 == Struct`), - /// this depends on whether the struct is a child of a struct. - /// * If we are calculating a field inside a [StructArray], this is 'true'. - pub(crate) fn calculate_array_levels( - &self, - array: &ArrayRef, - field: &Field, - ) -> Vec<Self> { - let (array_offsets, array_mask) = - Self::get_array_offsets_and_masks(array, self.offset, self.length); - match array.data_type() { - DataType::Null => vec![Self { - definition: self.definition.clone(), - repetition: self.repetition.clone(), - array_offsets, - array_mask, - max_definition: self.max_definition.max(1), - // Null type is always nullable - level_type: LevelType::Primitive(true), - offset: self.offset, - length: self.length, - }], - DataType::Boolean +/// Returns true if the DataType can be represented as a primitive parquet column, +/// i.e. a leaf array with no children +fn is_leaf(data_type: &DataType) -> bool { + matches!( + data_type, Review Comment: I'm not sure I follow? The intent is that we can match just dictionaries where the value type is a leaf, as this can be handled transparently. We don't support complex dictionary value types (and for what it is worth neither does Arrow C++). We used to simply assume that dictionary types were primitive, this is completing a TODO I encountered in the code - https://github.com/apache/arrow-rs/pull/1746/files/b104aba76a6868e4296eb0f5a7b4fd0b8960eb62#diff-4b715628e2e0ae6f66e590227d9587cd5f2155055a59ad6b7b0dc7b1914ad8edL316 Edit: Oh I see what you're saying, I think if a new arrow type were added we would likely need additional work in the parquet writer to support it, and so I think not automatically supporting it is probably safer. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
