tustvold commented on code in PR #1746:
URL: https://github.com/apache/arrow-rs/pull/1746#discussion_r883028020


##########
parquet/src/arrow/levels.rs:
##########
@@ -40,114 +40,32 @@
 //!
 //! \[1\] 
[parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding)
 
-use arrow::array::{make_array, Array, ArrayRef, MapArray, StructArray};
+use crate::errors::{ParquetError, Result};
+use arrow::array::{
+    make_array, Array, ArrayData, ArrayRef, GenericListArray, MapArray, 
OffsetSizeTrait,
+    StructArray,
+};
 use arrow::datatypes::{DataType, Field};
-
-/// Keeps track of the level information per array that is needed to write an 
Arrow array to Parquet.
-///
-/// When a nested schema is traversed, intermediate [LevelInfo] structs are 
created to track
-/// the state of parent arrays. When a primitive Arrow array is encountered, a 
final [LevelInfo]
-/// is created, and this is what is used to index into the array when writing 
data to Parquet.
-#[derive(Debug, Eq, PartialEq, Clone)]
-pub(crate) struct LevelInfo {
-    /// Array's definition levels
-    pub definition: Vec<i16>,
-    /// Array's optional repetition levels
-    pub repetition: Option<Vec<i16>>,
-    /// Array's offsets, 64-bit is used to accommodate large offset arrays
-    pub array_offsets: Vec<i64>,
-    // TODO: Convert to an Arrow Buffer after ARROW-10766 is merged.
-    /// Array's logical validity mask, whcih gets unpacked for list children.
-    /// If the parent of an array is null, all children are logically treated 
as
-    /// null. This mask keeps track of that.
-    ///
-    pub array_mask: Vec<bool>,
-    /// The maximum definition at this level, 0 at the record batch
-    pub max_definition: i16,
-    /// The type of array represented by this level info
-    pub level_type: LevelType,
-    /// The offset of the current level's array
-    pub offset: usize,
-    /// The length of the current level's array
-    pub length: usize,
-}
-
-/// LevelType defines the type of level, and whether it is nullable or not
-#[derive(Debug, Eq, PartialEq, Clone, Copy)]
-pub(crate) enum LevelType {
-    Root,
-    List(bool),
-    Struct(bool),
-    Primitive(bool),
-}
-
-impl LevelType {
-    #[inline]
-    const fn level_increment(&self) -> i16 {
-        match self {
-            LevelType::Root => 0,
-            // List repetition adds a constant 1
-            LevelType::List(is_nullable) => 1 + *is_nullable as i16,
-            LevelType::Struct(is_nullable) | LevelType::Primitive(is_nullable) 
=> {
-                *is_nullable as i16
-            }
-        }
-    }
+use std::ops::Range;
+
+/// Performs a depth-first scan of the children of `array`, constructing 
[`LevelInfo`]
+/// for each leaf column encountered
+pub(crate) fn calculate_array_levels(
+    array: &ArrayRef,
+    field: &Field,
+) -> Result<Vec<LevelInfo>> {
+    let mut builder = LevelInfoBuilder::try_new(field, Default::default())?;
+    builder.write(array, 0..array.len());
+    Ok(builder.finish())
 }
 
-impl LevelInfo {
-    /// Create a new [LevelInfo] by filling `length` slots, and setting an 
initial offset.
-    ///
-    /// This is a convenience function to populate the starting point of the 
traversal.
-    pub(crate) fn new(offset: usize, length: usize) -> Self {
-        Self {
-            // a batch has no definition level yet
-            definition: vec![0; length],
-            // a batch has no repetition as it is not a list
-            repetition: None,
-            // a batch has sequential offsets, should be num_rows + 1
-            array_offsets: (0..=(length as i64)).collect(),
-            // all values at a batch-level are non-null
-            array_mask: vec![true; length],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset,
-            length,
-        }
-    }
-
-    /// Compute nested levels of the Arrow array, recursing into lists and 
structs.
-    ///
-    /// Returns a list of `LevelInfo`, where each level is for nested 
primitive arrays.
-    ///
-    /// The parent struct's nullness is tracked, as it determines whether the 
child
-    /// max_definition should be incremented.
-    /// The 'is_parent_struct' variable asks "is this field's parent a 
struct?".
-    /// * If we are starting at a 
[RecordBatch](arrow::record_batch::RecordBatch), this is `false`.
-    /// * If we are calculating a list's child, this is `false`.
-    /// * If we are calculating a struct (i.e. `field.data_type90 == Struct`),
-    /// this depends on whether the struct is a child of a struct.
-    /// * If we are calculating a field inside a [StructArray], this is 'true'.
-    pub(crate) fn calculate_array_levels(
-        &self,
-        array: &ArrayRef,
-        field: &Field,
-    ) -> Vec<Self> {
-        let (array_offsets, array_mask) =
-            Self::get_array_offsets_and_masks(array, self.offset, self.length);
-        match array.data_type() {
-            DataType::Null => vec![Self {
-                definition: self.definition.clone(),
-                repetition: self.repetition.clone(),
-                array_offsets,
-                array_mask,
-                max_definition: self.max_definition.max(1),
-                // Null type is always nullable
-                level_type: LevelType::Primitive(true),
-                offset: self.offset,
-                length: self.length,
-            }],
-            DataType::Boolean
+/// Returns true if the DataType can be represented as a primitive parquet 
column,
+/// i.e. a leaf array with no children
+fn is_leaf(data_type: &DataType) -> bool {
+    matches!(
+        data_type,

Review Comment:
   I'm not sure I follow? The intent is that we can match just dictionaries 
where the value type is a leaf, as this can be handled transparently. We don't 
support complex dictionary value types (and for what it is worth neither does 
Arrow C++). We used to simply assume that dictionary types were primitive, this 
is completing a TODO I encountered in the code - 
https://github.com/apache/arrow-rs/pull/1746/files/b104aba76a6868e4296eb0f5a7b4fd0b8960eb62#diff-4b715628e2e0ae6f66e590227d9587cd5f2155055a59ad6b7b0dc7b1914ad8edL316
   
   Edit: Oh I see what you're saying, I think if a new arrow type were added we 
would likely need additional work in the parquet writer to support it, and so I 
think not automatically supporting it is probably safer.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to