[GitHub] [arrow-rs] tustvold commented on a diff in pull request #1588: Add support for nested list arrays (#993)

GitBox Wed, 04 May 2022 09:38:31 -0700


tustvold commented on code in PR #1588:
URL: https://github.com/apache/arrow-rs/pull/1588#discussion_r865045026



##########
parquet/src/arrow/array_reader/list_array.rs:
##########
@@ -88,97 +84,153 @@ impl<OffsetSize: OffsetSizeTrait> ArrayReader for 
ListArrayReader<OffsetSize> {
         if next_batch_array.len() == 0 {
             return Ok(new_empty_array(&self.data_type));
         }
+
         let def_levels = self
             .item_reader
             .get_def_levels()
-            .ok_or_else(|| ArrowError("item_reader def levels are 
None.".to_string()))?;
+            .ok_or_else(|| general_err!("item_reader def levels are None."))?;
+
         let rep_levels = self
             .item_reader
             .get_rep_levels()
-            .ok_or_else(|| ArrowError("item_reader rep levels are 
None.".to_string()))?;
+            .ok_or_else(|| general_err!("item_reader rep levels are None."))?;
 
-        if !((def_levels.len() == rep_levels.len())
-            && (rep_levels.len() == next_batch_array.len()))
-        {
-            return Err(ArrowError(
-                format!("Expected item_reader def_levels {} and rep_levels {} 
to be same length as batch {}", def_levels.len(), rep_levels.len(), 
next_batch_array.len()),
+        if OffsetSize::from_usize(next_batch_array.len()).is_none() {
+            return Err(general_err!(
+                "offset of {} would overflow list array",
+                next_batch_array.len()
             ));
         }
 
-        // List definitions can be encoded as 4 values:
-        // - n + 0: the list slot is null
-        // - n + 1: the list slot is not null, but is empty (i.e. [])
-        // - n + 2: the list slot is not null, but its child is empty (i.e. [ 
null ])
-        // - n + 3: the list slot is not null, and its child is not empty
-        // Where n is the max definition level of the list's parent.
-        // If a Parquet schema's only leaf is the list, then n = 0.
-
-        // If the list index is at empty definition, the child slot is null
-        let non_null_list_indices =
-            def_levels.iter().enumerate().filter_map(|(index, def)| {
-                (*def > self.list_empty_def_level).then(|| index as u32)
-            });
-        let indices = UInt32Array::from_iter_values(non_null_list_indices);
-        let batch_values =
-            arrow::compute::take(&*next_batch_array.clone(), &indices, None)?;
-
-        // first item in each list has rep_level = 0, subsequent items have 
rep_level = 1
-        let mut offsets: Vec<OffsetSize> = Vec::new();
-        let mut cur_offset = OffsetSize::zero();
-        def_levels.iter().zip(rep_levels).for_each(|(d, r)| {
-            if *r == 0 || d == &self.list_empty_def_level {
-                offsets.push(cur_offset);
-            }
-            if d > &self.list_empty_def_level {
-                cur_offset += OffsetSize::one();
-            }
-        });
-
-        offsets.push(cur_offset);
-
-        let num_bytes = bit_util::ceil(offsets.len(), 8);
-        // TODO: A useful optimization is to use the null count to fill with
-        // 0 or null, to reduce individual bits set in a loop.
-        // To favour dense data, set every slot to true, then unset
-        let mut null_buf = 
MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
-        let null_slice = null_buf.as_slice_mut();
-        let mut list_index = 0;
-        for i in 0..rep_levels.len() {
-            // If the level is lower than empty, then the slot is null.
-            // When a list is non-nullable, its empty level = null level,
-            // so this automatically factors that in.
-            if rep_levels[i] == 0 && def_levels[i] < self.list_empty_def_level 
{
-                bit_util::unset_bit(null_slice, list_index);
+        // A non-nullable list has a single definition level indicating if the 
list is empty
+        //
+        // A nullable list has two definition levels associated with it:
+        //
+        // The first identifies if the list is null
+        // The second identifies if the list is empty
+        //
+        // The child data returned above is padded with a value for each 
not-fully defined level.
+        // Therefore null and empty lists will correspond to a value in the 
child array.
+        //
+        // Whilst nulls may have a non-zero slice in the offsets array, empty 
lists must
+        // be of zero length. As a result we MUST filter out values 
corresponding to empty
+        // lists, and for consistency we do the same for nulls.
+
+        // The output offsets for the computed ListArray
+        let mut list_offsets: Vec<OffsetSize> =
+            Vec::with_capacity(next_batch_array.len());
+
+        // The validity mask of the computed ListArray if nullable
+        let mut validity = self
+            .nullable
+            .then(|| BooleanBufferBuilder::new(next_batch_array.len()));
+
+        // The offset into the filtered child data of the current level being 
considered
+        let mut cur_offset = 0;
+
+        // Identifies the start of a run of values to copy from the source 
child data
+        let mut filter_start = None;
+
+        // The number of child values skipped due to empty lists or nulls
+        let mut skipped = 0;
+
+        // Builder used to construct the filtered child data, skipping empty 
lists and nulls
+        let mut child_data_builder = MutableArrayData::new(

Review Comment:
   I switched to using this over the take kernel as I think it makes it easier 
to follow what is going on, it should also be significantly faster where nulls 
and empty lists are rare



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #1588: Add support for nested list arrays (#993)

Reply via email to