tustvold commented on code in PR #1588:
URL: https://github.com/apache/arrow-rs/pull/1588#discussion_r852471853
##########
parquet/src/arrow/array_reader/list_array.rs:
##########
@@ -88,97 +84,85 @@ impl<OffsetSize: OffsetSizeTrait> ArrayReader for
ListArrayReader<OffsetSize> {
if next_batch_array.len() == 0 {
return Ok(new_empty_array(&self.data_type));
}
+
let def_levels = self
.item_reader
.get_def_levels()
- .ok_or_else(|| ArrowError("item_reader def levels are
None.".to_string()))?;
+ .ok_or_else(|| general_err!("item_reader def levels are None."))?;
+
let rep_levels = self
.item_reader
.get_rep_levels()
- .ok_or_else(|| ArrowError("item_reader rep levels are
None.".to_string()))?;
-
- if !((def_levels.len() == rep_levels.len())
- && (rep_levels.len() == next_batch_array.len()))
- {
- return Err(ArrowError(
- format!("Expected item_reader def_levels {} and rep_levels {}
to be same length as batch {}", def_levels.len(), rep_levels.len(),
next_batch_array.len()),
- ));
- }
+ .ok_or_else(|| general_err!("item_reader rep levels are None."))?;
- // List definitions can be encoded as 4 values:
- // - n + 0: the list slot is null
- // - n + 1: the list slot is not null, but is empty (i.e. [])
- // - n + 2: the list slot is not null, but its child is empty (i.e. [
null ])
- // - n + 3: the list slot is not null, and its child is not empty
- // Where n is the max definition level of the list's parent.
- // If a Parquet schema's only leaf is the list, then n = 0.
-
- // If the list index is at empty definition, the child slot is null
- let non_null_list_indices =
- def_levels.iter().enumerate().filter_map(|(index, def)| {
- (*def > self.list_empty_def_level).then(|| index as u32)
- });
- let indices = UInt32Array::from_iter_values(non_null_list_indices);
- let batch_values =
- arrow::compute::take(&*next_batch_array.clone(), &indices, None)?;
Review Comment:
We no longer do this, which is likely better for performance, but comes with
the caveat that nulls take up more space than before.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]