HippoBaro commented on code in PR #9848:
URL: https://github.com/apache/arrow-rs/pull/9848#discussion_r3494341663
##########
parquet/src/arrow/array_reader/struct_array.rs:
##########
@@ -82,107 +88,68 @@ impl ArrayReader for StructArrayReader {
Ok(read.unwrap_or(0))
}
- /// Consume struct records.
- ///
- /// Definition levels of struct array is calculated as following:
- /// ```ignore
- /// def_levels[i] = min(child1_def_levels[i], child2_def_levels[i], ...,
- /// childn_def_levels[i]);
- /// ```
- ///
- /// Repetition levels of struct array is calculated as following:
- /// ```ignore
- /// rep_levels[i] = child1_rep_levels[i];
- /// ```
- ///
- /// The null bitmap of struct array is calculated from def_levels:
- /// ```ignore
- /// null_bitmap[i] = (def_levels[i] >= self.def_level);
- /// ```
- ///
fn consume_batch(&mut self) -> Result<ArrayRef> {
if self.children.is_empty() {
return Ok(Arc::new(StructArray::from(Vec::new())));
}
- let children_array = self
+ let children_arrays = self
.children
.iter_mut()
.map(|reader| reader.consume_batch())
.collect::<Result<Vec<_>>>()?;
- // check that array child data has same size
- let children_array_len = children_array
- .first()
- .map(|arr| arr.len())
- .ok_or_else(|| general_err!("Struct array reader should have at
least one child!"))?;
-
- let all_children_len_eq = children_array
- .iter()
- .all(|arr| arr.len() == children_array_len);
- if !all_children_len_eq {
- return Err(general_err!("Not all children array length are the
same!"));
- }
-
let DataType::Struct(fields) = &self.data_type else {
return Err(general_err!(
"Internal: StructArrayReader must have struct data type, got
{:?}",
self.data_type
));
};
- let fields = fields.clone(); // cloning Fields is cheap (Arc
internally)
+ let fields = fields.clone();
- let mut nulls = None;
- if self.nullable {
- // calculate struct def level data
+ let item_count = children_arrays.first().map(|a| a.len()).unwrap_or(0);
- // children should have consistent view of parent, only need to
inspect first child
+ if !children_arrays.windows(2).all(|w| w[0].len() == w[1].len()) {
+ return Err(general_err!("Not all children array length are the
same!"));
+ }
+
+ // Build struct null bitmap if the struct is nullable.
+ // We iterate def/rep levels and select entries that correspond to
+ // struct rows: skip parent-level padding (d < threshold) and
+ // inner-list continuations (r > struct_rep_level).
+ let nulls = if self.nullable {
let def_levels = self.children[0]
.get_def_levels()
- .expect("child with nullable parents must have definition
level");
-
- // calculate bitmap for current array
- let mut bitmap_builder =
BooleanBufferBuilder::new(children_array_len);
-
- match self.children[0].get_rep_levels() {
- Some(rep_levels) => {
- // Sanity check
- assert_eq!(rep_levels.len(), def_levels.len());
-
- for (rep_level, def_level) in
rep_levels.iter().zip(def_levels) {
- if rep_level > &self.struct_rep_level {
- // Already handled by inner list - SKIP
- continue;
- }
- bitmap_builder.append(*def_level >=
self.struct_def_level)
- }
- }
- None => {
- // Safety: slice iterator has a trusted length
- unsafe {
- bitmap_builder.extend_trusted_len(
- def_levels
- .iter()
- .map(|level| *level >= self.struct_def_level),
- )
- }
- }
+ .ok_or_else(|| general_err!("child def levels are None"))?;
+ let rep_levels = self.children[0].get_rep_levels();
+
+ let mut bitmap = BooleanBufferBuilder::new(item_count);
+
crate::arrow::record_reader::definition_levels::build_filtered_validity_bitmap(
Review Comment:
Done thanks!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]