This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 85edafc fix: Fix a bug in how filter indices are calculated (#1185)
85edafc is described below
commit 85edafcda90051b25cbad13ce6d6321006f65282
Author: Helgi Kristvin Sigurbjarnarson <[email protected]>
AuthorDate: Mon Jan 17 07:49:14 2022 -0800
fix: Fix a bug in how filter indices are calculated (#1185)
* fix: Fix a bug in how filter indices are calculated
Using the definition level and the nullability of the column only
produces the correct indices if max_definition - 1 is the list level.
For deeper nesting (struct in a list) this produces incorrect indices,
silently causing incorrect data to be written.
This fix uses the array offsets to compute the indices instead.
* add assertions
---
parquet/src/arrow/levels.rs | 75 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 61 insertions(+), 14 deletions(-)
diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs
index 0746088..ea1f212 100644
--- a/parquet/src/arrow/levels.rs
+++ b/parquet/src/arrow/levels.rs
@@ -758,14 +758,14 @@ impl LevelInfo {
/// Given a level's information, calculate the offsets required to index
an array correctly.
pub(crate) fn filter_array_indices(&self) -> Vec<usize> {
- // happy path if not dealing with lists
- let is_nullable = match self.level_type {
- LevelType::Primitive(is_nullable) => is_nullable,
- _ => panic!(
+ if !matches!(self.level_type, LevelType::Primitive(_)) {
+ panic!(
"Cannot filter indices on a non-primitive array, found {:?}",
self.level_type
- ),
- };
+ );
+ }
+
+ // happy path if not dealing with lists
if self.repetition.is_none() {
return self
.definition
@@ -780,17 +780,26 @@ impl LevelInfo {
})
.collect();
}
+
let mut filtered = vec![];
- // remove slots that are false from definition_mask
+ let mut definition_levels = self.definition.iter();
let mut index = 0;
- self.definition.iter().for_each(|def| {
- if *def == self.max_definition {
- filtered.push(index);
- }
- if *def >= self.max_definition - is_nullable as i16 {
- index += 1;
+
+ for len in self.array_offsets.windows(2).map(|s| s[1] - s[0]) {
+ if len == 0 {
+ // Skip this definition level--the iterator should not be
empty, and the definition
+ // level be less than max_definition, i.e., a null value)
+ assert!(*definition_levels.next().unwrap() <
self.max_definition);
+ } else {
+ for (_, def) in (0..len).zip(&mut definition_levels) {
+ if *def == self.max_definition {
+ filtered.push(index);
+ }
+ index += 1;
+ }
}
- });
+ }
+
filtered
}
}
@@ -1773,4 +1782,42 @@ mod tests {
assert_eq!(list_level, &expected_level);
}
+
+ #[test]
+ fn test_nested_indices() {
+ // Given a buffer like
+ // [0, null, null, 1, 2]
+ //
+ // The two level infos below might represent the two structures
+ // 1: [{a: 0}], [], null, [null, null], [{a: 1}], [{a: 2}]
+ // 2: [0], [], null, [null, null], [1], [2]
+ //
+ // (That is, their only difference is that the leaf values are nested
one level deeper in a
+ // struct).
+
+ let level1 = LevelInfo {
+ definition: vec![4, 1, 0, 2, 2, 4, 4],
+ repetition: Some(vec![0, 0, 0, 0, 1, 0, 0]),
+ array_offsets: vec![0, 1, 1, 1, 3, 4, 5],
+ array_mask: vec![true, true, false, false, false, false, true],
+ max_definition: 4,
+ level_type: LevelType::Primitive(true),
+ offset: 0,
+ length: 5,
+ };
+
+ let level2 = LevelInfo {
+ definition: vec![3, 1, 0, 2, 2, 3, 3],
+ repetition: Some(vec![0, 0, 0, 0, 1, 0, 0]),
+ array_offsets: vec![0, 1, 1, 1, 3, 4, 5],
+ array_mask: vec![true, true, false, false, false, false, true],
+ max_definition: 3,
+ level_type: LevelType::Primitive(true),
+ offset: 0,
+ length: 5,
+ };
+
+ // filter_array_indices should return the same indices in this case.
+ assert_eq!(level1.filter_array_indices(),
level2.filter_array_indices());
+ }
}