nevi-me commented on code in PR #1746:
URL: https://github.com/apache/arrow-rs/pull/1746#discussion_r883522239


##########
parquet/src/arrow/levels.rs:
##########
@@ -825,424 +477,270 @@ mod tests {
 
     use arrow::array::*;
     use arrow::buffer::Buffer;
-    use arrow::datatypes::{Schema, ToByteSlice};
+    use arrow::datatypes::{Int32Type, Schema, ToByteSlice};
     use arrow::record_batch::RecordBatch;
+    use arrow::util::pretty::pretty_format_columns;
 
     #[test]
     fn test_calculate_array_levels_twitter_example() {
         // based on the example at 
https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html
         // [[a, b, c], [d, e, f, g]], [[h], [i,j]]
-        let parent_levels = LevelInfo {
-            definition: vec![0, 0],
-            repetition: None,
-            array_offsets: vec![0, 1, 2], // 2 records, root offsets always 
sequential
-            array_mask: vec![true, true], // both lists defined
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 2,
-        };
-        // offset into array, each level1 has 2 values
-        let array_offsets = vec![0, 2, 4];
-        let array_mask = vec![true, true];
-
-        // calculate level1 levels
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(false),
-        );
-        //
-        let expected_levels = LevelInfo {
-            definition: vec![1, 1, 1, 1],
-            repetition: Some(vec![0, 1, 0, 1]),
-            array_offsets,
-            array_mask: vec![true, true, true, true],
-            max_definition: 1,
-            level_type: LevelType::List(false),
-            offset: 0,
-            length: 4,
-        };
-        // the separate asserts make it easier to see what's failing
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_mask, &expected_levels.array_mask);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        // this assert is to help if there are more variables added to the 
struct
-        assert_eq!(&levels, &expected_levels);
-
-        // level2
-        let parent_levels = levels;
-        let array_offsets = vec![0, 3, 7, 8, 10];
-        let array_mask = vec![true, true, true, true];
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(false),
-        );
-        let expected_levels = LevelInfo {
-            definition: vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-            repetition: Some(vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]),
-            array_offsets,
-            array_mask: vec![true; 10],
-            max_definition: 2,
-            level_type: LevelType::List(false),
-            offset: 0,
-            length: 10,
+
+        let leaf_type = Field::new("item", DataType::Int32, false);
+        let inner_type = DataType::List(Box::new(leaf_type));
+        let inner_field = Field::new("l2", inner_type.clone(), false);
+        let outer_type = DataType::List(Box::new(inner_field));
+        let outer_field = Field::new("l1", outer_type.clone(), false);
+
+        let primitives = Int32Array::from_iter(0..10);
+
+        // Cannot use from_iter_primitive as always infers nullable
+        let offsets = Buffer::from_iter([0_i32, 3, 7, 8, 10]);
+        let inner_list = ArrayDataBuilder::new(inner_type)
+            .len(4)
+            .add_buffer(offsets)
+            .add_child_data(primitives.data().clone())
+            .build()
+            .unwrap();
+
+        let offsets = Buffer::from_iter([0_i32, 2, 4]);
+        let outer_list = ArrayDataBuilder::new(outer_type)
+            .len(2)
+            .add_buffer(offsets)
+            .add_child_data(inner_list)
+            .build()
+            .unwrap();
+        let outer_list = make_array(outer_list);
+
+        let levels = calculate_array_levels(&outer_list, 
&outer_field).unwrap();
+        assert_eq!(levels.len(), 1);
+
+        let expected = LevelInfo {
+            def_levels: Some(vec![2; 10]),
+            rep_levels: Some(vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]),
+            non_null_indices: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            max_def_level: 2,
+            max_rep_level: 2,
         };
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_mask, &expected_levels.array_mask);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected);
     }
 
     #[test]
     fn test_calculate_one_level_1() {
         // This test calculates the levels for a non-null primitive array
-        let parent_levels = LevelInfo {
-            definition: vec![0; 10],
-            repetition: None,
-            array_offsets: (0..=10).collect(),
-            array_mask: vec![true; 10],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 10,
-        };
-        let array_offsets: Vec<i64> = (0..=10).collect();
-        let array_mask = vec![true; 10];
+        let array = Arc::new(Int32Array::from_iter(0..10)) as ArrayRef;
+        let field = Field::new("item", DataType::Int32, false);
+
+        let levels = calculate_array_levels(&array, &field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask.clone(),
-            LevelType::Primitive(false),
-        );
         let expected_levels = LevelInfo {
-            // As it is non-null, definitions can be omitted
-            definition: vec![0; 10],
-            repetition: None,
-            array_offsets,
-            array_mask,
-            max_definition: 0,
-            level_type: LevelType::Primitive(false),
-            offset: 0,
-            length: 10,
+            def_levels: None,
+            rep_levels: None,
+            non_null_indices: (0..10).collect(),
+            max_def_level: 0,
+            max_rep_level: 0,
         };
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected_levels);
     }
 
     #[test]
     fn test_calculate_one_level_2() {
-        // This test calculates the levels for a non-null primitive array
-        let parent_levels = LevelInfo {
-            definition: vec![0; 5],
-            repetition: None,
-            array_offsets: (0..=5).collect(),
-            array_mask: vec![true, true, true, true, true],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 5,
-        };
-        let array_offsets: Vec<i64> = (0..=5).collect();
-        let array_mask = vec![true, false, true, true, false];
+        // This test calculates the levels for a nullable primitive array
+        let array = Arc::new(Int32Array::from_iter([
+            Some(0),
+            None,
+            Some(0),
+            Some(0),
+            None,
+        ])) as ArrayRef;
+        let field = Field::new("item", DataType::Int32, true);
+
+        let levels = calculate_array_levels(&array, &field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask.clone(),
-            LevelType::Primitive(true),
-        );
         let expected_levels = LevelInfo {
-            definition: vec![1, 0, 1, 1, 0],
-            repetition: None,
-            array_offsets,
-            array_mask,
-            max_definition: 1,
-            level_type: LevelType::Primitive(true),
-            offset: 0,
-            length: 5,
+            def_levels: Some(vec![1, 0, 1, 1, 0]),
+            rep_levels: None,
+            non_null_indices: vec![0, 2, 3],
+            max_def_level: 1,
+            max_rep_level: 0,
         };
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected_levels);
     }
 
     #[test]
     fn test_calculate_array_levels_1() {
+        let leaf_field = Field::new("item", DataType::Int32, false);
+        let list_type = DataType::List(Box::new(leaf_field));
+
         // if all array values are defined (e.g. batch<list<_>>)
         // [[0], [1], [2], [3], [4]]
-        let parent_levels = LevelInfo {
-            definition: vec![0; 5],
-            repetition: None,
-            array_offsets: vec![0, 1, 2, 3, 4, 5],
-            array_mask: vec![true, true, true, true, true],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 5,
+
+        let leaf_array = Int32Array::from_iter(0..5);
+        // Cannot use from_iter_primitive as always infers nullable
+        let offsets = Buffer::from_iter(0_i32..6);
+        let list = ArrayDataBuilder::new(list_type.clone())
+            .len(5)
+            .add_buffer(offsets)
+            .add_child_data(leaf_array.data().clone())
+            .build()
+            .unwrap();
+        let list = make_array(list);
+
+        let list_field = Field::new("list", list_type.clone(), false);
+        let levels = calculate_array_levels(&list, &list_field).unwrap();
+        assert_eq!(levels.len(), 1);
+
+        let expected_levels = LevelInfo {
+            def_levels: Some(vec![1; 5]),
+            rep_levels: Some(vec![0; 5]),
+            non_null_indices: (0..5).collect(),
+            max_def_level: 1,
+            max_rep_level: 1,
         };
-        let array_offsets = vec![0, 2, 2, 4, 8, 11];
-        let array_mask = vec![true, false, true, true, true];
+        assert_eq!(&levels[0], &expected_levels);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(true),
-        );
-        // array: [[0, 0], _1_, [2, 2], [3, 3, 3, 3], [4, 4, 4]]
+        // array: [[0, 0], NULL, [2, 2], [3, 3, 3, 3], [4, 4, 4]]
         // all values are defined as we do not have nulls on the root (batch)
         // repetition:
         //   0: 0, 1
-        //   1:
+        //   1: 0
         //   2: 0, 1
         //   3: 0, 1, 1, 1
         //   4: 0, 1, 1
-        let expected_levels = LevelInfo {
-            // The levels are normally 2 because we:
-            // - Calculate the level at the list
-            // - Calculate the level at the list's child
-            // We do not do this in these tests, thus the levels are 1 less.
-            definition: vec![2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-            repetition: Some(vec![0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1]),
-            array_offsets,
-            array_mask: vec![
-                true, true, false, true, true, true, true, true, true, true, 
true, true,
-            ],
-            max_definition: 2,
-            level_type: LevelType::List(true),
-            offset: 0,
-            length: 11, // the child has 11 slots
-        };
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        assert_eq!(&levels, &expected_levels);
-    }
+        let leaf_array = Int32Array::from_iter([0, 0, 2, 2, 3, 3, 3, 3, 4, 4, 
4]);
+        let offsets = Buffer::from_iter([0_i32, 2, 2, 4, 8, 11]);
+        let list = ArrayDataBuilder::new(list_type.clone())
+            .len(5)
+            .add_buffer(offsets)
+            .add_child_data(leaf_array.data().clone())
+            .null_bit_buffer(Buffer::from([0b00011101]))
+            .build()
+            .unwrap();
+        let list = make_array(list);
 
-    #[test]
-    fn test_calculate_array_levels_2() {
-        // If some values are null
-        //
-        // This emulates an array in the form: <struct<list<?>>
-        // with values:
-        // - 0: [0, 1], but is null because of the struct
-        // - 1: []
-        // - 2: [2, 3], but is null because of the struct
-        // - 3: [4, 5, 6, 7]
-        // - 4: [8, 9, 10]
-        //
-        // If the first values of a list are null due to a parent, we have to 
still account for them
-        // while indexing, because they would affect the way the child is 
indexed
-        // i.e. in the above example, we have to know that [0, 1] has to be 
skipped
-        let parent_levels = LevelInfo {
-            definition: vec![0, 1, 0, 1, 1],
-            repetition: None,
-            array_offsets: vec![0, 1, 2, 3, 4, 5],
-            array_mask: vec![false, true, false, true, true],
-            max_definition: 1,
-            level_type: LevelType::Struct(true),
-            offset: 0,
-            length: 5,
-        };
-        let array_offsets = vec![0, 2, 2, 4, 8, 11];
-        let array_mask = vec![true, false, true, true, true];
+        let list_field = Field::new("list", list_type.clone(), true);
+        let levels = calculate_array_levels(&list, &list_field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(true),
-        );
-        let expected_levels = LevelInfo {
-            // 0 1 [2] are 0 (not defined at level 1)
-            // [2] is 1, but has 0 slots so is not populated (defined at level 
1 only)
-            // 2 3 [4] are 0
-            // 4 5 6 7 [8] are 1 (defined at level 1 only)
-            // 8 9 10 [11] are 2 (defined at both levels)
-            definition: vec![0, 0, 1, 0, 0, 3, 3, 3, 3, 3, 3, 3],

Review Comment:
   Yup, that's true. We also had a lot more logical-null bugs (esp slicing) at 
the time that I wrote the level logic. So it was harder to avoid the 
undesirable quirks at the time.
   
   If one will have to materialise data, they might as well remove data that's 
been made null due to masking, so the new approach is more desirable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to