nevi-me commented on a change in pull request #8792:
URL: https://github.com/apache/arrow/pull/8792#discussion_r532034509



##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -176,87 +194,17 @@ fn write_leaves(
             }
             Ok(())
         }
-        ArrowDataType::Dictionary(key_type, value_type) => {
-            use arrow_array::{PrimitiveArray, StringArray};
-            use ArrowDataType::*;
-            use ColumnWriter::*;
+        ArrowDataType::Dictionary(_, value_type) => {
+            // cast dictionary to a primitive
+            let array = arrow::compute::cast(array, value_type)?;

Review comment:
       @alamb @carols10cents I removed a lot of the dictionary code, because 
casting to a primitive, then writing that primitive, is a simpler approach.
   
   I initially thought the code was meant to perform better than the cast, but 
I noticed that right before we write the dictionary, we manually cast it by 
iterating over the key-values to create an array. That convinced me that we 
could avoid all of that by casting from the onset.
   
   What are your thoughts? If you have any benchmarks on IOx, it'd be great if 
you could check if this regresses you in any way. If it does, then there's 
likely a bug in the materialization that we need to look at again.

##########
File path: rust/arrow/src/array/equal/mod.rs
##########
@@ -841,6 +842,53 @@ mod tests {
         test_equal(a.as_ref(), b.as_ref(), true);
     }
 
+    #[test]
+    fn test_struct_equal_null() {

Review comment:
       Oops, this is what I used to test the null struct inheritance. So I'll 
remove it

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -176,87 +194,17 @@ fn write_leaves(
             }
             Ok(())
         }
-        ArrowDataType::Dictionary(key_type, value_type) => {
-            use arrow_array::{PrimitiveArray, StringArray};
-            use ArrowDataType::*;
-            use ColumnWriter::*;
+        ArrowDataType::Dictionary(_, value_type) => {
+            // cast dictionary to a primitive
+            let array = arrow::compute::cast(array, value_type)?;
 
-            let array = &**array;
             let mut col_writer = get_col_writer(&mut row_group_writer)?;
-            let levels = levels.pop().expect("Levels exhausted");
-
-            macro_rules! dispatch_dictionary {
-                ($($kt: pat, $vt: pat, $w: ident => $kat: ty, $vat: ty,)*) => (
-                    match (&**key_type, &**value_type, &mut col_writer) {
-                        $(($kt, $vt, $w(writer)) => write_dict::<$kat, $vat, 
_>(array, writer, levels),)*
-                        (kt, vt, _) => unreachable!("Shouldn't be attempting 
to write dictionary of <{:?}, {:?}>", kt, vt),
-                    }
-                );
-            }
-
-            if let (UInt8, UInt32, Int32ColumnWriter(writer)) =
-                (&**key_type, &**value_type, &mut col_writer)
-            {
-                let typed_array = array
-                    .as_any()
-                    .downcast_ref::<arrow_array::UInt8DictionaryArray>()
-                    .expect("Unable to get dictionary array");
-
-                let keys = typed_array.keys();
-
-                let value_buffer = typed_array.values();
-                let value_array =
-                    arrow::compute::cast(&value_buffer, 
&ArrowDataType::Int32)?;
-
-                let values = value_array
-                    .as_any()
-                    .downcast_ref::<arrow_array::Int32Array>()
-                    .unwrap();
-
-                use std::convert::TryFrom;
-                // This removes NULL values from the keys, but
-                // they're encoded by the levels, so that's fine.
-                let materialized_values: Vec<_> = keys

Review comment:
       then here we iterate through the keys to create the underlying 
primitives ...

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -423,25 +313,64 @@ fn write_leaf(
     Ok(written as i64)
 }
 
-/// A struct that represents definition and repetition levels.
-/// Repetition levels are only populated if the parent or current leaf is 
repeated
-#[derive(Debug)]
-struct Levels {
-    definition: Vec<i16>,
-    repetition: Option<Vec<i16>>,
-}
-
 /// Compute nested levels of the Arrow array, recursing into lists and structs
-fn get_levels(
+/// Returns a list of `LevelInfo`, where each level is for nested primitive 
arrays.
+///
+/// The algorithm works by eagerly incrementing non-null values, and 
decrementing
+/// when a value is null.
+///
+/// *Examples:*
+///
+/// A record batch always starts at a populated definition = level 1.
+/// When a batch only has a primitive, i.e. `<batch<primitive[a]>>, column `a`
+/// can only have a maximum level of 1 if it is not null.
+/// If it is null, we decrement by 1, such that the null slots will = level 0.
+///
+/// If a batch has nested arrays (list, struct, union, etc.), then the 
incrementing
+/// takes place.
+/// A `<batch<struct[a]<primitive[b]>>` will have up to 2 levels (if nullable).
+/// When calculating levels for `a`, if the struct slot is not empty, we
+/// increment by 1, such that we'd have `[2, 2, 2]` if all 3 slots are not 
null.
+/// If there is an empty slot, we decrement, leaving us with `[2, 0, 2]` as the
+/// null slot effectively means that no record is populated for the row 
altogether.
+///
+/// *Lists*
+///
+/// TODO

Review comment:
       I'll fill this part in once I find a strategy for dealing with list 
arrays

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -176,87 +194,17 @@ fn write_leaves(
             }
             Ok(())
         }
-        ArrowDataType::Dictionary(key_type, value_type) => {
-            use arrow_array::{PrimitiveArray, StringArray};
-            use ArrowDataType::*;
-            use ColumnWriter::*;
+        ArrowDataType::Dictionary(_, value_type) => {
+            // cast dictionary to a primitive
+            let array = arrow::compute::cast(array, value_type)?;
 
-            let array = &**array;
             let mut col_writer = get_col_writer(&mut row_group_writer)?;
-            let levels = levels.pop().expect("Levels exhausted");
-
-            macro_rules! dispatch_dictionary {
-                ($($kt: pat, $vt: pat, $w: ident => $kat: ty, $vat: ty,)*) => (
-                    match (&**key_type, &**value_type, &mut col_writer) {
-                        $(($kt, $vt, $w(writer)) => write_dict::<$kat, $vat, 
_>(array, writer, levels),)*
-                        (kt, vt, _) => unreachable!("Shouldn't be attempting 
to write dictionary of <{:?}, {:?}>", kt, vt),
-                    }
-                );
-            }
-
-            if let (UInt8, UInt32, Int32ColumnWriter(writer)) =
-                (&**key_type, &**value_type, &mut col_writer)
-            {
-                let typed_array = array
-                    .as_any()
-                    .downcast_ref::<arrow_array::UInt8DictionaryArray>()
-                    .expect("Unable to get dictionary array");
-
-                let keys = typed_array.keys();
-
-                let value_buffer = typed_array.values();
-                let value_array =

Review comment:
       Here we perform a cast of the values ...




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to