vertexclique commented on a change in pull request #8402:
URL: https://github.com/apache/arrow/pull/8402#discussion_r502831824



##########
File path: rust/parquet/src/arrow/converter.rs
##########
@@ -253,6 +256,34 @@ impl Converter<Vec<Option<ByteArray>>, LargeBinaryArray> 
for LargeBinaryArrayCon
     }
 }
 
+pub struct DictionaryArrayConverter {}
+
+impl<K: ArrowDictionaryKeyType> Converter<Vec<Option<ByteArray>>, 
DictionaryArray<K>>
+    for DictionaryArrayConverter
+{
+    fn convert(&self, source: Vec<Option<ByteArray>>) -> 
Result<DictionaryArray<K>> {
+        let data_size = source
+            .iter()
+            .map(|x| x.as_ref().map(|b| b.len()).unwrap_or(0))
+            .sum();
+
+        let keys_builder = PrimitiveBuilder::<K>::new(source.len());
+        let values_builder = StringBuilder::with_capacity(source.len(), 
data_size);
+
+        let mut builder = StringDictionaryBuilder::new(keys_builder, 
values_builder);
+        for v in source {
+            match v {
+                Some(array) => {
+                    builder.append(array.as_utf8()?)?;

Review comment:
       ```suggestion
                       let _ = builder.append(array.as_utf8()?)?;
   ```

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -175,15 +175,61 @@ fn write_leaves(
             }
             Ok(())
         }
+        ArrowDataType::Dictionary(k, v) => {
+            // Materialize the packed dictionary and let the writer repack it
+            let any_array = array.as_any();
+            let (k2, v2) = match &**k {
+                ArrowDataType::Int32 => {
+                    let typed_array = any_array
+                        .downcast_ref::<arrow_array::Int32DictionaryArray>()
+                        .expect("Unable to get dictionary array");
+
+                    (typed_array.keys(), typed_array.values())
+                }
+                o => unimplemented!("Unknown key type {:?}", o),
+            };
+
+            let k3 = k2;
+            let v3 = v2
+                .as_any()
+                .downcast_ref::<arrow_array::StringArray>()
+                .unwrap();
+
+            // TODO: This removes NULL values; what _should_ be done?
+            // FIXME: Don't use `as`
+            let materialized: Vec<_> = k3
+                .flatten()
+                .map(|k| v3.value(k as usize))
+                .map(ByteArray::from)
+                .collect();
+            //

Review comment:
       It seems like it is ignored in the branch this PR opened against:
   
   ```
   /// Get the underlying numeric array slice, skipping any null values.
   /// If there are no null values, it might be quicker to get the slice 
directly instead of
   /// calling this function.
   fn get_numeric_array_slice<T, A>(array: &arrow_array::PrimitiveArray<A>) -> 
Vec<T::T>
   where
       T: DataType,
       A: arrow::datatypes::ArrowNumericType,
       T::T: From<A::Native>,
   {
       let mut values = Vec::with_capacity(array.len() - array.null_count());
       for i in 0..array.len() {
           if array.is_valid(i) {
               values.push(array.value(i).into())
           }
       }
       values
   }
   ```
   
   Thou other implementations are appending null values. We need some changes 
to adapt in Rust impl., which means ByteArray should have `From` implementation 
for `Vec<Option<u8>>` and `Option<&'a str>` then the aforementioned method 
should be adapted to nulls.
   
   Related impl. in Cxx: 
https://github.com/apache/arrow/blob/master/cpp/src/parquet/arrow/path_internal.cc#L51-L73

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -175,15 +175,61 @@ fn write_leaves(
             }
             Ok(())
         }
+        ArrowDataType::Dictionary(k, v) => {
+            // Materialize the packed dictionary and let the writer repack it
+            let any_array = array.as_any();
+            let (k2, v2) = match &**k {
+                ArrowDataType::Int32 => {

Review comment:
       Can you write a macro for generating other key types implemented by 
`ArrowDictionaryKeyType`:
   https://docs.rs/arrow/1.0.1/arrow/datatypes/trait.ArrowDictionaryKeyType.html

##########
File path: rust/parquet/src/arrow/array_reader.rs
##########
@@ -894,6 +896,60 @@ impl<'a> ArrayReaderBuilder {
                         >::new(
                             page_iterator, column_desc, converter
                         )?))
+                    } else if let Some(ArrowType::Dictionary(index_type, _)) = 
arrow_type

Review comment:
       ```suggestion
                       } else if let Some(ArrowType::Dictionary(key_type, _)) = 
arrow_type
   ```
   The key type is a better name for dictionaries. Since they are not directly 
representing indices in the generic terminology. Even though other 
implementations are using this term for it.

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -175,15 +175,61 @@ fn write_leaves(
             }
             Ok(())
         }
+        ArrowDataType::Dictionary(k, v) => {
+            // Materialize the packed dictionary and let the writer repack it
+            let any_array = array.as_any();
+            let (k2, v2) = match &**k {
+                ArrowDataType::Int32 => {
+                    let typed_array = any_array
+                        .downcast_ref::<arrow_array::Int32DictionaryArray>()
+                        .expect("Unable to get dictionary array");
+
+                    (typed_array.keys(), typed_array.values())
+                }
+                o => unimplemented!("Unknown key type {:?}", o),
+            };
+
+            let k3 = k2;
+            let v3 = v2
+                .as_any()
+                .downcast_ref::<arrow_array::StringArray>()
+                .unwrap();
+
+            // TODO: This removes NULL values; what _should_ be done?
+            // FIXME: Don't use `as`
+            let materialized: Vec<_> = k3
+                .flatten()
+                .map(|k| v3.value(k as usize))
+                .map(ByteArray::from)
+                .collect();
+            //
+
+            let mut col_writer = get_col_writer(&mut row_group_writer)?;
+            let levels = levels.pop().unwrap();

Review comment:
       ```suggestion
               let levels = levels.pop().expect("Levels exhausted");
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to