nevi-me commented on a change in pull request #8561:
URL: https://github.com/apache/arrow/pull/8561#discussion_r516923335



##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2145,112 +2149,10 @@ pub struct DictionaryArray<K: ArrowPrimitiveType> {
     is_ordered: bool,
 }
 
-#[derive(Debug)]
-enum Draining {
-    Ready,
-    Iterating,
-    Finished,
-}
-
-#[derive(Debug)]
-pub struct NullableIter<'a, T> {
-    data: &'a ArrayDataRef, // TODO: Use a pointer to the null bitmap.
-    ptr: *const T,
-    i: usize,
-    len: usize,
-    draining: Draining,
-}
-
-impl<'a, T> std::iter::Iterator for NullableIter<'a, T>
-where
-    T: Clone,
-{
-    type Item = Option<T>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let i = self.i;
-        if i >= self.len {
-            None
-        } else if self.data.is_null(i) {
-            self.i += 1;
-            Some(None)
-        } else {
-            self.i += 1;
-            unsafe { Some(Some((&*self.ptr.add(i)).clone())) }
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (self.len, Some(self.len))
-    }
-
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        let i = self.i;
-        if i + n >= self.len {
-            self.i = self.len;
-            None
-        } else if self.data.is_null(i + n) {
-            self.i += n + 1;
-            Some(None)
-        } else {
-            self.i += n + 1;
-            unsafe { Some(Some((&*self.ptr.add(i + n)).clone())) }
-        }
-    }
-}
-
-impl<'a, T> std::iter::DoubleEndedIterator for NullableIter<'a, T>
-where
-    T: Clone,
-{
-    fn next_back(&mut self) -> Option<Self::Item> {
-        match self.draining {
-            Draining::Ready => {
-                self.draining = Draining::Iterating;
-                self.i = self.len - 1;
-                self.next_back()
-            }
-            Draining::Iterating => {
-                let i = self.i;
-                if i >= self.len {
-                    None
-                } else if self.data.is_null(i) {
-                    self.i = self.i.checked_sub(1).unwrap_or_else(|| {
-                        self.draining = Draining::Finished;
-                        0_usize
-                    });
-                    Some(None)
-                } else {
-                    match i.checked_sub(1) {
-                        Some(idx) => {
-                            self.i = idx;
-                            unsafe { Some(Some((&*self.ptr.add(i)).clone())) }
-                        }
-                        _ => {
-                            self.draining = Draining::Finished;
-                            unsafe { Some(Some((&*self.ptr).clone())) }
-                        }
-                    }
-                }
-            }
-            Draining::Finished => {
-                self.draining = Draining::Ready;
-                None
-            }
-        }
-    }
-}
-
 impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
     /// Return an iterator to the keys of this dictionary.
-    pub fn keys(&self) -> NullableIter<'_, K::Native> {
-        NullableIter::<'_, K::Native> {
-            data: &self.data,
-            ptr: unsafe { self.raw_values.get().add(self.data.offset()) },
-            i: 0,
-            len: self.data.len(),
-            draining: Draining::Ready,
-        }
+    pub fn keys(&self) -> &PrimitiveArray<K> {

Review comment:
       Thanks Jorge, I like this option more

##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2398,30 +2311,23 @@ impl<T: ArrowPrimitiveType> Array for 
DictionaryArray<T> {
 
     /// Returns the total number of bytes of memory occupied by the buffers 
owned by this [DictionaryArray].
     fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size() + 
self.values().get_buffer_memory_size()
+        // Since both `keys` and `values` derive (are references from) `data`, 
we only account for `data`.
+        self.data.get_array_memory_size()
     }
 
     /// Returns the total number of bytes of memory occupied physically by 
this [DictionaryArray].
     fn get_array_memory_size(&self) -> usize {
-        self.data.get_array_memory_size()
-            + self.values().get_array_memory_size()
-            + mem::size_of_val(self)
+        // Since both `keys` and `values` derive (are references from) `data`, 
we only account for `data`.
+        self.data.get_array_memory_size() + mem::size_of_val(self)
     }
 }
 
 impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        const MAX_LEN: usize = 10;
-        let keys: Vec<_> = self.keys().take(MAX_LEN).collect();
-        let elipsis = if self.keys().count() > MAX_LEN {
-            "..."
-        } else {
-            ""
-        };
         writeln!(
             f,
-            "DictionaryArray {{keys: {:?}{} values: {:?}}}",
-            keys, elipsis, self.values
+            "DictionaryArray {{keys: {:?} values: {:?}}}",

Review comment:
       Nice, I'm assuming that we're relying on the `keys` formatting now that 
it's a `PrimitiveArray`

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -283,48 +280,40 @@ trait Materialize<K, V> {
     fn materialize(&self) -> Vec<Self::Output>;
 }
 
-macro_rules! materialize_string {
-    ($($k:ty,)*) => {
-        $(impl Materialize<$k, arrow_array::StringArray> for dyn Array {
-            type Output = ByteArray;
-
-            fn materialize(&self) -> Vec<Self::Output> {
-                use std::convert::TryFrom;
-
-                let typed_array = self.as_any()
-                    .downcast_ref::<$k>()
-                    .expect("Unable to get dictionary array");
-
-                let keys = typed_array.keys();
-
-                let value_buffer = typed_array.values();
-                let values = value_buffer
-                    .as_any()
-                    .downcast_ref::<arrow_array::StringArray>()
-                    .unwrap();
-
-                // This removes NULL values from the NullableIter, but
-                // they're encoded by the levels, so that's fine.
-                keys
-                    .flatten()
-                    .map(|key| usize::try_from(key).unwrap_or_else(|k| 
panic!("key {} does not fit in usize", k)))
-                    .map(|key| values.value(key))
-                    .map(ByteArray::from)
-                    .collect()
-            }
-        })*
-    };
-}
+impl<K> Materialize<K, arrow_array::StringArray> for dyn Array
+where
+    K: arrow::datatypes::ArrowDictionaryKeyType,
+{
+    type Output = ByteArray;
+
+    fn materialize(&self) -> Vec<Self::Output> {
+        use arrow::datatypes::ArrowNativeType;
 
-materialize_string! {
-    arrow_array::Int8DictionaryArray,
-    arrow_array::Int16DictionaryArray,
-    arrow_array::Int32DictionaryArray,
-    arrow_array::Int64DictionaryArray,
-    arrow_array::UInt8DictionaryArray,
-    arrow_array::UInt16DictionaryArray,
-    arrow_array::UInt32DictionaryArray,
-    arrow_array::UInt64DictionaryArray,
+        let typed_array = self
+            .as_any()
+            .downcast_ref::<arrow_array::DictionaryArray<K>>()
+            .expect("Unable to get dictionary array");
+
+        let keys = typed_array.keys();
+
+        let value_buffer = typed_array.values();
+        let values = value_buffer
+            .as_any()
+            .downcast_ref::<arrow_array::StringArray>()
+            .unwrap();
+
+        // This removes NULL values from the keys, but
+        // they're encoded by the levels, so that's fine.
+        keys.into_iter()
+            .flatten()
+            .map(|key| {
+                key.to_usize()
+                    .unwrap_or_else(|| panic!("key {:?} does not fit in 
usize", key))
+            })
+            .map(|key| values.value(key))
+            .map(ByteArray::from)
+            .collect()

Review comment:
       I know Q's not for me, but I don't understand. Is the change not only 
removing the macro in favour of a generic impl?

##########
File path: rust/arrow/src/array/builder.rs
##########
@@ -3409,8 +3409,15 @@ mod tests {
         builder.append(22345678).unwrap();
         let array = builder.finish();
 
-        // Keys are strongly typed.
-        let aks: Vec<_> = array.keys().collect();
+        println!("{:?}", array.keys().data());

Review comment:
       nit: remove `println` (assuming you were using it for debug purposes)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to