alamb commented on a change in pull request #8346:
URL: https://github.com/apache/arrow/pull/8346#discussion_r501187806



##########
File path: rust/arrow/src/compute/kernels/cast.rs
##########
@@ -740,10 +769,203 @@ where
         .collect()
 }
 
+/// Attempts to cast an `ArrayDictionary` with index type K into
+/// `to_type` for supported types.
+///
+/// K is the key type
+fn dictionary_cast<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    to_type: &DataType,
+) -> Result<ArrayRef> {
+    use DataType::*;
+
+    match to_type {
+        Dictionary(to_index_type, to_value_type) => {
+            let dict_array = array
+                .as_any()
+                .downcast_ref::<DictionaryArray<K>>()
+                .ok_or_else(|| {
+                    ArrowError::ComputeError(
+                        "Internal Error: Cannot cast dictionary to 
DictionaryArray of expected type".to_string(),
+                    )
+                })?;
+
+            let keys_array: ArrayRef = Arc::new(dict_array.keys_array());
+            let values_array: ArrayRef = dict_array.values();
+            let cast_keys = cast(&keys_array, to_index_type)?;
+            let cast_values = cast(&values_array, to_value_type)?;
+
+            // Failure to cast keys (because they don't fit in the
+            // target type) results in NULL values;
+            if cast_keys.null_count() > keys_array.null_count() {
+                return Err(ArrowError::ComputeError(format!(
+                    "Could not convert {} dictionary indexes from {:?} to 
{:?}",
+                    cast_keys.null_count() - keys_array.null_count(),
+                    keys_array.data_type(),
+                    to_index_type
+                )));
+            }
+
+            // keys are data, child_data is values (dictionary)
+            let data = Arc::new(ArrayData::new(
+                to_type.clone(),
+                cast_keys.len(),
+                Some(cast_keys.null_count()),
+                cast_keys
+                    .data()
+                    .null_bitmap()
+                    .clone()
+                    .map(|bitmap| bitmap.bits),
+                cast_keys.data().offset(),
+                cast_keys.data().buffers().to_vec(),
+                vec![cast_values.data()],
+            ));
+
+            // create the appropriate array type
+            let new_array: ArrayRef = match **to_index_type {
+                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
+                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
+                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
+                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
+                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
+                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
+                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
+                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
+                _ => {
+                    return Err(ArrowError::ComputeError(format!(
+                        "Unsupported type {:?} for dictionary index",
+                        to_index_type
+                    )))
+                }
+            };
+
+            Ok(new_array)
+        }
+        _ => unpack_dictionary::<K>(array, to_type),
+    }
+}
+
+// Unpack a dictionary where the keys are of type <K> into a flattened array 
of type to_type
+fn unpack_dictionary<K>(array: &ArrayRef, to_type: &DataType) -> 
Result<ArrayRef>

Review comment:
       This code uses the `take` kernel, as suggested by @nevi-me. That means 
both less code and support for unpacking a larger range of datatype (any type 
that the dictionary can be unpacked to). 👍 👍 

##########
File path: rust/arrow/src/compute/kernels/cast.rs
##########
@@ -740,10 +769,203 @@ where
         .collect()
 }
 
+/// Attempts to cast an `ArrayDictionary` with index type K into
+/// `to_type` for supported types.
+///
+/// K is the key type
+fn dictionary_cast<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    to_type: &DataType,
+) -> Result<ArrayRef> {
+    use DataType::*;
+
+    match to_type {
+        Dictionary(to_index_type, to_value_type) => {
+            let dict_array = array
+                .as_any()
+                .downcast_ref::<DictionaryArray<K>>()
+                .ok_or_else(|| {
+                    ArrowError::ComputeError(
+                        "Internal Error: Cannot cast dictionary to 
DictionaryArray of expected type".to_string(),
+                    )
+                })?;
+
+            let keys_array: ArrayRef = Arc::new(dict_array.keys_array());
+            let values_array: ArrayRef = dict_array.values();
+            let cast_keys = cast(&keys_array, to_index_type)?;
+            let cast_values = cast(&values_array, to_value_type)?;
+
+            // Failure to cast keys (because they don't fit in the
+            // target type) results in NULL values;
+            if cast_keys.null_count() > keys_array.null_count() {
+                return Err(ArrowError::ComputeError(format!(
+                    "Could not convert {} dictionary indexes from {:?} to 
{:?}",
+                    cast_keys.null_count() - keys_array.null_count(),
+                    keys_array.data_type(),
+                    to_index_type
+                )));
+            }
+
+            // keys are data, child_data is values (dictionary)
+            let data = Arc::new(ArrayData::new(
+                to_type.clone(),
+                cast_keys.len(),
+                Some(cast_keys.null_count()),
+                cast_keys
+                    .data()
+                    .null_bitmap()
+                    .clone()
+                    .map(|bitmap| bitmap.bits),
+                cast_keys.data().offset(),
+                cast_keys.data().buffers().to_vec(),
+                vec![cast_values.data()],
+            ));
+
+            // create the appropriate array type
+            let new_array: ArrayRef = match **to_index_type {
+                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
+                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
+                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
+                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
+                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
+                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
+                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
+                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
+                _ => {
+                    return Err(ArrowError::ComputeError(format!(
+                        "Unsupported type {:?} for dictionary index",
+                        to_index_type
+                    )))
+                }
+            };
+
+            Ok(new_array)
+        }
+        _ => unpack_dictionary::<K>(array, to_type),
+    }
+}
+
+// Unpack a dictionary where the keys are of type <K> into a flattened array 
of type to_type
+fn unpack_dictionary<K>(array: &ArrayRef, to_type: &DataType) -> 
Result<ArrayRef>
+where
+    K: ArrowDictionaryKeyType,
+{
+    let dict_array = array
+        .as_any()
+        .downcast_ref::<DictionaryArray<K>>()
+        .ok_or_else(|| {
+            ArrowError::ComputeError(
+                "Internal Error: Cannot cast dictionary to DictionaryArray of 
expected type".to_string(),
+            )
+        })?;
+
+    // attempt to cast the dict values to the target type
+    // use the take kernel to expand out the dictionary
+    let cast_dict_values = cast(&dict_array.values(), to_type)?;
+
+    // Note take requires first casting the indicies to u32
+    let keys_array: ArrayRef = Arc::new(dict_array.keys_array());
+    let indicies = cast(&keys_array, &DataType::UInt32)?;
+    let u32_indicies =
+        indicies
+            .as_any()
+            .downcast_ref::<UInt32Array>()
+            .ok_or_else(|| {
+                ArrowError::ComputeError(
+                    "Internal Error: Cannot cast dict indicies to 
UInt32".to_string(),
+                )
+            })?;
+
+    take(&cast_dict_values, u32_indicies, None)
+}
+
+/// Attempts to encode an array into an `ArrayDictionary` with index
+/// type K and value (dictionary) type value_type
+///
+/// K is the key type
+fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    dict_value_type: &DataType,
+) -> Result<ArrayRef> {
+    use DataType::*;
+
+    match *dict_value_type {
+        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, 
dict_value_type),
+        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, 
dict_value_type),
+        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, 
dict_value_type),
+        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, 
dict_value_type),
+        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, 
dict_value_type),
+        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, 
dict_value_type),
+        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, 
dict_value_type),
+        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, 
dict_value_type),
+        Utf8 => pack_string_to_dictionary::<K>(array),
+        _ => Err(ArrowError::ComputeError(format!(
+            "Internal Error: Unsupported output type for dictionary packing: 
{:?}",
+            dict_value_type
+        ))),
+    }
+}
+
+// Packs the data from the primitive array of type <V> to a
+// DictionaryArray with keys of type K and values of value_type V
+fn pack_numeric_to_dictionary<K, V>(
+    array: &ArrayRef,
+    dict_value_type: &DataType,
+) -> Result<ArrayRef>
+where
+    K: ArrowDictionaryKeyType,
+    V: ArrowNumericType,
+{
+    // attempt to cast the source array values to the target value type (the 
dictionary values type)
+    let cast_values = cast(array, &dict_value_type)?;
+    let values = cast_values
+        .as_any()
+        .downcast_ref::<PrimitiveArray<V>>()
+        .unwrap();
+
+    let keys_builder = PrimitiveBuilder::<K>::new(values.len());
+    let values_builder = PrimitiveBuilder::<V>::new(values.len());
+    let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder);
+
+    // copy each element one at a time
+    for i in 0..values.len() {
+        if values.is_null(i) {
+            b.append_null()?;
+        } else {
+            b.append(values.value(i))?;
+        }
+    }
+    Ok(Arc::new(b.finish()))
+}
+
+// Packs the data as a StringDictionaryArray, if possible, with the
+// key types of K
+fn pack_string_to_dictionary<K>(array: &ArrayRef) -> Result<ArrayRef>

Review comment:
       Maybe there is a fancier way to implement packing for arrays other than 
`PrimitiveType` but I didn't see anything obvious (maybe a `macro!`?)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to