davidhewitt commented on code in PR #7716:
URL: https://github.com/apache/arrow-rs/pull/7716#discussion_r2157040092
##########
arrow-select/src/dictionary.rs:
##########
@@ -23,10 +29,76 @@ use arrow_array::types::{
LargeUtf8Type, Utf8Type,
};
use arrow_array::{cast::AsArray, downcast_primitive};
-use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray,
PrimitiveArray};
-use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice};
+use arrow_array::{
+ downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef,
ArrowNativeTypeOp,
+ BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray,
+};
+use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer,
ScalarBuffer, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
+/// Garbage collects a [DictionaryArray] by removing unreferenced values.
+pub fn garbage_collect_dictionary<K: ArrowDictionaryKeyType>(
+ dictionary: &DictionaryArray<K>,
+) -> Result<DictionaryArray<K>, ArrowError> {
+ let keys = dictionary.keys();
+ let values = dictionary.values();
+
+ let mut mask_builder =
+
BooleanBufferBuilder::new_from_buffer(MutableBuffer::new_null(values.len()),
values.len());
+
+ for key in keys {
+ if let Some(key) = key {
+ mask_builder.set_bit(key.as_usize(), true);
+ }
+ }
+
+ let mask = mask_builder.finish();
+
+ // If no work to do, return the original dictionary
+ if mask.count_set_bits() == values.len() {
+ return Ok(dictionary.clone());
+ }
+
+ // Remap the keys to new indices based on the set bits in the mask
+ let key_remap: HashMap<usize, usize> = mask
+ .set_indices()
+ .enumerate()
+ .map(|(new, old)| (old, new))
+ .collect();
+
+ let new_keys = keys
+ .iter()
+ .map(|key| {
+ key.map_or(<K::Native as ArrowNativeTypeOp>::ZERO, |k| {
+ K::Native::from_usize(
+ key_remap
+ .get(&k.as_usize())
+ .copied()
+ .expect("key should be in remap"),
+ )
+ .expect("key remap should always be in range of K")
+ })
+ })
+ .collect::<ScalarBuffer<K::Native>>();
+
+ let new_keys = PrimitiveArray::new(new_keys, keys.nulls().cloned());
+
+ // Create a new values array with the masked values
+ let values = filter(dictionary.values(), &BooleanArray::new(mask, None))?;
Review Comment:
This use of `filter` is how I avoid casting to a concrete value type.
There is also an `Interner` which is used in other kernels for dictionary
deduplication, but:
- it doesn't support all data types, and
- this implementation is more consistent with `GenericByteViewArray::gc`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]