tustvold commented on code in PR #7716:
URL: https://github.com/apache/arrow-rs/pull/7716#discussion_r2157191641
##########
arrow-select/src/dictionary.rs:
##########
@@ -23,10 +29,76 @@ use arrow_array::types::{
LargeUtf8Type, Utf8Type,
};
use arrow_array::{cast::AsArray, downcast_primitive};
-use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray,
PrimitiveArray};
-use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice};
+use arrow_array::{
+ downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef,
ArrowNativeTypeOp,
+ BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray,
+};
+use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer,
ScalarBuffer, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
+/// Garbage collects a [DictionaryArray] by removing unreferenced values.
+pub fn garbage_collect_dictionary<K: ArrowDictionaryKeyType>(
+ dictionary: &DictionaryArray<K>,
+) -> Result<DictionaryArray<K>, ArrowError> {
+ let keys = dictionary.keys();
+ let values = dictionary.values();
+
+ let mut mask_builder =
+
BooleanBufferBuilder::new_from_buffer(MutableBuffer::new_null(values.len()),
values.len());
+
+ for key in keys {
+ if let Some(key) = key {
+ mask_builder.set_bit(key.as_usize(), true);
+ }
+ }
+
+ let mask = mask_builder.finish();
+
+ // If no work to do, return the original dictionary
+ if mask.count_set_bits() == values.len() {
+ return Ok(dictionary.clone());
+ }
+
+ // Remap the keys to new indices based on the set bits in the mask
+ let key_remap: HashMap<usize, usize> = mask
Review Comment:
It'd probably be significantly faster to just use a `Vec` and leave the
pruned indices set to 0
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]