Re: [PR] Optimize Dictionary groupings [datafusion]

via GitHub Tue, 23 Jun 2026 08:58:58 -0700


Rich-T-kid commented on code in PR #21765:
URL: https://github.com/apache/datafusion/pull/21765#discussion_r3454836051



##########
datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs:
##########
@@ -0,0 +1,1003 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::aggregates::group_values::GroupValues;
+use crate::hash_utils::RandomState;
+use arrow::array::{
+    Array, ArrayRef, AsArray, DictionaryArray, LargeStringArray, 
LargeStringBuilder,
+    ListArray, ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, 
StringBuilder,
+    StringViewArray, StringViewBuilder,
+};
+use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType};
+use datafusion_common::DataFusionError::{Internal, NotImplemented};
+use datafusion_common::Result;
+use datafusion_common::hash_utils::create_hashes;
+use datafusion_expr::EmitTo;
+use hashbrown::HashTable;
+use std::borrow::Cow;
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Heuristic for sizing the values buffer of string builders during emit:
+/// dictionary-encoded values are short by design (categorical strings, short
+/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common
+/// case while keeping over-allocation cheap when values are smaller.
+const AVG_BYTES_PER_DICT_VALUE: usize = 16;
+
+macro_rules! decode_list {
+    ($raw:expr, $builder:expr) => {{
+        let mut builder = $builder;
+        for raw_bytes in $raw {
+            match raw_bytes {
+                None => builder.append_null(),
+                Some(raw_vector) => {
+                    let mut offset = 0;
+                    while offset < raw_vector.len() {
+                        let len = i64::from_ne_bytes(
+                            raw_vector[offset..offset + 8]
+                                .try_into()
+                                .expect("slice of length 8"),
+                        );
+                        offset += 8;
+                        if len == -1 {
+                            builder.values().append_null();
+                        } else {
+                            let s = unsafe {
+                                std::str::from_utf8_unchecked(
+                                    &raw_vector[offset..offset + len as usize],
+                                )
+                            };
+                            builder.values().append_value(s);
+                            offset += len as usize;
+                        }
+                    }
+                    builder.append(true);
+                }
+            }
+        }
+        Ok(Arc::new(builder.finish()) as ArrayRef)
+    }};
+}
+macro_rules! decode_scalar_string {
+    ($raw:expr, $builder:expr) => {{
+        let mut builder = $builder;
+        for raw_bytes in $raw {
+            match raw_bytes {
+                Some(raw_vector) => {
+                    let s = unsafe { std::str::from_utf8_unchecked(raw_vector) 
};
+                    builder.append_value(s);
+                }
+                None => builder.append_null(),
+            }
+        }
+        Ok(Arc::new(builder.finish()) as ArrayRef)
+    }};
+}
+struct DictEntry {
+    hash: u64,
+    group_id: usize,
+    offset: usize,
+    len: usize,
+}
+
+pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> {
+    /// Packed byte storage for all group values.
+    row_buffer: Vec<u8>,
+    /// `row_offsets[g]` = start of group `g` in `row_buffer`;
+    row_offsets: Vec<usize>,
+    value_dt: DataType,
+    _phantom: PhantomData<K>,
+    // keeps track of which values weve already seen, keyed by raw value hash.
+    unique_dict_value_mapping: HashTable<DictEntry>,
+
+    random_state: RandomState,
+
+    // cache the group id for nulls since they all map to the same group
+    null_group_id: Option<usize>,
+    // key to group vector scratch space, used to avoid re-allocating a new 
vector on each call to intern
+    key_to_group: Vec<Option<usize>>,
+    // 0. cache pointer of arrays, this avoids having to re-compute hashing 
for arrays weve already seen on past iterations
+    // 1. avoid re-allocating buffer inbetween calls, instead of allocating a 
new vector each time re-use inbetween calls
+    values_cache: (Option<ArrayRef>, Vec<u64>),
+}
+
+impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
+    pub fn new(data_type: &DataType) -> Self {
+        Self {
+            row_buffer: Vec::new(),
+            row_offsets: Vec::new(),
+            unique_dict_value_mapping: HashTable::new(),
+            value_dt: data_type.clone(),
+            _phantom: PhantomData,
+            random_state: RandomState::with_seed(0),
+            null_group_id: None,
+            key_to_group: Vec::new(),
+            values_cache: (None, Vec::new()),
+        }
+    }
+
+    fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize {
+        let row_buffer = &self.row_buffer;
+        if let Some(e) = self.unique_dict_value_mapping.find(hash, |e| {
+            e.hash == hash && &row_buffer[e.offset..e.offset + e.len] == raw
+        }) {
+            return e.group_id;
+        }
+        let new_group_id = self.row_offsets.len();
+        let offset = self.row_buffer.len();
+        self.row_offsets.push(offset);
+        self.row_buffer.extend_from_slice(raw);
+        self.unique_dict_value_mapping.insert_unique(
+            hash,
+            DictEntry {
+                hash,
+                group_id: new_group_id,
+                offset,
+                len: raw.len(),
+            },
+            |e| e.hash,
+        );
+        new_group_id
+    }
+    fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> {
+        self.values_cache.1.clear();
+        self.values_cache.1.resize(values.len(), 0);
+        create_hashes(
+            [Arc::clone(values)],
+            &self.random_state,
+            &mut self.values_cache.1,
+        )?;
+        Ok(())
+        //Ok(hashes)
+    }
+
+    fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> {
+        match values.data_type() {
+            DataType::Utf8 => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("Expected StringArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::LargeUtf8 => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<LargeStringArray>()
+                    .expect("Expected LargeStringArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::Utf8View => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<StringViewArray>()
+                    .expect("Expected StringViewArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::List(_) => {
+                let list_array = values
+                    .as_any()
+                    .downcast_ref::<ListArray>()
+                    .expect("Expected ListArray");
+
+                debug_assert!(!list_array.is_null(index));
+
+                let start = list_array.value_offsets()[index] as usize;
+                let end = list_array.value_offsets()[index + 1] as usize;
+                let child = list_array.values();
+
+                let mut bytes = Vec::new();
+                for i in start..end {
+                    if child.is_null(i) {
+                        // acts as a marker for transform_into_array to write 
a null
+                        bytes.extend_from_slice(&(-1i64).to_ne_bytes());
+                    } else {
+                        let raw = Self::get_raw_bytes(child, i);
+                        bytes.extend_from_slice(&(raw.len() as 
i64).to_ne_bytes());
+                        bytes.extend_from_slice(&raw);
+                    }
+                }
+                Cow::Owned(bytes)
+            }
+            other => unimplemented!("get_raw_bytes not implemented for 
{other:?}"),
+        }
+    }
+
+    #[inline]
+    fn get_null_group_id(&mut self) -> usize {
+        if let Some(group_id) = self.null_group_id {
+            group_id
+        } else {
+            let new_group_id = self.row_offsets.len();
+            self.row_offsets.push(self.row_buffer.len()); // null: empty span
+            self.null_group_id = Some(new_group_id);
+            new_group_id
+        }
+    }
+
+    fn transform_into_array(
+        &self,
+        n: usize,
+        null_group_id: Option<usize>,
+    ) -> Result<ArrayRef> {
+        let data_capacity = n * AVG_BYTES_PER_DICT_VALUE;
+        let raw = (0..n).map(|i| {
+            if Some(i) == null_group_id {
+                return None;
+            }
+            let start = self.row_offsets[i];
+            // last group has no i+1 entry; its end is the buffer tail.
+            let end = self
+                .row_offsets
+                .get(i + 1)
+                .copied()
+                .unwrap_or(self.row_buffer.len());
+            Some(&self.row_buffer[start..end])
+        });
+        match &self.value_dt {

Review Comment:
   thank you for the review @alamb 
   
   I agree on avoiding ballooning LOC, adding support for every data type 
becomes more trouble than the compute savings justify. That said, I think 
restricting support to `Dictionary<_, Utf8/Utf8View>` is the right call. It 
significantly simplifies the code paths while still delivering a large 
performance boost for the most common production data shapes (low-cardinality 
group bys), and similar to other single-column group by specializations, it 
leaves a viable fallback that loses nothing when the fast path isn't hit.
   
   With that in mind, @zhuqi-lucas's work looks very interesting -- I had a 
similar idea with #22891 and #21878. The issue was that multi-column group-bys 
on dictionaries fell through to `GroupValuesRows`, which was slower than we'd 
like. I'm actually working on a multi-column dictionary group by in #22983, but 
I'm running into similar issues around handling multiple distinct types. For 
the multi-column case it makes more sense to build on @zhuqi-lucas's work, 
since the number of types that need to be supported explodes and there are 
diminishing returns to exploiting dictionary properties across multiple columns 
and intern calls ([key space 
cache](https://github.com/apache/datafusion/pull/22983/changes#diff-84ceab5d992f30b340d52094b28685b5ecfd6f68f27867641ca1a9453ad279baR209),
 [arc ptr 
cache](https://github.com/apache/datafusion/pull/22983/changes#diff-84ceab5d992f30b340d52094b28685b5ecfd6f68f27867641ca1a9453ad279baR193),
 [compute keys once and reuse](https://github.com/apache/datafu
 
sion/pull/22983/changes#diff-84ceab5d992f30b340d52094b28685b5ecfd6f68f27867641ca1a9453ad279baR342)).
   
   As for this PR specifically, I do think it adds significant value. Would 
restricting the supported value types be a viable path forward? It would keep 
the type combinations from exploding while still letting the community benefit 
from the speedup.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Optimize Dictionary groupings [datafusion]

Reply via email to