2010YOUY01 commented on code in PR #12996:
URL: https://github.com/apache/datafusion/pull/12996#discussion_r1826582961


##########
datafusion/physical-plan/src/aggregates/group_values/column.rs:
##########
@@ -35,29 +37,113 @@ use datafusion_common::{not_impl_err, DataFusionError, 
Result};
 use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
+
 use hashbrown::raw::RawTable;
-use std::mem::size_of;
 
-/// A [`GroupValues`] that stores multiple columns of group values.
+const NON_INLINED_FLAG: u64 = 0x8000000000000000;
+const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF;
+
+/// The view of indices pointing to the actual values in `GroupValues`
 ///
+/// If only single `group index` represented by view,
+/// value of view is just the `group index`, and we call it a `inlined view`.
 ///
-pub struct GroupValuesColumn {
+/// If multiple `group indices` represented by view,
+/// value of view is the actually the index pointing to `group indices`,
+/// and we call it `non-inlined view`.
+///
+/// The view(a u64) format is like:
+///   +---------------------+---------------------------------------------+
+///   | inlined flag(1bit)  | group index / index to group indices(63bit) |
+///   +---------------------+---------------------------------------------+
+///
+/// `inlined flag`: 1 represents `non-inlined`, and 0 represents `inlined`
+///
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+struct GroupIndexView(u64);
+
+impl GroupIndexView {
+    #[inline]
+    pub fn is_non_inlined(&self) -> bool {
+        (self.0 & NON_INLINED_FLAG) > 0
+    }
+
+    #[inline]
+    pub fn new_inlined(group_index: u64) -> Self {
+        Self(group_index)
+    }
+
+    #[inline]
+    pub fn new_non_inlined(list_offset: u64) -> Self {
+        let non_inlined_value = list_offset | NON_INLINED_FLAG;
+        Self(non_inlined_value)
+    }
+
+    #[inline]
+    pub fn value(&self) -> u64 {
+        self.0 & VALUE_MASK
+    }
+}
+
+/// A [`GroupValues`] that stores multiple columns of group values,
+/// and supports vectorized operators for them
+///
+pub struct VectorizedGroupValuesColumn {
     /// The output schema
     schema: SchemaRef,
 
     /// Logically maps group values to a group_index in
     /// [`Self::group_values`] and in each accumulator
     ///
-    /// Uses the raw API of hashbrown to avoid actually storing the
-    /// keys (group values) in the table
+    /// It is a `hashtable` based on `hashbrown`.
     ///
-    /// keys: u64 hashes of the GroupValue
-    /// values: (hash, group_index)
-    map: RawTable<(u64, usize)>,
+    /// Key and value in the `hashtable`:
+    ///   - The `key` is `hash value(u64)` of the `group value`
+    ///   - The `value` is the `group values` with the same `hash value`
+    ///
+    /// We don't really store the actual `group values` in `hashtable`,
+    /// instead we store the `group indices` pointing to values in 
`GroupValues`.
+    /// And we use [`GroupIndexView`] to represent such `group indices` in 
table.
+    ///
+    ///
+    map: RawTable<(u64, GroupIndexView)>,

Review Comment:
   Is it the case
   1. If group1 and group2 have exactly the same hash value, `GroupIndexView` 
will use chaining to resolve the collision
   2. If group1 and group2 have different hash values but map to the same slot 
in hash table, `hashbrown` will handle the collision for you with probing



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to