jayzhan211 commented on code in PR #12996:
URL: https://github.com/apache/datafusion/pull/12996#discussion_r1819950324


##########
datafusion/physical-plan/src/aggregates/group_values/column.rs:
##########
@@ -15,33 +15,107 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::ops::Sub;
+use std::{iter, mem, usize};
+
 use crate::aggregates::group_values::group_column::{
     ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn,
     PrimitiveGroupValueBuilder,
 };
 use crate::aggregates::group_values::GroupValues;
 use ahash::RandomState;
-use arrow::compute::cast;
+use arrow::compute::{self, cast};
 use arrow::datatypes::{
     BinaryViewType, Date32Type, Date64Type, Float32Type, Float64Type, 
Int16Type,
     Int32Type, Int64Type, Int8Type, StringViewType, UInt16Type, UInt32Type, 
UInt64Type,
     UInt8Type,
 };
 use arrow::record_batch::RecordBatch;
-use arrow_array::{Array, ArrayRef};
-use arrow_schema::{DataType, Schema, SchemaRef};
+use arrow_array::{
+    Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
+    Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, 
Int32Array,
+    Int64Array, Int8Array, LargeStringArray, StringArray, StringViewArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, 
TimestampNanosecondArray,
+    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+};
+use arrow_schema::{DataType, Schema, SchemaRef, TimeUnit};
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::{not_impl_err, DataFusionError, Result};
 use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
 
-use hashbrown::raw::RawTable;
+use datafusion_physical_expr_common::datum::compare_with_eq;
+use hashbrown::raw::{Bucket, RawTable};
 
-/// A [`GroupValues`] that stores multiple columns of group values.
+const NON_INLINED_FLAG: u64 = 0x8000000000000000;
+const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF;
+
+/// `BucketContext` is a packed struct
 ///
+/// ### Format:
 ///
-pub struct GroupValuesColumn {
+///   +---------------------+--------------------+
+///   | checking flag(1bit) | group index(63bit) |
+///   +---------------------+--------------------+
+///    
+/// ### Checking flag
+///
+///   It is possible that rows with same hash values exist in `input cols`.
+///   And if we `vectorized_equal_to` and `vectorized append` them
+///   in the same round, some fault cases will occur especially when
+///   they are totally the repeated rows...
+///
+///   For example:
+///     - Two repeated rows exist in `input cols`.
+///
+///     - We found their hash values equal to one exist group
+///
+///     - We then perform `vectorized_equal_to` for them to the exist group,
+///       and found their values not equal to the exist one
+///
+///     - Finally when perform `vectorized append`, we decide to build two
+///       respective new groups for them, even we actually just need one
+///       new group...
+///
+///   So for solving such cases simply, if some rows with same hash value
+///   in `input cols`, just allow to process one of them in a round,
+///   and this flag is used to represent that one of them is processing
+///   in current round.
+///
+/// ### Group index
+///
+///     The group's index in group values
+///
+#[derive(Debug, Clone, Copy)]
+struct GroupIndexView(u64);
+
+impl GroupIndexView {
+    #[inline]
+    pub fn is_non_inlined(&self) -> bool {

Review Comment:
   `inline` is a bit confusing to understand at first. I believe the main idea 
is to determine whether the group value already exists (is inline) or not. 
Would exists be a better name for this?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to