alchemist51 commented on code in PR #15591:
URL: https://github.com/apache/datafusion/pull/15591#discussion_r2726532346
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -37,12 +47,10 @@ use datafusion_expr_common::groups_accumulator::EmitTo;
#[derive(Debug)]
pub enum SeenValues {
/// All groups seen so far have seen at least one non-null value
- All {
Review Comment:
probably unintended change
##########
datafusion/common/src/config.rs:
##########
@@ -647,6 +647,16 @@ config_namespace! {
/// the remote end point.
pub objectstore_writer_buffer_size: usize, default = 10 * 1024 * 1024
+ /// Should DataFusion use a blocked approach to manage grouping state.
+ /// By default, the blocked approach is used which
+ /// allocates capacity based on a predefined block size firstly.
+ /// When the block reaches its limit, we allocate a new block (also
with
+ /// the same predefined block size based capacity) instead of expanding
+ /// the current one and copying the data.
+ /// If `false`, a single allocation approach is used, where
+ /// values are managed within a single large memory block.
+ /// As this block grows, it often triggers numerous copies, resulting
in poor performance.
+ pub enable_aggregation_blocked_groups: bool, default = true
Review Comment:
It should be false by default for now?
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -121,26 +136,29 @@ pub struct NullState {
///
/// If `seen_values` is `SeenValues::All`, all groups have seen at least
one non null value
seen_values: SeenValues,
-}
-impl Default for NullState {
- fn default() -> Self {
- Self::new()
- }
+ /// Size of one seen values block, can be None if only desire single block
+ block_size: Option<usize>,
+
+ /// phantom data for required type `<O>`
+ _phantom: PhantomData<O>,
}
-impl NullState {
- pub fn new() -> Self {
+impl<O: GroupIndexOperations> NullState<O> {
+ /// Create a new `NullState`
+ pub fn new(block_size: Option<usize>) -> Self {
Self {
seen_values: SeenValues::All { num_values: 0 },
+ block_size,
+ _phantom: PhantomData,
Review Comment:
Didn't quite get the phantom usage
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -60,27 +68,34 @@ impl SeenValues {
///
/// The builder is then ensured to have at least `total_num_groups` length,
/// with any new entries initialized to false.
- fn get_builder(&mut self, total_num_groups: usize) -> &mut
BooleanBufferBuilder {
- match self {
+ fn get_big_enough_builder(
+ &mut self,
+ total_num_groups: usize,
+ block_size: Option<usize>,
+ ) -> &mut Blocks<BooleanBufferBuilder> {
+ // If `self` is `SeenValues::All`, transition it to `SeenValues::Some`
with `num_values trues` firstly,
+ // then return mutable reference to the builder.
+ // If `self` is `SeenValues::Some`, just directly return mutable
reference to the builder.
+ let new_block = |block_size: Option<usize>| {
Review Comment:
should we move it to SeenValues::All match call rather?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]