Rachelint commented on code in PR #12996: URL: https://github.com/apache/datafusion/pull/12996#discussion_r1826167201
########## datafusion/physical-plan/src/aggregates/group_values/column.rs: ########## @@ -356,3 +1048,461 @@ impl GroupValues for GroupValuesColumn { self.hashes_buffer.shrink_to(count); } } + +/// Returns true if [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] +/// supported for the specified schema +pub fn supported_schema(schema: &Schema) -> bool { + schema + .fields() + .iter() + .map(|f| f.data_type()) + .all(supported_type) +} + +/// Returns true if the specified data type is supported by +/// [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] +/// +/// In order to be supported, there must be a specialized implementation of +/// [`GroupColumn`] for the data type, instantiated in [`GroupValuesColumn::intern`] +/// or [`VectorizedGroupValuesColumn::intern`] +fn supported_type(data_type: &DataType) -> bool { + matches!( + *data_type, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + | DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary + | DataType::Date32 + | DataType::Date64 + | DataType::Utf8View + | DataType::BinaryView + ) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::{compute::concat_batches, util::pretty::pretty_format_batches}; + use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray, StringViewArray}; + use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use datafusion_expr::EmitTo; + + use crate::aggregates::group_values::{ + column::VectorizedGroupValuesColumn, GroupValues, + }; + + #[test] + fn test_intern_for_vectorized_group_values() { + let data_set = VectorizedTestDataSet::new(); + let mut group_values = + VectorizedGroupValuesColumn::try_new(data_set.schema()).unwrap(); + + data_set.load_to_group_values(&mut group_values); + let actual_batch = group_values.emit(EmitTo::All).unwrap(); + let actual_batch = RecordBatch::try_new(data_set.schema(), actual_batch).unwrap(); + + check_result(&actual_batch, &data_set.expected_batch); + } + + #[test] + fn test_emit_first_n_for_vectorized_group_values() { Review Comment: https://github.com/apache/datafusion/pull/12996#discussion_r1824623183 has been finished, now we can totally avoid the unnecessary memory allocations for modifying `group_index_lists`! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org