alamb commented on code in PR #21586:
URL: https://github.com/apache/datafusion/pull/21586#discussion_r3106965696
##########
datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs:
##########
@@ -30,6 +30,12 @@ use std::sync::Arc;
const BYTE_VIEW_MAX_BLOCK_SIZE: usize = 2 * 1024 * 1024;
+#[derive(Clone, Copy)]
Review Comment:
I think it would help to add some explaining what this is and what it is
used for so a future reader doesn't have to read the rest of the code to figure
it out
##########
datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs:
##########
@@ -179,6 +181,133 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
}
}
+ fn vectorized_append_non_null_views(
+ &mut self,
+ array: &GenericByteViewArray<B>,
+ rows: &[usize],
+ ) {
+ let source_views = array.views();
+ self.views.reserve(rows.len());
+
+ if array.data_buffers().is_empty() {
+ self.views.extend(rows.iter().map(|&row| source_views[row]));
+ return;
+ }
+
+ let start_idx = self.views.len();
+ let mut pending = Vec::with_capacity(rows.len().saturating_add(1) / 2);
Review Comment:
You could probably save this allocation by using some sort of temp scratch
space on ByteViewGroupValueBuilder
Something lik
```rust
pub struct ByteViewGroupValueBuilder<B: ByteViewType> {
...
scratch: Vec<PendingByteViewCopy>,
```
}
So then it wouldn't reallocate on each batch
##########
datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs:
##########
@@ -179,6 +181,133 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
}
}
+ fn vectorized_append_non_null_views(
+ &mut self,
+ array: &GenericByteViewArray<B>,
+ rows: &[usize],
+ ) {
+ let source_views = array.views();
+ self.views.reserve(rows.len());
+
+ if array.data_buffers().is_empty() {
+ self.views.extend(rows.iter().map(|&row| source_views[row]));
+ return;
+ }
+
+ let start_idx = self.views.len();
+ let mut pending = Vec::with_capacity(rows.len().saturating_add(1) / 2);
+ for (idx, &row) in rows.iter().enumerate() {
+ let view = source_views[row];
+ self.views.push(view);
+ if (view as u32) > 12 {
+ pending.push(PendingByteViewCopy {
+ dest_index: start_idx + idx,
+ source: ByteView::from(view),
+ });
+ }
+ }
+
+ self.batch_copy_long_views(array.data_buffers(), &pending);
+ }
+
+ fn vectorized_append_views_with_nulls(
+ &mut self,
+ array: &GenericByteViewArray<B>,
+ rows: &[usize],
+ ) {
+ let source_views = array.views();
+ let mut pending = Vec::with_capacity(rows.len().saturating_add(1) / 2);
+ self.views.reserve(rows.len());
+
+ for &row in rows {
+ if array.is_null(row) {
+ self.nulls.append(true);
+ self.views.push(0);
+ continue;
+ }
+
+ self.nulls.append(false);
+
+ let view = source_views[row];
+ let dest_index = self.views.len();
+ self.views.push(view);
+
+ if (view as u32) > 12 {
+ pending.push(PendingByteViewCopy {
+ dest_index,
+ source: ByteView::from(view),
+ });
+ }
+ }
+
+ self.batch_copy_long_views(array.data_buffers(), &pending);
+ }
+
+ fn batch_copy_long_views(
Review Comment:
it would help me to understand this if it has some more comments explaining
the rationale for postponing the copy (to detect contiguous views?)
##########
datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs:
##########
@@ -179,6 +181,133 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
}
}
+ fn vectorized_append_non_null_views(
+ &mut self,
+ array: &GenericByteViewArray<B>,
+ rows: &[usize],
+ ) {
+ let source_views = array.views();
+ self.views.reserve(rows.len());
+
+ if array.data_buffers().is_empty() {
+ self.views.extend(rows.iter().map(|&row| source_views[row]));
+ return;
+ }
+
+ let start_idx = self.views.len();
+ let mut pending = Vec::with_capacity(rows.len().saturating_add(1) / 2);
+ for (idx, &row) in rows.iter().enumerate() {
+ let view = source_views[row];
+ self.views.push(view);
+ if (view as u32) > 12 {
Review Comment:
we culd replace 12 with MAX_INLINE_VIEW_LEN maybe to make it clearer what is
going on here
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]