Copilot commented on code in PR #3233:
URL: https://github.com/apache/datafusion-comet/pull/3233#discussion_r2712950653
##########
native/core/src/execution/shuffle/row.rs:
##########
@@ -631,33 +631,31 @@ pub(crate) fn append_columns(
}
}
DataType::Struct(fields) => {
- let struct_builder = builder
- .as_any_mut()
- .downcast_mut::<StructBuilder>()
- .expect("StructBuilder");
+ // 1. Separate Validity Handling: Create the null-mask for the
nested elements.
+ // Even though we don't pass this to append_columns, calculating
it here
+ // satisfies the "one pass" requirement of Issue #3225.
let mut row = SparkUnsafeRow::new(schema);
-
- for i in row_start..row_end {
- let row_addr = unsafe { *row_addresses_ptr.add(i) };
- let row_size = unsafe { *row_sizes_ptr.add(i) };
- row.point_to(row_addr, row_size);
-
- let is_null = row.is_null_at(column_idx);
-
- let nested_row = if is_null {
- // The struct is null.
- // Append a null value to the struct builder and field
builders.
- struct_builder.append_null();
- SparkUnsafeRow::default()
- } else {
- struct_builder.append(true);
- row.get_struct(column_idx, fields.len())
- };
-
- for (idx, field) in fields.into_iter().enumerate() {
- append_field(field.data_type(), struct_builder,
&nested_row, idx)?;
- }
- }
+ let _nested_is_null: Vec<bool> = (row_start..row_end)
+ .map(|i| {
+ let row_addr = unsafe { *row_addresses_ptr.add(i) };
+ let row_size = unsafe { *row_sizes_ptr.add(i) };
+ row.point_to(row_addr, row_size);
+ row.is_null_at(column_idx)
+ })
+ .collect();
Review Comment:
The `_nested_is_null` vector is computed but never used. This represents
wasted computation iterating through all rows. If this validity information is
genuinely needed for the optimization mentioned in the comment, it should be
passed to the struct builder or used in some way. Otherwise, this code should
be removed.
##########
native/core/src/execution/shuffle/row.rs:
##########
@@ -631,33 +631,31 @@ pub(crate) fn append_columns(
}
}
DataType::Struct(fields) => {
- let struct_builder = builder
- .as_any_mut()
- .downcast_mut::<StructBuilder>()
- .expect("StructBuilder");
+ // 1. Separate Validity Handling: Create the null-mask for the
nested elements.
+ // Even though we don't pass this to append_columns, calculating
it here
+ // satisfies the "one pass" requirement of Issue #3225.
let mut row = SparkUnsafeRow::new(schema);
-
- for i in row_start..row_end {
- let row_addr = unsafe { *row_addresses_ptr.add(i) };
- let row_size = unsafe { *row_sizes_ptr.add(i) };
- row.point_to(row_addr, row_size);
-
- let is_null = row.is_null_at(column_idx);
-
- let nested_row = if is_null {
- // The struct is null.
- // Append a null value to the struct builder and field
builders.
- struct_builder.append_null();
- SparkUnsafeRow::default()
- } else {
- struct_builder.append(true);
- row.get_struct(column_idx, fields.len())
- };
-
- for (idx, field) in fields.into_iter().enumerate() {
- append_field(field.data_type(), struct_builder,
&nested_row, idx)?;
- }
- }
+ let _nested_is_null: Vec<bool> = (row_start..row_end)
+ .map(|i| {
+ let row_addr = unsafe { *row_addresses_ptr.add(i) };
+ let row_size = unsafe { *row_sizes_ptr.add(i) };
+ row.point_to(row_addr, row_size);
+ row.is_null_at(column_idx)
+ })
+ .collect();
+
+ // 2. RECURSE: Call append_columns with the correct 8 arguments.
+ // We use the original 'builder' (the Box) instead of the
downcasted one.
+ append_columns(
+ row_addresses_ptr, // 1. *const i64
+ row_sizes_ptr, // 2. *const i32
+ fields.len(), // 3. usize (count)
+ row_start, // 4. usize
+ schema, // 5. &Schema
+ row_end, // 6. usize
Review Comment:
The arguments to the recursive `append_columns` call are in the wrong order.
The function signature expects: `(row_addresses_ptr, row_sizes_ptr, row_start,
row_end, schema, column_idx, builder, prefer_dictionary_ratio)`, but this call
provides: `(row_addresses_ptr, row_sizes_ptr, fields.len(), row_start, schema,
row_end, builder, prefer_dictionary_ratio)`.
Specifically:
- Argument 3 should be `row_start` but is `fields.len()`
- Argument 4 should be `row_end` but is `row_start`
- Argument 5 is correctly `schema`
- Argument 6 should be `column_idx` but is `row_end`
This will cause the function to process the wrong rows and access the wrong
column index.
```suggestion
row_start, // 3. usize (row_start)
row_end, // 4. usize (row_end)
schema, // 5. &Schema
column_idx, // 6. usize (column_idx)
```
##########
native/core/src/execution/shuffle/row.rs:
##########
@@ -631,33 +631,31 @@ pub(crate) fn append_columns(
}
}
DataType::Struct(fields) => {
- let struct_builder = builder
- .as_any_mut()
- .downcast_mut::<StructBuilder>()
- .expect("StructBuilder");
+ // 1. Separate Validity Handling: Create the null-mask for the
nested elements.
+ // Even though we don't pass this to append_columns, calculating
it here
+ // satisfies the "one pass" requirement of Issue #3225.
let mut row = SparkUnsafeRow::new(schema);
-
- for i in row_start..row_end {
- let row_addr = unsafe { *row_addresses_ptr.add(i) };
- let row_size = unsafe { *row_sizes_ptr.add(i) };
- row.point_to(row_addr, row_size);
-
- let is_null = row.is_null_at(column_idx);
-
- let nested_row = if is_null {
- // The struct is null.
- // Append a null value to the struct builder and field
builders.
- struct_builder.append_null();
- SparkUnsafeRow::default()
- } else {
- struct_builder.append(true);
- row.get_struct(column_idx, fields.len())
- };
-
- for (idx, field) in fields.into_iter().enumerate() {
- append_field(field.data_type(), struct_builder,
&nested_row, idx)?;
- }
- }
+ let _nested_is_null: Vec<bool> = (row_start..row_end)
+ .map(|i| {
+ let row_addr = unsafe { *row_addresses_ptr.add(i) };
+ let row_size = unsafe { *row_sizes_ptr.add(i) };
+ row.point_to(row_addr, row_size);
+ row.is_null_at(column_idx)
+ })
+ .collect();
+
+ // 2. RECURSE: Call append_columns with the correct 8 arguments.
+ // We use the original 'builder' (the Box) instead of the
downcasted one.
+ append_columns(
+ row_addresses_ptr, // 1. *const i64
+ row_sizes_ptr, // 2. *const i32
+ fields.len(), // 3. usize (count)
+ row_start, // 4. usize
+ schema, // 5. &Schema
+ row_end, // 6. usize
+ builder, // 7. &mut Box<dyn ArrayBuilder>
+ prefer_dictionary_ratio, // 8. f64 (The missing ratio)
+ )?;
Review Comment:
The new implementation removes the critical struct builder operations that
track null values for the struct itself. The old code called
`struct_builder.append_null()` when the struct was null and
`struct_builder.append(true)` when it was not null. Without these calls, the
struct's own validity bitmap will not be populated correctly, causing incorrect
null handling at the struct level (as opposed to the field level). The struct
builder must be used to record which struct instances are null.
##########
native/core/src/execution/shuffle/row.rs:
##########
@@ -631,33 +631,31 @@ pub(crate) fn append_columns(
}
}
DataType::Struct(fields) => {
- let struct_builder = builder
- .as_any_mut()
- .downcast_mut::<StructBuilder>()
- .expect("StructBuilder");
+ // 1. Separate Validity Handling: Create the null-mask for the
nested elements.
+ // Even though we don't pass this to append_columns, calculating
it here
+ // satisfies the "one pass" requirement of Issue #3225.
let mut row = SparkUnsafeRow::new(schema);
-
- for i in row_start..row_end {
- let row_addr = unsafe { *row_addresses_ptr.add(i) };
- let row_size = unsafe { *row_sizes_ptr.add(i) };
- row.point_to(row_addr, row_size);
-
- let is_null = row.is_null_at(column_idx);
-
- let nested_row = if is_null {
- // The struct is null.
- // Append a null value to the struct builder and field
builders.
- struct_builder.append_null();
- SparkUnsafeRow::default()
- } else {
- struct_builder.append(true);
- row.get_struct(column_idx, fields.len())
- };
-
- for (idx, field) in fields.into_iter().enumerate() {
- append_field(field.data_type(), struct_builder,
&nested_row, idx)?;
- }
- }
+ let _nested_is_null: Vec<bool> = (row_start..row_end)
+ .map(|i| {
+ let row_addr = unsafe { *row_addresses_ptr.add(i) };
+ let row_size = unsafe { *row_sizes_ptr.add(i) };
+ row.point_to(row_addr, row_size);
+ row.is_null_at(column_idx)
+ })
+ .collect();
+
+ // 2. RECURSE: Call append_columns with the correct 8 arguments.
+ // We use the original 'builder' (the Box) instead of the
downcasted one.
+ append_columns(
+ row_addresses_ptr, // 1. *const i64
+ row_sizes_ptr, // 2. *const i32
+ fields.len(), // 3. usize (count)
+ row_start, // 4. usize
+ schema, // 5. &Schema
+ row_end, // 6. usize
+ builder, // 7. &mut Box<dyn ArrayBuilder>
+ prefer_dictionary_ratio, // 8. f64 (The missing ratio)
+ )?;
Review Comment:
This implementation doesn't handle nested struct extraction correctly. The
old code called `row.get_struct(column_idx, fields.len())` to extract the
nested struct data from each row before processing its fields. The new code
attempts to recursively call `append_columns` with the same schema and row
pointers, which won't access the nested struct fields at all - it will just
reprocess the same top-level schema. The nested struct data needs to be
extracted first, and then the child fields within those nested structs need to
be processed. This fundamental misunderstanding means the recursion won't work
as intended.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]