This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new d88a8f5cb fix: Update FuzzDataGenerator to produce dictionary-encoded
string arrays & fix bugs that this exposes (#2635)
d88a8f5cb is described below
commit d88a8f5cb3c63b83ee4c374623659ce1b3f63d95
Author: Andy Grove <[email protected]>
AuthorDate: Thu Oct 23 13:58:40 2025 -0600
fix: Update FuzzDataGenerator to produce dictionary-encoded string arrays &
fix bugs that this exposes (#2635)
---
native/core/src/execution/operators/scan.rs | 27 ++++++----------------
native/core/src/execution/planner.rs | 7 ++----
.../apache/comet/testing/FuzzDataGenerator.scala | 3 +++
3 files changed, 12 insertions(+), 25 deletions(-)
diff --git a/native/core/src/execution/operators/scan.rs
b/native/core/src/execution/operators/scan.rs
index 502fa0bd1..fef4a5b9e 100644
--- a/native/core/src/execution/operators/scan.rs
+++ b/native/core/src/execution/operators/scan.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::execution::operators::copy_array;
+use crate::execution::operators::{copy_array, copy_or_unpack_array, CopyMode};
use crate::{
errors::CometError,
execution::{
@@ -147,27 +147,13 @@ impl ScanExec {
})
}
- /// Checks if the input data type `dt` is a dictionary type with primitive
value type.
- /// If so, unpacks it and returns the primitive value type.
- ///
- /// Otherwise, this returns the original data type.
- ///
- /// This is necessary since DataFusion doesn't handle dictionary array
with values
- /// being primitive type.
- ///
- /// TODO: revisit this once DF has improved its dictionary type support.
Ideally we shouldn't
- /// do this in Comet but rather let DF to handle it for us.
+ /// Unpack all dictionary types because some DataFusion operators
+ /// and expressions do not support dictionary types
fn unpack_dictionary_type(dt: &DataType) -> DataType {
if let DataType::Dictionary(_, vt) = dt {
- if !matches!(
- vt.as_ref(),
- DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary |
DataType::LargeBinary
- ) {
- // return the underlying data type
- return vt.as_ref().clone();
- }
+ // return the underlying data type
+ return vt.as_ref().clone();
}
-
dt.clone()
}
@@ -280,7 +266,8 @@ impl ScanExec {
let array = if arrow_ffi_safe {
// ownership of this array has been transferred to native
- array
+ // but we still need to unpack dictionary arrays
+ copy_or_unpack_array(&array, &CopyMode::UnpackOrClone)?
} else {
// it is necessary to copy the array because the contents may
be
// overwritten on the JVM side in the future
diff --git a/native/core/src/execution/planner.rs
b/native/core/src/execution/planner.rs
index 1550efd79..da56e01bb 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -3120,11 +3120,8 @@ mod tests {
assert!(batch.is_ok(), "got error {}",
batch.unwrap_err());
let batch = batch.unwrap();
assert_eq!(batch.num_rows(), row_count / 4);
- // string/binary should still be packed with dictionary
- assert!(matches!(
- batch.column(0).data_type(),
- DataType::Dictionary(_, _)
- ));
+ // string/binary should no longer be packed with
dictionary
+ assert!(matches!(batch.column(0).data_type(),
DataType::Utf8));
}
Poll::Ready(None) => {
break;
diff --git
a/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
b/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
index 087221e1a..188da1d79 100644
--- a/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
+++ b/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
@@ -195,6 +195,9 @@ object FuzzDataGenerator {
case 2 => r.nextLong().toString
case 3 => r.nextDouble().toString
case 4 => RandomStringUtils.randomAlphabetic(8)
+ case 5 =>
+ // use a constant value to trigger dictionary encoding
+ "dict_encode_me!"
case _ => r.nextString(8)
}
})
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]