This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
     new d88a8f5cb fix: Update FuzzDataGenerator to produce dictionary-encoded 
string arrays & fix bugs that this exposes (#2635)
d88a8f5cb is described below

commit d88a8f5cb3c63b83ee4c374623659ce1b3f63d95
Author: Andy Grove <[email protected]>
AuthorDate: Thu Oct 23 13:58:40 2025 -0600

    fix: Update FuzzDataGenerator to produce dictionary-encoded string arrays & 
fix bugs that this exposes (#2635)
---
 native/core/src/execution/operators/scan.rs        | 27 ++++++----------------
 native/core/src/execution/planner.rs               |  7 ++----
 .../apache/comet/testing/FuzzDataGenerator.scala   |  3 +++
 3 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/native/core/src/execution/operators/scan.rs 
b/native/core/src/execution/operators/scan.rs
index 502fa0bd1..fef4a5b9e 100644
--- a/native/core/src/execution/operators/scan.rs
+++ b/native/core/src/execution/operators/scan.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::execution::operators::copy_array;
+use crate::execution::operators::{copy_array, copy_or_unpack_array, CopyMode};
 use crate::{
     errors::CometError,
     execution::{
@@ -147,27 +147,13 @@ impl ScanExec {
         })
     }
 
-    /// Checks if the input data type `dt` is a dictionary type with primitive 
value type.
-    /// If so, unpacks it and returns the primitive value type.
-    ///
-    /// Otherwise, this returns the original data type.
-    ///
-    /// This is necessary since DataFusion doesn't handle dictionary array 
with values
-    /// being primitive type.
-    ///
-    /// TODO: revisit this once DF has improved its dictionary type support. 
Ideally we shouldn't
-    ///   do this in Comet but rather let DF to handle it for us.
+    /// Unpack all dictionary types because some DataFusion operators
+    /// and expressions do not support dictionary types
     fn unpack_dictionary_type(dt: &DataType) -> DataType {
         if let DataType::Dictionary(_, vt) = dt {
-            if !matches!(
-                vt.as_ref(),
-                DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | 
DataType::LargeBinary
-            ) {
-                // return the underlying data type
-                return vt.as_ref().clone();
-            }
+            // return the underlying data type
+            return vt.as_ref().clone();
         }
-
         dt.clone()
     }
 
@@ -280,7 +266,8 @@ impl ScanExec {
 
             let array = if arrow_ffi_safe {
                 // ownership of this array has been transferred to native
-                array
+                // but we still need to unpack dictionary arrays
+                copy_or_unpack_array(&array, &CopyMode::UnpackOrClone)?
             } else {
                 // it is necessary to copy the array because the contents may 
be
                 // overwritten on the JVM side in the future
diff --git a/native/core/src/execution/planner.rs 
b/native/core/src/execution/planner.rs
index 1550efd79..da56e01bb 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -3120,11 +3120,8 @@ mod tests {
                         assert!(batch.is_ok(), "got error {}", 
batch.unwrap_err());
                         let batch = batch.unwrap();
                         assert_eq!(batch.num_rows(), row_count / 4);
-                        // string/binary should still be packed with dictionary
-                        assert!(matches!(
-                            batch.column(0).data_type(),
-                            DataType::Dictionary(_, _)
-                        ));
+                        // string/binary should no longer be packed with 
dictionary
+                        assert!(matches!(batch.column(0).data_type(), 
DataType::Utf8));
                     }
                     Poll::Ready(None) => {
                         break;
diff --git 
a/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala 
b/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
index 087221e1a..188da1d79 100644
--- a/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
+++ b/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
@@ -195,6 +195,9 @@ object FuzzDataGenerator {
             case 2 => r.nextLong().toString
             case 3 => r.nextDouble().toString
             case 4 => RandomStringUtils.randomAlphabetic(8)
+            case 5 =>
+              // use a constant value to trigger dictionary encoding
+              "dict_encode_me!"
             case _ => r.nextString(8)
           }
         })


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to