This is an automated email from the ASF dual-hosted git repository.

felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2422994de0 GH-39463: [C++] Support cast kernel from large string, 
(large) binary to dictionary (#40017)
2422994de0 is described below

commit 2422994de04cf4f5a989fec0f00fabccad15b03f
Author: Hyunseok Seo <[email protected]>
AuthorDate: Thu Feb 15 01:16:16 2024 +0900

    GH-39463: [C++] Support cast kernel from large string, (large) binary to 
dictionary (#40017)
    
    
    
    ### Rationale for this change
    
    Support `cast` kernel from large string(`large_utf8()`, (large) 
binary(`binary()`, `large_binary()`) to `dictionary`
    
    ### What changes are included in this PR?
    
    - Support `cast` kernel
      - from large string(`large_utf8()`) to `dictionary`
      - from binary(`binary()`) to `dictionary`
      - from large binary(`large_binary()`) to `dictionary`
    
    ### Are these changes tested?
    
    Yes. It is passed by existing test cases.
    
    ### Are there any user-facing changes?
    
    No.
    
    * Closes: #39463
    
    Authored-by: Hyunseok Seo <[email protected]>
    Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
 .../compute/kernels/scalar_cast_dictionary.cc      | 14 ++++--
 cpp/src/arrow/scalar_test.cc                       | 56 +++++++++++-----------
 2 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index f13aa26d96..ae88ef1cb7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -45,11 +45,12 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& 
batch, ExecResult* o
     return Status::OK();
   }
 
-  // If the input type is STRING, it is first encoded as a dictionary to 
facilitate
-  // processing. This approach allows the subsequent code to uniformly handle 
STRING
-  // inputs as if they were originally provided in dictionary format. Encoding 
as a
-  // dictionary helps in reusing the same logic for dictionary operations.
-  if (batch[0].type()->id() == Type::STRING) {
+  // If the input type is string or binary-like, it is first encoded as a 
dictionary to
+  // facilitate processing. This approach allows the subsequent code to 
uniformly handle
+  // string or binary-like inputs as if they were originally provided in 
dictionary
+  // format. Encoding as a dictionary helps in reusing the same logic for 
dictionary
+  // operations.
+  if (is_base_binary_like(in_array->type->id())) {
     in_array = DictionaryEncode(in_array)->array();
   }
   const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
@@ -98,6 +99,9 @@ std::vector<std::shared_ptr<CastFunction>> 
GetDictionaryCasts() {
   AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get());
   AddDictionaryCast<DictionaryType>(cast_dict.get());
   AddDictionaryCast<StringType>(cast_dict.get());
+  AddDictionaryCast<LargeStringType>(cast_dict.get());
+  AddDictionaryCast<BinaryType>(cast_dict.get());
+  AddDictionaryCast<LargeBinaryType>(cast_dict.get());
 
   return {cast_dict};
 }
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index d9fb3feaee..09dfde3227 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -1482,33 +1482,35 @@ TEST(TestDictionaryScalar, ValidateErrors) {
 
 TEST(TestDictionaryScalar, Cast) {
   for (auto index_ty : all_dictionary_index_types()) {
-    auto ty = dictionary(index_ty, utf8());
-    auto dict = checked_pointer_cast<StringArray>(
-        ArrayFromJSON(utf8(), R"(["alpha", null, "gamma"])"));
-
-    for (int64_t i = 0; i < dict->length(); ++i) {
-      auto alpha =
-          dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : 
MakeNullScalar(utf8());
-      // Cast string to dict(..., string)
-      ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
-      const auto& cast_alpha = cast_alpha_datum.scalar();
-      ASSERT_OK(cast_alpha->ValidateFull());
-      ASSERT_OK_AND_ASSIGN(
-          auto roundtripped_alpha,
-          checked_cast<const 
DictionaryScalar&>(*cast_alpha).GetEncodedValue());
-
-      ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
-      auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
-      ASSERT_OK(alpha_dict.ValidateFull());
-      ASSERT_OK_AND_ASSIGN(
-          auto encoded_alpha,
-          checked_cast<const DictionaryScalar&>(alpha_dict).GetEncodedValue());
-
-      AssertScalarsEqual(*alpha, *roundtripped_alpha);
-      AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);
-
-      // dictionaries differ, though encoded values are identical
-      ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
+    for (auto value_ty : {utf8(), large_utf8(), binary(), large_binary()}) {
+      auto ty = dictionary(index_ty, value_ty);
+      auto dict = ArrayFromJSON(value_ty, R"(["alpha", null, "gamma"])");
+      ASSERT_OK(dict->ValidateFull());
+
+      for (int64_t i = 0; i < dict->length(); ++i) {
+        ASSERT_OK_AND_ASSIGN(auto alpha, dict->GetScalar(i));
+
+        // Cast string to dict(..., string)
+        ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
+        const auto& cast_alpha = cast_alpha_datum.scalar();
+        ASSERT_OK(cast_alpha->ValidateFull());
+        ASSERT_OK_AND_ASSIGN(
+            auto roundtripped_alpha,
+            checked_cast<const 
DictionaryScalar&>(*cast_alpha).GetEncodedValue());
+
+        ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
+        auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
+        ASSERT_OK(alpha_dict.ValidateFull());
+        ASSERT_OK_AND_ASSIGN(
+            auto encoded_alpha,
+            checked_cast<const 
DictionaryScalar&>(alpha_dict).GetEncodedValue());
+
+        AssertScalarsEqual(*alpha, *roundtripped_alpha);
+        AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);
+
+        // dictionaries differ, though encoded values are identical
+        ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
+      }
     }
   }
 }

Reply via email to