This is an automated email from the ASF dual-hosted git repository.
felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2422994de0 GH-39463: [C++] Support cast kernel from large string,
(large) binary to dictionary (#40017)
2422994de0 is described below
commit 2422994de04cf4f5a989fec0f00fabccad15b03f
Author: Hyunseok Seo <[email protected]>
AuthorDate: Thu Feb 15 01:16:16 2024 +0900
GH-39463: [C++] Support cast kernel from large string, (large) binary to
dictionary (#40017)
### Rationale for this change
Support `cast` kernel from large string(`large_utf8()`, (large)
binary(`binary()`, `large_binary()`) to `dictionary`
### What changes are included in this PR?
- Support `cast` kernel
- from large string(`large_utf8()`) to `dictionary`
- from binary(`binary()`) to `dictionary`
- from large binary(`large_binary()`) to `dictionary`
### Are these changes tested?
Yes. It is passed by existing test cases.
### Are there any user-facing changes?
No.
* Closes: #39463
Authored-by: Hyunseok Seo <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
.../compute/kernels/scalar_cast_dictionary.cc | 14 ++++--
cpp/src/arrow/scalar_test.cc | 56 +++++++++++-----------
2 files changed, 38 insertions(+), 32 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index f13aa26d96..ae88ef1cb7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -45,11 +45,12 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan&
batch, ExecResult* o
return Status::OK();
}
- // If the input type is STRING, it is first encoded as a dictionary to
facilitate
- // processing. This approach allows the subsequent code to uniformly handle
STRING
- // inputs as if they were originally provided in dictionary format. Encoding
as a
- // dictionary helps in reusing the same logic for dictionary operations.
- if (batch[0].type()->id() == Type::STRING) {
+ // If the input type is string or binary-like, it is first encoded as a
dictionary to
+ // facilitate processing. This approach allows the subsequent code to
uniformly handle
+ // string or binary-like inputs as if they were originally provided in
dictionary
+ // format. Encoding as a dictionary helps in reusing the same logic for
dictionary
+ // operations.
+ if (is_base_binary_like(in_array->type->id())) {
in_array = DictionaryEncode(in_array)->array();
}
const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
@@ -98,6 +99,9 @@ std::vector<std::shared_ptr<CastFunction>>
GetDictionaryCasts() {
AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get());
AddDictionaryCast<DictionaryType>(cast_dict.get());
AddDictionaryCast<StringType>(cast_dict.get());
+ AddDictionaryCast<LargeStringType>(cast_dict.get());
+ AddDictionaryCast<BinaryType>(cast_dict.get());
+ AddDictionaryCast<LargeBinaryType>(cast_dict.get());
return {cast_dict};
}
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index d9fb3feaee..09dfde3227 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -1482,33 +1482,35 @@ TEST(TestDictionaryScalar, ValidateErrors) {
TEST(TestDictionaryScalar, Cast) {
for (auto index_ty : all_dictionary_index_types()) {
- auto ty = dictionary(index_ty, utf8());
- auto dict = checked_pointer_cast<StringArray>(
- ArrayFromJSON(utf8(), R"(["alpha", null, "gamma"])"));
-
- for (int64_t i = 0; i < dict->length(); ++i) {
- auto alpha =
- dict->IsValid(i) ? MakeScalar(dict->GetString(i)) :
MakeNullScalar(utf8());
- // Cast string to dict(..., string)
- ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
- const auto& cast_alpha = cast_alpha_datum.scalar();
- ASSERT_OK(cast_alpha->ValidateFull());
- ASSERT_OK_AND_ASSIGN(
- auto roundtripped_alpha,
- checked_cast<const
DictionaryScalar&>(*cast_alpha).GetEncodedValue());
-
- ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
- auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
- ASSERT_OK(alpha_dict.ValidateFull());
- ASSERT_OK_AND_ASSIGN(
- auto encoded_alpha,
- checked_cast<const DictionaryScalar&>(alpha_dict).GetEncodedValue());
-
- AssertScalarsEqual(*alpha, *roundtripped_alpha);
- AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);
-
- // dictionaries differ, though encoded values are identical
- ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
+ for (auto value_ty : {utf8(), large_utf8(), binary(), large_binary()}) {
+ auto ty = dictionary(index_ty, value_ty);
+ auto dict = ArrayFromJSON(value_ty, R"(["alpha", null, "gamma"])");
+ ASSERT_OK(dict->ValidateFull());
+
+ for (int64_t i = 0; i < dict->length(); ++i) {
+ ASSERT_OK_AND_ASSIGN(auto alpha, dict->GetScalar(i));
+
+ // Cast string to dict(..., string)
+ ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
+ const auto& cast_alpha = cast_alpha_datum.scalar();
+ ASSERT_OK(cast_alpha->ValidateFull());
+ ASSERT_OK_AND_ASSIGN(
+ auto roundtripped_alpha,
+ checked_cast<const
DictionaryScalar&>(*cast_alpha).GetEncodedValue());
+
+ ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
+ auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
+ ASSERT_OK(alpha_dict.ValidateFull());
+ ASSERT_OK_AND_ASSIGN(
+ auto encoded_alpha,
+ checked_cast<const
DictionaryScalar&>(alpha_dict).GetEncodedValue());
+
+ AssertScalarsEqual(*alpha, *roundtripped_alpha);
+ AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);
+
+ // dictionaries differ, though encoded values are identical
+ ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
+ }
}
}
}