This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e502728d17 GH-34890: [C++][Python] Add a no-op kernel for
dictionary_encode(dictionary) (#38349)
e502728d17 is described below
commit e502728d174e3b9170abe2ec4db6845c2e8eab01
Author: Jin Shang <[email protected]>
AuthorDate: Tue Dec 12 01:01:15 2023 +0800
GH-34890: [C++][Python] Add a no-op kernel for
dictionary_encode(dictionary) (#38349)
Added a no-op kernel for convenience as discussed in the issue.
* Closes: #34890
Lead-authored-by: Jin Shang <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/compute/kernels/vector_hash.cc | 13 ++++++++-----
cpp/src/arrow/compute/kernels/vector_hash_test.cc | 9 +++++++++
docs/source/cpp/compute.rst | 3 ++-
python/pyarrow/tests/test_compute.py | 1 +
4 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc
b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 5426dc4054..65e59d1a2e 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -718,8 +718,9 @@ const DictionaryEncodeOptions*
GetDefaultDictionaryEncodeOptions() {
const FunctionDoc dictionary_encode_doc(
"Dictionary-encode array",
- ("Return a dictionary-encoded version of the input array."), {"array"},
- "DictionaryEncodeOptions");
+ ("Return a dictionary-encoded version of the input array.\n"
+ "This function does nothing if the input is already a dictionary array."),
+ {"array"}, "DictionaryEncodeOptions");
// ----------------------------------------------------------------------
// This function does not use any hashing utilities
@@ -803,9 +804,11 @@ void RegisterVectorHash(FunctionRegistry* registry) {
GetDefaultDictionaryEncodeOptions());
AddHashKernels<DictEncodeAction>(dict_encode.get(), base, DictEncodeOutput);
- // Calling dictionary_encode on dictionary input not supported, but if it
- // ends up being needed (or convenience), a kernel could be added to make it
- // a no-op
+ auto no_op = [](KernelContext*, const ExecSpan& span, ExecResult* out) {
+ out->value = span[0].array.ToArrayData();
+ return Status::OK();
+ };
+ DCHECK_OK(dict_encode->AddKernel({Type::DICTIONARY}, OutputType(FirstType),
no_op));
DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
}
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index 7b713362f6..c4ec74fbaa 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -687,6 +687,15 @@ TEST_F(TestHashKernel, DictEncodeIntervalMonth) {
{0, 0, 1, 0, 2});
}
+TEST_F(TestHashKernel, DictEncodeDictInput) {
+ // Dictionary encode a dictionary is a no-op
+ auto dict_ty = dictionary(int32(), utf8());
+ auto dict = ArrayFromJSON(utf8(), R"(["a", "b", "c"])");
+ auto indices = ArrayFromJSON(int32(), "[0, 1, 2, 0, 1, 2, 0, 1, 2]");
+ auto input = std::make_shared<DictionaryArray>(dict_ty, indices, dict);
+ CheckDictEncode(input, dict, indices);
+}
+
TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) {
auto dict_json = "[10, 20, 30, 40]";
auto dict = ArrayFromJSON(int64(), dict_json);
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 47af976415..17d003b261 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1675,7 +1675,8 @@ Associative transforms
| | | Temporal, Binary- and String-like |
| |
+-------------------+-------+-----------------------------------+-------------+-------+
-* \(1) Output is ``Dictionary(Int32, input type)``.
+* \(1) Output is ``Dictionary(Int32, input type)``. It is a no-op if input is
+ already a Dictionary array.
* \(2) Duplicates are removed from the output while the original order is
maintained.
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 067d96a821..7c5a134d33 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1781,6 +1781,7 @@ def test_dictionary_decode():
assert array == dictionary_array_decode
assert array == pc.dictionary_decode(array)
+ assert pc.dictionary_encode(dictionary_array) == dictionary_array
def test_cast():