This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new e502728d17 GH-34890: [C++][Python] Add a no-op kernel for 
dictionary_encode(dictionary) (#38349)
e502728d17 is described below

commit e502728d174e3b9170abe2ec4db6845c2e8eab01
Author: Jin Shang <[email protected]>
AuthorDate: Tue Dec 12 01:01:15 2023 +0800

    GH-34890: [C++][Python] Add a no-op kernel for 
dictionary_encode(dictionary) (#38349)
    
    Added a no-op kernel for convenience as discussed in the issue.
    * Closes: #34890
    
    Lead-authored-by: Jin Shang <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: Benjamin Kietzman <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/compute/kernels/vector_hash.cc      | 13 ++++++++-----
 cpp/src/arrow/compute/kernels/vector_hash_test.cc |  9 +++++++++
 docs/source/cpp/compute.rst                       |  3 ++-
 python/pyarrow/tests/test_compute.py              |  1 +
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc 
b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 5426dc4054..65e59d1a2e 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -718,8 +718,9 @@ const DictionaryEncodeOptions* 
GetDefaultDictionaryEncodeOptions() {
 
 const FunctionDoc dictionary_encode_doc(
     "Dictionary-encode array",
-    ("Return a dictionary-encoded version of the input array."), {"array"},
-    "DictionaryEncodeOptions");
+    ("Return a dictionary-encoded version of the input array.\n"
+     "This function does nothing if the input is already a dictionary array."),
+    {"array"}, "DictionaryEncodeOptions");
 
 // ----------------------------------------------------------------------
 // This function does not use any hashing utilities
@@ -803,9 +804,11 @@ void RegisterVectorHash(FunctionRegistry* registry) {
       GetDefaultDictionaryEncodeOptions());
   AddHashKernels<DictEncodeAction>(dict_encode.get(), base, DictEncodeOutput);
 
-  // Calling dictionary_encode on dictionary input not supported, but if it
-  // ends up being needed (or convenience), a kernel could be added to make it
-  // a no-op
+  auto no_op = [](KernelContext*, const ExecSpan& span, ExecResult* out) {
+    out->value = span[0].array.ToArrayData();
+    return Status::OK();
+  };
+  DCHECK_OK(dict_encode->AddKernel({Type::DICTIONARY}, OutputType(FirstType), 
no_op));
 
   DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc 
b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index 7b713362f6..c4ec74fbaa 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -687,6 +687,15 @@ TEST_F(TestHashKernel, DictEncodeIntervalMonth) {
       {0, 0, 1, 0, 2});
 }
 
+TEST_F(TestHashKernel, DictEncodeDictInput) {
+  // Dictionary encode a dictionary is a no-op
+  auto dict_ty = dictionary(int32(), utf8());
+  auto dict = ArrayFromJSON(utf8(), R"(["a", "b", "c"])");
+  auto indices = ArrayFromJSON(int32(), "[0, 1, 2, 0, 1, 2, 0, 1, 2]");
+  auto input = std::make_shared<DictionaryArray>(dict_ty, indices, dict);
+  CheckDictEncode(input, dict, indices);
+}
+
 TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) {
   auto dict_json = "[10, 20, 30, 40]";
   auto dict = ArrayFromJSON(int64(), dict_json);
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 47af976415..17d003b261 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1675,7 +1675,8 @@ Associative transforms
 |                   |       | Temporal, Binary- and String-like |             
|       |
 
+-------------------+-------+-----------------------------------+-------------+-------+
 
-* \(1) Output is ``Dictionary(Int32, input type)``.
+* \(1) Output is ``Dictionary(Int32, input type)``. It is a no-op if input is
+  already a Dictionary array.
 
 * \(2) Duplicates are removed from the output while the original order is
   maintained.
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 067d96a821..7c5a134d33 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1781,6 +1781,7 @@ def test_dictionary_decode():
 
     assert array == dictionary_array_decode
     assert array == pc.dictionary_decode(array)
+    assert pc.dictionary_encode(dictionary_array) == dictionary_array
 
 
 def test_cast():

Reply via email to