This is an automated email from the ASF dual-hosted git repository.

westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c7741fb4e6 GH-34588:[C++][Python] Add a MetaFunction for 
"dictionary_decode" (#35356)
c7741fb4e6 is described below

commit c7741fb4e633222346269e80b35b5df48051b585
Author: Junming Chen <[email protected]>
AuthorDate: Wed Jul 19 07:41:07 2023 +0800

    GH-34588:[C++][Python] Add a MetaFunction for "dictionary_decode" (#35356)
    
    **Rationale for this change**
    This PR is for [Issue-34588](https://github.com/apache/arrow/issues/34588). 
Discussing with @ westonpace, a MetaFunction for "dictionary_decode" is 
implemented instead of adding a compute kernel.
    
    **What changes are included in this PR?**
    C++: Meta Function of dictionary_decode.
    Python: Test
    
    **Are these changes tested?**
    One test in tests/test_compute.py
    
    * Closes: #34588
    
    Lead-authored-by: Junming Chen <[email protected]>
    Co-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: Weston Pace <[email protected]>
    Signed-off-by: Weston Pace <[email protected]>
---
 cpp/src/arrow/compute/kernels/vector_hash.cc | 37 ++++++++++++++++++++++++++++
 cpp/src/arrow/compute/registry.cc            |  1 +
 cpp/src/arrow/compute/registry_internal.h    |  1 +
 python/pyarrow/tests/test_compute.py         | 11 +++++++++
 4 files changed, 50 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc 
b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 2eab7ae8af..a7bb2d88c2 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -27,6 +27,7 @@
 #include "arrow/array/dict_internal.h"
 #include "arrow/array/util.h"
 #include "arrow/compute/api_vector.h"
+#include "arrow/compute/cast.h"
 #include "arrow/compute/kernels/common_internal.h"
 #include "arrow/result.h"
 #include "arrow/util/hashing.h"
@@ -762,6 +763,38 @@ const FunctionDoc dictionary_encode_doc(
     ("Return a dictionary-encoded version of the input array."), {"array"},
     "DictionaryEncodeOptions");
 
+// ----------------------------------------------------------------------
+// This function does not use any hashing utilities
+// but is kept in this file to be near dictionary_encode
+// Dictionary decode implementation
+
+const FunctionDoc dictionary_decode_doc{
+    "Decodes a DictionaryArray to an Array",
+    ("Return a plain-encoded version of the array input\n"
+     "This function does nothing if the input is not a dictionary."),
+    {"dictionary_array"}};
+
+class DictionaryDecodeMetaFunction : public MetaFunction {
+ public:
+  DictionaryDecodeMetaFunction()
+      : MetaFunction("dictionary_decode", Arity::Unary(), 
dictionary_decode_doc) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options,
+                            ExecContext* ctx) const override {
+    if (args[0].type() == nullptr || args[0].type()->id() != Type::DICTIONARY) 
{
+      return args[0];
+    }
+
+    if (args[0].is_array() || args[0].is_chunked_array()) {
+      DictionaryType* dict_type = 
checked_cast<DictionaryType*>(args[0].type().get());
+      CastOptions cast_options = CastOptions::Safe(dict_type->value_type());
+      return CallFunction("cast", args, &cast_options, ctx);
+    } else {
+      return Status::TypeError("Expected an Array or a Chunked Array");
+    }
+  }
+};
 }  // namespace
 
 void RegisterVectorHash(FunctionRegistry* registry) {
@@ -819,6 +852,10 @@ void RegisterVectorHash(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
 }
 
+void RegisterDictionaryDecode(FunctionRegistry* registry) {
+  
DCHECK_OK(registry->AddFunction(std::make_shared<DictionaryDecodeMetaFunction>()));
+}
+
 }  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc 
b/cpp/src/arrow/compute/registry.cc
index a4b484a206..7a54f78a03 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -275,6 +275,7 @@ static std::unique_ptr<FunctionRegistry> 
CreateBuiltInRegistry() {
 
   // Register core kernels
   RegisterScalarCast(registry.get());
+  RegisterDictionaryDecode(registry.get());
   RegisterVectorHash(registry.get());
   RegisterVectorSelection(registry.get());
 
diff --git a/cpp/src/arrow/compute/registry_internal.h 
b/cpp/src/arrow/compute/registry_internal.h
index b4239701d9..cdc9f804e7 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -28,6 +28,7 @@ namespace internal {
 void RegisterScalarArithmetic(FunctionRegistry* registry);
 void RegisterScalarBoolean(FunctionRegistry* registry);
 void RegisterScalarCast(FunctionRegistry* registry);
+void RegisterDictionaryDecode(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
 void RegisterScalarIfElse(FunctionRegistry* registry);
 void RegisterScalarNested(FunctionRegistry* registry);
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index e47e5d3f3e..98ab84c039 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1756,6 +1756,17 @@ def test_logical():
     assert pc.invert(a) == pa.array([False, True, True, None])
 
 
+def test_dictionary_decode():
+    array = pa.array(["a", "a", "b", "c", "b"])
+    dictionary_array = array.dictionary_encode()
+    dictionary_array_decode = pc.dictionary_decode(dictionary_array)
+
+    assert array != dictionary_array
+
+    assert array == dictionary_array_decode
+    assert array == pc.dictionary_decode(array)
+
+
 def test_cast():
     arr = pa.array([1, 2, 3, 4], type='int64')
     options = pc.CastOptions(pa.int8())

Reply via email to