This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f0ed8e2  ARROW-8922: [C++] Add illustrative "ascii_upper" and 
"ascii_length" scalar string functions valid for Array and Scalar inputs
f0ed8e2 is described below

commit f0ed8e2343569047204c5c1a0a24e379c0d23c30
Author: Wes McKinney <[email protected]>
AuthorDate: Sun May 31 07:55:13 2020 -0500

    ARROW-8922: [C++] Add illustrative "ascii_upper" and "ascii_length" scalar 
string functions valid for Array and Scalar inputs
    
    There's some new code generation machinery here (that will be worth ongoing 
iteration) but the relevant implementation / "developer UX" is what's in 
string_scalar_ascii.cc, take a look.
    
    Note: the implementation of `ascii_upper` is far from optimal. 
`std::toupper` does more than convert ASCII to uppercase and so it would likely 
be faster to replace it with a bespoke implementation that only deals with the 
ASCII alphabetic character space
    
    ```
    In [1]: import pyarrow as pa; import pyarrow.compute as pc
    
    In [2]: arr = pa.array(['aaa', 'bbbbbb', None, ''])
    
    In [3]: pc.ascii_upper(arr)
    Out[3]:
    <pyarrow.lib.StringArray object at 0x7f7044003e50>
    [
      "AAA",
      "BBBBBB",
      null,
      ""
    ]
    
    In [4]: pc.ascii_length(arr)
    Out[4]:
    <pyarrow.lib.Int32Array object at 0x7f7044003910>
    [
      3,
      6,
      null,
      0
    ]
    ```
    
    int64 offsets are respected with LargeString
    
    ```
    In [5]: arr = pa.array(['aaa', 'bbbbbb', None, ''], type='large_utf8')
    
    In [6]: pc.ascii_length(arr)
    Out[6]:
    <pyarrow.lib.Int64Array object at 0x7f703c74cbb0>
    [
      3,
      6,
      null,
      0
    ]
    ```
    
    Closes #7278 from wesm/ARROW-8922
    
    Authored-by: Wes McKinney <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 cpp/src/arrow/CMakeLists.txt                       |  1 +
 cpp/src/arrow/compare.cc                           |  2 +-
 cpp/src/arrow/compute/kernels/CMakeLists.txt       |  3 +-
 cpp/src/arrow/compute/kernels/codegen_internal.cc  |  5 ++
 cpp/src/arrow/compute/kernels/codegen_internal.h   | 98 +++++++++++++++++-----
 cpp/src/arrow/compute/kernels/scalar_set_lookup.cc |  6 +-
 .../arrow/compute/kernels/scalar_string_ascii.cc   | 67 +++++++++++++++
 .../scalar_string_internal.h}                      | 37 ++++----
 .../arrow/compute/kernels/scalar_string_test.cc    | 73 ++++++++++++++++
 cpp/src/arrow/compute/registry.cc                  |  1 +
 cpp/src/arrow/compute/registry_internal.h          |  1 +
 cpp/src/arrow/scalar.cc                            |  6 ++
 cpp/src/arrow/scalar.h                             |  2 +
 cpp/src/arrow/testing/gtest_util.cc                | 16 ++++
 cpp/src/arrow/testing/gtest_util.h                 |  3 +
 cpp/src/arrow/type_traits.h                        |  2 +
 python/pyarrow/compute.py                          | 10 +++
 17 files changed, 290 insertions(+), 43 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 30c4c73..031ae5d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -340,6 +340,7 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_cast_temporal.cc
               compute/kernels/scalar_compare.cc
               compute/kernels/scalar_set_lookup.cc
+              compute/kernels/scalar_string_ascii.cc
               compute/kernels/vector_filter.cc
               compute/kernels/vector_hash.cc
               compute/kernels/vector_sort.cc
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 351a42f..7c19a6f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -850,7 +850,7 @@ class ScalarEqualsVisitor {
   template <typename T>
   typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, 
Status>::type
   Visit(const T& left) {
-    const auto& right = checked_cast<const BinaryScalar&>(right_);
+    const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
     result_ = internal::SharedPtrEquals(left.value, right.value);
     return Status::OK();
   }
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt 
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 361e24b..74493a8 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -24,7 +24,8 @@ add_arrow_compute_test(scalar_test
                        scalar_boolean_test.cc
                        scalar_cast_test.cc
                        scalar_compare_test.cc
-                       scalar_set_lookup_test.cc)
+                       scalar_set_lookup_test.cc
+                       scalar_string_test.cc)
 
 add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc 
b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 2771b6a..5db4c92 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -102,6 +102,11 @@ const std::vector<std::shared_ptr<DataType>>& 
BaseBinaryTypes() {
   return g_base_binary_types;
 }
 
+const std::vector<std::shared_ptr<DataType>>& StringTypes() {
+  static DataTypeVector types = {utf8(), large_utf8()};
+  return types;
+}
+
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
   std::call_once(codegen_static_initialized, InitStaticData);
   return g_signed_int_types;
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h 
b/cpp/src/arrow/compute/kernels/codegen_internal.h
index bf504a3..512f2a0 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -122,20 +122,55 @@ struct UnboxScalar<Type, enable_if_base_binary<Type>> {
 };
 
 template <typename Type, typename Enable = void>
-struct GetValueType;
+struct GetViewType;
 
 template <typename Type>
-struct GetValueType<Type, enable_if_has_c_type<Type>> {
+struct GetViewType<Type, enable_if_has_c_type<Type>> {
   using T = typename Type::c_type;
 };
 
 template <typename Type>
-struct GetValueType<
+struct GetViewType<
     Type, enable_if_t<is_base_binary_type<Type>::value || 
is_decimal_type<Type>::value ||
                       is_fixed_size_binary_type<Type>::value>> {
   using T = util::string_view;
 };
 
+template <typename Type, typename Enable = void>
+struct GetOutputType;
+
+template <typename Type>
+struct GetOutputType<Type, enable_if_has_c_type<Type>> {
+  using T = typename Type::c_type;
+};
+
+template <typename Type>
+struct GetOutputType<
+    Type, enable_if_t<is_string_like_type<Type>::value>> {
+  using T = std::string;
+};
+
+template <typename Type, typename Enable = void>
+struct BoxScalar;
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_has_c_type<Type>> {
+  using T = typename GetOutputType<Type>::T;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>& 
type) {
+    return std::make_shared<ScalarType>(val, type);
+  }
+};
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_base_binary<Type>> {
+  using T = typename GetOutputType<Type>::T;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>&) {
+    return std::make_shared<ScalarType>(val);
+  }
+};
+
 // ----------------------------------------------------------------------
 // Reusable type resolvers
 
@@ -154,6 +189,7 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec 
exec,
 // functions
 
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+const std::vector<std::shared_ptr<DataType>>& StringTypes();
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
 const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
 const std::vector<std::shared_ptr<DataType>>& IntTypes();
@@ -327,10 +363,8 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
 // };
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnary {
-  using OutScalar = typename TypeTraits<OutType>::ScalarType;
-
-  using OUT = typename GetValueType<OutType>::T;
-  using ARG0 = typename GetValueType<Arg0Type>::T;
+  using OUT = typename GetOutputType<OutType>::T;
+  using ARG0 = typename GetViewType<Arg0Type>::T;
 
   static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ArrayIterator<Arg0Type> arg0(*batch[0].array());
@@ -342,8 +376,9 @@ struct ScalarUnary {
   static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].scalar()->is_valid) {
       ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
-      out->value = std::make_shared<OutScalar>(Op::template Call<OUT, 
ARG0>(ctx, arg0),
-                                               out->type());
+      out->value = BoxScalar<OutType>::Box(
+          Op::template Call<OUT, ARG0>(ctx, arg0),
+          out->type());
     } else {
       out->value = MakeNullScalar(batch[0].type());
     }
@@ -363,9 +398,8 @@ struct ScalarUnary {
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnaryNotNullStateful {
   using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
-  using OutScalar = typename TypeTraits<OutType>::ScalarType;
-  using OUT = typename GetValueType<OutType>::T;
-  using ARG0 = typename GetValueType<Arg0Type>::T;
+  using OUT = typename GetOutputType<OutType>::T;
+  using ARG0 = typename GetViewType<Arg0Type>::T;
 
   Op op;
   ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
@@ -395,6 +429,30 @@ struct ScalarUnaryNotNullStateful {
   };
 
   template <typename Type>
+  struct ArrayExec<Type, enable_if_string_like<Type>> {
+    static void Exec(const ThisType& functor, KernelContext* ctx, const 
ExecBatch& batch,
+                     Datum* out) {
+      typename TypeTraits<Type>::BuilderType builder;
+      Status s = VisitArrayDataInline<Arg0Type>(
+          *batch[0].array(), [&](util::optional<ARG0> v) -> Status {
+          if (v.has_value()) {
+            return builder.Append(functor.op.Call(ctx, *v));
+          } else {
+            return builder.AppendNull();
+          }
+        });
+      if (!s.ok()) {
+        ctx->SetStatus(s);
+        return;
+      } else {
+        std::shared_ptr<ArrayData> result;
+        ctx->SetStatus(builder.FinishInternal(&result));
+        out->value = std::move(result);
+      }
+    }
+  };
+
+  template <typename Type>
   struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
     static void Exec(const ThisType& functor, KernelContext* ctx, const 
ExecBatch& batch,
                      Datum* out) {
@@ -416,7 +474,7 @@ struct ScalarUnaryNotNullStateful {
   void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].scalar()->is_valid) {
       ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
-      out->value = std::make_shared<OutScalar>(
+      out->value = BoxScalar<OutType>::Box(
           this->op.template Call<OUT, ARG0>(ctx, arg0),
           out->type());
     } else {
@@ -438,6 +496,9 @@ struct ScalarUnaryNotNullStateful {
 // operator requires some initialization use ScalarUnaryNotNullStateful
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnaryNotNull {
+  using OUT = typename GetOutputType<OutType>::T;
+  using ARG0 = typename GetViewType<Arg0Type>::T;
+
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // Seed kernel with dummy state
     ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
@@ -464,11 +525,9 @@ struct ScalarUnaryNotNull {
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
-  using OutScalarType = typename TypeTraits<OutType>::ScalarType;
-
-  using OUT = typename GetValueType<OutType>::T;
-  using ARG0 = typename GetValueType<Arg0Type>::T;
-  using ARG1 = typename GetValueType<Arg1Type>::T;
+  using OUT = typename GetOutputType<OutType>::T;
+  using ARG0 = typename GetViewType<Arg0Type>::T;
+  using ARG1 = typename GetViewType<Arg1Type>::T;
 
   template <typename ChosenOp>
   static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
@@ -492,7 +551,8 @@ struct ScalarBinary {
   static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
     auto arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
     auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
-    out->value = std::make_shared<OutScalarType>(ChosenOp::template Call(ctx, 
arg0, arg1));
+    out->value = BoxScalar<OutType>::Box(ChosenOp::template Call(ctx, arg0, 
arg1),
+                                         out->type());
   }
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc 
b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 956d9e5..502fba2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -42,7 +42,7 @@ struct SetLookupState : public KernelState {
       : lookup_table(pool, 0), lookup_null_count(0) {}
 
   Status Init(const SetLookupOptions& options) {
-    using T = typename GetValueType<Type>::T;
+    using T = typename GetViewType<Type>::T;
     auto insert_value = [&](util::optional<T> v) {
       if (v.has_value()) {
         int32_t unused_memo_index;
@@ -147,7 +147,7 @@ struct MatchVisitor {
 
   template <typename Type>
   enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
-    using T = typename GetValueType<Type>::T;
+    using T = typename GetViewType<Type>::T;
 
     const auto& state = checked_cast<const 
SetLookupState<Type>&>(*ctx->state());
 
@@ -222,7 +222,7 @@ struct IsInVisitor {
 
   template <typename Type>
   enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
-    using T = typename GetValueType<Type>::T;
+    using T = typename GetViewType<Type>::T;
     const auto& state = checked_cast<const 
SetLookupState<Type>&>(*ctx->state());
     ArrayData* output = out->mutable_array();
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc 
b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
new file mode 100644
index 0000000..19eaf84
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_string_internal.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+// TODO: optional ascii validation
+
+struct AsciiLength {
+  template <typename OUT, typename ARG0 = util::string_view>
+  static OUT Call(KernelContext*, ARG0 val) {
+    return static_cast<OUT>(val.size());
+  }
+};
+
+struct AsciiUpper {
+  // XXX: the Scalar codegen path passes template arguments that are unused
+  template <typename... Ignored>
+  static std::string Call(KernelContext*, const util::string_view& val) {
+    std::string result = val.to_string();
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [](unsigned char c) { return std::toupper(c); });
+    return result;
+  }
+};
+
+void AddAsciiLength(FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>("ascii_length", Arity::Unary());
+  ArrayKernelExec exec_offset_32 =
+      codegen::ScalarUnaryNotNull<Int32Type, StringType, AsciiLength>::Exec;
+  ArrayKernelExec exec_offset_64 =
+      codegen::ScalarUnaryNotNull<Int64Type, LargeStringType, 
AsciiLength>::Exec;
+  DCHECK_OK(func->AddKernel({utf8()}, int32(), exec_offset_32));
+  DCHECK_OK(func->AddKernel({large_utf8()}, int64(), exec_offset_64));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+void RegisterScalarStringAscii(FunctionRegistry* registry) {
+  MakeUnaryStringToString<AsciiUpper>("ascii_upper", registry);
+  AddAsciiLength(registry);
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry_internal.h 
b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
similarity index 54%
copy from cpp/src/arrow/compute/registry_internal.h
copy to cpp/src/arrow/compute/kernels/scalar_string_internal.h
index 5969981..dc71a04 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
@@ -15,30 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/compute/kernels/common.h"
 
 namespace arrow {
 namespace compute {
-
-class FunctionRegistry;
-
 namespace internal {
 
-// Built-in scalar / elementwise functions
-void RegisterScalarArithmetic(FunctionRegistry* registry);
-void RegisterScalarBoolean(FunctionRegistry* registry);
-void RegisterScalarCast(FunctionRegistry* registry);
-void RegisterScalarComparison(FunctionRegistry* registry);
-void RegisterScalarSetLookup(FunctionRegistry* registry);
-
-// Vector functions
-void RegisterVectorFilter(FunctionRegistry* registry);
-void RegisterVectorHash(FunctionRegistry* registry);
-void RegisterVectorSort(FunctionRegistry* registry);
-void RegisterVectorTake(FunctionRegistry* registry);
-
-// Aggregate functions
-void RegisterScalarAggregateBasic(FunctionRegistry* registry);
+// Apply a scalar function to each string and yield same output type
+template <typename Op>
+void MakeUnaryStringToString(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
+  ArrayKernelExec exec_offset_32 =
+      codegen::ScalarUnaryNotNull<StringType, StringType, Op>::Exec;
+  ArrayKernelExec exec_offset_64 =
+      codegen::ScalarUnaryNotNull<LargeStringType, LargeStringType, Op>::Exec;
+  DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_offset_32));
+  DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_offset_64));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
 
 }  // namespace internal
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
new file mode 100644
index 0000000..fba9a21
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+namespace compute {
+
+typedef ::testing::Types<StringType, LargeStringType> StringTypes;
+
+template <typename TestType>
+class TestStringKernels : public ::testing::Test {
+ protected:
+  using OffsetType = typename TypeTraits<TestType>::OffsetType;
+
+  void CheckUnary(std::string func_name, std::string json_input,
+                  std::shared_ptr<DataType> out_ty, std::string json_expected) 
{
+    auto input = ArrayFromJSON(string_type(), json_input);
+    auto expected = ArrayFromJSON(out_ty, json_expected);
+    ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}));
+    AssertArraysEqual(*expected, *out.make_array(), /*verbose=*/true);
+
+    // Check all the scalars
+    for (int64_t i = 0; i < input->length(); ++i) {
+      ASSERT_OK_AND_ASSIGN(auto val, input->GetScalar(i));
+      ASSERT_OK_AND_ASSIGN(auto ex_val, expected->GetScalar(i));
+      ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {val}));
+      AssertScalarsEqual(*ex_val, *out.scalar(), /*verbose=*/true);
+    }
+  }
+
+  std::shared_ptr<DataType> string_type() {
+    return TypeTraits<TestType>::type_singleton();
+  }
+
+  std::shared_ptr<DataType> offset_type() {
+    return TypeTraits<OffsetType>::type_singleton();
+  }
+};
+
+TYPED_TEST_SUITE(TestStringKernels, StringTypes);
+
+TYPED_TEST(TestStringKernels, AsciiLength) {
+  this->CheckUnary("ascii_length", "[\"aaa\", null, \"\", \"b\"]", 
this->offset_type(),
+                   "[3, null, 0, 1]");
+}
+
+TYPED_TEST(TestStringKernels, AsciiUpper) {
+  this->CheckUnary("ascii_upper", "[\"aAa&\", null, \"\", \"b\"]", 
this->string_type(),
+                   "[\"AAA&\", null, \"\", \"B\"]");
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc 
b/cpp/src/arrow/compute/registry.cc
index c2f1ffd..1ef61d2 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -103,6 +103,7 @@ static std::unique_ptr<FunctionRegistry> 
CreateBuiltInRegistry() {
   RegisterScalarCast(registry.get());
   RegisterScalarComparison(registry.get());
   RegisterScalarSetLookup(registry.get());
+  RegisterScalarStringAscii(registry.get());
 
   // Aggregate functions
   RegisterScalarAggregateBasic(registry.get());
diff --git a/cpp/src/arrow/compute/registry_internal.h 
b/cpp/src/arrow/compute/registry_internal.h
index 5969981..2c3a5e3 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -30,6 +30,7 @@ void RegisterScalarBoolean(FunctionRegistry* registry);
 void RegisterScalarCast(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
 void RegisterScalarSetLookup(FunctionRegistry* registry);
+void RegisterScalarStringAscii(FunctionRegistry* registry);
 
 // Vector functions
 void RegisterVectorFilter(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index c3e2811..cc32d8a 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -127,6 +127,9 @@ size_t Scalar::Hash::hash(const Scalar& scalar) { return 
ScalarHashImpl(scalar).
 StringScalar::StringScalar(std::string s)
     : StringScalar(Buffer::FromString(std::move(s))) {}
 
+LargeStringScalar::LargeStringScalar(std::string s)
+    : LargeStringScalar(Buffer::FromString(std::move(s))) {}
+
 FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr<Buffer> value,
                                              std::shared_ptr<DataType> type)
     : BinaryScalar(std::move(value), std::move(type)) {
@@ -212,6 +215,9 @@ std::shared_ptr<Scalar> 
MakeNullScalar(std::shared_ptr<DataType> type) {
 }
 
 std::string Scalar::ToString() const {
+  if (!this->is_valid) {
+    return "null";
+  }
   auto maybe_repr = CastTo(utf8());
   if (maybe_repr.ok()) {
     return checked_cast<const 
StringScalar&>(*maybe_repr.ValueOrDie()).value->ToString();
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index ac4e3db..5caf04d 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -224,6 +224,8 @@ struct ARROW_EXPORT LargeStringScalar : public 
LargeBinaryScalar {
   explicit LargeStringScalar(std::shared_ptr<Buffer> value)
       : LargeStringScalar(std::move(value), large_utf8()) {}
 
+  explicit LargeStringScalar(std::string s);
+
   LargeStringScalar() : LargeStringScalar(large_utf8()) {}
 };
 
diff --git a/cpp/src/arrow/testing/gtest_util.cc 
b/cpp/src/arrow/testing/gtest_util.cc
index 894e3fb..a91fc83 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -82,6 +82,22 @@ void AssertArraysEqual(const Array& expected, const Array& 
actual, bool verbose)
   }
 }
 
+void AssertScalarsEqual(const Scalar& expected, const Scalar& actual, bool 
verbose) {
+  std::stringstream diff;
+  // ARROW-8956, ScalarEquals returns false when both are null
+  if (!expected.is_valid && !actual.is_valid) {
+    // We consider both being null to be equal in this function
+    return;
+  }
+  if (!expected.Equals(actual)) {
+    if (verbose) {
+      diff << "Expected:\n" << expected.ToString();
+      diff << "\nActual:\n" << actual.ToString();
+    }
+    FAIL() << diff.str();
+  }
+}
+
 void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual,
                         bool check_metadata) {
   AssertTsEqual(expected, actual, check_metadata);
diff --git a/cpp/src/arrow/testing/gtest_util.h 
b/cpp/src/arrow/testing/gtest_util.h
index 846e30d..d84db73 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -167,6 +167,9 @@ struct Datum;
 // If verbose is true, then the arrays will be pretty printed
 ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual,
                                     bool verbose = false);
+// Returns true when values are both null
+ARROW_EXPORT void AssertScalarsEqual(const Scalar& expected, const Scalar& 
actual,
+                                     bool verbose = false);
 ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected,
                                      const RecordBatch& actual,
                                      bool check_metadata = false);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index f5e32ba..f61b690 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -267,6 +267,7 @@ struct TypeTraits<StringType> {
   using ArrayType = StringArray;
   using BuilderType = StringBuilder;
   using ScalarType = StringScalar;
+  using OffsetType = Int32Type;
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
 };
@@ -276,6 +277,7 @@ struct TypeTraits<LargeStringType> {
   using ArrayType = LargeStringArray;
   using BuilderType = LargeStringBuilder;
   using ScalarType = LargeStringScalar;
+  using OffsetType = Int64Type;
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return 
large_utf8(); }
 };
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 6e3628f..d7c063a 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -84,6 +84,16 @@ def cast(arr, target_type, safe=True):
     return call_function("cast", [arr], options)
 
 
+def _simple_unary_function(name):
+    def func(arg):
+        return call_function(name, [arg])
+    return func
+
+
+ascii_length = _simple_unary_function('ascii_length')
+ascii_upper = _simple_unary_function('ascii_upper')
+
+
 def sum(array):
     """
     Sum the values in a numerical (chunked) array.

Reply via email to