This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f0ed8e2 ARROW-8922: [C++] Add illustrative "ascii_upper" and
"ascii_length" scalar string functions valid for Array and Scalar inputs
f0ed8e2 is described below
commit f0ed8e2343569047204c5c1a0a24e379c0d23c30
Author: Wes McKinney <[email protected]>
AuthorDate: Sun May 31 07:55:13 2020 -0500
ARROW-8922: [C++] Add illustrative "ascii_upper" and "ascii_length" scalar
string functions valid for Array and Scalar inputs
There's some new code generation machinery here (that will be worth ongoing
iteration) but the relevant implementation / "developer UX" is what's in
string_scalar_ascii.cc, take a look.
Note: the implementation of `ascii_upper` is far from optimal.
`std::toupper` does more than convert ASCII to uppercase and so it would likely
be faster to replace it with a bespoke implementation that only deals with the
ASCII alphabetic character space
```
In [1]: import pyarrow as pa; import pyarrow.compute as pc
In [2]: arr = pa.array(['aaa', 'bbbbbb', None, ''])
In [3]: pc.ascii_upper(arr)
Out[3]:
<pyarrow.lib.StringArray object at 0x7f7044003e50>
[
"AAA",
"BBBBBB",
null,
""
]
In [4]: pc.ascii_length(arr)
Out[4]:
<pyarrow.lib.Int32Array object at 0x7f7044003910>
[
3,
6,
null,
0
]
```
int64 offsets are respected with LargeString
```
In [5]: arr = pa.array(['aaa', 'bbbbbb', None, ''], type='large_utf8')
In [6]: pc.ascii_length(arr)
Out[6]:
<pyarrow.lib.Int64Array object at 0x7f703c74cbb0>
[
3,
6,
null,
0
]
```
Closes #7278 from wesm/ARROW-8922
Authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/compare.cc | 2 +-
cpp/src/arrow/compute/kernels/CMakeLists.txt | 3 +-
cpp/src/arrow/compute/kernels/codegen_internal.cc | 5 ++
cpp/src/arrow/compute/kernels/codegen_internal.h | 98 +++++++++++++++++-----
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 6 +-
.../arrow/compute/kernels/scalar_string_ascii.cc | 67 +++++++++++++++
.../scalar_string_internal.h} | 37 ++++----
.../arrow/compute/kernels/scalar_string_test.cc | 73 ++++++++++++++++
cpp/src/arrow/compute/registry.cc | 1 +
cpp/src/arrow/compute/registry_internal.h | 1 +
cpp/src/arrow/scalar.cc | 6 ++
cpp/src/arrow/scalar.h | 2 +
cpp/src/arrow/testing/gtest_util.cc | 16 ++++
cpp/src/arrow/testing/gtest_util.h | 3 +
cpp/src/arrow/type_traits.h | 2 +
python/pyarrow/compute.py | 10 +++
17 files changed, 290 insertions(+), 43 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 30c4c73..031ae5d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -340,6 +340,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
compute/kernels/scalar_set_lookup.cc
+ compute/kernels/scalar_string_ascii.cc
compute/kernels/vector_filter.cc
compute/kernels/vector_hash.cc
compute/kernels/vector_sort.cc
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 351a42f..7c19a6f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -850,7 +850,7 @@ class ScalarEqualsVisitor {
template <typename T>
typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value,
Status>::type
Visit(const T& left) {
- const auto& right = checked_cast<const BinaryScalar&>(right_);
+ const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
result_ = internal::SharedPtrEquals(left.value, right.value);
return Status::OK();
}
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 361e24b..74493a8 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -24,7 +24,8 @@ add_arrow_compute_test(scalar_test
scalar_boolean_test.cc
scalar_cast_test.cc
scalar_compare_test.cc
- scalar_set_lookup_test.cc)
+ scalar_set_lookup_test.cc
+ scalar_string_test.cc)
add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc
b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 2771b6a..5db4c92 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -102,6 +102,11 @@ const std::vector<std::shared_ptr<DataType>>&
BaseBinaryTypes() {
return g_base_binary_types;
}
+const std::vector<std::shared_ptr<DataType>>& StringTypes() {
+ static DataTypeVector types = {utf8(), large_utf8()};
+ return types;
+}
+
const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
std::call_once(codegen_static_initialized, InitStaticData);
return g_signed_int_types;
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h
b/cpp/src/arrow/compute/kernels/codegen_internal.h
index bf504a3..512f2a0 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -122,20 +122,55 @@ struct UnboxScalar<Type, enable_if_base_binary<Type>> {
};
template <typename Type, typename Enable = void>
-struct GetValueType;
+struct GetViewType;
template <typename Type>
-struct GetValueType<Type, enable_if_has_c_type<Type>> {
+struct GetViewType<Type, enable_if_has_c_type<Type>> {
using T = typename Type::c_type;
};
template <typename Type>
-struct GetValueType<
+struct GetViewType<
Type, enable_if_t<is_base_binary_type<Type>::value ||
is_decimal_type<Type>::value ||
is_fixed_size_binary_type<Type>::value>> {
using T = util::string_view;
};
+template <typename Type, typename Enable = void>
+struct GetOutputType;
+
+template <typename Type>
+struct GetOutputType<Type, enable_if_has_c_type<Type>> {
+ using T = typename Type::c_type;
+};
+
+template <typename Type>
+struct GetOutputType<
+ Type, enable_if_t<is_string_like_type<Type>::value>> {
+ using T = std::string;
+};
+
+template <typename Type, typename Enable = void>
+struct BoxScalar;
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_has_c_type<Type>> {
+ using T = typename GetOutputType<Type>::T;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>&
type) {
+ return std::make_shared<ScalarType>(val, type);
+ }
+};
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_base_binary<Type>> {
+ using T = typename GetOutputType<Type>::T;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>&) {
+ return std::make_shared<ScalarType>(val);
+ }
+};
+
// ----------------------------------------------------------------------
// Reusable type resolvers
@@ -154,6 +189,7 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec
exec,
// functions
const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+const std::vector<std::shared_ptr<DataType>>& StringTypes();
const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& IntTypes();
@@ -327,10 +363,8 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
// };
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnary {
- using OutScalar = typename TypeTraits<OutType>::ScalarType;
-
- using OUT = typename GetValueType<OutType>::T;
- using ARG0 = typename GetValueType<Arg0Type>::T;
+ using OUT = typename GetOutputType<OutType>::T;
+ using ARG0 = typename GetViewType<Arg0Type>::T;
static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ArrayIterator<Arg0Type> arg0(*batch[0].array());
@@ -342,8 +376,9 @@ struct ScalarUnary {
static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].scalar()->is_valid) {
ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
- out->value = std::make_shared<OutScalar>(Op::template Call<OUT,
ARG0>(ctx, arg0),
- out->type());
+ out->value = BoxScalar<OutType>::Box(
+ Op::template Call<OUT, ARG0>(ctx, arg0),
+ out->type());
} else {
out->value = MakeNullScalar(batch[0].type());
}
@@ -363,9 +398,8 @@ struct ScalarUnary {
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnaryNotNullStateful {
using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
- using OutScalar = typename TypeTraits<OutType>::ScalarType;
- using OUT = typename GetValueType<OutType>::T;
- using ARG0 = typename GetValueType<Arg0Type>::T;
+ using OUT = typename GetOutputType<OutType>::T;
+ using ARG0 = typename GetViewType<Arg0Type>::T;
Op op;
ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
@@ -395,6 +429,30 @@ struct ScalarUnaryNotNullStateful {
};
template <typename Type>
+ struct ArrayExec<Type, enable_if_string_like<Type>> {
+ static void Exec(const ThisType& functor, KernelContext* ctx, const
ExecBatch& batch,
+ Datum* out) {
+ typename TypeTraits<Type>::BuilderType builder;
+ Status s = VisitArrayDataInline<Arg0Type>(
+ *batch[0].array(), [&](util::optional<ARG0> v) -> Status {
+ if (v.has_value()) {
+ return builder.Append(functor.op.Call(ctx, *v));
+ } else {
+ return builder.AppendNull();
+ }
+ });
+ if (!s.ok()) {
+ ctx->SetStatus(s);
+ return;
+ } else {
+ std::shared_ptr<ArrayData> result;
+ ctx->SetStatus(builder.FinishInternal(&result));
+ out->value = std::move(result);
+ }
+ }
+ };
+
+ template <typename Type>
struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
static void Exec(const ThisType& functor, KernelContext* ctx, const
ExecBatch& batch,
Datum* out) {
@@ -416,7 +474,7 @@ struct ScalarUnaryNotNullStateful {
void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].scalar()->is_valid) {
ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
- out->value = std::make_shared<OutScalar>(
+ out->value = BoxScalar<OutType>::Box(
this->op.template Call<OUT, ARG0>(ctx, arg0),
out->type());
} else {
@@ -438,6 +496,9 @@ struct ScalarUnaryNotNullStateful {
// operator requires some initialization use ScalarUnaryNotNullStateful
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnaryNotNull {
+ using OUT = typename GetOutputType<OutType>::T;
+ using ARG0 = typename GetViewType<Arg0Type>::T;
+
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
@@ -464,11 +525,9 @@ struct ScalarUnaryNotNull {
template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
typename FlippedOp = Op>
struct ScalarBinary {
- using OutScalarType = typename TypeTraits<OutType>::ScalarType;
-
- using OUT = typename GetValueType<OutType>::T;
- using ARG0 = typename GetValueType<Arg0Type>::T;
- using ARG1 = typename GetValueType<Arg1Type>::T;
+ using OUT = typename GetOutputType<OutType>::T;
+ using ARG0 = typename GetViewType<Arg0Type>::T;
+ using ARG1 = typename GetViewType<Arg1Type>::T;
template <typename ChosenOp>
static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum*
out) {
@@ -492,7 +551,8 @@ struct ScalarBinary {
static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum*
out) {
auto arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
- out->value = std::make_shared<OutScalarType>(ChosenOp::template Call(ctx,
arg0, arg1));
+ out->value = BoxScalar<OutType>::Box(ChosenOp::template Call(ctx, arg0,
arg1),
+ out->type());
}
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 956d9e5..502fba2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -42,7 +42,7 @@ struct SetLookupState : public KernelState {
: lookup_table(pool, 0), lookup_null_count(0) {}
Status Init(const SetLookupOptions& options) {
- using T = typename GetValueType<Type>::T;
+ using T = typename GetViewType<Type>::T;
auto insert_value = [&](util::optional<T> v) {
if (v.has_value()) {
int32_t unused_memo_index;
@@ -147,7 +147,7 @@ struct MatchVisitor {
template <typename Type>
enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
- using T = typename GetValueType<Type>::T;
+ using T = typename GetViewType<Type>::T;
const auto& state = checked_cast<const
SetLookupState<Type>&>(*ctx->state());
@@ -222,7 +222,7 @@ struct IsInVisitor {
template <typename Type>
enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
- using T = typename GetValueType<Type>::T;
+ using T = typename GetViewType<Type>::T;
const auto& state = checked_cast<const
SetLookupState<Type>&>(*ctx->state());
ArrayData* output = out->mutable_array();
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
new file mode 100644
index 0000000..19eaf84
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_string_internal.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+// TODO: optional ascii validation
+
+struct AsciiLength {
+ template <typename OUT, typename ARG0 = util::string_view>
+ static OUT Call(KernelContext*, ARG0 val) {
+ return static_cast<OUT>(val.size());
+ }
+};
+
+struct AsciiUpper {
+ // XXX: the Scalar codegen path passes template arguments that are unused
+ template <typename... Ignored>
+ static std::string Call(KernelContext*, const util::string_view& val) {
+ std::string result = val.to_string();
+ std::transform(result.begin(), result.end(), result.begin(),
+ [](unsigned char c) { return std::toupper(c); });
+ return result;
+ }
+};
+
+void AddAsciiLength(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("ascii_length", Arity::Unary());
+ ArrayKernelExec exec_offset_32 =
+ codegen::ScalarUnaryNotNull<Int32Type, StringType, AsciiLength>::Exec;
+ ArrayKernelExec exec_offset_64 =
+ codegen::ScalarUnaryNotNull<Int64Type, LargeStringType,
AsciiLength>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, int32(), exec_offset_32));
+ DCHECK_OK(func->AddKernel({large_utf8()}, int64(), exec_offset_64));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+void RegisterScalarStringAscii(FunctionRegistry* registry) {
+ MakeUnaryStringToString<AsciiUpper>("ascii_upper", registry);
+ AddAsciiLength(registry);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/cpp/src/arrow/compute/registry_internal.h
b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
similarity index 54%
copy from cpp/src/arrow/compute/registry_internal.h
copy to cpp/src/arrow/compute/kernels/scalar_string_internal.h
index 5969981..dc71a04 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
@@ -15,30 +15,29 @@
// specific language governing permissions and limitations
// under the License.
-#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/compute/kernels/common.h"
namespace arrow {
namespace compute {
-
-class FunctionRegistry;
-
namespace internal {
-// Built-in scalar / elementwise functions
-void RegisterScalarArithmetic(FunctionRegistry* registry);
-void RegisterScalarBoolean(FunctionRegistry* registry);
-void RegisterScalarCast(FunctionRegistry* registry);
-void RegisterScalarComparison(FunctionRegistry* registry);
-void RegisterScalarSetLookup(FunctionRegistry* registry);
-
-// Vector functions
-void RegisterVectorFilter(FunctionRegistry* registry);
-void RegisterVectorHash(FunctionRegistry* registry);
-void RegisterVectorSort(FunctionRegistry* registry);
-void RegisterVectorTake(FunctionRegistry* registry);
-
-// Aggregate functions
-void RegisterScalarAggregateBasic(FunctionRegistry* registry);
+// Apply a scalar function to each string and yield same output type
+template <typename Op>
+void MakeUnaryStringToString(std::string name, FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
+ ArrayKernelExec exec_offset_32 =
+ codegen::ScalarUnaryNotNull<StringType, StringType, Op>::Exec;
+ ArrayKernelExec exec_offset_64 =
+ codegen::ScalarUnaryNotNull<LargeStringType, LargeStringType, Op>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_offset_32));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_offset_64));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
} // namespace internal
} // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
new file mode 100644
index 0000000..fba9a21
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+namespace compute {
+
+typedef ::testing::Types<StringType, LargeStringType> StringTypes;
+
+template <typename TestType>
+class TestStringKernels : public ::testing::Test {
+ protected:
+ using OffsetType = typename TypeTraits<TestType>::OffsetType;
+
+ void CheckUnary(std::string func_name, std::string json_input,
+ std::shared_ptr<DataType> out_ty, std::string json_expected)
{
+ auto input = ArrayFromJSON(string_type(), json_input);
+ auto expected = ArrayFromJSON(out_ty, json_expected);
+ ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}));
+ AssertArraysEqual(*expected, *out.make_array(), /*verbose=*/true);
+
+ // Check all the scalars
+ for (int64_t i = 0; i < input->length(); ++i) {
+ ASSERT_OK_AND_ASSIGN(auto val, input->GetScalar(i));
+ ASSERT_OK_AND_ASSIGN(auto ex_val, expected->GetScalar(i));
+ ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {val}));
+ AssertScalarsEqual(*ex_val, *out.scalar(), /*verbose=*/true);
+ }
+ }
+
+ std::shared_ptr<DataType> string_type() {
+ return TypeTraits<TestType>::type_singleton();
+ }
+
+ std::shared_ptr<DataType> offset_type() {
+ return TypeTraits<OffsetType>::type_singleton();
+ }
+};
+
+TYPED_TEST_SUITE(TestStringKernels, StringTypes);
+
+TYPED_TEST(TestStringKernels, AsciiLength) {
+ this->CheckUnary("ascii_length", "[\"aaa\", null, \"\", \"b\"]",
this->offset_type(),
+ "[3, null, 0, 1]");
+}
+
+TYPED_TEST(TestStringKernels, AsciiUpper) {
+ this->CheckUnary("ascii_upper", "[\"aAa&\", null, \"\", \"b\"]",
this->string_type(),
+ "[\"AAA&\", null, \"\", \"B\"]");
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc
b/cpp/src/arrow/compute/registry.cc
index c2f1ffd..1ef61d2 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -103,6 +103,7 @@ static std::unique_ptr<FunctionRegistry>
CreateBuiltInRegistry() {
RegisterScalarCast(registry.get());
RegisterScalarComparison(registry.get());
RegisterScalarSetLookup(registry.get());
+ RegisterScalarStringAscii(registry.get());
// Aggregate functions
RegisterScalarAggregateBasic(registry.get());
diff --git a/cpp/src/arrow/compute/registry_internal.h
b/cpp/src/arrow/compute/registry_internal.h
index 5969981..2c3a5e3 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -30,6 +30,7 @@ void RegisterScalarBoolean(FunctionRegistry* registry);
void RegisterScalarCast(FunctionRegistry* registry);
void RegisterScalarComparison(FunctionRegistry* registry);
void RegisterScalarSetLookup(FunctionRegistry* registry);
+void RegisterScalarStringAscii(FunctionRegistry* registry);
// Vector functions
void RegisterVectorFilter(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index c3e2811..cc32d8a 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -127,6 +127,9 @@ size_t Scalar::Hash::hash(const Scalar& scalar) { return
ScalarHashImpl(scalar).
StringScalar::StringScalar(std::string s)
: StringScalar(Buffer::FromString(std::move(s))) {}
+LargeStringScalar::LargeStringScalar(std::string s)
+ : LargeStringScalar(Buffer::FromString(std::move(s))) {}
+
FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr<Buffer> value,
std::shared_ptr<DataType> type)
: BinaryScalar(std::move(value), std::move(type)) {
@@ -212,6 +215,9 @@ std::shared_ptr<Scalar>
MakeNullScalar(std::shared_ptr<DataType> type) {
}
std::string Scalar::ToString() const {
+ if (!this->is_valid) {
+ return "null";
+ }
auto maybe_repr = CastTo(utf8());
if (maybe_repr.ok()) {
return checked_cast<const
StringScalar&>(*maybe_repr.ValueOrDie()).value->ToString();
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index ac4e3db..5caf04d 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -224,6 +224,8 @@ struct ARROW_EXPORT LargeStringScalar : public
LargeBinaryScalar {
explicit LargeStringScalar(std::shared_ptr<Buffer> value)
: LargeStringScalar(std::move(value), large_utf8()) {}
+ explicit LargeStringScalar(std::string s);
+
LargeStringScalar() : LargeStringScalar(large_utf8()) {}
};
diff --git a/cpp/src/arrow/testing/gtest_util.cc
b/cpp/src/arrow/testing/gtest_util.cc
index 894e3fb..a91fc83 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -82,6 +82,22 @@ void AssertArraysEqual(const Array& expected, const Array&
actual, bool verbose)
}
}
+void AssertScalarsEqual(const Scalar& expected, const Scalar& actual, bool
verbose) {
+ std::stringstream diff;
+ // ARROW-8956, ScalarEquals returns false when both are null
+ if (!expected.is_valid && !actual.is_valid) {
+ // We consider both being null to be equal in this function
+ return;
+ }
+ if (!expected.Equals(actual)) {
+ if (verbose) {
+ diff << "Expected:\n" << expected.ToString();
+ diff << "\nActual:\n" << actual.ToString();
+ }
+ FAIL() << diff.str();
+ }
+}
+
void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual,
bool check_metadata) {
AssertTsEqual(expected, actual, check_metadata);
diff --git a/cpp/src/arrow/testing/gtest_util.h
b/cpp/src/arrow/testing/gtest_util.h
index 846e30d..d84db73 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -167,6 +167,9 @@ struct Datum;
// If verbose is true, then the arrays will be pretty printed
ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual,
bool verbose = false);
+// Returns true when values are both null
+ARROW_EXPORT void AssertScalarsEqual(const Scalar& expected, const Scalar&
actual,
+ bool verbose = false);
ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected,
const RecordBatch& actual,
bool check_metadata = false);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index f5e32ba..f61b690 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -267,6 +267,7 @@ struct TypeTraits<StringType> {
using ArrayType = StringArray;
using BuilderType = StringBuilder;
using ScalarType = StringScalar;
+ using OffsetType = Int32Type;
constexpr static bool is_parameter_free = true;
static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
};
@@ -276,6 +277,7 @@ struct TypeTraits<LargeStringType> {
using ArrayType = LargeStringArray;
using BuilderType = LargeStringBuilder;
using ScalarType = LargeStringScalar;
+ using OffsetType = Int64Type;
constexpr static bool is_parameter_free = true;
static inline std::shared_ptr<DataType> type_singleton() { return
large_utf8(); }
};
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 6e3628f..d7c063a 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -84,6 +84,16 @@ def cast(arr, target_type, safe=True):
return call_function("cast", [arr], options)
+def _simple_unary_function(name):
+ def func(arg):
+ return call_function(name, [arg])
+ return func
+
+
+ascii_length = _simple_unary_function('ascii_length')
+ascii_upper = _simple_unary_function('ascii_upper')
+
+
def sum(array):
"""
Sum the values in a numerical (chunked) array.