This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e1c3334 ARROW-6775: [C++][Python] Implement list_value_lengths and
list_parent_indices functions
e1c3334 is described below
commit e1c3334d548f0eb8cbf5c6eca84737dede8ee271
Author: Wes McKinney <[email protected]>
AuthorDate: Tue Jul 7 13:23:37 2020 +0200
ARROW-6775: [C++][Python] Implement list_value_lengths and
list_parent_indices functions
This adds two functions that operate on list types:
* list_value_lengths: returns an int32 (for List) or int64 (for LargeList)
array with the number of elements in each list value slot
* list_parent_indices: returns an int32/int64 array with the same length as
the child values array of a List type where each value is the index of the list
"slot" containing each child value
Closes #7632 from wesm/some-list-functions
Lead-authored-by: Wes McKinney <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/compute/kernels/CMakeLists.txt | 1 +
cpp/src/arrow/compute/kernels/scalar_nested.cc | 69 ++++++++++++++++++++++
...vector_nested_test.cc => scalar_nested_test.cc} | 12 ++--
cpp/src/arrow/compute/kernels/vector_nested.cc | 43 ++++++++++++--
.../arrow/compute/kernels/vector_nested_test.cc | 20 +++++++
cpp/src/arrow/compute/registry.cc | 1 +
cpp/src/arrow/compute/registry_internal.h | 1 +
cpp/src/arrow/type_traits.h | 2 +
python/pyarrow/array.pxi | 41 +++++++++++++
python/pyarrow/compute.py | 2 +
python/pyarrow/tests/test_array.py | 28 +++++++++
12 files changed, 211 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 345f9b9..2682471 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -359,6 +359,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_cast_string.cc
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
+ compute/kernels/scalar_nested.cc
compute/kernels/scalar_set_lookup.cc
compute/kernels/scalar_string.cc
compute/kernels/scalar_validity.cc
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 64debf8..e693a41 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -24,6 +24,7 @@ add_arrow_compute_test(scalar_test
scalar_boolean_test.cc
scalar_cast_test.cc
scalar_compare_test.cc
+ scalar_nested_test.cc
scalar_set_lookup_test.cc
scalar_string_test.cc
scalar_validity_test.cc
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc
b/cpp/src/arrow/compute/kernels/scalar_nested.cc
new file mode 100644
index 0000000..7c61aa3
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Vector kernels involving nested types
+
+#include "arrow/array/array_base.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/result.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+namespace {
+
+template <typename Type, typename offset_type = typename Type::offset_type>
+void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ typename TypeTraits<Type>::ArrayType list(batch[0].array());
+ ArrayData* out_arr = out->mutable_array();
+ auto out_values = out_arr->GetMutableValues<offset_type>(1);
+ const offset_type* offsets = list.raw_value_offsets();
+ ::arrow::internal::detail::VisitBitBlocksVoid(
+ list.data()->buffers[0], list.offset(), list.length(),
+ [&](int64_t position) {
+ *out_values++ = offsets[position + 1] - offsets[position];
+ },
+ [&]() { *out_values++ = 0; });
+ } else {
+ const auto& arg0 = batch[0].scalar_as<ScalarType>();
+ if (arg0.is_valid) {
+ checked_cast<OffsetScalarType*>(out->scalar().get())->value =
+ static_cast<offset_type>(arg0.value->length());
+ }
+ }
+}
+
+} // namespace
+
+void RegisterScalarNested(FunctionRegistry* registry) {
+ auto list_value_lengths =
+ std::make_shared<ScalarFunction>("list_value_lengths", Arity::Unary());
+ DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(),
+ ListValueLengths<ListType>));
+ DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)},
int64(),
+ ListValueLengths<LargeListType>));
+ DCHECK_OK(registry->AddFunction(std::move(list_value_lengths)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
similarity index 78%
copy from cpp/src/arrow/compute/kernels/vector_nested_test.cc
copy to cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index 61c2b77..4657c41 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -25,12 +25,14 @@
namespace arrow {
namespace compute {
-TEST(TestVectorNested, ListFlatten) {
+static std::shared_ptr<DataType> GetOffsetType(const DataType& type) {
+ return type.id() == Type::LIST ? int32() : int64();
+}
+
+TEST(TestScalarNested, ListValueLengths) {
for (auto ty : {list(int32()), large_list(int32())}) {
- auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]");
- auto expected = ArrayFromJSON(int32(), "[0, null, 1, 2, 3]");
- ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_flatten", {input}));
- AssertArraysEqual(*expected, *out.make_array());
+ CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3],
[]]",
+ GetOffsetType(*ty), "[3, null, 2, 0]");
}
}
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc
b/cpp/src/arrow/compute/kernels/vector_nested.cc
index e35bc58..eeb58d3 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -24,6 +24,7 @@
namespace arrow {
namespace compute {
namespace internal {
+namespace {
template <typename Type>
void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -36,19 +37,51 @@ void ListFlatten(KernelContext* ctx, const ExecBatch&
batch, Datum* out) {
out->value = (*result)->data();
}
-static Result<ValueDescr> ValuesType(KernelContext*,
- const std::vector<ValueDescr>& args) {
+template <typename Type, typename offset_type = typename Type::offset_type>
+void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out)
{
+ typename TypeTraits<Type>::ArrayType list(batch[0].array());
+ ArrayData* out_arr = out->mutable_array();
+
+ const offset_type* offsets = list.raw_value_offsets();
+ offset_type values_length = offsets[list.length()] - offsets[0];
+
+ out_arr->length = values_length;
+ out_arr->null_count = 0;
+ KERNEL_ASSIGN_OR_RAISE(out_arr->buffers[1], ctx,
+ ctx->Allocate(values_length * sizeof(offset_type)));
+ auto out_indices =
reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
+ for (int64_t i = 0; i < list.length(); ++i) {
+ // Note: In most cases, null slots are empty, but when they are non-empty
+ // we write out the indices so make sure they are accounted for. This
+ // behavior could be changed if needed in the future.
+ for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) {
+ *out_indices++ = static_cast<offset_type>(i);
+ }
+ }
+}
+
+Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>&
args) {
const auto& list_type = checked_cast<const BaseListType&>(*args[0].type);
return ValueDescr::Array(list_type.value_type());
}
+} // namespace
+
void RegisterVectorNested(FunctionRegistry* registry) {
auto flatten = std::make_shared<VectorFunction>("list_flatten",
Arity::Unary());
- DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType),
+ DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)},
OutputType(ValuesType),
ListFlatten<ListType>));
- DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)},
OutputType(ValuesType),
- ListFlatten<LargeListType>));
+ DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
+ OutputType(ValuesType),
ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));
+
+ auto list_parent_indices =
+ std::make_shared<VectorFunction>("list_parent_indices", Arity::Unary());
+ DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)},
int32(),
+ ListParentIndices<ListType>));
+
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)},
int64(),
+ ListParentIndices<LargeListType>));
+ DCHECK_OK(registry->AddFunction(std::move(list_parent_indices)));
}
} // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
index 61c2b77..906772a 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
@@ -34,5 +34,25 @@ TEST(TestVectorNested, ListFlatten) {
}
}
+TEST(TestVectorNested, ListParentIndices) {
+ for (auto ty : {list(int32()), large_list(int32())}) {
+ auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");
+
+ auto out_ty = ty->id() == Type::LIST ? int32() : int64();
+ auto expected = ArrayFromJSON(out_ty, "[0, 0, 0, 2, 2, 4, 4]");
+ ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices",
{input}));
+ AssertArraysEqual(*expected, *out.make_array());
+ }
+
+ // Construct a list with non-empty null slots
+ auto input = ArrayFromJSON(list(int32()), "[[0, null, 1], [0, 0], [2, 3],
[], [4, 5]]");
+ std::shared_ptr<ArrayData> data = input->data()->Copy();
+ data->buffers[0] =
+ (ArrayFromJSON(boolean(), "[true, false, true, true,
true]")->data()->buffers[1]);
+ auto expected = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
+ ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {data}));
+ AssertArraysEqual(*expected, *out.make_array());
+}
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc
b/cpp/src/arrow/compute/registry.cc
index 061c01c..4b1adf3 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -102,6 +102,7 @@ static std::unique_ptr<FunctionRegistry>
CreateBuiltInRegistry() {
RegisterScalarBoolean(registry.get());
RegisterScalarCast(registry.get());
RegisterScalarComparison(registry.get());
+ RegisterScalarNested(registry.get());
RegisterScalarSetLookup(registry.get());
RegisterScalarStringAscii(registry.get());
RegisterScalarValidity(registry.get());
diff --git a/cpp/src/arrow/compute/registry_internal.h
b/cpp/src/arrow/compute/registry_internal.h
index 5d22162..1e1a7b0 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -29,6 +29,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry);
void RegisterScalarBoolean(FunctionRegistry* registry);
void RegisterScalarCast(FunctionRegistry* registry);
void RegisterScalarComparison(FunctionRegistry* registry);
+void RegisterScalarNested(FunctionRegistry* registry);
void RegisterScalarSetLookup(FunctionRegistry* registry);
void RegisterScalarStringAscii(FunctionRegistry* registry);
void RegisterScalarValidity(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index dbe5844..eea4e86 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -307,6 +307,7 @@ struct TypeTraits<ListType> {
using OffsetType = Int32Type;
using OffsetArrayType = Int32Array;
using OffsetBuilderType = Int32Builder;
+ using OffsetScalarType = Int32Scalar;
constexpr static bool is_parameter_free = false;
};
@@ -318,6 +319,7 @@ struct TypeTraits<LargeListType> {
using OffsetType = Int64Type;
using OffsetArrayType = Int64Array;
using OffsetBuilderType = Int64Builder;
+ using OffsetScalarType = Int64Scalar;
constexpr static bool is_parameter_free = false;
};
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5f8c851..50398b7 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1361,6 +1361,47 @@ cdef class BaseListArray(Array):
"""
return _pc().list_flatten(self)
+ def value_parent_indices(self):
+ """
+ Return array of same length as list child values array where each
+ output value is the index of the parent list array slot containing each
+ child value.
+
+ Examples
+ --------
+ >>> arr = pa.array([[1, 2, 3], [], None, [4]],
+ ... type=pa.list_(pa.int32()))
+ >>> arr.value_parent_indices()
+ <pyarrow.lib.Int32Array object at 0x7efc5db958a0>
+ [
+ 0,
+ 0,
+ 0,
+ 3
+ ]
+ """
+ return _pc().list_parent_indices(self)
+
+ def value_lengths(self):
+ """
+ Return integers array with values equal to the respective length of
+ each list element. Null list values are null in the output.
+
+ Examples
+ --------
+ >>> arr = pa.array([[1, 2, 3], [], None, [4]],
+ ... type=pa.list_(pa.int32()))
+ >>> arr.value_lengths()
+ <pyarrow.lib.Int32Array object at 0x7efc5db95910>
+ [
+ 3,
+ 0,
+ null,
+ 1
+ ]
+ """
+ return _pc().list_value_lengths(self)
+
cdef class ListArray(BaseListArray):
"""
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index babab27..ae7dae8 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -107,6 +107,8 @@ is_valid = _simple_unary_function('is_valid')
is_null = _simple_unary_function('is_null')
list_flatten = _simple_unary_function('list_flatten')
+list_parent_indices = _simple_unary_function('list_parent_indices')
+list_value_lengths = _simple_unary_function('list_value_lengths')
add = _simple_binary_function('add')
subtract = _simple_binary_function('subtract')
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 18c9a1b..bd63ba7 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2109,6 +2109,34 @@ def test_list_array_flatten(offset_type,
list_type_factory):
assert arr2.values.values.equals(arr0)
[email protected](('offset_type', 'list_type_factory'),
+ [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
+def test_list_value_parent_indices(offset_type, list_type_factory):
+ arr = pa.array(
+ [
+ [0, 1, 2],
+ None,
+ [],
+ [3, 4]
+ ], type=list_type_factory(pa.int32()))
+ expected = pa.array([0, 0, 0, 3, 3], type=offset_type)
+ assert arr.value_parent_indices().equals(expected)
+
+
[email protected](('offset_type', 'list_type_factory'),
+ [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
+def test_list_value_lengths(offset_type, list_type_factory):
+ arr = pa.array(
+ [
+ [0, 1, 2],
+ None,
+ [],
+ [3, 4]
+ ], type=list_type_factory(pa.int32()))
+ expected = pa.array([3, None, 0, 2], type=offset_type)
+ assert arr.value_lengths().equals(expected)
+
+
@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
def test_list_array_flatten_non_canonical(list_type_factory):
# Non-canonical list array (null elements backed by non-empty sublists)