This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e1c3334  ARROW-6775: [C++][Python] Implement list_value_lengths and 
list_parent_indices functions
e1c3334 is described below

commit e1c3334d548f0eb8cbf5c6eca84737dede8ee271
Author: Wes McKinney <[email protected]>
AuthorDate: Tue Jul 7 13:23:37 2020 +0200

    ARROW-6775: [C++][Python] Implement list_value_lengths and 
list_parent_indices functions
    
    This adds two functions that operate on list types:
    
    * list_value_lengths: returns an int32 (for List) or int64 (for LargeList) 
array with the number of elements in each list value slot
    * list_parent_indices: returns an int32/int64 array with the same length as 
the child values array of a List type where each value is the index of the list 
"slot" containing each child value
    
    Closes #7632 from wesm/some-list-functions
    
    Lead-authored-by: Wes McKinney <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/CMakeLists.txt                       |  1 +
 cpp/src/arrow/compute/kernels/CMakeLists.txt       |  1 +
 cpp/src/arrow/compute/kernels/scalar_nested.cc     | 69 ++++++++++++++++++++++
 ...vector_nested_test.cc => scalar_nested_test.cc} | 12 ++--
 cpp/src/arrow/compute/kernels/vector_nested.cc     | 43 ++++++++++++--
 .../arrow/compute/kernels/vector_nested_test.cc    | 20 +++++++
 cpp/src/arrow/compute/registry.cc                  |  1 +
 cpp/src/arrow/compute/registry_internal.h          |  1 +
 cpp/src/arrow/type_traits.h                        |  2 +
 python/pyarrow/array.pxi                           | 41 +++++++++++++
 python/pyarrow/compute.py                          |  2 +
 python/pyarrow/tests/test_array.py                 | 28 +++++++++
 12 files changed, 211 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 345f9b9..2682471 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -359,6 +359,7 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_cast_string.cc
               compute/kernels/scalar_cast_temporal.cc
               compute/kernels/scalar_compare.cc
+              compute/kernels/scalar_nested.cc
               compute/kernels/scalar_set_lookup.cc
               compute/kernels/scalar_string.cc
               compute/kernels/scalar_validity.cc
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt 
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 64debf8..e693a41 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -24,6 +24,7 @@ add_arrow_compute_test(scalar_test
                        scalar_boolean_test.cc
                        scalar_cast_test.cc
                        scalar_compare_test.cc
+                       scalar_nested_test.cc
                        scalar_set_lookup_test.cc
                        scalar_string_test.cc
                        scalar_validity_test.cc
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc 
b/cpp/src/arrow/compute/kernels/scalar_nested.cc
new file mode 100644
index 0000000..7c61aa3
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Vector kernels involving nested types
+
+#include "arrow/array/array_base.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/result.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+namespace {
+
+template <typename Type, typename offset_type = typename Type::offset_type>
+void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
+
+  if (batch[0].kind() == Datum::ARRAY) {
+    typename TypeTraits<Type>::ArrayType list(batch[0].array());
+    ArrayData* out_arr = out->mutable_array();
+    auto out_values = out_arr->GetMutableValues<offset_type>(1);
+    const offset_type* offsets = list.raw_value_offsets();
+    ::arrow::internal::detail::VisitBitBlocksVoid(
+        list.data()->buffers[0], list.offset(), list.length(),
+        [&](int64_t position) {
+          *out_values++ = offsets[position + 1] - offsets[position];
+        },
+        [&]() { *out_values++ = 0; });
+  } else {
+    const auto& arg0 = batch[0].scalar_as<ScalarType>();
+    if (arg0.is_valid) {
+      checked_cast<OffsetScalarType*>(out->scalar().get())->value =
+          static_cast<offset_type>(arg0.value->length());
+    }
+  }
+}
+
+}  // namespace
+
+void RegisterScalarNested(FunctionRegistry* registry) {
+  auto list_value_lengths =
+      std::make_shared<ScalarFunction>("list_value_lengths", Arity::Unary());
+  DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(),
+                                          ListValueLengths<ListType>));
+  DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)}, 
int64(),
+                                          ListValueLengths<LargeListType>));
+  DCHECK_OK(registry->AddFunction(std::move(list_value_lengths)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
similarity index 78%
copy from cpp/src/arrow/compute/kernels/vector_nested_test.cc
copy to cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index 61c2b77..4657c41 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -25,12 +25,14 @@
 namespace arrow {
 namespace compute {
 
-TEST(TestVectorNested, ListFlatten) {
+static std::shared_ptr<DataType> GetOffsetType(const DataType& type) {
+  return type.id() == Type::LIST ? int32() : int64();
+}
+
+TEST(TestScalarNested, ListValueLengths) {
   for (auto ty : {list(int32()), large_list(int32())}) {
-    auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]");
-    auto expected = ArrayFromJSON(int32(), "[0, null, 1, 2, 3]");
-    ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_flatten", {input}));
-    AssertArraysEqual(*expected, *out.make_array());
+    CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3], 
[]]",
+                     GetOffsetType(*ty), "[3, null, 2, 0]");
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc 
b/cpp/src/arrow/compute/kernels/vector_nested.cc
index e35bc58..eeb58d3 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -24,6 +24,7 @@
 namespace arrow {
 namespace compute {
 namespace internal {
+namespace {
 
 template <typename Type>
 void ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -36,19 +37,51 @@ void ListFlatten(KernelContext* ctx, const ExecBatch& 
batch, Datum* out) {
   out->value = (*result)->data();
 }
 
-static Result<ValueDescr> ValuesType(KernelContext*,
-                                     const std::vector<ValueDescr>& args) {
+template <typename Type, typename offset_type = typename Type::offset_type>
+void ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) 
{
+  typename TypeTraits<Type>::ArrayType list(batch[0].array());
+  ArrayData* out_arr = out->mutable_array();
+
+  const offset_type* offsets = list.raw_value_offsets();
+  offset_type values_length = offsets[list.length()] - offsets[0];
+
+  out_arr->length = values_length;
+  out_arr->null_count = 0;
+  KERNEL_ASSIGN_OR_RAISE(out_arr->buffers[1], ctx,
+                         ctx->Allocate(values_length * sizeof(offset_type)));
+  auto out_indices = 
reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
+  for (int64_t i = 0; i < list.length(); ++i) {
+    // Note: In most cases, null slots are empty, but when they are non-empty
+    // we write out the indices so make sure they are accounted for. This
+    // behavior could be changed if needed in the future.
+    for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) {
+      *out_indices++ = static_cast<offset_type>(i);
+    }
+  }
+}
+
+Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& 
args) {
   const auto& list_type = checked_cast<const BaseListType&>(*args[0].type);
   return ValueDescr::Array(list_type.value_type());
 }
 
+}  // namespace
+
 void RegisterVectorNested(FunctionRegistry* registry) {
   auto flatten = std::make_shared<VectorFunction>("list_flatten", 
Arity::Unary());
-  DCHECK_OK(flatten->AddKernel({InputType(Type::LIST)}, OutputType(ValuesType),
+  DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, 
OutputType(ValuesType),
                                ListFlatten<ListType>));
-  DCHECK_OK(flatten->AddKernel({InputType(Type::LARGE_LIST)}, 
OutputType(ValuesType),
-                               ListFlatten<LargeListType>));
+  DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
+                               OutputType(ValuesType), 
ListFlatten<LargeListType>));
   DCHECK_OK(registry->AddFunction(std::move(flatten)));
+
+  auto list_parent_indices =
+      std::make_shared<VectorFunction>("list_parent_indices", Arity::Unary());
+  DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, 
int32(),
+                                           ListParentIndices<ListType>));
+  
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, 
int64(),
+                                           ListParentIndices<LargeListType>));
+  DCHECK_OK(registry->AddFunction(std::move(list_parent_indices)));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc 
b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
index 61c2b77..906772a 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
@@ -34,5 +34,25 @@ TEST(TestVectorNested, ListFlatten) {
   }
 }
 
+TEST(TestVectorNested, ListParentIndices) {
+  for (auto ty : {list(int32()), large_list(int32())}) {
+    auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");
+
+    auto out_ty = ty->id() == Type::LIST ? int32() : int64();
+    auto expected = ArrayFromJSON(out_ty, "[0, 0, 0, 2, 2, 4, 4]");
+    ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", 
{input}));
+    AssertArraysEqual(*expected, *out.make_array());
+  }
+
+  // Construct a list with non-empty null slots
+  auto input = ArrayFromJSON(list(int32()), "[[0, null, 1], [0, 0], [2, 3], 
[], [4, 5]]");
+  std::shared_ptr<ArrayData> data = input->data()->Copy();
+  data->buffers[0] =
+      (ArrayFromJSON(boolean(), "[true, false, true, true, 
true]")->data()->buffers[1]);
+  auto expected = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
+  ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("list_parent_indices", {data}));
+  AssertArraysEqual(*expected, *out.make_array());
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc 
b/cpp/src/arrow/compute/registry.cc
index 061c01c..4b1adf3 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -102,6 +102,7 @@ static std::unique_ptr<FunctionRegistry> 
CreateBuiltInRegistry() {
   RegisterScalarBoolean(registry.get());
   RegisterScalarCast(registry.get());
   RegisterScalarComparison(registry.get());
+  RegisterScalarNested(registry.get());
   RegisterScalarSetLookup(registry.get());
   RegisterScalarStringAscii(registry.get());
   RegisterScalarValidity(registry.get());
diff --git a/cpp/src/arrow/compute/registry_internal.h 
b/cpp/src/arrow/compute/registry_internal.h
index 5d22162..1e1a7b0 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -29,6 +29,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry);
 void RegisterScalarBoolean(FunctionRegistry* registry);
 void RegisterScalarCast(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
+void RegisterScalarNested(FunctionRegistry* registry);
 void RegisterScalarSetLookup(FunctionRegistry* registry);
 void RegisterScalarStringAscii(FunctionRegistry* registry);
 void RegisterScalarValidity(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index dbe5844..eea4e86 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -307,6 +307,7 @@ struct TypeTraits<ListType> {
   using OffsetType = Int32Type;
   using OffsetArrayType = Int32Array;
   using OffsetBuilderType = Int32Builder;
+  using OffsetScalarType = Int32Scalar;
   constexpr static bool is_parameter_free = false;
 };
 
@@ -318,6 +319,7 @@ struct TypeTraits<LargeListType> {
   using OffsetType = Int64Type;
   using OffsetArrayType = Int64Array;
   using OffsetBuilderType = Int64Builder;
+  using OffsetScalarType = Int64Scalar;
   constexpr static bool is_parameter_free = false;
 };
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5f8c851..50398b7 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1361,6 +1361,47 @@ cdef class BaseListArray(Array):
         """
         return _pc().list_flatten(self)
 
+    def value_parent_indices(self):
+        """
+        Return array of same length as list child values array where each
+        output value is the index of the parent list array slot containing each
+        child value.
+
+        Examples
+        --------
+        >>> arr = pa.array([[1, 2, 3], [], None, [4]],
+        ...                type=pa.list_(pa.int32()))
+        >>> arr.value_parent_indices()
+        <pyarrow.lib.Int32Array object at 0x7efc5db958a0>
+        [
+          0,
+          0,
+          0,
+          3
+        ]
+        """
+        return _pc().list_parent_indices(self)
+
+    def value_lengths(self):
+        """
+        Return integers array with values equal to the respective length of
+        each list element. Null list values are null in the output.
+
+        Examples
+        --------
+        >>> arr = pa.array([[1, 2, 3], [], None, [4]],
+        ...                type=pa.list_(pa.int32()))
+        >>> arr.value_lengths()
+        <pyarrow.lib.Int32Array object at 0x7efc5db95910>
+        [
+          3,
+          0,
+          null,
+          1
+        ]
+        """
+        return _pc().list_value_lengths(self)
+
 
 cdef class ListArray(BaseListArray):
     """
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index babab27..ae7dae8 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -107,6 +107,8 @@ is_valid = _simple_unary_function('is_valid')
 is_null = _simple_unary_function('is_null')
 
 list_flatten = _simple_unary_function('list_flatten')
+list_parent_indices = _simple_unary_function('list_parent_indices')
+list_value_lengths = _simple_unary_function('list_value_lengths')
 
 add = _simple_binary_function('add')
 subtract = _simple_binary_function('subtract')
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 18c9a1b..bd63ba7 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2109,6 +2109,34 @@ def test_list_array_flatten(offset_type, 
list_type_factory):
     assert arr2.values.values.equals(arr0)
 
 
[email protected](('offset_type', 'list_type_factory'),
+                         [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
+def test_list_value_parent_indices(offset_type, list_type_factory):
+    arr = pa.array(
+        [
+            [0, 1, 2],
+            None,
+            [],
+            [3, 4]
+        ], type=list_type_factory(pa.int32()))
+    expected = pa.array([0, 0, 0, 3, 3], type=offset_type)
+    assert arr.value_parent_indices().equals(expected)
+
+
[email protected](('offset_type', 'list_type_factory'),
+                         [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
+def test_list_value_lengths(offset_type, list_type_factory):
+    arr = pa.array(
+        [
+            [0, 1, 2],
+            None,
+            [],
+            [3, 4]
+        ], type=list_type_factory(pa.int32()))
+    expected = pa.array([3, None, 0, 2], type=offset_type)
+    assert arr.value_lengths().equals(expected)
+
+
 @pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
 def test_list_array_flatten_non_canonical(list_type_factory):
     # Non-canonical list array (null elements backed by non-empty sublists)

Reply via email to