pitrou commented on code in PR #42067:
URL: https://github.com/apache/arrow/pull/42067#discussion_r1635028195


##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -28,6 +28,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_generate.h"
 #include "arrow/util/string.h"
+#include "arrow/util/unreachable.h"

Review Comment:
   Do you intend to replace it with your proposed macro at some point?



##########
cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc:
##########
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/exec.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/util/benchmark_util.h"
+#include "benchmark/benchmark.h"
+
+namespace arrow::compute {
+
+constexpr auto kSeed = 0x94378165;
+
+const auto kSliceStart = 2;
+const auto kSliceStop = 10;
+
+template <typename InListType, typename ValueType, int32_t kFixedListSize = 0>
+static void BenchmarkListSlice(benchmark::State& state, const 
ListSliceOptions& opts) {
+  RegressionArgs args(state, /*size_is_bytes=*/false);
+  auto value_ty = TypeTraits<ValueType>::type_singleton();
+  std::shared_ptr<DataType> list_ty;
+  if constexpr (std::is_same_v<InListType, FixedSizeListType>) {
+    list_ty = std::make_shared<FixedSizeListType>(std::move(value_ty), 
kFixedListSize);
+  } else {
+    list_ty = std::make_shared<InListType>(std::move(value_ty));
+  }
+  auto rand = random::RandomArrayGenerator(kSeed);
+  auto array = rand.ArrayOf(list_ty, args.size, args.null_proportion);
+  auto ctx = default_exec_context();
+  std::vector<Datum> input_args = {std::move(array)};
+  for (auto _ : state) {
+    ABORT_NOT_OK(CallFunction("list_slice", input_args, &opts, ctx).status());
+  }
+}
+
+template <typename InListType = ListType>
+static void ListSliceInt64List(benchmark::State& state) {
+  ListSliceOptions opts;
+  opts.start = kSliceStart;
+  BenchmarkListSlice<InListType, Int64Type>(state, opts);
+}
+
+template <typename InListType = ListType>
+static void ListSliceStringList(benchmark::State& state) {
+  ListSliceOptions opts;
+  opts.start = kSliceStart;
+  BenchmarkListSlice<InListType, StringType>(state, opts);
+}
+
+template <typename InListType = ListType>
+static void ListSliceInt64ListWithStop(benchmark::State& state) {
+  ListSliceOptions opts;
+  opts.start = kSliceStart;
+  opts.stop = kSliceStop;
+  BenchmarkListSlice<InListType, Int64Type>(state, opts);

Review Comment:
   Does slicing use a different algorithm when a stop is also given? Otherwise, 
this benchmark variant doesn't seem useful.



##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -264,22 +276,25 @@ TEST(TestScalarNested, ListSliceChildArrayOffset) {
   ASSERT_EQ(input->offset(), 0);
   ASSERT_EQ(input->values()->offset(), 2);
 
-  ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+  ListSliceOptions args(/*start=*/0, /*stop=*/3, /*step=*/1,

Review Comment:
   I'm curious why you changed this?



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -148,128 +191,197 @@ struct ListSlice {
       return Status::Invalid("`step` must be >= 1, got: ", opts.step);
     }
 
-    const ArraySpan& list_array = batch[0].array;
-    const Type* list_type = checked_cast<const Type*>(list_array.type);
-    const auto value_type = list_type->field(0);
-    const auto return_fixed_size_list = opts.return_fixed_size_list.value_or(
-        list_type->id() == arrow::Type::FIXED_SIZE_LIST);
-    std::unique_ptr<ArrayBuilder> builder;
-
-    // should have been checked in resolver
-    // if stop not set, then cannot return fixed size list without input being 
fixed size
-    // list b/c we cannot determine the max list element in type resolving.
-    DCHECK(opts.stop.has_value() ||
-           (!opts.stop.has_value() && (!return_fixed_size_list ||
-                                       list_type->id() == 
arrow::Type::FIXED_SIZE_LIST)));
-
-    // construct array values
-    if (return_fixed_size_list) {
-      int32_t stop;
-      if (opts.stop.has_value()) {
-        stop = static_cast<int32_t>(opts.stop.value());
-      } else {
-        DCHECK_EQ(list_type->id(), arrow::Type::FIXED_SIZE_LIST);
-        stop = reinterpret_cast<const 
FixedSizeListType*>(list_type)->list_size();
-      }
-      const auto size = std::max(stop - static_cast<int32_t>(opts.start), 0);
-      const auto length = bit_util::CeilDiv(size, opts.step);
-      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
-                                fixed_size_list(value_type, 
static_cast<int32_t>(length)),
-                                &builder));
-      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
-    } else {
-      if constexpr (std::is_same_v<Type, LargeListType>) {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
-      } else {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
-      }
+    auto* pool = ctx->memory_pool();
+    ARROW_ASSIGN_OR_RAISE(auto output_type_holder, ListSliceOutputType(opts, 
*list_type));
+    constexpr auto kInputTypeId = InListType::type_id;
+    auto output_type = output_type_holder.GetSharedPtr();
+    switch (output_type->id()) {
+      case Type::LIST:
+        DCHECK(kInputTypeId == Type::LIST || kInputTypeId == 
Type::FIXED_SIZE_LIST);
+        if constexpr (kInputTypeId == Type::LIST ||
+                      kInputTypeId == Type::FIXED_SIZE_LIST) {
+          return BuildArray<ListBuilder>(pool, opts, batch, output_type, out);
+        }
+        break;
+      case Type::LARGE_LIST:
+        DCHECK_EQ(kInputTypeId, Type::LARGE_LIST);
+        if constexpr (kInputTypeId == Type::LARGE_LIST) {

Review Comment:
   Same question here, and below...



##########
cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc:
##########
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/exec.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/util/benchmark_util.h"
+#include "benchmark/benchmark.h"
+
+namespace arrow::compute {
+
+constexpr auto kSeed = 0x94378165;
+
+const auto kSliceStart = 2;
+const auto kSliceStop = 10;
+
+template <typename InListType, typename ValueType, int32_t kFixedListSize = 0>

Review Comment:
   Are the template parameters actually useful? Otherwise, using regular 
function args would cut down on code generation (I'm less worried about 
executable size than compile times here :-)).



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -148,128 +191,197 @@ struct ListSlice {
       return Status::Invalid("`step` must be >= 1, got: ", opts.step);
     }
 
-    const ArraySpan& list_array = batch[0].array;
-    const Type* list_type = checked_cast<const Type*>(list_array.type);
-    const auto value_type = list_type->field(0);
-    const auto return_fixed_size_list = opts.return_fixed_size_list.value_or(
-        list_type->id() == arrow::Type::FIXED_SIZE_LIST);
-    std::unique_ptr<ArrayBuilder> builder;
-
-    // should have been checked in resolver
-    // if stop not set, then cannot return fixed size list without input being 
fixed size
-    // list b/c we cannot determine the max list element in type resolving.
-    DCHECK(opts.stop.has_value() ||
-           (!opts.stop.has_value() && (!return_fixed_size_list ||
-                                       list_type->id() == 
arrow::Type::FIXED_SIZE_LIST)));
-
-    // construct array values
-    if (return_fixed_size_list) {
-      int32_t stop;
-      if (opts.stop.has_value()) {
-        stop = static_cast<int32_t>(opts.stop.value());
-      } else {
-        DCHECK_EQ(list_type->id(), arrow::Type::FIXED_SIZE_LIST);
-        stop = reinterpret_cast<const 
FixedSizeListType*>(list_type)->list_size();
-      }
-      const auto size = std::max(stop - static_cast<int32_t>(opts.start), 0);
-      const auto length = bit_util::CeilDiv(size, opts.step);
-      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
-                                fixed_size_list(value_type, 
static_cast<int32_t>(length)),
-                                &builder));
-      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
-    } else {
-      if constexpr (std::is_same_v<Type, LargeListType>) {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
-      } else {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
-      }
+    auto* pool = ctx->memory_pool();
+    ARROW_ASSIGN_OR_RAISE(auto output_type_holder, ListSliceOutputType(opts, 
*list_type));
+    constexpr auto kInputTypeId = InListType::type_id;
+    auto output_type = output_type_holder.GetSharedPtr();
+    switch (output_type->id()) {
+      case Type::LIST:
+        DCHECK(kInputTypeId == Type::LIST || kInputTypeId == 
Type::FIXED_SIZE_LIST);
+        if constexpr (kInputTypeId == Type::LIST ||
+                      kInputTypeId == Type::FIXED_SIZE_LIST) {

Review Comment:
   Am I misreading, or is the `if` condition basically always true here?



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -130,14 +131,56 @@ std::string ToString(const std::optional<T>& o) {
   return o.has_value() ? ToChars(*o) : "(nullopt)";
 }
 
-template <typename Type>
+/// \param stop User-provided stop or the length of the input list
+int64_t ListSliceLength(int64_t start, int64_t step, int64_t stop) {
+  DCHECK_GE(step, 1);
+  const auto size = std::max(stop - start, static_cast<int64_t>(0));

Review Comment:
   FTR, you can also write `std::max<int64_t>(stop - start, 0)`, which, besides 
being slightly terser, makes sure that the return type is as desired.



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -148,128 +191,197 @@ struct ListSlice {
       return Status::Invalid("`step` must be >= 1, got: ", opts.step);
     }
 
-    const ArraySpan& list_array = batch[0].array;
-    const Type* list_type = checked_cast<const Type*>(list_array.type);
-    const auto value_type = list_type->field(0);
-    const auto return_fixed_size_list = opts.return_fixed_size_list.value_or(
-        list_type->id() == arrow::Type::FIXED_SIZE_LIST);
-    std::unique_ptr<ArrayBuilder> builder;
-
-    // should have been checked in resolver
-    // if stop not set, then cannot return fixed size list without input being 
fixed size
-    // list b/c we cannot determine the max list element in type resolving.
-    DCHECK(opts.stop.has_value() ||
-           (!opts.stop.has_value() && (!return_fixed_size_list ||
-                                       list_type->id() == 
arrow::Type::FIXED_SIZE_LIST)));
-
-    // construct array values
-    if (return_fixed_size_list) {
-      int32_t stop;
-      if (opts.stop.has_value()) {
-        stop = static_cast<int32_t>(opts.stop.value());
-      } else {
-        DCHECK_EQ(list_type->id(), arrow::Type::FIXED_SIZE_LIST);
-        stop = reinterpret_cast<const 
FixedSizeListType*>(list_type)->list_size();
-      }
-      const auto size = std::max(stop - static_cast<int32_t>(opts.start), 0);
-      const auto length = bit_util::CeilDiv(size, opts.step);
-      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
-                                fixed_size_list(value_type, 
static_cast<int32_t>(length)),
-                                &builder));
-      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
-    } else {
-      if constexpr (std::is_same_v<Type, LargeListType>) {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
-      } else {
-        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
-        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
-      }
+    auto* pool = ctx->memory_pool();
+    ARROW_ASSIGN_OR_RAISE(auto output_type_holder, ListSliceOutputType(opts, 
*list_type));
+    constexpr auto kInputTypeId = InListType::type_id;
+    auto output_type = output_type_holder.GetSharedPtr();
+    switch (output_type->id()) {
+      case Type::LIST:
+        DCHECK(kInputTypeId == Type::LIST || kInputTypeId == 
Type::FIXED_SIZE_LIST);
+        if constexpr (kInputTypeId == Type::LIST ||
+                      kInputTypeId == Type::FIXED_SIZE_LIST) {
+          return BuildArray<ListBuilder>(pool, opts, batch, output_type, out);
+        }
+        break;
+      case Type::LARGE_LIST:
+        DCHECK_EQ(kInputTypeId, Type::LARGE_LIST);
+        if constexpr (kInputTypeId == Type::LARGE_LIST) {
+          return BuildArray<LargeListBuilder>(pool, opts, batch, output_type, 
out);
+        }
+        break;
+      case Type::FIXED_SIZE_LIST:
+        // A fixed-size list can be produced from any list-like input
+        // if ListSliceOptions::return_fixed_size_list is set to true
+        return BuildArray<FixedSizeListBuilder>(pool, opts, batch, 
output_type, out);
+      case Type::LIST_VIEW:
+        DCHECK_EQ(kInputTypeId, Type::LIST_VIEW);
+        if constexpr (kInputTypeId == Type::LIST_VIEW) {
+          return BuildArray<ListViewBuilder>(pool, opts, batch, output_type, 
out);
+        }
+        break;
+      case Type::LARGE_LIST_VIEW:
+        DCHECK_EQ(kInputTypeId, Type::LARGE_LIST_VIEW);
+        if constexpr (kInputTypeId == Type::LARGE_LIST_VIEW) {
+          return BuildArray<LargeListViewBuilder>(pool, opts, batch, 
output_type, out);
+        }
+        break;
+      default:
+        break;
     }
-
-    // build output arrays and set result
-    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
-    out->value = std::move(result->data());
+    Unreachable();
     return Status::OK();
   }
 
+  /// \brief Builds the array of list slices from the input list array
   template <typename BuilderType>
-  static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
-                           ArrayBuilder& builder) {
-    if constexpr (std::is_same_v<Type, FixedSizeListType>) {
-      RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts, 
builder));
+  static Status BuildArray(MemoryPool* pool, const ListSliceOptions& opts,
+                           const ExecSpan& batch,
+                           const std::shared_ptr<DataType>& output_type,
+                           ExecResult* out) {
+    std::unique_ptr<ArrayBuilder> builder;
+    RETURN_NOT_OK(MakeBuilder(pool, output_type, &builder));
+    auto* list_builder = checked_cast<BuilderType*>(builder.get());

Review Comment:
   Should we start by presizing the builder?



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -130,14 +131,56 @@ std::string ToString(const std::optional<T>& o) {
   return o.has_value() ? ToChars(*o) : "(nullopt)";
 }
 
-template <typename Type>
+/// \param stop User-provided stop or the length of the input list
+int64_t ListSliceLength(int64_t start, int64_t step, int64_t stop) {
+  DCHECK_GE(step, 1);
+  const auto size = std::max(stop - start, static_cast<int64_t>(0));
+  return bit_util::CeilDiv(size, step);
+}
+
+std::optional<int64_t> EffectiveSliceStop(const ListSliceOptions& opts,
+                                          const BaseListType& input_type) {
+  if (!opts.stop.has_value() && input_type.id() == Type::FIXED_SIZE_LIST) {
+    return checked_cast<const FixedSizeListType&>(input_type).list_size();
+  }
+  return opts.stop;
+}
+
+Result<TypeHolder> ListSliceOutputType(const ListSliceOptions& opts,
+                                       const BaseListType& input_list_type) {
+  const auto& value_type = input_list_type.field(0);
+  const bool is_fixed_size_list = input_list_type.id() == 
Type::FIXED_SIZE_LIST;
+  const auto return_fixed_size_list =
+      opts.return_fixed_size_list.value_or(is_fixed_size_list);
+  if (return_fixed_size_list) {
+    auto stop = EffectiveSliceStop(opts, input_list_type);
+    if (!stop.has_value()) {
+      return Status::NotImplemented(

Review Comment:
   This is not something we envision implementing at some point, since the 
inputs/options combination is invalid, right?
   ```suggestion
         return Status::Invalid(
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to