jorisvandenbossche commented on code in PR #14395:
URL: https://github.com/apache/arrow/pull/14395#discussion_r1006722884


##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+    if (opts.stop == -1) {
+      // TODO: Support slicing to arbitrary end
+      // For variable size list, this would be the largest difference in 
offsets
+      // For fixed size list, this would be the fixed size.
+      return Status::NotImplemented(
+          "Setting `stop==-1` to signify slicing to end, not yet 
implemented.");
+    }
+
+    const ArraySpan& list_ = batch[0].array;
+    const Type* list_type = checked_cast<const Type*>(list_.type);
+    const auto value_type = list_type->value_type();
+
+    // construct builder
+    std::unique_ptr<ArrayBuilder> builder;
+    if (opts.return_fixed_size_list) {
+      RETURN_NOT_OK(MakeBuilder(
+          ctx->memory_pool(),
+          fixed_size_list(value_type, static_cast<int32_t>(opts.stop - 
opts.start)),
+          &builder));
+    } else {
+      if constexpr (std::is_same_v<Type, LargeListType>) {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
+      } else {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
+      }
+    }
+    RETURN_NOT_OK(builder->Reserve(list_.length - 1));
+
+    // construct array values
+    if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+      if (opts.return_fixed_size_list) {
+        RETURN_NOT_OK(
+            BuildArrayFromFixedSizeListType<FixedSizeListBuilder>(batch, opts, 
*builder));
+      } else {
+        return Status::Invalid(
+            "Requesting ListArray when slicing a FixedSizeList array would 
always "
+            "result in a FixedSizeList. Please set 
`return_fixed_size_list=true` when "
+            "slicing a FixedSizeList.");
+      }
+    } else {
+      if (opts.return_fixed_size_list) {
+        RETURN_NOT_OK(
+            BuildArrayFromListType<FixedSizeListBuilder>(batch, opts, 
*builder));
+      } else {
+        if (std::is_same_v<Type, LargeListType>) {
+          RETURN_NOT_OK(BuildArrayFromListType<LargeListBuilder>(batch, opts, 
*builder));
+        } else {
+          RETURN_NOT_OK(BuildArrayFromListType<ListBuilder>(batch, opts, 
*builder));
+        }
+      }
+    }
+
+    // build output arrays and set result
+    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+    out->value = result->data();
+    return Status::OK();
+  }
+  template <typename BuilderType>
+  static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+                                                const ListSliceOptions opts,
+                                                ArrayBuilder& builder) {
+    const auto list_size =
+        checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+    const ArraySpan& list_ = batch[0].array;
+    const ArraySpan& list_values = list_.child_data[0];
+    const offset_type n_offsets = list_values.length / list_size;
+
+    auto list_builder = checked_cast<BuilderType*>(&builder);
+    for (offset_type offset = 0; offset < n_offsets * list_size;
+         offset = offset + list_size) {
+      auto next_offset = offset + list_size;
+      if (list_values.IsNull(offset)) {

Review Comment:
   The top-level validity of the ListArray lives in `list_` and not 
`list_values`, so we should we check `list_` instead?



##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,100 @@ TEST(TestScalarNested, ListElementInvalid) {
               Raises(StatusCode::Invalid));
 }
 
+void CheckListSlice(std::shared_ptr<Array> input, std::shared_ptr<Array> 
expected,
+                    const ListSliceOptions& args) {
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("list_slice", {input}, 
&args));
+  ASSERT_EQ(result, expected);
+}
+
+TEST(TestScalarNested, ListSliceVariableOutput) {

Review Comment:
   Can you also add a case with top-level nulls? (so not only with nulls inside 
the list elements)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+    if (opts.stop == -1) {
+      // TODO: Support slicing to arbitrary end

Review Comment:
   For the existing SliceOptions we use the max int64 value for this, at least 
in the Python constructor of the options. But I suppose we don't have special 
handling for that on the C++ side, and it just works (because a string can 
never actually have that slice, so you always slice until the end in practice). 
   
   Of course, for supporting this for fixed size list output, we would need to 
special case this to replace it with the actual length of the input fixed size 
list? (and if we special case, it might be better to special case -1 instead of 
max int64?)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+    if (opts.stop == -1) {
+      // TODO: Support slicing to arbitrary end
+      // For variable size list, this would be the largest difference in 
offsets
+      // For fixed size list, this would be the fixed size.
+      return Status::NotImplemented(
+          "Setting `stop==-1` to signify slicing to end, not yet 
implemented.");
+    }
+
+    const ArraySpan& list_ = batch[0].array;
+    const Type* list_type = checked_cast<const Type*>(list_.type);
+    const auto value_type = list_type->value_type();
+
+    // construct builder
+    std::unique_ptr<ArrayBuilder> builder;
+    if (opts.return_fixed_size_list) {
+      RETURN_NOT_OK(MakeBuilder(
+          ctx->memory_pool(),
+          fixed_size_list(value_type, static_cast<int32_t>(opts.stop - 
opts.start)),
+          &builder));
+    } else {
+      if constexpr (std::is_same_v<Type, LargeListType>) {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
+      } else {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
+      }
+    }
+    RETURN_NOT_OK(builder->Reserve(list_.length - 1));

Review Comment:
   Why the -1? (I would think that a builder starts with capacity of 0?)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+    if (opts.stop == -1) {
+      // TODO: Support slicing to arbitrary end

Review Comment:
   Although that also raises the question: for fixed size list input, if you 
slice until after the end, does it also get filled with nulls, or do we 
automatically slice until the end (like how Python slicing works)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to