lidavidm commented on code in PR #14395:
URL: https://github.com/apache/arrow/pull/14395#discussion_r1009362001
##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -346,6 +346,16 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions {
int64_t start, stop, step;
};
+class ARROW_EXPORT ListSliceOptions : public FunctionOptions {
+ public:
+ explicit ListSliceOptions(int64_t start,
+ int64_t stop = std::numeric_limits<int64_t>::max(),
+ int64_t step = 1, bool return_fixed_size_list =
true);
+ ListSliceOptions();
+ static constexpr char const kTypeName[] = "ListSliceOptions";
+ int64_t start, stop, step, return_fixed_size_list;
Review Comment:
nit: why declare the bool flag as an int64_t?
nit: docstrings for each member?
##########
python/pyarrow/tests/test_compute.py:
##########
@@ -2929,3 +2930,90 @@ def test_cast_table_raises():
with pytest.raises(pa.lib.ArrowInvalid):
pc.cast(table, pa.int64())
+
+
[email protected]("start,stop,expected", (
+ (0, 1, [[1], [4], [6], None]),
+ (0, 2, [[1, 2], [4, 5], [6, None], None]),
+ (1, 2, [[2], [5], [None], None]),
+ (2, 4, [[3, None], [None, None], [None, None], None])
+))
[email protected]("value_type", (pa.string, pa.int16, pa.float64))
[email protected]("list_type", (pa.list_, pa.large_list, "fixed"))
+def test_list_slice_output_fixed(start, stop, expected, value_type, list_type):
+ if list_type == "fixed":
+ arr = pa.array([[1, 2, 3], [4, 5, None], [6, None, None], None],
+ pa.list_(pa.int8(), 3)).cast(pa.list_(value_type(), 3))
+ else:
+ arr = pa.array([[1, 2, 3], [4, 5], [6], None],
+ pa.list_(pa.int8())).cast(list_type(value_type()))
+ result = pc.list_slice(arr, start, stop) # default is to return fixed size
+ pylist = result.cast(pa.list_(pa.int8(), stop-start)).to_pylist()
+ assert pylist == expected
+
+
[email protected]("start,stop", (
+ (0, 1,),
+ (0, 2,),
+ (1, 2,),
+ (2, 4,)
+))
[email protected]("value_type", (pa.string, pa.int16, pa.float64))
[email protected]("list_type", (pa.list_, pa.large_list, "fixed"))
+def test_list_slice_output_variable(start, stop, value_type, list_type):
+ if list_type == "fixed":
+ data = [[1, 2, 3], [4, 5, None], [6, None, None], None]
+ arr = pa.array(
+ data,
+ pa.list_(pa.int8(), 3)).cast(pa.list_(value_type(), 3))
+ else:
+ data = [[1, 2, 3], [4, 5], [6], None]
+ arr = pa.array(data,
+ pa.list_(pa.int8())).cast(list_type(value_type()))
+
+ # Gets same list type (ListArray vs LargeList)
+ if list_type == "fixed":
+ list_type = pa.list_ # non fixed output type
+
+ result = pc.list_slice(arr, start, stop, return_fixed_size_list=False)
+ assert result.type == list_type(value_type())
+
+ pylist = result.cast(pa.list_(pa.int8())).to_pylist()
+
+ # Variable output slicing follows Python's slice semantics
+ expected = [d[start:stop] if d is not None else None for d in data]
+ assert pylist == expected
+
+
+def test_list_slice_bad_parameters():
+ arr = pa.array([[1]], pa.list_(pa.int8(), 1))
+ msg = r"`start`(.*) should be greater than 0 and smaller than `stop`(.*)"
+ with pytest.raises(pa.ArrowInvalid, match=msg):
+ pc.list_slice(arr, -1) # negative start?
+ with pytest.raises(pa.ArrowInvalid, match=msg):
+ pc.list_slice(arr, 2, 1) # start > stop?
+
+ # TODO: start==stop -> empty lists
+ with pytest.raises(pa.ArrowInvalid, match=msg):
+ pc.list_slice(arr, 0, 0) # start == stop?
+
+ # TODO: support step in slicing
+ msg = "Setting `step` to anything other than 1 is not supported; got
step=2"
+ with pytest.raises(NotImplementedError, match=msg):
+ pc.list_slice(arr, 0, 1, step=2)
+
+ # TODO: support stop == -1; slice to end
+ msg = "Setting `stop==-1` to signify slicing to end, not yet implemented."
+ with pytest.raises(NotImplementedError, match=msg):
+ pc.list_slice(arr, 0, -1)
Review Comment:
Ah, now I see this…can we just implement it in Python?
And, do we want to use `-1` as "no stop", or perhaps None in Python? That
leaves it open for us to support Python-style negative indices in the future if
desired.
##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,97 @@ TEST(TestScalarNested, ListElementInvalid) {
Raises(StatusCode::Invalid));
}
+void CheckListSlice(std::shared_ptr<Array> input, std::shared_ptr<Array>
expected,
+ const ListSliceOptions& args) {
+ ASSERT_OK_AND_ASSIGN(auto result, CallFunction("list_slice", {input},
&args));
+ ASSERT_EQ(result, expected);
Review Comment:
Can we use the standard helper function for this instead
(`CheckScalarUnary`)? It'll check slices and such (but doesn't check the case
when nested arrays have their own offset, so we'll still need separate tests
for that)
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
return Status::OK();
}
+template <typename Type, typename IndexType>
+struct ListSlice {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+ // Invariants
+ if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+ // TODO: support start == stop which should give empty lists
+ return Status::Invalid("`start`(", opts.start,
+ ") should be greater than 0 and smaller than
`stop`(",
+ opts.stop, ")");
+ }
+ if (opts.step != 1) {
+ // TODO: support step in slicing
+ return Status::NotImplemented(
+ "Setting `step` to anything other than 1 is not supported; got
step=",
+ opts.step);
+ }
+ if (opts.stop == -1) {
+ // TODO: Support slicing to arbitrary end
Review Comment:
If this is all ambiguous, can we use `std::optional<int64_t>` for the slice
stop?
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,199 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
return Status::OK();
}
+template <typename Type, typename IndexType>
+struct ListSlice {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+ // Invariants
+ if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+ // TODO: support start == stop which should give empty lists
+ return Status::Invalid("`start`(", opts.start,
+ ") should be greater than 0 and smaller than
`stop`(",
+ opts.stop, ")");
+ }
+ if (opts.step != 1) {
+ // TODO: support step in slicing
+ return Status::NotImplemented(
+ "Setting `step` to anything other than 1 is not supported; got
step=",
+ opts.step);
+ }
+ if (opts.stop == -1) {
+ // TODO: Support slicing to arbitrary end
+ // For variable size list, this would be the largest difference in
offsets
+ // For fixed size list, this would be the fixed size.
+ return Status::NotImplemented(
+ "Setting `stop==-1` to signify slicing to end, not yet
implemented.");
+ }
+
+ const ArraySpan& list_ = batch[0].array;
+ const Type* list_type = checked_cast<const Type*>(list_.type);
+ const auto value_type = list_type->value_type();
+
+ std::unique_ptr<ArrayBuilder> builder;
+
+ // construct array values
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(MakeBuilder(
+ ctx->memory_pool(),
+ fixed_size_list(value_type, static_cast<int32_t>(opts.stop -
opts.start)),
+ &builder));
+ RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
+ } else {
+ if constexpr (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type),
&builder));
+ RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
+ } else {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type),
&builder));
+ RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
+ }
+ }
+
+ // build output arrays and set result
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ out->value = result->data();
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+ RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts,
builder));
+ } else {
+ RETURN_NOT_OK(BuildArrayFromListType<BuilderType>(batch, opts, builder));
+ }
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+ const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ const auto list_size =
+ checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+ const ArraySpan& list_ = batch[0].array;
+ const ArraySpan& list_values = list_.child_data[0];
+ const offset_type n_offsets = list_values.length / list_size;
+
+ auto list_builder = checked_cast<BuilderType*>(&builder);
+ for (offset_type offset = 0; offset < n_offsets * list_size;
+ offset = offset + list_size) {
+ auto next_offset = offset + list_size;
+ if (list_values.IsNull(offset)) {
+ RETURN_NOT_OK(list_builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(SetValues<BuilderType>(list_builder, offset,
next_offset, &opts,
+ &list_values));
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArrayFromListType(const ExecSpan& batch,
+ const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ const ArraySpan& list_ = batch[0].array;
+ const offset_type* offsets = list_.GetValues<offset_type>(1);
+ const auto n_offsets = static_cast<offset_type>(
+ static_cast<size_t>(list_.GetBuffer(1)->size()) / sizeof(offset_type));
+
+ // validity bitmap if set.
+ std::unique_ptr<arrow::internal::Bitmap> validity_bitmap;
+ if (list_.buffers[0].data != nullptr) {
+ arrow::internal::Bitmap bitmap{list_.buffers[0].data, list_.offset,
list_.length};
+ validity_bitmap = std::make_unique<arrow::internal::Bitmap>(bitmap);
Review Comment:
Why are we adding an extra allocation/indirection here? If it's just to
track whether there was a bitmap or not, maybe just use a boolean flag instead?
##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,97 @@ TEST(TestScalarNested, ListElementInvalid) {
Raises(StatusCode::Invalid));
}
+void CheckListSlice(std::shared_ptr<Array> input, std::shared_ptr<Array>
expected,
+ const ListSliceOptions& args) {
+ ASSERT_OK_AND_ASSIGN(auto result, CallFunction("list_slice", {input},
&args));
+ ASSERT_EQ(result, expected);
+}
+
+TEST(TestScalarNested, ListSliceVariableOutput) {
+ const auto value_types = {float32(), int32()};
+ for (auto value_type : value_types) {
+ /* Variable list size output required variable size list input. */
+ auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6],
null]")};
+ for (auto input : inputs) {
+ ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+ /*return_fixed_size_list=*/false);
+ auto expected = ArrayFromJSON(list(value_type), "[[1, 2], [4, 5], [6],
null]");
+ CheckListSlice(input, expected, args);
+
+ args.start = 1;
+ expected = ArrayFromJSON(list(value_type), "[[2], [5], [], null]");
+ CheckListSlice(input, expected, args);
+
+ args.start = 2;
+ args.stop = 4;
+ expected = ArrayFromJSON(list(value_type), "[[3], [], [], null]");
+ CheckListSlice(input, expected, args);
+ }
+ }
+
+ // Verify passing `return_fixed_size_list=false` with fixed size input
+ // returns variable size even if stop is beyond list_size
+ ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+ /*return_fixed_size_list=*/false);
+ auto input = ArrayFromJSON(fixed_size_list(int32(), 1), "[[1]]");
+ auto expected = ArrayFromJSON(list(int32()), "[[1]]");
+ CheckListSlice(input, expected, args);
+}
+
+TEST(TestScalarNested, ListSliceFixedOutput) {
+ const auto value_types = {float32(), int32()};
+ for (auto value_type : value_types) {
+ auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6],
null]"),
+ ArrayFromJSON(fixed_size_list(value_type, 3),
+ "[[1, 2, 3], [4, 5, null], [6, null, null],
null]")};
+ for (auto input : inputs) {
+ ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+ /*return_fixed_size_list=*/true);
+ auto expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+ "[[1, 2], [4, 5], [6, null], null]");
+ CheckListSlice(input, expected, args);
+
+ args.start = 1;
+ expected =
+ ArrayFromJSON(fixed_size_list(value_type, 1), "[[2], [5], [null],
null]");
+ CheckListSlice(input, expected, args);
+
+ args.start = 2;
+ args.stop = 4;
+ expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+ "[[3, null], [null, null], [null, null],
null]");
+ CheckListSlice(input, expected, args);
+ }
+ }
+}
+
+TEST(TestScalarNested, ListSliceBadParameters) {
+ auto input = ArrayFromJSON(list(int32()), "[[1]]");
+
+ // negative start
+ SliceOptions args(-1, 1);
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "`start`(-1) should be greater than 0 and smaller than `stop`(1)"),
+ CallFunction("list_slice", {input}, &args));
+ // start greater than stop
+ args.start = 1;
+ args.stop = 0;
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "`start`(1) should be greater than 0 and smaller than `stop`(0)"),
+ CallFunction("list_slice", {input}, &args));
+ // start same as stop
+ args.stop = args.start;
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "`start`(1) should be greater than 0 and smaller than `stop`(1)"),
+ CallFunction("list_slice", {input}, &args));
Review Comment:
negative stop as well?
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
return Status::OK();
}
+template <typename Type, typename IndexType>
+struct ListSlice {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+ // Invariants
+ if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+ // TODO: support start == stop which should give empty lists
+ return Status::Invalid("`start`(", opts.start,
+ ") should be greater than 0 and smaller than
`stop`(",
+ opts.stop, ")");
+ }
+ if (opts.step != 1) {
+ // TODO: support step in slicing
+ return Status::NotImplemented(
+ "Setting `step` to anything other than 1 is not supported; got
step=",
+ opts.step);
+ }
+ if (opts.stop == -1) {
+ // TODO: Support slicing to arbitrary end
+ // For variable size list, this would be the largest difference in
offsets
+ // For fixed size list, this would be the fixed size.
+ return Status::NotImplemented(
+ "Setting `stop==-1` to signify slicing to end, not yet
implemented.");
+ }
+
+ const ArraySpan& list_ = batch[0].array;
+ const Type* list_type = checked_cast<const Type*>(list_.type);
+ const auto value_type = list_type->value_type();
+
+ // construct builder
+ std::unique_ptr<ArrayBuilder> builder;
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(MakeBuilder(
+ ctx->memory_pool(),
+ fixed_size_list(value_type, static_cast<int32_t>(opts.stop -
opts.start)),
+ &builder));
+ } else {
+ if constexpr (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type),
&builder));
+ } else {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type),
&builder));
+ }
+ }
+ RETURN_NOT_OK(builder->Reserve(list_.length - 1));
+
+ // construct array values
+ if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(
+ BuildArrayFromFixedSizeListType<FixedSizeListBuilder>(batch, opts,
*builder));
+ } else {
+ return Status::Invalid(
+ "Requesting ListArray when slicing a FixedSizeList array would
always "
+ "result in a FixedSizeList. Please set
`return_fixed_size_list=true` when "
+ "slicing a FixedSizeList.");
+ }
+ } else {
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(
+ BuildArrayFromListType<FixedSizeListBuilder>(batch, opts,
*builder));
+ } else {
+ if (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(BuildArrayFromListType<LargeListBuilder>(batch, opts,
*builder));
+ } else {
+ RETURN_NOT_OK(BuildArrayFromListType<ListBuilder>(batch, opts,
*builder));
+ }
+ }
+ }
+
+ // build output arrays and set result
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ out->value = result->data();
+ return Status::OK();
+ }
+ template <typename BuilderType>
+ static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+ const ListSliceOptions opts,
+ ArrayBuilder& builder) {
+ const auto list_size =
+ checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+ const ArraySpan& list_ = batch[0].array;
+ const ArraySpan& list_values = list_.child_data[0];
+ const offset_type n_offsets = list_values.length / list_size;
+
+ auto list_builder = checked_cast<BuilderType*>(&builder);
+ for (offset_type offset = 0; offset < n_offsets * list_size;
+ offset = offset + list_size) {
+ auto next_offset = offset + list_size;
+ if (list_values.IsNull(offset)) {
Review Comment:
Hmm, I agree something is off - we should still be taking into account the
FixedSizeListArray's validity bitmap
##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,97 @@ TEST(TestScalarNested, ListElementInvalid) {
Raises(StatusCode::Invalid));
}
+void CheckListSlice(std::shared_ptr<Array> input, std::shared_ptr<Array>
expected,
Review Comment:
A common thing that breaks kernels is to give the child array its own
offset, in addition to the parent array; it'd be good to test that here.
##########
python/pyarrow/_compute.pyx:
##########
@@ -1168,6 +1168,32 @@ class SliceOptions(_SliceOptions):
self._set_options(start, stop, step)
+cdef class _ListSliceOptions(FunctionOptions):
+ def _set_options(self, start, stop=-1, step=1,
return_fixed_size_list=True):
+ self.wrapped.reset(new CListSliceOptions(start, stop, step,
return_fixed_size_list))
+
+
+class ListSliceOptions(_ListSliceOptions):
+ """
+ Options for list array slicing.
+
+ Parameters
+ ----------
+ start : int
+ Index to start slicing inner list elements (inclusive).
+ stop : int, default -1
+ If given, index to stop slicing at (exclusive).
+ If not given, slicing will stop at the end.
+ step : int, default 1
+ Slice step.
+ return_fixed_size_list : bool, default True
+ Whether to return a FixedSizeListArray.
+ """
+
+ def __init__(self, start, stop=-1, step=1, return_fixed_size_list=True):
Review Comment:
The Python and C++ defaults differ, and it's not documented in either case
what negative indices do (and it seems to be different from Python itself?)
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,199 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
return Status::OK();
}
+template <typename Type, typename IndexType>
+struct ListSlice {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+ // Invariants
+ if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+ // TODO: support start == stop which should give empty lists
+ return Status::Invalid("`start`(", opts.start,
+ ") should be greater than 0 and smaller than
`stop`(",
+ opts.stop, ")");
+ }
+ if (opts.step != 1) {
+ // TODO: support step in slicing
+ return Status::NotImplemented(
+ "Setting `step` to anything other than 1 is not supported; got
step=",
+ opts.step);
+ }
+ if (opts.stop == -1) {
+ // TODO: Support slicing to arbitrary end
+ // For variable size list, this would be the largest difference in
offsets
+ // For fixed size list, this would be the fixed size.
+ return Status::NotImplemented(
+ "Setting `stop==-1` to signify slicing to end, not yet
implemented.");
+ }
+
+ const ArraySpan& list_ = batch[0].array;
+ const Type* list_type = checked_cast<const Type*>(list_.type);
+ const auto value_type = list_type->value_type();
+
+ std::unique_ptr<ArrayBuilder> builder;
+
+ // construct array values
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(MakeBuilder(
+ ctx->memory_pool(),
+ fixed_size_list(value_type, static_cast<int32_t>(opts.stop -
opts.start)),
+ &builder));
+ RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
+ } else {
+ if constexpr (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type),
&builder));
+ RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
+ } else {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type),
&builder));
+ RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
+ }
+ }
+
+ // build output arrays and set result
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ out->value = result->data();
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+ RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts,
builder));
+ } else {
+ RETURN_NOT_OK(BuildArrayFromListType<BuilderType>(batch, opts, builder));
+ }
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+ const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ const auto list_size =
+ checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+ const ArraySpan& list_ = batch[0].array;
+ const ArraySpan& list_values = list_.child_data[0];
+ const offset_type n_offsets = list_values.length / list_size;
+
+ auto list_builder = checked_cast<BuilderType*>(&builder);
+ for (offset_type offset = 0; offset < n_offsets * list_size;
+ offset = offset + list_size) {
+ auto next_offset = offset + list_size;
+ if (list_values.IsNull(offset)) {
+ RETURN_NOT_OK(list_builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(SetValues<BuilderType>(list_builder, offset,
next_offset, &opts,
+ &list_values));
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ static Status BuildArrayFromListType(const ExecSpan& batch,
+ const ListSliceOptions& opts,
+ ArrayBuilder& builder) {
+ const ArraySpan& list_ = batch[0].array;
+ const offset_type* offsets = list_.GetValues<offset_type>(1);
+ const auto n_offsets = static_cast<offset_type>(
+ static_cast<size_t>(list_.GetBuffer(1)->size()) / sizeof(offset_type));
+
+ // validity bitmap if set.
+ std::unique_ptr<arrow::internal::Bitmap> validity_bitmap;
+ if (list_.buffers[0].data != nullptr) {
+ arrow::internal::Bitmap bitmap{list_.buffers[0].data, list_.offset,
list_.length};
+ validity_bitmap = std::make_unique<arrow::internal::Bitmap>(bitmap);
+ }
+ const ArraySpan& list_values = list_.child_data[0];
+
+ auto list_builder = checked_cast<BuilderType*>(&builder);
+ for (auto i = 0; i < n_offsets - 1; ++i) {
+ const offset_type offset = offsets[i];
+ const offset_type next_offset = offsets[i + 1];
+ if (validity_bitmap != nullptr && !validity_bitmap->GetBit(i)) {
+ RETURN_NOT_OK(list_builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(SetValues<BuilderType>(list_builder, offset,
next_offset, &opts,
+ &list_values));
+ }
+ }
+ return Status::OK();
+ }
+ template <typename BuilderType>
+ static Status SetValues(BuilderType* list_builder, const offset_type offset,
+ const offset_type next_offset, const
ListSliceOptions* opts,
+ const ArraySpan* list_values) {
+ auto value_builder = list_builder->value_builder();
+ auto cursor = offset;
+
+ RETURN_NOT_OK(list_builder->Append());
+ while (cursor < offset + (opts->stop - opts->start)) {
+ if (cursor + opts->start >= next_offset) {
+ if constexpr (!std::is_same_v<BuilderType, FixedSizeListBuilder>) {
+ break; // don't pad nulls for variable sized list output
+ }
+ RETURN_NOT_OK(value_builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(
+ value_builder->AppendArraySlice(*list_values, cursor +
opts->start, 1));
+ }
+ ++cursor;
+ }
+ return Status::OK();
+ }
+};
+
+Result<TypeHolder> MakeListSliceResolve(KernelContext* ctx,
+ const std::vector<TypeHolder>& types) {
+ const auto start = OptionsWrapper<ListSliceOptions>::Get(ctx).start;
+ const auto stop = OptionsWrapper<ListSliceOptions>::Get(ctx).stop;
+ const auto return_fixed_size_list =
+ OptionsWrapper<ListSliceOptions>::Get(ctx).return_fixed_size_list;
+ const auto list_type = checked_cast<const BaseListType*>(types[0].type);
+ if (return_fixed_size_list) {
+ return TypeHolder(
+ fixed_size_list(list_type->value_type(), static_cast<int32_t>(stop -
start)));
+ } else {
+ // Returning large list if that's what we got in and didn't ask for fixed
size
+ if (list_type->id() == Type::LARGE_LIST) {
+ return TypeHolder(large_list(list_type->value_type()));
Review Comment:
Do we care about preserving field name here?
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,206 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
return Status::OK();
}
+template <typename Type, typename IndexType>
+struct ListSlice {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+ // Invariants
+ if (opts.start < 0 || (opts.start >= opts.stop && opts.stop != -1)) {
+ // TODO: support start == stop which should give empty lists
+ return Status::Invalid("`start`(", opts.start,
+ ") should be greater than 0 and smaller than
`stop`(",
+ opts.stop, ")");
+ }
+ if (opts.step != 1) {
+ // TODO: support step in slicing
+ return Status::NotImplemented(
+ "Setting `step` to anything other than 1 is not supported; got
step=",
+ opts.step);
+ }
+ if (opts.stop == -1) {
+ // TODO: Support slicing to arbitrary end
+ // For variable size list, this would be the largest difference in
offsets
+ // For fixed size list, this would be the fixed size.
+ return Status::NotImplemented(
+ "Setting `stop==-1` to signify slicing to end, not yet
implemented.");
+ }
+
+ const ArraySpan& list_ = batch[0].array;
+ const Type* list_type = checked_cast<const Type*>(list_.type);
+ const auto value_type = list_type->value_type();
+
+ // construct builder
+ std::unique_ptr<ArrayBuilder> builder;
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(MakeBuilder(
+ ctx->memory_pool(),
+ fixed_size_list(value_type, static_cast<int32_t>(opts.stop -
opts.start)),
+ &builder));
+ } else {
+ if constexpr (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type),
&builder));
+ } else {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type),
&builder));
+ }
+ }
+ RETURN_NOT_OK(builder->Reserve(list_.length - 1));
+
+ // construct array values
+ if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(
+ BuildArrayFromFixedSizeListType<FixedSizeListBuilder>(batch, opts,
*builder));
+ } else {
+ return Status::Invalid(
+ "Requesting ListArray when slicing a FixedSizeList array would
always "
+ "result in a FixedSizeList. Please set
`return_fixed_size_list=true` when "
+ "slicing a FixedSizeList.");
+ }
+ } else {
+ if (opts.return_fixed_size_list) {
+ RETURN_NOT_OK(
+ BuildArrayFromListType<FixedSizeListBuilder>(batch, opts,
*builder));
+ } else {
+ if (std::is_same_v<Type, LargeListType>) {
+ RETURN_NOT_OK(BuildArrayFromListType<LargeListBuilder>(batch, opts,
*builder));
+ } else {
+ RETURN_NOT_OK(BuildArrayFromListType<ListBuilder>(batch, opts,
*builder));
+ }
+ }
+ }
+
+ // build output arrays and set result
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ out->value = result->data();
+ return Status::OK();
+ }
+ template <typename BuilderType>
+ static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+ const ListSliceOptions opts,
+ ArrayBuilder& builder) {
+ const auto list_size =
+ checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+ const ArraySpan& list_ = batch[0].array;
+ const ArraySpan& list_values = list_.child_data[0];
+ const offset_type n_offsets = list_values.length / list_size;
+
+ auto list_builder = checked_cast<BuilderType*>(&builder);
+ for (offset_type offset = 0; offset < n_offsets * list_size;
+ offset = offset + list_size) {
+ auto next_offset = offset + list_size;
+ if (list_values.IsNull(offset)) {
Review Comment:
The offset for that check needs to be `offset // list_size`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]