mapleFU commented on code in PR #35345:
URL: https://github.com/apache/arrow/pull/35345#discussion_r1359475184
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,11 +260,113 @@ Result<std::shared_ptr<Array>> FlattenListArray(const
ListArrayT& list_array,
return Concatenate(non_null_fragments, memory_pool);
}
+template <typename ListViewArrayT>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT&
list_view_array,
+ MemoryPool* memory_pool) {
+ using offset_type = typename ListViewArrayT::offset_type;
+ const int64_t list_view_array_length = list_view_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+ if (list_view_array_length == 0) {
+ return SliceArrayWithOffsets(*value_array, 0, 0);
+ }
+
+ // If the list array is *all* nulls, then just return an empty array.
+ if (list_view_array.null_count() == list_view_array.length()) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ const auto* validity = list_view_array.data()->template
GetValues<uint8_t>(0, 0);
+ const auto* offsets = list_view_array.data()->template
GetValues<offset_type>(1);
+ const auto* sizes = list_view_array.data()->template
GetValues<offset_type>(2);
+
+ // If a ListViewArray:
+ //
+ // 1) does not contain nulls
+ // 2) has sorted offsets
+ // 3) has disjoint views which completely cover the values array
+ //
+ // then simply slice its value array with the first offset and end of the
last list
+ // view.
+ if (list_view_array.null_count() == 0) {
+ bool sorted_and_disjoint = true;
+ for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length;
++i) {
+ sorted_and_disjoint &=
+ sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1];
+ }
+
+ if (sorted_and_disjoint) {
+ const auto begin_offset = list_view_array.value_offset(0);
+ const auto end_offset =
list_view_array.value_offset(list_view_array_length - 1) +
+
list_view_array.value_length(list_view_array_length - 1);
+ return SliceArrayWithOffsets(*value_array, begin_offset, end_offset);
+ }
+ }
+
+ auto is_null_or_empty = [&](int64_t i) {
+ return (validity && !bit_util::GetBit(validity, list_view_array.offset() +
i)) ||
+ sizes[i] == 0;
+ };
+
+ std::vector<std::shared_ptr<Array>> non_null_fragments;
+ // Index of first valid, non-empty list-view and last offset
+ // of the current contiguous fragment in values.
+ constexpr int64_t kUninitialized = -1;
+ int64_t first_i = kUninitialized;
+ offset_type end_offset;
+ int64_t i = 0;
+ for (; i < list_view_array_length; i++) {
+ if (is_null_or_empty(i)) continue;
+
+ first_i = i;
+ end_offset = offsets[i] + sizes[i];
+ break;
+ }
+ i += 1;
+ for (; i < list_view_array_length; i++) {
+ if (is_null_or_empty(i)) continue;
+
+ if (offsets[i] == end_offset) {
+ end_offset += sizes[i];
+ continue;
+ }
+ non_null_fragments.push_back(
+ SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset));
+ first_i = i;
+ end_offset = offsets[i] + sizes[i];
+ }
+ if (first_i != kUninitialized) {
+ non_null_fragments.push_back(
+ SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset));
+ }
+
+ // Final attempt to avoid invoking Concatenate().
+ if (non_null_fragments.size() == 1) {
+ return non_null_fragments[0];
+ } else if (non_null_fragments.size() == 0) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ return Concatenate(non_null_fragments, memory_pool);
+}
+
std::shared_ptr<Array> BoxOffsets(const std::shared_ptr<DataType>& boxed_type,
const ArrayData& data) {
+ const int64_t num_offsets =
+ is_list_view(data.type->id()) ? data.length : data.length + 1;
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[1]};
auto offsets_data =
- std::make_shared<ArrayData>(boxed_type, data.length + 1,
std::move(buffers),
+ std::make_shared<ArrayData>(boxed_type, /*length=*/num_offsets,
std::move(buffers),
+ /*null_count=*/0, data.offset);
+ return MakeArray(offsets_data);
+}
+
+std::shared_ptr<Array> BoxSizes(const std::shared_ptr<DataType>& boxed_type,
+ const ArrayData& data) {
+ DCHECK(is_list_view(data.type->id()));
+ std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[2]};
+ auto offsets_data =
Review Comment:
Rename to `sizes_data`?
##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -80,100 +89,91 @@ class BaseListBuilder : public ArrayBuilder {
value_builder_->Reset();
}
- /// \brief Vector append
- ///
- /// If passed, valid_bytes is of equal length to values, and any zero byte
- /// will be considered as a null for that slot
- Status AppendValues(const offset_type* offsets, int64_t length,
- const uint8_t* valid_bytes = NULLPTR) {
- ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(valid_bytes, length);
- offsets_builder_.UnsafeAppend(offsets, length);
- return Status::OK();
- }
-
/// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
- /// value builder
- Status Append(bool is_valid = true) {
+ /// value builder.
+ ///
+ /// \pre if is_valid is false, list_length MUST be 0
+ /// \param is_valid Whether the new list slot is valid
+ /// \param list_length The number of elements in the list
+ Status Append(bool is_valid, int64_t list_length) {
Review Comment:
This would be useful, can we add this in comment? Since the interface would
be a bit tricky.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -699,57 +713,147 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
Review Comment:
So this is for List rather than ListView?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]