bkietz commented on code in PR #35345:
URL: https://github.com/apache/arrow/pull/35345#discussion_r1317796574
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,11 +261,109 @@ Result<std::shared_ptr<Array>> FlattenListArray(const
ListArrayT& list_array,
return Concatenate(non_null_fragments, memory_pool);
}
+template <typename ListViewArrayT>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT&
list_view_array,
+ MemoryPool* memory_pool) {
+ using offset_type = typename ListViewArrayT::offset_type;
+ const int64_t list_view_array_length = list_view_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+ if (list_view_array_length == 0) {
+ return SliceArrayWithOffsets(*value_array, 0, 0);
+ }
+
+ // If the list array is *all* nulls, then just return an empty array.
+ if (list_view_array.null_count() == list_view_array.length()) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ const auto* validity = list_view_array.data()->template
GetValues<uint8_t>(0);
+ const auto* offsets = list_view_array.data()->template
GetValues<offset_type>(1);
+ const auto* sizes = list_view_array.data()->template
GetValues<offset_type>(2);
+
+ // If a ListViewArray:
+ //
+ // 1) does not contain nulls
+ // 2) has sorted offsets
+ // 3) every view is disjoint
+ //
+ // then simply slice its value array with the first offset and end of the
last list
+ // view.
+ if (list_view_array.null_count() == 0) {
+ bool sorted_and_disjoint = true;
+ for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length;
++i) {
+ sorted_and_disjoint &=
+ sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1];
+ }
+
+ if (sorted_and_disjoint) {
+ const auto begin_offset = list_view_array.value_offset(0);
+ const auto end_offset =
list_view_array.value_offset(list_view_array_length - 1) +
+
list_view_array.value_length(list_view_array_length - 1);
+ return SliceArrayWithOffsets(*value_array, begin_offset, end_offset);
+ }
+ }
+
+ std::vector<std::shared_ptr<Array>> non_null_fragments;
+ // Index of first valid list-view and last offset
Review Comment:
```suggestion
// Index of first valid, non-empty list-view and last offset
```
##########
cpp/src/arrow/util/list_util_test.cc:
##########
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array/builder_nested.h"
+#include "arrow/util/list_util.h"
+
+#include "arrow/testing/builder.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+using ListAndListViewTypes =
+ ::testing::Types<ListType, LargeListType, ListViewType, LargeListViewType>;
+
+template <typename T>
+class TestListUtils : public ::testing::Test {
+ public:
+ using TypeClass = T;
+ using offset_type = typename TypeClass::offset_type;
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+
+ void SetUp() override {
+ value_type_ = int16();
+ type_ = std::make_shared<T>(value_type_);
+
+ std::unique_ptr<ArrayBuilder> tmp;
+ ASSERT_OK(MakeBuilder(pool_, type_, &tmp));
+ builder_.reset(checked_cast<BuilderType*>(tmp.release()));
+ }
+
+ void TestRangeOfValuesUsed() {
+ std::shared_ptr<ArrayData> result;
+
+ // Empty list-like array
+ ASSERT_OK(builder_->FinishInternal(&result));
Review Comment:
Why use a builder here instead of ArrayFromJSON?
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,11 +261,109 @@ Result<std::shared_ptr<Array>> FlattenListArray(const
ListArrayT& list_array,
return Concatenate(non_null_fragments, memory_pool);
}
+template <typename ListViewArrayT>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT&
list_view_array,
+ MemoryPool* memory_pool) {
+ using offset_type = typename ListViewArrayT::offset_type;
+ const int64_t list_view_array_length = list_view_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+ if (list_view_array_length == 0) {
+ return SliceArrayWithOffsets(*value_array, 0, 0);
+ }
+
+ // If the list array is *all* nulls, then just return an empty array.
+ if (list_view_array.null_count() == list_view_array.length()) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ const auto* validity = list_view_array.data()->template
GetValues<uint8_t>(0);
+ const auto* offsets = list_view_array.data()->template
GetValues<offset_type>(1);
+ const auto* sizes = list_view_array.data()->template
GetValues<offset_type>(2);
+
+ // If a ListViewArray:
+ //
+ // 1) does not contain nulls
+ // 2) has sorted offsets
+ // 3) every view is disjoint
Review Comment:
nit: "disjoint" by itself admits a list view array with views from 0->3 and
5->7, which wouldn't be compatible with this optimization. The code correctly
checks for full cover
```suggestion
// 3) has disjoint views which completely cover the values array
```
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -321,6 +497,123 @@ std::shared_ptr<Array> LargeListArray::offsets() const {
return BoxOffsets(int64(), *data_);
}
+// ----------------------------------------------------------------------
+// ListViewArray
+
+ListViewArray::ListViewArray(std::shared_ptr<ArrayData> data) {
+ ListViewArray::SetData(std::move(data));
+}
+
+ListViewArray::ListViewArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets,
+ std::shared_ptr<Buffer> value_sizes,
+ std::shared_ptr<Array> values,
+ std::shared_ptr<Buffer> null_bitmap, int64_t
null_count,
+ int64_t offset) {
+ ListViewArray::SetData(ArrayData::Make(
+ type, length,
Review Comment:
```suggestion
std::move(type), length,
```
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -699,55 +713,188 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at
negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset
at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ",
offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ enum ListViewValidationError {
+ kOk = 0,
+ kOutOfBoundsOffset = 1,
+ kOutOfBoundsSize = 2,
+ };
Review Comment:
Is there a reason to do this instead of providing factories for the
corresponding failure Status?
```
static Status OutOfBoundsListViewOffset(int64_t offset, int64_t slot) {
// ...
}
static Status OutOfBoundsListViewSize(int64_t size, int64_t slot) {
// ...
}
```
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,11 +261,109 @@ Result<std::shared_ptr<Array>> FlattenListArray(const
ListArrayT& list_array,
return Concatenate(non_null_fragments, memory_pool);
}
+template <typename ListViewArrayT>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT&
list_view_array,
+ MemoryPool* memory_pool) {
+ using offset_type = typename ListViewArrayT::offset_type;
+ const int64_t list_view_array_length = list_view_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+ if (list_view_array_length == 0) {
+ return SliceArrayWithOffsets(*value_array, 0, 0);
+ }
+
+ // If the list array is *all* nulls, then just return an empty array.
+ if (list_view_array.null_count() == list_view_array.length()) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ const auto* validity = list_view_array.data()->template
GetValues<uint8_t>(0);
+ const auto* offsets = list_view_array.data()->template
GetValues<offset_type>(1);
+ const auto* sizes = list_view_array.data()->template
GetValues<offset_type>(2);
+
+ // If a ListViewArray:
+ //
+ // 1) does not contain nulls
+ // 2) has sorted offsets
+ // 3) every view is disjoint
+ //
+ // then simply slice its value array with the first offset and end of the
last list
+ // view.
+ if (list_view_array.null_count() == 0) {
+ bool sorted_and_disjoint = true;
+ for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length;
++i) {
+ sorted_and_disjoint &=
+ sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1];
+ }
+
+ if (sorted_and_disjoint) {
+ const auto begin_offset = list_view_array.value_offset(0);
+ const auto end_offset =
list_view_array.value_offset(list_view_array_length - 1) +
+
list_view_array.value_length(list_view_array_length - 1);
+ return SliceArrayWithOffsets(*value_array, begin_offset, end_offset);
+ }
+ }
+
+ std::vector<std::shared_ptr<Array>> non_null_fragments;
+ // Index of first valid list-view and last offset
+ // of the current contiguous fragment in values.
+ int64_t first_i = -1;
+ offset_type end_offset = -1;
+ int64_t i = 0;
+ for (; i < list_view_array_length; i++) {
+ if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) {
+ continue;
+ }
+ first_i = i;
+ end_offset = offsets[i] + sizes[i];
+ break;
+ }
+ i += 1;
+ for (; i < list_view_array_length; i++) {
+ if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) {
+ continue;
+ }
+ if (offsets[i] == end_offset) {
+ end_offset += sizes[i];
+ } else {
+ non_null_fragments.push_back(
+ SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset));
+ first_i = i;
+ end_offset = offsets[i] + sizes[i];
+ }
+ }
+ if (first_i >= 0) {
+ non_null_fragments.push_back(
+ SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset));
+ }
+
+ // Final attempt to avoid invoking Concatenate().
+ if (non_null_fragments.size() == 1) {
+ return non_null_fragments[0];
+ } else if (non_null_fragments.size() == 0) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ return Concatenate(non_null_fragments, memory_pool);
Review Comment:
Would you mind moving the single chunk special case into Concatenate? It's a
cheap fast path which is useful for data of any type
```suggestion
// Concatenate() doesn't support an empty vector
if (non_null_fragments.size() == 0) {
return MakeEmptyArray(value_array->type(), memory_pool);
}
return Concatenate(non_null_fragments, memory_pool);
```
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -137,6 +137,78 @@ Result<std::shared_ptr<typename
TypeTraits<TYPE>::ArrayType>> ListArrayFromArray
return std::make_shared<ArrayType>(std::move(data));
}
+template <typename TYPE>
+Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>>
ListViewArrayFromArrays(
+ std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
+ const Array& values, MemoryPool* pool, std::shared_ptr<Buffer> null_bitmap
= NULLPTR,
+ int64_t null_count = kUnknownNullCount) {
+ using offset_type = typename TYPE::offset_type;
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+
+ if (offsets.type_id() != OffsetArrowType::type_id) {
+ return Status::TypeError("List offsets must be ",
OffsetArrowType::type_name());
+ }
+
+ if (sizes.length() != offsets.length() && sizes.length() != offsets.length()
- 1) {
+ return Status::Invalid(
+ "List sizes must have the same length as offsets or one less than
offsets");
+ }
+ if (sizes.type_id() != OffsetArrowType::type_id) {
+ return Status::TypeError("List sizes must be ",
OffsetArrowType::type_name());
+ }
+
+ if (offsets.offset() != sizes.offset()) {
+ return Status::Invalid("List offsets and sizes must have the same offset");
+ }
+ const int64_t offset = sizes.offset();
+
+ if (null_bitmap) {
+ if (offsets.null_count() > 0 || sizes.null_count() > 0) {
+ return Status::Invalid(
+ "Ambiguous to specify both validity map and offsets or sizes with
nulls");
+ }
+ if (offset != 0) {
+ return Status::Invalid(
+ "List offsets and sizes must not be slices if a validity map is
specified");
+ }
+ } else {
+ if (offsets.null_count() > 0 && sizes.null_count() > 0) {
+ return Status::Invalid("Ambiguous to specify both offsets and sizes with
nulls");
+ }
+ }
+
+ DCHECK(offsets.length() == sizes.length() || offsets.length() - 1 ==
sizes.length());
+
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+ const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+ const auto& typed_sizes = checked_cast<const OffsetArrayType&>(sizes);
+
+ auto derived_validity_buffer = std::move(null_bitmap);
+ int64_t null_count_ = null_count;
+ if (offsets.null_count() > 0) {
+ derived_validity_buffer = offsets.null_bitmap();
+ null_count_ = offsets.null_count();
+ // We allow construction from an offsets array containing one extra value.
+ // If that is the case, we might need to discount one null from
out_null_count.
+ if (offsets.length() - 1 == sizes.length() &&
!offsets.IsValid(sizes.length())) {
+ null_count_ -= 1;
Review Comment:
Why introduce another variable?
```suggestion
if (offsets.null_count() > 0) {
derived_validity_buffer = offsets.null_bitmap();
null_count = offsets.null_count();
// We allow construction from an offsets array containing one extra
value.
// If that is the case, we might need to discount one null from
out_null_count.
if (offsets.length() - 1 == sizes.length() &&
!offsets.IsValid(sizes.length())) {
null_count -= 1;
```
##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,11 +261,109 @@ Result<std::shared_ptr<Array>> FlattenListArray(const
ListArrayT& list_array,
return Concatenate(non_null_fragments, memory_pool);
}
+template <typename ListViewArrayT>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT&
list_view_array,
+ MemoryPool* memory_pool) {
+ using offset_type = typename ListViewArrayT::offset_type;
+ const int64_t list_view_array_length = list_view_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+ if (list_view_array_length == 0) {
+ return SliceArrayWithOffsets(*value_array, 0, 0);
+ }
+
+ // If the list array is *all* nulls, then just return an empty array.
+ if (list_view_array.null_count() == list_view_array.length()) {
+ return MakeEmptyArray(value_array->type(), memory_pool);
+ }
+
+ const auto* validity = list_view_array.data()->template
GetValues<uint8_t>(0);
+ const auto* offsets = list_view_array.data()->template
GetValues<offset_type>(1);
+ const auto* sizes = list_view_array.data()->template
GetValues<offset_type>(2);
+
+ // If a ListViewArray:
+ //
+ // 1) does not contain nulls
+ // 2) has sorted offsets
+ // 3) every view is disjoint
+ //
+ // then simply slice its value array with the first offset and end of the
last list
+ // view.
+ if (list_view_array.null_count() == 0) {
+ bool sorted_and_disjoint = true;
+ for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length;
++i) {
+ sorted_and_disjoint &=
+ sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1];
+ }
+
+ if (sorted_and_disjoint) {
+ const auto begin_offset = list_view_array.value_offset(0);
+ const auto end_offset =
list_view_array.value_offset(list_view_array_length - 1) +
+
list_view_array.value_length(list_view_array_length - 1);
+ return SliceArrayWithOffsets(*value_array, begin_offset, end_offset);
+ }
+ }
+
+ std::vector<std::shared_ptr<Array>> non_null_fragments;
+ // Index of first valid list-view and last offset
+ // of the current contiguous fragment in values.
+ int64_t first_i = -1;
+ offset_type end_offset = -1;
+ int64_t i = 0;
+ for (; i < list_view_array_length; i++) {
+ if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) {
+ continue;
+ }
+ first_i = i;
+ end_offset = offsets[i] + sizes[i];
+ break;
+ }
+ i += 1;
+ for (; i < list_view_array_length; i++) {
Review Comment:
Is it a worthwhile special case to search for the first list with a
specialized loop? It'd be easier to read if the two loops were consolidated,
and the special case I'm used to seeing is explicitly hoisting a no-nulls case
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -699,55 +713,188 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at
negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset
at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ",
offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ enum ListViewValidationError {
+ kOk = 0,
+ kOutOfBoundsOffset = 1,
+ kOutOfBoundsSize = 2,
+ };
+
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ std::pair<ListViewValidationError, int64_t> DoFullyValidateOffsetsAndSizes(
+ int64_t offset_limit) {
+ const auto* validity = data.GetValues<uint8_t>(0, 0);
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+
+ int64_t slot = 0;
+ if (validity) {
+ internal::BitBlockCounter counter(validity, data.offset, data.length);
+ internal::BitBlockCount block;
+ for (int64_t i = 0; i < data.length; i += block.length) {
+ block = counter.NextWord();
+ if (block.NoneSet()) {
+ continue;
+ }
+ const bool all_set = block.AllSet();
+ for (int j = 0; j < block.length; j++) {
+ slot = i + j;
+ const bool valid = all_set || bit_util::GetBit(validity, data.offset
+ slot);
+ if (valid) {
+ const auto size = sizes[slot];
+ if (size > 0) {
+ const auto offset = offsets[slot];
+ if (offset < 0 || offset > offset_limit) {
+ return {kOutOfBoundsOffset, slot};
+ }
+ if (offset + size > offset_limit) {
Review Comment:
Since we haven't checked what `size` is yet, it might contain
`offset_type::MAX` or something else which would provoke overflow
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -699,55 +713,188 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at
negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset
at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ",
offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ enum ListViewValidationError {
+ kOk = 0,
+ kOutOfBoundsOffset = 1,
+ kOutOfBoundsSize = 2,
+ };
+
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ std::pair<ListViewValidationError, int64_t> DoFullyValidateOffsetsAndSizes(
+ int64_t offset_limit) {
+ const auto* validity = data.GetValues<uint8_t>(0, 0);
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+
+ int64_t slot = 0;
+ if (validity) {
+ internal::BitBlockCounter counter(validity, data.offset, data.length);
+ internal::BitBlockCount block;
+ for (int64_t i = 0; i < data.length; i += block.length) {
+ block = counter.NextWord();
+ if (block.NoneSet()) {
+ continue;
+ }
+ const bool all_set = block.AllSet();
+ for (int j = 0; j < block.length; j++) {
+ slot = i + j;
+ const bool valid = all_set || bit_util::GetBit(validity, data.offset
+ slot);
+ if (valid) {
+ const auto size = sizes[slot];
+ if (size > 0) {
+ const auto offset = offsets[slot];
+ if (offset < 0 || offset > offset_limit) {
+ return {kOutOfBoundsOffset, slot};
+ }
+ if (offset + size > offset_limit) {
+ return {kOutOfBoundsSize, slot};
+ }
+ } else if (size < 0) {
+ return {kOutOfBoundsSize, slot};
+ }
+ }
+ }
+ }
+ } else {
+ for (; slot < data.length; slot++) {
+ const auto size = sizes[slot];
+ if (size > 0) {
+ const auto offset = offsets[slot];
+ if (offset < 0 || offset > offset_limit) {
+ return {kOutOfBoundsOffset, slot};
+ }
+ if (offset + size > offset_limit) {
+ return {kOutOfBoundsSize, slot};
+ }
+ } else if (size < 0) {
+ return {kOutOfBoundsSize, slot};
+ }
+ }
+ }
+
+ return {kOk, slot};
+ }
+
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsetsAndSizes(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+
+ auto [result, slot] =
DoFullyValidateOffsetsAndSizes<offset_type>(offset_limit);
+ switch (result) {
+ case kOk:
+ break;
+ case kOutOfBoundsOffset: {
+ const auto offset = offsets[slot];
+ return Status::Invalid("Offset invariant failure: offset for slot ",
slot,
+ " out of bounds: ", offset,
+ offset < 0 ? " < 0" : " > offset_limit");
Review Comment:
```suggestion
" out of bounds. Expected ", offset, " to be
at least ",
"0 and less than ", offset_limit);
```
##########
cpp/src/arrow/builder.cc:
##########
@@ -219,6 +219,20 @@ struct MakeBuilderImpl {
return Status::OK();
}
+ Status Visit(const ListViewType& list_view_type) {
+ std::shared_ptr<DataType> value_type = list_view_type.value_type();
+ ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type));
+ out.reset(new ListViewBuilder(pool, std::move(value_builder), type));
Review Comment:
```suggestion
out.reset(new ListViewBuilder(pool, std::move(value_builder),
std::move(type)));
```
##########
cpp/src/arrow/array/builder_base.cc:
##########
@@ -319,10 +324,30 @@ struct DerefConstIterator {
pointer operator->() const { return &(**it); }
};
+/// If A and B are equivalent types, a builder of type A can receive
+/// scalar values of type B and a builder of type B can receive
+/// scalar values of type A.
+///
+/// \param a Type A.
+/// \param b Type B.
+bool AreScalarTypesEquivalent(const DataType& a, const DataType& b) {
+ if (a.Equals(b)) {
+ return true;
+ }
+ if ((a.id() == Type::LIST && b.id() == Type::LIST_VIEW) ||
+ (a.id() == Type::LIST_VIEW && b.id() == Type::LIST) ||
+ (a.id() == Type::LARGE_LIST && b.id() == Type::LARGE_LIST_VIEW) ||
+ (a.id() == Type::LARGE_LIST_VIEW && b.id() == Type::LARGE_LIST)) {
+ return checked_cast<const BaseListType&>(a).value_type()->Equals(
+ *checked_cast<const BaseListType&>(b).value_type());
+ }
+ return false;
+}
+
} // namespace
Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
- if (!scalar.type->Equals(type())) {
+ if (!AreScalarTypesEquivalent(*scalar.type, *type())) {
Review Comment:
Where is this used? I'm not sure I like the idea of introducing implicit
conversions. For one thing, what if the builder is `list(field("item", int8(),
/*nullable=*/false))` but the appended scalar is `list(int8())`? The implicit
conversion would drop the guarantee of no nulls from the built array
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]