pitrou commented on code in PR #35345:
URL: https://github.com/apache/arrow/pull/35345#discussion_r1388146086
##########
cpp/src/arrow/array/array_nested.h:
##########
@@ -216,6 +241,183 @@ class ARROW_EXPORT LargeListArray : public
BaseListArray<LargeListType> {
void SetData(const std::shared_ptr<ArrayData>& data);
};
+// ----------------------------------------------------------------------
+// ListViewArray / LargeListViewArray
+
+template <typename TYPE>
+class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TYPE::offset_type;
+
+ const TypeClass* list_view_type() const { return
this->var_length_list_like_type(); }
+
+ /// Note that this buffer does not account for any slice offset or length.
+ const std::shared_ptr<Buffer>& value_sizes() const { return
this->data_->buffers[2]; }
+
+ /// Return pointer to raw value offsets accounting for any slice offset
+ const offset_type* raw_value_sizes() const {
+ return raw_value_sizes_ + this->data_->offset;
+ }
+
+ offset_type value_length(int64_t i) const final {
+ return this->raw_value_sizes_[i + this->data_->offset];
+ }
+
+ protected:
+ const offset_type* raw_value_sizes_ = NULLPTR;
+};
+
+/// \brief Concrete Array class for list-view data
+class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
+ public:
+ explicit ListViewArray(std::shared_ptr<ArrayData> data);
+
+ ListViewArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets,
+ std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array>
values,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct ListViewArray from array of offsets, sizes, and child
+ /// value array
+ ///
+ /// Construct a ListViewArray using buffers from offsets and sizes arrays
+ /// that project views into the child values array.
+ ///
+ /// This function does the bare minimum of validation of the offsets/sizes
and
+ /// input types. The offset and length of the offsets and sizes arrays must
+ /// match and that will be checked, but their contents will be assumed to be
+ /// well-formed.
+ ///
+ /// If a null_bitmap is not provided, the nulls will be inferred from the
+ /// offsets's null bitmap. But if a null_bitmap is provided, the offsets
array
+ /// can't have nulls.
+ ///
+ /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an
+ /// array with offset() > 0).
Review Comment:
But the sizes can?
##########
cpp/src/arrow/array/array_nested.h:
##########
@@ -181,7 +199,14 @@ class ARROW_EXPORT LargeListArray : public
BaseListArray<LargeListType> {
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
- /// are assumed to be well-formed
+ /// are assumed to be well-formed.
+ ///
+ /// If a null_bitmap is not provided, the nulls will be inferred from the
+ /// offsets's null bitmap. But if a null_bitmap is provided, the offsets
array
+ /// can't have nulls.
+ ///
+ /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an
+ /// array with offset() > 0).
Review Comment:
It's weird that this docstring snippet is different from the one in
`ListArray` below. Is it an overlook?
##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -160,16 +166,69 @@ Status PutOffsets(const std::shared_ptr<Buffer>& src,
Offset first_offset, Offse
// Write offsets into dst, ensuring that the first offset written is
// first_offset
- auto adjustment = first_offset - src_begin[0];
+ auto displacement = first_offset - src_begin[0];
// NOTE: Concatenate can be called during IPC reads to append delta
dictionaries.
// Avoid UB on non-validated input by doing the addition in the unsigned
domain.
// (the result can later be validated using Array::ValidateFull)
- std::transform(src_begin, src_end, dst, [adjustment](Offset offset) {
- return SafeSignedAdd(offset, adjustment);
+ std::transform(src_begin, src_end, dst, [displacement](Offset offset) {
+ return SafeSignedAdd(offset, displacement);
});
return Status::OK();
}
+template <typename offset_type>
+void PutListViewOffsets(const Buffer& src, offset_type displacement,
offset_type* dst);
+
+// Concatenate buffers holding list-view offsets into a single buffer of
offsets
+//
+// value_ranges contains the relevant ranges of values in the child array
actually
+// referenced to by the views. Most commonly, these ranges will start from 0,
+// but when that is not the case, we need to adjust the displacement of
offsets.
+// The concatenated child array does not contain values from the beginning
+// if they are not referenced to by any view.
+template <typename offset_type>
+Status ConcatenateListViewOffsets(const BufferVector& buffers,
+ const std::vector<Range>& value_ranges,
+ MemoryPool* pool, std::shared_ptr<Buffer>*
out) {
+ const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers);
+ ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool));
+ auto* out_data = (*out)->mutable_data_as<offset_type>();
+
+ int64_t num_child_values = 0;
+ int64_t elements_length = 0;
+ for (size_t i = 0; i < buffers.size(); ++i) {
+ const auto displacement =
+ static_cast<offset_type>(num_child_values - value_ranges[i].offset);
+ PutListViewOffsets(/*src=*/*buffers[i],
static_cast<offset_type>(displacement),
+ /*dst=*/out_data + elements_length);
+ elements_length += buffers[i]->size() / sizeof(offset_type);
Review Comment:
Perhaps `PutListViewOffsets` should return a pointer after the last written
offset? (just a suggestion though)
##########
cpp/src/arrow/array/concatenate_test.cc:
##########
@@ -187,33 +189,89 @@ TEST_F(ConcatenateTest, FixedSizeListType) {
});
}
-TEST_F(ConcatenateTest, ListType) {
- Check([this](int32_t size, double null_probability, std::shared_ptr<Array>*
out) {
+template <typename ListType>
+struct ListConcatenationChecker {
+ using offset_type = typename ListType::offset_type;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+ using ListArrayType = typename TypeTraits<ListType>::ArrayType;
+
+ template <typename Self>
+ static void Check(Self& self, int32_t size, double null_probability,
+ std::shared_ptr<Array>* out) {
auto values_size = size * 4;
- auto values = this->GeneratePrimitive<Int8Type>(values_size,
null_probability);
- auto offsets_vector = this->Offsets<int32_t>(values_size, size);
+ auto values =
+ self.template GeneratePrimitive<Int8Type>(values_size,
null_probability);
+ auto offsets_vector = self.template Offsets<offset_type>(values_size,
size);
// Ensure first and last offsets encompass the whole values array
offsets_vector.front() = 0;
- offsets_vector.back() = static_cast<int32_t>(values_size);
+ offsets_vector.back() = static_cast<offset_type>(values_size);
std::shared_ptr<Array> offsets;
- ArrayFromVector<Int32Type>(offsets_vector, &offsets);
- ASSERT_OK_AND_ASSIGN(*out, ListArray::FromArrays(*offsets, *values));
+ ArrayFromVector<OffsetArrowType>(offsets_vector, &offsets);
+ ASSERT_OK_AND_ASSIGN(*out, ListArrayType::FromArrays(*offsets, *values));
ASSERT_OK((**out).ValidateFull());
+ }
+};
+
+TEST_F(ConcatenateTest, ListType) {
+ Check([this](int32_t size, double null_probability, std::shared_ptr<Array>*
out) {
+ ListConcatenationChecker<ListType>::Check(*this, size, null_probability,
out);
});
}
TEST_F(ConcatenateTest, LargeListType) {
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>*
out) {
- auto values_size = size * 4;
- auto values = this->GeneratePrimitive<Int8Type>(values_size,
null_probability);
- auto offsets_vector = this->Offsets<int64_t>(values_size, size);
- // Ensure first and last offsets encompass the whole values array
- offsets_vector.front() = 0;
- offsets_vector.back() = static_cast<int64_t>(values_size);
+ ListConcatenationChecker<LargeListType>::Check(*this, size,
null_probability, out);
+ });
+}
+
+template <typename ListViewType>
+struct ListViewConcatenationChecker {
+ using offset_type = typename ListViewType::offset_type;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+ using ListViewArrayType = typename TypeTraits<ListViewType>::ArrayType;
+
+ template <typename Self>
+ static void Check(Self& self, int32_t num_list_views, double
null_probability,
+ std::shared_ptr<Array>* out) {
+ auto values_size = 4 * num_list_views;
+ auto values =
+ self.template GeneratePrimitive<Int8Type>(values_size,
null_probability);
+
std::shared_ptr<Array> offsets;
- ArrayFromVector<Int64Type>(offsets_vector, &offsets);
- ASSERT_OK_AND_ASSIGN(*out, LargeListArray::FromArrays(*offsets, *values));
+ auto offsets_vector = self.template Offsets<offset_type>(values_size,
num_list_views);
+ offsets_vector.front() = 0;
Review Comment:
Do we want to enforce this? The test should be more interesting is the
encompassed value range does not start at offset 0...
##########
cpp/src/arrow/array/array_nested.h:
##########
@@ -216,6 +241,183 @@ class ARROW_EXPORT LargeListArray : public
BaseListArray<LargeListType> {
void SetData(const std::shared_ptr<ArrayData>& data);
};
+// ----------------------------------------------------------------------
+// ListViewArray / LargeListViewArray
+
+template <typename TYPE>
+class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TYPE::offset_type;
+
+ const TypeClass* list_view_type() const { return
this->var_length_list_like_type(); }
+
+ /// Note that this buffer does not account for any slice offset or length.
+ const std::shared_ptr<Buffer>& value_sizes() const { return
this->data_->buffers[2]; }
+
+ /// Return pointer to raw value offsets accounting for any slice offset
+ const offset_type* raw_value_sizes() const {
+ return raw_value_sizes_ + this->data_->offset;
+ }
+
+ offset_type value_length(int64_t i) const final {
+ return this->raw_value_sizes_[i + this->data_->offset];
+ }
+
+ protected:
+ const offset_type* raw_value_sizes_ = NULLPTR;
+};
+
+/// \brief Concrete Array class for list-view data
+class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
+ public:
+ explicit ListViewArray(std::shared_ptr<ArrayData> data);
+
+ ListViewArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets,
+ std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array>
values,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct ListViewArray from array of offsets, sizes, and child
+ /// value array
+ ///
+ /// Construct a ListViewArray using buffers from offsets and sizes arrays
+ /// that project views into the child values array.
+ ///
+ /// This function does the bare minimum of validation of the offsets/sizes
and
+ /// input types. The offset and length of the offsets and sizes arrays must
+ /// match and that will be checked, but their contents will be assumed to be
+ /// well-formed.
+ ///
+ /// If a null_bitmap is not provided, the nulls will be inferred from the
+ /// offsets's null bitmap. But if a null_bitmap is provided, the offsets
array
+ /// can't have nulls.
+ ///
+ /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an
+ /// array with offset() > 0).
+ ///
+ /// \param[in] offsets An array of int32 offsets into the values array. NULL
values are
+ /// supported if the corresponding values in sizes is NULL or 0.
+ /// \param[in] sizes An array containing the int32 sizes of every view. NULL
values are
+ /// taken to represent a NULL list-view in the array being created.
+ /// \param[in] values Array containing list values
+ /// \param[in] pool MemoryPool
+ /// \param[in] null_bitmap Optional validity bitmap
+ /// \param[in] null_count Optional null count in null_bitmap
+ static Result<std::shared_ptr<ListViewArray>> FromArrays(
+ const Array& offsets, const Array& sizes, const Array& values,
+ MemoryPool* pool = default_memory_pool(),
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount);
+
+ static Result<std::shared_ptr<ListViewArray>> FromArrays(
+ std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
+ const Array& values, MemoryPool* pool = default_memory_pool(),
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount);
+
+ /// \brief Return an Array that is a concatenation of the list-views in this
array.
+ ///
+ /// Note that it's different from `values()` in that it takes into
+ /// consideration this array's offsets (which can be in any order)
+ /// and sizes. Nulls are skipped.
+ Result<std::shared_ptr<Array>> Flatten(
+ MemoryPool* memory_pool = default_memory_pool()) const;
+
+ /// \brief Return list-view offsets as an Int32Array
+ ///
+ /// The returned array will not have a validity bitmap, so you cannot expect
+ /// to pass it to ListArray::FromArrays() and get back the same list array
+ /// if the original one has nulls.
+ std::shared_ptr<Array> offsets() const;
+
+ /// \brief Return list-view sizes as an Int32Array
+ ///
+ /// The returned array will not have a validity bitmap, so you cannot expect
+ /// to pass it to ListArray::FromArrays() and get back the same list array
+ /// if the original one has nulls.
+ std::shared_ptr<Array> sizes() const;
+
+ protected:
+ // This constructor defers SetData to a derived array class
+ ListViewArray() = default;
+
+ void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+/// \brief Concrete Array class for large list-view data (with 64-bit offsets
+/// and sizes)
+class ARROW_EXPORT LargeListViewArray : public
BaseListViewArray<LargeListViewType> {
+ public:
+ explicit LargeListViewArray(std::shared_ptr<ArrayData> data);
+
+ LargeListViewArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets,
+ std::shared_ptr<Buffer> value_sizes,
std::shared_ptr<Array> values,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset =
0);
+
+ /// \brief Construct LargeListViewArray from array of offsets, sizes, and
child
+ /// value array
+ ///
+ /// Construct an LargeListViewArray using buffers from offsets and sizes
arrays
+ /// that project views into the values array.
+ ///
+ /// This function does the bare minimum of validation of the offsets/sizes
and
+ /// input types. The offset and length of the offsets and sizes arrays must
+ /// match and that will be checked, but their contents will be assumed to be
+ /// well-formed.
+ ///
+ /// If a null_bitmap is not provided, the nulls will be inferred from the
offsets' or
+ /// sizes' null bitmap. Only one of these two is allowed to have a null
bitmap. But if a
+ /// null_bitmap is provided, the offsets array and the sizes array can't
have nulls.
+ ///
+ /// And when a null_bitmap is provided, neither the offsets or sizes array
can be a
+ /// slice (i.e. an array with offset() > 0).
+ ///
+ /// \param[in] offsets An array of int64 offsets into the values array. NULL
values are
+ /// supported if the corresponding values in sizes is NULL or 0.
+ /// \param[in] sizes An array containing the int64 sizes of every view. NULL
values are
+ /// taken to represent a NULL list-view in the array being created.
+ /// \param[in] values Array containing list values
+ /// \param[in] pool MemoryPool
+ /// \param[in] null_bitmap Optional validity bitmap
+ /// \param[in] null_count Optional null count in null_bitmap
+ static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
+ const Array& offsets, const Array& sizes, const Array& values,
+ MemoryPool* pool = default_memory_pool(),
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount);
+
+ static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
+ std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
+ const Array& values, MemoryPool* pool = default_memory_pool(),
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount);
+
+ /// \brief Return an Array that is a concatenation of the large list-views
in this
+ /// array.
+ ///
+ /// Note that it's different from `values()` in that it takes into
+ /// consideration this array's offsets (which can be in any order)
+ /// and sizes. Nulls are skipped.
+ Result<std::shared_ptr<Array>> Flatten(
+ MemoryPool* memory_pool = default_memory_pool()) const;
+
+ /// \brief Return list-view offsets as an Int64Array
Review Comment:
Would recommend adding the same docstring description as in `ListViewArray`
above.
##########
cpp/src/arrow/array/array_nested.h:
##########
@@ -105,6 +103,23 @@ class BaseListArray : public Array {
const offset_type* raw_value_offsets_ = NULLPTR;
};
+// ----------------------------------------------------------------------
+// ListArray / LargeListArray
+
+template <typename TYPE>
+class BaseListArray : public VarLengthListLikeArray<TYPE> {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TYPE::offset_type;
+
+ const TypeClass* list_type() const { return
this->var_length_list_like_type(); }
+
+ offset_type value_length(int64_t i) const final {
Review Comment:
So, just to be sure, `final` will allow devirtualizing this method if the
object is known to be a `BaseListArray` or subclass thereof?
##########
cpp/src/arrow/ipc/test_common.cc:
##########
@@ -189,6 +189,30 @@ Status MakeRandomListArray(const std::shared_ptr<Array>&
child_array, int num_li
return MakeListArray<ListType>(child_array, num_lists, include_nulls, pool,
out);
}
+Status MakeRandomListViewArray(const std::shared_ptr<Array>& child_array, int
num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ const auto seed = static_cast<uint32_t>(child_array->length());
+ random::RandomArrayGenerator rand(seed);
+
+ const double null_probability = include_nulls ? 0.5 : 0.0;
+ *out = rand.ListView(*child_array, num_lists, null_probability, false, 0.9,
+ kDefaultBufferAlignment, pool);
+ return Status::OK();
+}
+
+Status MakeRandomLargeListViewArray(const std::shared_ptr<Array>& child_array,
+ int num_lists, bool include_nulls,
MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ const auto seed = static_cast<uint32_t>(child_array->length());
+ random::RandomArrayGenerator rand(seed);
+
+ const double null_probability = include_nulls ? 0.5 : 0.0;
+ *out = rand.LargeListView(*child_array, num_lists, null_probability, false,
0.9,
Review Comment:
Same here.
##########
cpp/src/arrow/ipc/test_common.cc:
##########
@@ -189,6 +189,30 @@ Status MakeRandomListArray(const std::shared_ptr<Array>&
child_array, int num_li
return MakeListArray<ListType>(child_array, num_lists, include_nulls, pool,
out);
}
+Status MakeRandomListViewArray(const std::shared_ptr<Array>& child_array, int
num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ const auto seed = static_cast<uint32_t>(child_array->length());
+ random::RandomArrayGenerator rand(seed);
+
+ const double null_probability = include_nulls ? 0.5 : 0.0;
+ *out = rand.ListView(*child_array, num_lists, null_probability, false, 0.9,
Review Comment:
Can you mention argument names when they would otherwise be difficult to
make out?
```suggestion
*out = rand.ListView(*child_array, num_lists, null_probability, /*xxx=*/
false, /*yyy=*/ 0.9,
```
##########
cpp/src/arrow/ipc/writer.cc:
##########
@@ -442,6 +502,37 @@ class RecordBatchSerializer {
// Must also slice the values
values = values->Slice(values_offset, values_length);
}
+ --max_recursion_depth_;
+ RETURN_NOT_OK(VisitArray(*values));
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_view<typename T::TypeClass, Status> Visit(const T& array) {
+ using offset_type = typename T::offset_type;
+
+ offset_type min_offset = 0;
+ offset_type max_end = 0;
+ {
+ std::shared_ptr<Buffer> value_offsets;
+ RETURN_NOT_OK(
+ GetZeroBasedListViewOffsets<T>(array, &value_offsets, &min_offset,
&max_end));
Review Comment:
Does it mean we're attempting to compact the values array before sending it?
This doesn't seem required, is it?
##########
cpp/src/arrow/testing/random.cc:
##########
@@ -811,6 +1021,12 @@ std::shared_ptr<Array>
RandomArrayGenerator::ArrayOf(const Field& field, int64_t
return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values);
\
}
+#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE)
\
Review Comment:
Can you `#undef` it afterwards?
##########
cpp/src/arrow/testing/random.cc:
##########
@@ -608,6 +609,218 @@ std::shared_ptr<Array>
OffsetsFromLengthsArray(OffsetArrayType* lengths,
std::make_shared<typename OffsetArrayType::TypeClass>(), size, buffers,
null_count);
return std::make_shared<OffsetArrayType>(array_data);
}
+
+// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
+// a given metadata key.
+template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
+enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata*
metadata,
+ const std::string& key,
+ T default_value) {
+ if (!metadata) return default_value;
+ const auto index = metadata->FindKey(key);
+ if (index < 0) return default_value;
+ const auto& value = metadata->value(index);
+ T output{};
+ if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output))
{
+ ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ",
+ ArrowType::type_name()));
+ }
+ return output;
+}
+
+/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm
[1].
+///
+/// [1]
https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] data The array to shuffle
+template <typename ListViewType>
+void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) {
+ DCHECK_EQ(data.type->id(), ListViewType::type_id);
+ using offset_type = typename ListViewType::offset_type;
+
+ auto* validity = data.GetMutableValues<uint8_t>(0, 0);
+ auto* offsets = data.GetMutableValues<offset_type>(1);
+ auto* sizes = data.GetMutableValues<offset_type>(2);
+
+ pcg32_fast rng(seed);
+ using UniformDist = std::uniform_int_distribution<int64_t>;
+ UniformDist dist;
+ for (int64_t i = data.length - 1; i > 0; --i) {
+ const auto j = dist(rng, UniformDist::param_type(0, i));
+ if (ARROW_PREDICT_TRUE(i != j)) {
+ // Swap validity bits
+ if (validity) {
+ const bool valid_i = bit_util::GetBit(validity, data.offset + i);
+ const bool valid_j = bit_util::GetBit(validity, data.offset + i);
+ if (valid_i != valid_j) {
+ bit_util::SetBitTo(validity, data.offset + i, valid_j);
+ bit_util::SetBitTo(validity, data.offset + j, valid_i);
+ }
+ }
+ // Swap offsets and sizes
+ std::swap(offsets[i], offsets[j]);
+ std::swap(sizes[i], sizes[j]);
+ }
+ }
+}
+
+/// \brief Generate the list-view offsets based on a random buffer of sizes.
+///
+/// The sizes buffer is an input of this function, but when force_empty_nulls
is true,
+/// some values on the sizes buffer can be set to 0.
+///
+/// When sparsity is 0.0, the list-view spans are perfectly packed one after
the
+/// other. If sparsity is greater than 0.0, the list-view spans are set apart
+/// from each other in proportion to the sparsity value and size of each
+/// list-view. A negative sparsity means each list-view shares a fraction of
the
+/// values used by the previous list-view.
+///
+/// For instance, a sparsity of -1.0 means the values array will only need
enough values
+/// for the largest list-view with all the other list-views spanning some of
these same
+/// values.
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] mutable_sizes_array The array of sizes to use
+/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0
+/// \param[in] zero_undefined_offsets Whether to zero the offsets of
list-views that have
+/// 0 set as the size
+/// \param[in] sparsity The sparsity of the generated list-view offsets
+/// \param[out] out_max_view_end The maximum value of the end of a list-view
+template <typename OffsetArrayType, typename offset_type>
+std::shared_ptr<Array> ViewOffsetsFromLengthsArray(
+ SeedType seed, OffsetArrayType& mutable_sizes_array, bool
force_empty_nulls,
+ bool zero_undefined_offsets, double sparsity, int64_t* out_max_view_end,
+ int64_t alignment, MemoryPool* memory_pool) {
+ using TypeClass = typename OffsetArrayType::TypeClass;
+
+ auto* sizes = mutable_sizes_array.data()->template
GetMutableValues<offset_type>(1);
+
+ BufferVector buffers{2};
+ buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to
+ buffers[1] = *AllocateBuffer(sizeof(offset_type) *
mutable_sizes_array.length(),
+ alignment, memory_pool);
+ auto offsets = buffers[1]->mutable_data_as<offset_type>();
+
+ double offset_base = 0.0;
+ offset_type max_view_end = 0;
+ for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) {
+ const auto offset = static_cast<offset_type>(std::llround(offset_base));
+ if (mutable_sizes_array.IsNull(i)) {
+ if (force_empty_nulls) {
+ sizes[i] = 0;
+ }
+ offsets[i] = zero_undefined_offsets ? 0 : offset;
+ } else {
+ if (sizes[i] == 0) {
+ offsets[i] = zero_undefined_offsets ? 0 : offset;
+ } else {
+ offsets[i] = offset;
+ DCHECK_LT(offset, std::numeric_limits<offset_type>::max() - sizes[i]);
+ offset_base = std::max(0.0, offset_base + (sparsity * sizes[i]));
+ }
+ }
+ max_view_end = std::max(max_view_end, offsets[i] + sizes[i]);
+ }
+ *out_max_view_end = max_view_end;
+
+ auto array_data =
+ ArrayData::Make(TypeTraits<TypeClass>::type_singleton(),
+ mutable_sizes_array.length(), std::move(buffers),
/*null_count=*/0);
+ return std::make_shared<OffsetArrayType>(std::move(array_data));
+}
+
+template <typename ArrayType, typename RAG>
+Result<std::shared_ptr<Array>> ArrayOfListView(RAG& self, const Field& field,
Review Comment:
Hmm... why do `RandomListView` and `ArrayOfListView` use two different
approaches? This is more code to maintain and understand.
(I'm not sure which approach is better, but ideally we would decide on a
common one)
##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -40,37 +40,46 @@ namespace arrow {
/// @{
// ----------------------------------------------------------------------
-// List builder
+// VarLengthListLikeBuilder
template <typename TYPE>
-class BaseListBuilder : public ArrayBuilder {
+class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder {
Review Comment:
Fair enough, let's not bikeshed this :-)
##########
cpp/src/arrow/util/list_util.cc:
##########
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/list_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+namespace arrow::list_util {
+
+namespace internal {
+
+namespace {
+
+using arrow::internal::checked_cast;
+using arrow::internal::ReverseSetBitRunReader;
+using arrow::internal::SetBitRunReader;
+
+/// \pre input.length() > 0 && input.null_count() != input.length()
+/// \param input A LIST_VIEW or LARGE_LIST_VIEW array
+template <typename offset_type>
+std::optional<int64_t> MinViewOffset(const ArraySpan& input) {
+ const uint8_t* validity = input.buffers[0].data;
+ const auto* offsets = input.GetValues<offset_type>(1);
+ const auto* sizes = input.GetValues<offset_type>(2);
+
+ // Make an access to the sizes buffer only when strictly necessary.
+#define MINIMIZE_MIN_VIEW_OFFSET(i) \
+ auto offset = offsets[i]; \
+ if (min_offset.has_value()) { \
+ if (offset < *min_offset && sizes[i] > 0) { \
+ if (offset == 0) { \
+ return 0; \
+ } \
+ min_offset = offset; \
+ } \
+ } else { \
+ if (sizes[i] > 0) { \
+ if (offset == 0) { \
+ return 0; \
+ } \
+ min_offset = offset; \
+ } \
+ }
+
+ std::optional<offset_type> min_offset;
+ if (validity == nullptr) {
+ for (int64_t i = 0; i < input.length; i++) {
+ MINIMIZE_MIN_VIEW_OFFSET(i);
+ }
+ } else {
+ SetBitRunReader reader(validity, input.offset, input.length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ for (int64_t i = run.position; i < run.position + run.length; ++i) {
+ MINIMIZE_MIN_VIEW_OFFSET(i);
+ }
+ }
+ }
+ return min_offset;
+
+#undef MINIMIZE_MIN_VIEW_OFFSET
+}
+
+/// \pre input.length() > 0 && input.null_count() != input.length()
+/// \param input A LIST_VIEW or LARGE_LIST_VIEW array
+template <typename offset_type>
+int64_t MaxViewEnd(const ArraySpan& input) {
+ constexpr auto kInt64Max = std::numeric_limits<int64_t>::max();
+ const auto values_length = input.child_data[0].length;
+
+ const uint8_t* validity = input.buffers[0].data;
+ const auto* offsets = input.GetValues<offset_type>(1);
+ const auto* sizes = input.GetValues<offset_type>(2);
+
+ // Early-exit: 64-bit overflow detected. This is not possible on a valid
list-view,
+ // but we return the maximum possible value to avoid undefined behavior.
Review Comment:
Unless there's a specific condition where this can be called on invalid data
(for example when reading IPC data), we don't care that UB would happen if the
list-view is invalid.
##########
cpp/src/arrow/array/util.cc:
##########
@@ -383,6 +393,12 @@ class NullArrayFactory {
return Status::OK();
}
+ template <typename T>
+ enable_if_list_view<T, Status> Visit(const T&) {
+ buffer_length_ = length_ * sizeof(typename T::offset_type);
Review Comment:
Also remember to recurse on the child array as in the var_size_list case
above (though it seems it should call `GetBufferLength` with length 0, not
`length_`... @bkietz WDYT ?).
##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -191,20 +191,129 @@ class BaseListBuilder : public ArrayBuilder {
return
std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
}
+ private:
+ static constexpr const char* type_name() {
+ if constexpr (is_list_view(TYPE::type_id)) {
+ return "ListView";
+ } else {
+ return "List";
+ }
+ }
+
protected:
+ /// \brief Append dimensions for num_values empty list slots.
+ ///
+ /// ListViewBuilder overrides this to also append the sizes.
+ virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
+ const int64_t offset = value_builder_->length();
+ for (int64_t i = 0; i < num_values; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+ }
+ }
+
+ /// \brief Append dimensions for a single list slot.
+ ///
+ /// ListViewBuilder overrides this to also append the size.
+ virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+ }
+
TypedBufferBuilder<offset_type> offsets_builder_;
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Field> value_field_;
+};
+
+// ----------------------------------------------------------------------
+// ListBuilder / LargeListBuilder
+
+template <typename TYPE>
+class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
+ private:
+ using BASE = VarLengthListLikeBuilder<TYPE>;
+
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename BASE::offset_type;
+
+ using BASE::BASE;
+
+ using BASE::Append;
+
+ ~BaseListBuilder() override = default;
+
+ /// \brief Start a new variable-length list slot
+ ///
+ /// This function should be called before beginning to append elements to the
+ /// value builder
+ ///
+ /// Prefer Append(is_valid, 0) as that works correctly for list-view types
+ /// as well as list types.
+ Status Append(bool is_valid = true) { return BASE::Append(is_valid, 0); }
+
+ /// \brief Vector append
+ ///
+ /// If passed, valid_bytes is of equal length to values, and any zero byte
+ /// will be considered as a null for that slot
+ Status AppendValues(const offset_type* offsets, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ ARROW_RETURN_NOT_OK(this->Reserve(length));
+ this->UnsafeAppendToBitmap(valid_bytes, length);
+ this->offsets_builder_.UnsafeAppend(offsets, length);
+ return Status::OK();
+ }
+
+ Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+ int64_t length, const uint8_t* valid_bytes) final {
+ // offsets are assumed to be valid, but the first lenght-1 sizes have to be
+ // consistent with the offsets to rule out the possibility that the caller
+ // is passing sizes that could work if building a list-view, but don't work
+ // on building a list that requires offsets to be non-decreasing.
+ if (sizes) {
Review Comment:
Hmm, I'm coming back to this discussion and I'm struggling to understand
this again.
This seems to hint that List builders and ListView builders are different
enough that they should probably be different beasts entirely instead of trying
to maximize API and code sharing. What do you think?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -797,57 +811,147 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at
negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset
at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ",
offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ template <typename offset_type>
+ Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto offset = offsets[slot];
+ return Status::Invalid("Offset invariant failure: offset for slot ", slot,
+ " out of bounds. Expected ", offset,
+ " to be at least 0 and less than ", offset_limit);
+ }
+
+ template <typename offset_type>
+ Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+ const auto size = sizes[slot];
+ if (size < 0) {
+ return Status::Invalid("Offset invariant failure: size for slot ", slot,
+ " out of bounds: ", size, " < 0");
+ } else {
+ const auto offset = offsets[slot];
+ return Status::Invalid("Offset invariant failure: size for slot ", slot,
+ " out of bounds: ", offset, " + ", size, " > ",
+ offset_limit);
+ }
+ }
+
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsetsAndSizes(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ const auto size = sizes[i];
+ if (size >= 0) {
+ const auto offset = offsets[i];
+ if (offset < 0 || offset > offset_limit) {
+ return OutOfBoundsListViewOffset<offset_type>(i, offset_limit);
+ }
+ if (size > offset_limit - offset) {
+ return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+ }
+ } else {
+ return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+ }
+ }
+
+ return Status::OK();
+ }
+
template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ Status ValidateOffsetsAndMaybeSizes(const TypeClass&, int64_t offset_limit) {
using offset_type = typename TypeClass::offset_type;
+ constexpr bool is_list_view = is_list_view_type<TypeClass>::value;
- if (!IsBufferValid(1)) {
- // For length 0, an empty offsets buffer seems accepted as a special case
- // (ARROW-544)
- if (data.length > 0) {
- return Status::Invalid("Non-empty array but offsets are null");
+ const bool non_empty = data.length > 0;
+ if constexpr (is_list_view) {
+ if (!IsBufferValid(1)) {
+ // For length 0, an empty offsets buffer is accepted (ARROW-544).
Review Comment:
Remove this comment?
##########
cpp/src/arrow/array/util.cc:
##########
@@ -383,6 +393,12 @@ class NullArrayFactory {
return Status::OK();
}
+ template <typename T>
+ enable_if_list_view<T, Status> Visit(const T&) {
+ buffer_length_ = length_ * sizeof(typename T::offset_type);
Review Comment:
Shouldn't you call `MaxOf` instead of setting `buffer_length_` directly?
##########
cpp/src/arrow/ipc/writer.cc:
##########
@@ -350,6 +350,67 @@ class RecordBatchSerializer {
return Status::OK();
}
+ template <typename ArrayType, typename offset_type = typename
ArrayType::offset_type>
+ Status GetZeroBasedListViewOffsets(const ArrayType& array,
+ std::shared_ptr<Buffer>*
out_value_offsets,
+ offset_type* out_min_offset,
+ offset_type* out_max_end) {
+ auto offsets = array.value_offsets();
+ auto sizes = array.value_sizes();
+
+ const int64_t required_bytes = sizeof(offset_type) * array.length();
+ if (array.offset() != 0) {
+ // If we have a non-zero offset, it's likely that the smallest offset is
+ // not zero. We must a) create a new offsets array with shifted offsets
and
+ // b) slice the values array accordingly.
+
+ ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
+ AllocateBuffer(required_bytes,
options_.memory_pool));
+ offset_type min_offset = 0;
+ offset_type max_end = 0;
+ if (array.length() > 0) {
+ min_offset = std::numeric_limits<offset_type>::max();
+ for (int i = 0; i < array.length(); ++i) {
+ min_offset = std::min(min_offset, array.value_offset(i));
+ max_end = std::max(max_end, array.value_offset(i) +
array.value_length(i));
+ }
+ }
+
+ auto* dest_offsets = shifted_offsets->mutable_data_as<offset_type>();
+
+ for (int i = 0; i < array.length(); ++i) {
+ dest_offsets[i] = array.value_offset(i) - min_offset;
+ }
+ *out_min_offset = min_offset;
+ *out_max_end = max_end;
+ offsets = std::move(shifted_offsets);
+ } else {
+ // ARROW-6046: Slice offsets to used extent, in case we have a truncated
+ // slice
+ if (offsets != nullptr && offsets->size() > required_bytes) {
+ offsets = SliceBuffer(offsets, 0, required_bytes);
+ }
+ *out_min_offset = 0;
+ *out_max_end = static_cast<offset_type>(array.values()->length());
+ }
+ *out_value_offsets = std::move(offsets);
+ return Status::OK();
+ }
+
+ template <typename ArrayType, typename offset_type = typename
ArrayType::offset_type>
+ Status GetListViewSizes(const ArrayType& array,
+ std::shared_ptr<Buffer>* out_value_sizes) {
+ const int64_t required_bytes = sizeof(offset_type) * array.length();
+ auto sizes = array.value_sizes();
+ if (sizes != nullptr && (array.offset() != 0 || sizes->size() >
required_bytes)) {
Review Comment:
Why would `sizes` be null? Also, the second condition seems a bit pedantic,
since slicing is cheap.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -797,57 +811,147 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ private:
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsets(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at
negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset
at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ",
offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ template <typename offset_type>
+ Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto offset = offsets[slot];
+ return Status::Invalid("Offset invariant failure: offset for slot ", slot,
+ " out of bounds. Expected ", offset,
+ " to be at least 0 and less than ", offset_limit);
+ }
+
+ template <typename offset_type>
+ Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+ const auto size = sizes[slot];
+ if (size < 0) {
+ return Status::Invalid("Offset invariant failure: size for slot ", slot,
+ " out of bounds: ", size, " < 0");
+ } else {
+ const auto offset = offsets[slot];
+ return Status::Invalid("Offset invariant failure: size for slot ", slot,
+ " out of bounds: ", offset, " + ", size, " > ",
+ offset_limit);
+ }
+ }
+
+ /// \pre basic validation has already been performed
+ template <typename offset_type>
+ Status FullyValidateOffsetsAndSizes(int64_t offset_limit) {
+ const auto* offsets = data.GetValues<offset_type>(1);
+ const auto* sizes = data.GetValues<offset_type>(2);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ const auto size = sizes[i];
+ if (size >= 0) {
+ const auto offset = offsets[i];
+ if (offset < 0 || offset > offset_limit) {
+ return OutOfBoundsListViewOffset<offset_type>(i, offset_limit);
+ }
+ if (size > offset_limit - offset) {
+ return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+ }
+ } else {
+ return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+ }
+ }
+
+ return Status::OK();
+ }
+
template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ Status ValidateOffsetsAndMaybeSizes(const TypeClass&, int64_t offset_limit) {
using offset_type = typename TypeClass::offset_type;
+ constexpr bool is_list_view = is_list_view_type<TypeClass>::value;
- if (!IsBufferValid(1)) {
- // For length 0, an empty offsets buffer seems accepted as a special case
- // (ARROW-544)
- if (data.length > 0) {
- return Status::Invalid("Non-empty array but offsets are null");
+ const bool non_empty = data.length > 0;
+ if constexpr (is_list_view) {
+ if (!IsBufferValid(1)) {
+ // For length 0, an empty offsets buffer is accepted (ARROW-544).
+ return Status::Invalid("offsets buffer is null");
+ }
+ if (!IsBufferValid(2)) {
+ return Status::Invalid("sizes buffer is null");
+ }
+ } else {
+ if (!IsBufferValid(1)) {
+ // For length 0, an empty offsets buffer is accepted (ARROW-544).
+ return non_empty ? Status::Invalid("Non-empty array but offsets are
null")
+ : Status::OK();
}
- return Status::OK();
}
- // An empty list array can have 0 offsets
const auto offsets_byte_size = data.buffers[1]->size();
const auto required_offsets = ((data.length > 0) || (offsets_byte_size >
0))
- ? data.length + data.offset + 1
+ ? data.length + data.offset +
(is_list_view ? 0 : 1)
: 0;
if (offsets_byte_size / static_cast<int32_t>(sizeof(offset_type)) <
required_offsets) {
return Status::Invalid("Offsets buffer size (bytes): ",
offsets_byte_size,
" isn't large enough for length: ", data.length,
" and offset: ", data.offset);
}
+ if constexpr (is_list_view) {
+ const auto required_sizes = data.length + data.offset;
+ const auto sizes_bytes_size = data.buffers[2]->size();
+ if (sizes_bytes_size / static_cast<int32_t>(sizeof(offset_type)) <
required_sizes) {
+ return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size,
+ " isn't large enough for length: ", data.length,
+ " and offset: ", data.offset);
+ }
+ }
if (full_validation && required_offsets > 0) {
- // Validate all offset values
- const offset_type* offsets = data.GetValues<offset_type>(1);
-
- auto prev_offset = offsets[0];
- if (prev_offset < 0) {
- return Status::Invalid(
- "Offset invariant failure: array starts at negative offset ",
prev_offset);
- }
- for (int64_t i = 1; i <= data.length; ++i) {
- const auto current_offset = offsets[i];
- if (current_offset < prev_offset) {
- return Status::Invalid(
- "Offset invariant failure: non-monotonic offset at slot ", i, ":
",
- current_offset, " < ", prev_offset);
- }
- if (current_offset > offset_limit) {
- return Status::Invalid("Offset invariant failure: offset for slot ",
i,
- " out of bounds: ", current_offset, " > ",
offset_limit);
- }
- prev_offset = current_offset;
+ if constexpr (is_list_view) {
+ return FullyValidateOffsetsAndSizes<offset_type>(offset_limit);
+ } else {
+ return FullyValidateOffsets<offset_type>(offset_limit);
}
}
return Status::OK();
}
+ public:
+ template <typename TypeClass>
+ enable_if_list_view<TypeClass, Status> ValidateOffsetsAndSizes(const
TypeClass& type,
+ int64_t
offset_limit) {
+ return ValidateOffsetsAndMaybeSizes<TypeClass>(type, offset_limit);
+ }
+
+ template <typename TypeClass>
+ std::enable_if_t<is_var_length_list_type<TypeClass>::value ||
+ is_base_binary_like(TypeClass::type_id),
+ Status>
+ ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ return ValidateOffsetsAndMaybeSizes<TypeClass>(type, offset_limit);
+ }
Review Comment:
I'm not sure I understand the point of these two indirections.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -23,7 +23,7 @@
#include "arrow/extension_type.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
Review Comment:
Was it a problem in the existing code?
##########
cpp/src/arrow/array/util.cc:
##########
@@ -853,6 +885,13 @@ class RepeatedArrayFactory {
return builder.Finish(out);
}
+ template <typename IntType>
+ Status CreateIntBuffer(IntType value, std::shared_ptr<Buffer>* out) {
Review Comment:
Can return `Result<std::shared_ptr<Buffer>>` for more convenient usage.
##########
cpp/src/arrow/c/bridge_test.cc:
##########
@@ -1930,6 +2004,33 @@ TEST_F(TestSchemaImport, NestedList) {
CheckImport(list(fixed_size_list(int8(), 3)));
}
+TEST_F(TestSchemaImport, ListView) {
+ FillPrimitive(AddChild(), "c");
+ FillListLike("+vl");
+ CheckImport(list_view(int8()));
+
+ FillPrimitive(AddChild(), "s", "item", 0);
+ FillListLike("+vl");
+ CheckImport(list_view(field("item", int16(), /*nullable=*/false)));
+
+ // Large list-view
+ FillPrimitive(AddChild(), "s");
+ FillListLike("+vL");
+ CheckImport(large_list_view(int16()));
+}
+
+TEST_F(TestSchemaImport, NestedListView) {
+ FillPrimitive(AddChild(), "c");
+ FillListLike(AddChild(), "+vl");
+ FillListLike("+vL");
+ CheckImport(large_list_view(list_view(int8())));
+
+ FillPrimitive(AddChild(), "c");
+ FillListLike(AddChild(), "+w:3");
+ FillListLike("+vl");
+ CheckImport(list_view(fixed_size_list(int8(), 3)));
+}
+
Review Comment:
Are we missing tests for array import and roundtrip?
##########
cpp/src/arrow/util/list_util.h:
##########
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace list_util {
+
+/// \brief Get the child array holding the values from a List or ListView array
+inline const ArraySpan& ValuesArray(const ArraySpan& span) { return
span.child_data[0]; }
+
+namespace internal {
+
+/// \brief Calculate the smallest continuous range of values used by the
+/// var-length list-like input (list, map and list-view types).
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return A pair of (offset, length) describing the range
+ARROW_EXPORT Result<std::pair<int64_t, int64_t>> RangeOfValuesUsed(
+ const ArraySpan& input);
+
+/// \brief Calculate the sum of the sizes of all valid lists or list-views
+///
+/// This is usally the same as the length of the RangeOfValuesUsed() range, but
+/// it can be:
+/// - Smaller: when the child array constains many values that are not
+/// referenced by the lists or list-views in the parent array
+/// - Greater: when the list-views share child array ranges
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return The sum of all list or list-view sizes
+ARROW_EXPORT Result<int64_t> SumOfLogicalListSizes(const ArraySpan& input);
+
+/// \brief Build a ListViewArray from a ListArray
+ARROW_EXPORT Result<std::shared_ptr<ListViewArray>> ListViewFromList(
+ const ListArray& source, MemoryPool* pool);
+
+/// \brief Build a LargeListViewArray from a LargeListArray
+ARROW_EXPORT Result<std::shared_ptr<LargeListViewArray>> ListViewFromList(
+ const LargeListArray& source, MemoryPool* pool);
+
+/// \brief Build a ListArray from a ListViewArray
+ARROW_EXPORT Result<std::shared_ptr<ListArray>> ListFromListView(
+ const ListViewArray& source, MemoryPool* pool);
+
+/// \brief Build a LargeListArray from a LargeListViewArray
+ARROW_EXPORT Result<std::shared_ptr<LargeListArray>> ListFromListView(
+ const LargeListViewArray& source, MemoryPool* pool);
Review Comment:
It would seem more logical to me to have these as static methods of
`ListArray` and `ListViewArray`.
##########
cpp/src/arrow/util/list_util.h:
##########
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace list_util {
+
+/// \brief Get the child array holding the values from a List or ListView array
+inline const ArraySpan& ValuesArray(const ArraySpan& span) { return
span.child_data[0]; }
Review Comment:
This honestly looks like a pointless indirection to me, though I wouldn't
strongly oppose it either.
(also, should it be `constexpr`?)
##########
cpp/src/arrow/testing/random.cc:
##########
@@ -608,6 +609,218 @@ std::shared_ptr<Array>
OffsetsFromLengthsArray(OffsetArrayType* lengths,
std::make_shared<typename OffsetArrayType::TypeClass>(), size, buffers,
null_count);
return std::make_shared<OffsetArrayType>(array_data);
}
+
+// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
+// a given metadata key.
+template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
+enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata*
metadata,
+ const std::string& key,
+ T default_value) {
+ if (!metadata) return default_value;
+ const auto index = metadata->FindKey(key);
+ if (index < 0) return default_value;
+ const auto& value = metadata->value(index);
+ T output{};
+ if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output))
{
+ ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ",
+ ArrowType::type_name()));
+ }
+ return output;
+}
+
+/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm
[1].
+///
+/// [1]
https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] data The array to shuffle
+template <typename ListViewType>
+void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) {
+ DCHECK_EQ(data.type->id(), ListViewType::type_id);
+ using offset_type = typename ListViewType::offset_type;
+
+ auto* validity = data.GetMutableValues<uint8_t>(0, 0);
+ auto* offsets = data.GetMutableValues<offset_type>(1);
+ auto* sizes = data.GetMutableValues<offset_type>(2);
+
+ pcg32_fast rng(seed);
+ using UniformDist = std::uniform_int_distribution<int64_t>;
+ UniformDist dist;
+ for (int64_t i = data.length - 1; i > 0; --i) {
+ const auto j = dist(rng, UniformDist::param_type(0, i));
Review Comment:
I didn't know this two-operand form :-o
##########
cpp/src/arrow/util/list_util.cc:
##########
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/list_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+namespace arrow::list_util {
+
+namespace internal {
+
+namespace {
+
+using arrow::internal::checked_cast;
+using arrow::internal::ReverseSetBitRunReader;
+using arrow::internal::SetBitRunReader;
+
+/// \pre input.length() > 0 && input.null_count() != input.length()
+/// \param input A LIST_VIEW or LARGE_LIST_VIEW array
+template <typename offset_type>
+std::optional<int64_t> MinViewOffset(const ArraySpan& input) {
+ const uint8_t* validity = input.buffers[0].data;
+ const auto* offsets = input.GetValues<offset_type>(1);
+ const auto* sizes = input.GetValues<offset_type>(2);
+
+ // Make an access to the sizes buffer only when strictly necessary.
+#define MINIMIZE_MIN_VIEW_OFFSET(i) \
Review Comment:
We should ideally keep macros trivial. Here, it can be replaced with a
capturing lambda.
##########
cpp/src/arrow/util/list_util.cc:
##########
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/list_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+namespace arrow::list_util {
+
+namespace internal {
+
+namespace {
+
+using arrow::internal::checked_cast;
+using arrow::internal::ReverseSetBitRunReader;
+using arrow::internal::SetBitRunReader;
+
+/// \pre input.length() > 0 && input.null_count() != input.length()
+/// \param input A LIST_VIEW or LARGE_LIST_VIEW array
+template <typename offset_type>
+std::optional<int64_t> MinViewOffset(const ArraySpan& input) {
+ const uint8_t* validity = input.buffers[0].data;
+ const auto* offsets = input.GetValues<offset_type>(1);
+ const auto* sizes = input.GetValues<offset_type>(2);
+
+ // Make an access to the sizes buffer only when strictly necessary.
+#define MINIMIZE_MIN_VIEW_OFFSET(i) \
+ auto offset = offsets[i]; \
+ if (min_offset.has_value()) { \
+ if (offset < *min_offset && sizes[i] > 0) { \
+ if (offset == 0) { \
+ return 0; \
+ } \
+ min_offset = offset; \
+ } \
+ } else { \
+ if (sizes[i] > 0) { \
+ if (offset == 0) { \
+ return 0; \
+ } \
+ min_offset = offset; \
+ } \
+ }
+
+ std::optional<offset_type> min_offset;
+ if (validity == nullptr) {
Review Comment:
Can simply use `VisitSetBitRunsVoid` which automates this scaffolding:
```c++
VisitSetBitRunsVoid(validity, input.offset, input.length(), [&](int64_t
start, int64t_ length) { ... });
```
##########
cpp/src/arrow/testing/random.cc:
##########
@@ -608,6 +609,218 @@ std::shared_ptr<Array>
OffsetsFromLengthsArray(OffsetArrayType* lengths,
std::make_shared<typename OffsetArrayType::TypeClass>(), size, buffers,
null_count);
return std::make_shared<OffsetArrayType>(array_data);
}
+
+// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
+// a given metadata key.
+template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
+enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata*
metadata,
+ const std::string& key,
+ T default_value) {
+ if (!metadata) return default_value;
+ const auto index = metadata->FindKey(key);
+ if (index < 0) return default_value;
+ const auto& value = metadata->value(index);
+ T output{};
+ if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output))
{
+ ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ",
+ ArrowType::type_name()));
+ }
+ return output;
+}
+
+/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm
[1].
+///
+/// [1]
https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] data The array to shuffle
+template <typename ListViewType>
+void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) {
Review Comment:
Ideally we would avoid mutable references.
```suggestion
void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]