pitrou commented on code in PR #35345:
URL: https://github.com/apache/arrow/pull/35345#discussion_r1394386948


##########
cpp/src/arrow/array/util.cc:
##########
@@ -379,6 +389,15 @@ class NullArrayFactory {
     enable_if_var_size_list<T, Status> Visit(const T& type) {
       // values array may be empty, but there must be at least one offset of 0
       RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1)));
+      // XXX(felipec): reviewers, is this correct?
+      RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
+      return Status::OK();
+    }
+
+    template <typename T>
+    enable_if_list_view<T, Status> Visit(const T& type) {
+      RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_));
+      // XXX(felipec): reviewers, is this correct?

Review Comment:
   I think it's correct and the above as well. @bkietz Perhaps you want to 
check this?



##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -80,100 +89,123 @@ class BaseListBuilder : public ArrayBuilder {
     value_builder_->Reset();
   }
 
-  /// \brief Vector append
-  ///
-  /// If passed, valid_bytes is of equal length to values, and any zero byte
-  /// will be considered as a null for that slot
-  Status AppendValues(const offset_type* offsets, int64_t length,
-                      const uint8_t* valid_bytes = NULLPTR) {
-    ARROW_RETURN_NOT_OK(Reserve(length));
-    UnsafeAppendToBitmap(valid_bytes, length);
-    offsets_builder_.UnsafeAppend(offsets, length);
-    return Status::OK();
-  }
-
   /// \brief Start a new variable-length list slot
   ///
-  /// This function should be called before beginning to append elements to the
-  /// value builder
-  Status Append(bool is_valid = true) {
+  /// This function should be called before appending elements to the
+  /// value builder. Elements appended to the value builder before this 
function
+  /// is called for the first time, will not be members of any list value.
+  ///
+  /// After this function is called, list_length elements SHOULD be appended to
+  /// the values builder. If this contract is violated, the behavior is 
defined by
+  /// the concrete builder implementation and SHOULD NOT be relied upon unless
+  /// the caller is specifically building a [Large]List or [Large]ListView 
array.
+  ///
+  /// For [Large]List arrays, the list slot length will be the number of 
elements
+  /// appended to the values builder before the next call to Append* or 
Finish. For
+  /// [Large]ListView arrays, the list slot length will be exactly 
list_length, but if
+  /// Append* is called before at least list_length elements are appended to 
the values
+  /// builder, the current list slot will share elements with the next list
+  /// slots or an invalid [Large]ListView array will be generated because there
+  /// aren't enough elements in the values builder to fill the list slots.
+  ///

Review Comment:
   ```suggestion
     ///
     /// If you're building a [Large]List and don't need to be compatible
     /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)`
     /// is a simpler API.
     ///
   ```



##########
cpp/src/arrow/array/util.cc:
##########
@@ -379,6 +389,15 @@ class NullArrayFactory {
     enable_if_var_size_list<T, Status> Visit(const T& type) {
       // values array may be empty, but there must be at least one offset of 0
       RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1)));
+      // XXX(felipec): reviewers, is this correct?
+      RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
+      return Status::OK();
+    }
+
+    template <typename T>
+    enable_if_list_view<T, Status> Visit(const T& type) {
+      RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_));
+      // XXX(felipec): reviewers, is this correct?
       RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));

Review Comment:
   Here as well, can pass 0 as a length.
   
   This will be exercised if there are nested list cases in the tests (such as 
list(list_view(...)), list(list(...)), etc.).



##########
cpp/src/arrow/c/bridge_test.cc:
##########
@@ -3631,6 +3824,13 @@ TEST_F(TestArrayRoundtrip, List) {
   TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, 
null]]");
 }
 
+TEST_F(TestArrayRoundtrip, ListView) {
+  TestWithJSON(list_view(int32()), "[]");
+  TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]");

Review Comment:
   Can we also check that non trivial list views (with overlapping or disjoint 
elements) roundtrip as well?



##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -191,20 +223,130 @@ class BaseListBuilder : public ArrayBuilder {
     return 
std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
   }
 
+ private:
+  static constexpr const char* type_name() {
+    if constexpr (is_list_view(TYPE::type_id)) {
+      return "ListView";
+    } else {
+      return "List";
+    }
+  }
+
  protected:
+  /// \brief Append dimensions for num_values empty list slots.
+  ///
+  /// ListViewBuilder overrides this to also append the sizes.
+  virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
+    const int64_t offset = value_builder_->length();
+    for (int64_t i = 0; i < num_values; ++i) {
+      offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+    }
+  }
+
+  /// \brief Append dimensions for a single list slot.
+  ///
+  /// ListViewBuilder overrides this to also append the size.
+  virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) {
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+  }
+
   TypedBufferBuilder<offset_type> offsets_builder_;
   std::shared_ptr<ArrayBuilder> value_builder_;
   std::shared_ptr<Field> value_field_;
+};
+
+// ----------------------------------------------------------------------
+// ListBuilder / LargeListBuilder
+
+template <typename TYPE>
+class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
+ private:
+  using BASE = VarLengthListLikeBuilder<TYPE>;
+
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename BASE::offset_type;
+
+  using BASE::BASE;
+
+  using BASE::Append;
+
+  ~BaseListBuilder() override = default;
+
+  /// \brief Start a new variable-length list slot
+  ///
+  /// This function should be called before beginning to append elements to the
+  /// value builder
+  Status Append(bool is_valid = true) {
+    // The value_length parameter to BASE::Append(bool, int64_t) is ignored 
when
+    // building a list array, so we can pass 0 here.
+    return BASE::Append(is_valid, 0);
+  }
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes is of equal length to values, and any zero byte
+  /// will be considered as a null for that slot
+  Status AppendValues(const offset_type* offsets, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    ARROW_RETURN_NOT_OK(this->Reserve(length));
+    this->UnsafeAppendToBitmap(valid_bytes, length);
+    this->offsets_builder_.UnsafeAppend(offsets, length);
+    return Status::OK();
+  }
+
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length, const uint8_t* valid_bytes) final {
+    // offsets are assumed to be valid, but the first length-1 sizes have to be
+    // consistent with the offsets to rule out the possibility that the caller
+    // is passing sizes that could work if building a list-view, but don't work
+    // on building a list that requires offsets to be non-decreasing.

Review Comment:
   ```suggestion
       // on building a list that requires offsets to be non-decreasing.
       // CAUTION: the last size element (`sizes[length - 1]`) is not
       // validated and could be inconsistent with the offsets given in a
       // subsequent call to AppendValues.
   ```



##########
cpp/src/arrow/array/util.cc:
##########
@@ -379,6 +389,15 @@ class NullArrayFactory {
     enable_if_var_size_list<T, Status> Visit(const T& type) {
       // values array may be empty, but there must be at least one offset of 0
       RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1)));
+      // XXX(felipec): reviewers, is this correct?
+      RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));

Review Comment:
   The list child array can be 0-length since all list elements will have 
length 0. This matches `NullArrayFactory::Visit`.
   ```suggestion
         RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/ 
0)));
   ```



##########
cpp/src/arrow/array/array_list_test.cc:
##########
@@ -434,16 +622,18 @@ class TestListArray : public ::testing::Test {
 
   void TestBulkAppendInvalid() {
     std::vector<int16_t> values = {0, 1, 2, 3, 4, 5, 6};
-    std::vector<int> lengths = {3, 0, 4};
     std::vector<uint8_t> is_valid = {1, 0, 1};
     // Should be {0, 3, 3} given the is_valid array

Review Comment:
   Can we fix this comment?



##########
cpp/src/arrow/array/builder_nested.h:
##########
@@ -191,20 +191,129 @@ class BaseListBuilder : public ArrayBuilder {
     return 
std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
   }
 
+ private:
+  static constexpr const char* type_name() {
+    if constexpr (is_list_view(TYPE::type_id)) {
+      return "ListView";
+    } else {
+      return "List";
+    }
+  }
+
  protected:
+  /// \brief Append dimensions for num_values empty list slots.
+  ///
+  /// ListViewBuilder overrides this to also append the sizes.
+  virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
+    const int64_t offset = value_builder_->length();
+    for (int64_t i = 0; i < num_values; ++i) {
+      offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+    }
+  }
+
+  /// \brief Append dimensions for a single list slot.
+  ///
+  /// ListViewBuilder overrides this to also append the size.
+  virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) {
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+  }
+
   TypedBufferBuilder<offset_type> offsets_builder_;
   std::shared_ptr<ArrayBuilder> value_builder_;
   std::shared_ptr<Field> value_field_;
+};
+
+// ----------------------------------------------------------------------
+// ListBuilder / LargeListBuilder
+
+template <typename TYPE>
+class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
+ private:
+  using BASE = VarLengthListLikeBuilder<TYPE>;
+
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename BASE::offset_type;
+
+  using BASE::BASE;
+
+  using BASE::Append;
+
+  ~BaseListBuilder() override = default;
+
+  /// \brief Start a new variable-length list slot
+  ///
+  /// This function should be called before beginning to append elements to the
+  /// value builder
+  ///
+  /// Prefer Append(is_valid, 0) as that works correctly for list-view types
+  /// as well as list types.
+  Status Append(bool is_valid = true) { return BASE::Append(is_valid, 0); }
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes is of equal length to values, and any zero byte
+  /// will be considered as a null for that slot
+  Status AppendValues(const offset_type* offsets, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    ARROW_RETURN_NOT_OK(this->Reserve(length));
+    this->UnsafeAppendToBitmap(valid_bytes, length);
+    this->offsets_builder_.UnsafeAppend(offsets, length);
+    return Status::OK();
+  }
+
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length, const uint8_t* valid_bytes) final {
+    // offsets are assumed to be valid, but the first lenght-1 sizes have to be
+    // consistent with the offsets to rule out the possibility that the caller
+    // is passing sizes that could work if building a list-view, but don't work
+    // on building a list that requires offsets to be non-decreasing.
+    if (sizes) {

Review Comment:
   I understand the concern, but this means that:
   1. the caller makes an effort to generate an array of sizes
   2. the callee validates that the array of sizes is "correct"
   3. the callee then proceeds to ignore the array of sizes
   
   (this even happens in optimized builds)
   
   I'll also note that `sizes[length - 1]` is not validated, so if I call in 
sequence:
   1. `Append(offsets = [0, 4, 5], sizes = [4, 1, 3], length = 3, valid_bytes = 
[1, 1, 1])`
   2. `Append(offsets = [6, 8], sizes = [2, 1], length = 2, valid_bytes = [1, 
1])`
   
   ... then the third size (3) is incorrect yet silently accepted.
   
   Can we perhaps guard the validation with `#ifndef NDEBUG`?



##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -189,23 +260,126 @@ Result<std::shared_ptr<Array>> FlattenListArray(const 
ListArrayT& list_array,
   return Concatenate(non_null_fragments, memory_pool);
 }
 
+template <typename ListViewArrayT, bool HasNulls>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT& 
list_view_array,
+                                                    MemoryPool* memory_pool) {
+  using offset_type = typename ListViewArrayT::offset_type;
+  const int64_t list_view_array_offset = list_view_array.offset();
+  const int64_t list_view_array_length = list_view_array.length();
+  std::shared_ptr<arrow::Array> value_array = list_view_array.values();
+
+  if (list_view_array_length == 0) {
+    return SliceArrayWithOffsets(*value_array, 0, 0);
+  }
+
+  // If the list array is *all* nulls, then just return an empty array.
+  if constexpr (HasNulls) {
+    if (list_view_array.null_count() == list_view_array.length()) {
+      return MakeEmptyArray(value_array->type(), memory_pool);
+    }
+  }
+
+  const auto* validity = list_view_array.data()->template 
GetValues<uint8_t>(0, 0);
+  const auto* offsets = list_view_array.data()->template 
GetValues<offset_type>(1);
+  const auto* sizes = list_view_array.data()->template 
GetValues<offset_type>(2);
+
+  auto is_null_or_empty = [&](int64_t i) {
+    if constexpr (HasNulls) {
+      if (!bit_util::GetBit(validity, list_view_array_offset + i)) {
+        return true;
+      }
+    }
+    return sizes[i] == 0;
+  };
+
+  // Index of the first valid, non-empty list-view.
+  int64_t first_i = 0;
+  for (; first_i < list_view_array_length; first_i++) {
+    if (!is_null_or_empty(first_i)) {
+      break;
+    }
+  }
+  // If all list-views are empty, return an empty array.
+  if (first_i == list_view_array_length) {
+    return MakeEmptyArray(value_array->type(), memory_pool);
+  }
+
+  std::vector<std::shared_ptr<Array>> slices;
+  {

Review Comment:
   This algorithm looks fine. It will be quite efficient if the list-view array 
is mostly contiguous and ascending, much less if the list view elements are all 
over the place :-)
   
   Would you mind adding a comment about the perf expectations here?
   
   (sidenote: perhaps we need a `Concatenate` API variant that takes a 
`vector<ArraySpan>`...)
   



##########
cpp/src/arrow/util/list_util.cc:
##########
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/list_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+namespace arrow::list_util {
+
+namespace internal {
+
+namespace {
+
+using arrow::internal::checked_cast;
+using arrow::internal::ReverseSetBitRunReader;
+using arrow::internal::SetBitRunReader;
+
+/// \pre input.length() > 0 && input.null_count() != input.length()
+/// \param input A LIST_VIEW or LARGE_LIST_VIEW array
+template <typename offset_type>
+std::optional<int64_t> MinViewOffset(const ArraySpan& input) {
+  const uint8_t* validity = input.buffers[0].data;
+  const auto* offsets = input.GetValues<offset_type>(1);
+  const auto* sizes = input.GetValues<offset_type>(2);
+
+  // Make an access to the sizes buffer only when strictly necessary.
+#define MINIMIZE_MIN_VIEW_OFFSET(i)             \
+  auto offset = offsets[i];                     \
+  if (min_offset.has_value()) {                 \
+    if (offset < *min_offset && sizes[i] > 0) { \
+      if (offset == 0) {                        \
+        return 0;                               \
+      }                                         \
+      min_offset = offset;                      \
+    }                                           \
+  } else {                                      \
+    if (sizes[i] > 0) {                         \
+      if (offset == 0) {                        \
+        return 0;                               \
+      }                                         \
+      min_offset = offset;                      \
+    }                                           \
+  }
+
+  std::optional<offset_type> min_offset;
+  if (validity == nullptr) {

Review Comment:
   Ah, sorry, I had missed the early return. Scratch that :-(



##########
cpp/src/arrow/util/list_util.h:
##########
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace list_util {
+
+/// \brief Get the child array holding the values from a List or ListView array
+inline const ArraySpan& ValuesArray(const ArraySpan& span) { return 
span.child_data[0]; }
+
+namespace internal {
+
+/// \brief Calculate the smallest continuous range of values used by the
+/// var-length list-like input (list, map and list-view types).
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return A pair of (offset, length) describing the range
+ARROW_EXPORT Result<std::pair<int64_t, int64_t>> RangeOfValuesUsed(
+    const ArraySpan& input);
+
+/// \brief Calculate the sum of the sizes of all valid lists or list-views
+///
+/// This is usally the same as the length of the RangeOfValuesUsed() range, but
+/// it can be:
+/// - Smaller: when the child array constains many values that are not
+/// referenced by the lists or list-views in the parent array
+/// - Greater: when the list-views share child array ranges
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return The sum of all list or list-view sizes
+ARROW_EXPORT Result<int64_t> SumOfLogicalListSizes(const ArraySpan& input);
+
+/// \brief Build a ListViewArray from a ListArray
+ARROW_EXPORT Result<std::shared_ptr<ListViewArray>> ListViewFromList(
+    const ListArray& source, MemoryPool* pool);
+
+/// \brief Build a LargeListViewArray from a LargeListArray
+ARROW_EXPORT Result<std::shared_ptr<LargeListViewArray>> ListViewFromList(
+    const LargeListArray& source, MemoryPool* pool);
+
+/// \brief Build a ListArray from a ListViewArray
+ARROW_EXPORT Result<std::shared_ptr<ListArray>> ListFromListView(
+    const ListViewArray& source, MemoryPool* pool);
+
+/// \brief Build a LargeListArray from a LargeListViewArray
+ARROW_EXPORT Result<std::shared_ptr<LargeListArray>> ListFromListView(
+    const LargeListViewArray& source, MemoryPool* pool);

Review Comment:
   I don't think it's a problem to use a Builder in `array_nested.cc`. It would 
be a problem to start including builder APIs from `array_nested.h`.



##########
cpp/src/arrow/testing/random.cc:
##########
@@ -608,6 +609,218 @@ std::shared_ptr<Array> 
OffsetsFromLengthsArray(OffsetArrayType* lengths,
       std::make_shared<typename OffsetArrayType::TypeClass>(), size, buffers, 
null_count);
   return std::make_shared<OffsetArrayType>(array_data);
 }
+
+// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
+// a given metadata key.
+template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
+enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata* 
metadata,
+                                                   const std::string& key,
+                                                   T default_value) {
+  if (!metadata) return default_value;
+  const auto index = metadata->FindKey(key);
+  if (index < 0) return default_value;
+  const auto& value = metadata->value(index);
+  T output{};
+  if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output)) 
{
+    ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ",
+                                 ArrowType::type_name()));
+  }
+  return output;
+}
+
+/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm 
[1].
+///
+/// [1] 
https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] data The array to shuffle
+template <typename ListViewType>
+void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) {
+  DCHECK_EQ(data.type->id(), ListViewType::type_id);
+  using offset_type = typename ListViewType::offset_type;
+
+  auto* validity = data.GetMutableValues<uint8_t>(0, 0);
+  auto* offsets = data.GetMutableValues<offset_type>(1);
+  auto* sizes = data.GetMutableValues<offset_type>(2);
+
+  pcg32_fast rng(seed);
+  using UniformDist = std::uniform_int_distribution<int64_t>;
+  UniformDist dist;
+  for (int64_t i = data.length - 1; i > 0; --i) {
+    const auto j = dist(rng, UniformDist::param_type(0, i));
+    if (ARROW_PREDICT_TRUE(i != j)) {
+      // Swap validity bits
+      if (validity) {
+        const bool valid_i = bit_util::GetBit(validity, data.offset + i);
+        const bool valid_j = bit_util::GetBit(validity, data.offset + i);
+        if (valid_i != valid_j) {
+          bit_util::SetBitTo(validity, data.offset + i, valid_j);
+          bit_util::SetBitTo(validity, data.offset + j, valid_i);
+        }
+      }
+      // Swap offsets and sizes
+      std::swap(offsets[i], offsets[j]);
+      std::swap(sizes[i], sizes[j]);
+    }
+  }
+}
+
+/// \brief Generate the list-view offsets based on a random buffer of sizes.
+///
+/// The sizes buffer is an input of this function, but when force_empty_nulls 
is true,
+/// some values on the sizes buffer can be set to 0.
+///
+/// When sparsity is 0.0, the list-view spans are perfectly packed one after 
the
+/// other. If sparsity is greater than 0.0, the list-view spans are set apart
+/// from each other in proportion to the sparsity value and size of each
+/// list-view. A negative sparsity means each list-view shares a fraction of 
the
+/// values used by the previous list-view.
+///
+/// For instance, a sparsity of -1.0 means the values array will only need 
enough values
+/// for the largest list-view with all the other list-views spanning some of 
these same
+/// values.
+///
+/// \param[in] seed The seed for the random number generator
+/// \param[in,out] mutable_sizes_array The array of sizes to use
+/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0
+/// \param[in] zero_undefined_offsets Whether to zero the offsets of 
list-views that have
+/// 0 set as the size
+/// \param[in] sparsity The sparsity of the generated list-view offsets
+/// \param[out] out_max_view_end The maximum value of the end of a list-view
+template <typename OffsetArrayType, typename offset_type>
+std::shared_ptr<Array> ViewOffsetsFromLengthsArray(
+    SeedType seed, OffsetArrayType& mutable_sizes_array, bool 
force_empty_nulls,
+    bool zero_undefined_offsets, double sparsity, int64_t* out_max_view_end,
+    int64_t alignment, MemoryPool* memory_pool) {
+  using TypeClass = typename OffsetArrayType::TypeClass;
+
+  auto* sizes = mutable_sizes_array.data()->template 
GetMutableValues<offset_type>(1);
+
+  BufferVector buffers{2};
+  buffers[0] = NULLPTR;  // sizes can have nulls, offsets don't have to
+  buffers[1] = *AllocateBuffer(sizeof(offset_type) * 
mutable_sizes_array.length(),
+                               alignment, memory_pool);
+  auto offsets = buffers[1]->mutable_data_as<offset_type>();
+
+  double offset_base = 0.0;
+  offset_type max_view_end = 0;
+  for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) {
+    const auto offset = static_cast<offset_type>(std::llround(offset_base));
+    if (mutable_sizes_array.IsNull(i)) {
+      if (force_empty_nulls) {
+        sizes[i] = 0;
+      }
+      offsets[i] = zero_undefined_offsets ? 0 : offset;
+    } else {
+      if (sizes[i] == 0) {
+        offsets[i] = zero_undefined_offsets ? 0 : offset;
+      } else {
+        offsets[i] = offset;
+        DCHECK_LT(offset, std::numeric_limits<offset_type>::max() - sizes[i]);
+        offset_base = std::max(0.0, offset_base + (sparsity * sizes[i]));
+      }
+    }
+    max_view_end = std::max(max_view_end, offsets[i] + sizes[i]);
+  }
+  *out_max_view_end = max_view_end;
+
+  auto array_data =
+      ArrayData::Make(TypeTraits<TypeClass>::type_singleton(),
+                      mutable_sizes_array.length(), std::move(buffers), 
/*null_count=*/0);
+  return std::make_shared<OffsetArrayType>(std::move(array_data));
+}
+
+template <typename ArrayType, typename RAG>
+Result<std::shared_ptr<Array>> ArrayOfListView(RAG& self, const Field& field,

Review Comment:
   Could these at least use a similar `coverage` parameter? I don't think 
`sparsity` brings much to the table, and the fact that its description needs 8 
lines of text probably means it won't get much use in practice.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -797,57 +811,147 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+ private:
+  /// \pre basic validation has already been performed
+  template <typename offset_type>
+  Status FullyValidateOffsets(int64_t offset_limit) {
+    const auto* offsets = data.GetValues<offset_type>(1);
+    auto prev_offset = offsets[0];
+    if (prev_offset < 0) {
+      return Status::Invalid("Offset invariant failure: array starts at 
negative offset ",
+                             prev_offset);
+    }
+    for (int64_t i = 1; i <= data.length; ++i) {
+      const auto current_offset = offsets[i];
+      if (current_offset < prev_offset) {
+        return Status::Invalid("Offset invariant failure: non-monotonic offset 
at slot ",
+                               i, ": ", current_offset, " < ", prev_offset);
+      }
+      if (current_offset > offset_limit) {
+        return Status::Invalid("Offset invariant failure: offset for slot ", i,
+                               " out of bounds: ", current_offset, " > ", 
offset_limit);
+      }
+      prev_offset = current_offset;
+    }
+    return Status::OK();
+  }
+
+  template <typename offset_type>
+  Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) {
+    const auto* offsets = data.GetValues<offset_type>(1);
+    const auto offset = offsets[slot];
+    return Status::Invalid("Offset invariant failure: offset for slot ", slot,
+                           " out of bounds. Expected ", offset,
+                           " to be at least 0 and less than ", offset_limit);
+  }
+
+  template <typename offset_type>
+  Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) {
+    const auto* offsets = data.GetValues<offset_type>(1);
+    const auto* sizes = data.GetValues<offset_type>(2);
+    const auto size = sizes[slot];
+    if (size < 0) {
+      return Status::Invalid("Offset invariant failure: size for slot ", slot,
+                             " out of bounds: ", size, " < 0");
+    } else {
+      const auto offset = offsets[slot];
+      return Status::Invalid("Offset invariant failure: size for slot ", slot,
+                             " out of bounds: ", offset, " + ", size, " > ",
+                             offset_limit);
+    }
+  }
+
+  /// \pre basic validation has already been performed
+  template <typename offset_type>
+  Status FullyValidateOffsetsAndSizes(int64_t offset_limit) {
+    const auto* offsets = data.GetValues<offset_type>(1);
+    const auto* sizes = data.GetValues<offset_type>(2);
+
+    for (int64_t i = 0; i < data.length; ++i) {
+      const auto size = sizes[i];
+      if (size >= 0) {
+        const auto offset = offsets[i];
+        if (offset < 0 || offset > offset_limit) {
+          return OutOfBoundsListViewOffset<offset_type>(i, offset_limit);
+        }
+        if (size > offset_limit - offset) {
+          return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+        }
+      } else {
+        return OutOfBoundsListViewSize<offset_type>(i, offset_limit);
+      }
+    }
+
+    return Status::OK();
+  }
+
   template <typename TypeClass>
-  Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+  Status ValidateOffsetsAndMaybeSizes(const TypeClass&, int64_t offset_limit) {
     using offset_type = typename TypeClass::offset_type;
+    constexpr bool is_list_view = is_list_view_type<TypeClass>::value;
 
-    if (!IsBufferValid(1)) {
-      // For length 0, an empty offsets buffer seems accepted as a special case
-      // (ARROW-544)
-      if (data.length > 0) {
-        return Status::Invalid("Non-empty array but offsets are null");
+    const bool non_empty = data.length > 0;
+    if constexpr (is_list_view) {
+      if (!IsBufferValid(1)) {
+        // For length 0, an empty offsets buffer is accepted (ARROW-544).
+        return Status::Invalid("offsets buffer is null");
+      }
+      if (!IsBufferValid(2)) {
+        return Status::Invalid("sizes buffer is null");
+      }
+    } else {
+      if (!IsBufferValid(1)) {
+        // For length 0, an empty offsets buffer is accepted (ARROW-544).
+        return non_empty ? Status::Invalid("Non-empty array but offsets are 
null")
+                         : Status::OK();
       }
-      return Status::OK();
     }
 
-    // An empty list array can have 0 offsets
     const auto offsets_byte_size = data.buffers[1]->size();
     const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 
0))
-                                      ? data.length + data.offset + 1
+                                      ? data.length + data.offset + 
(is_list_view ? 0 : 1)
                                       : 0;
     if (offsets_byte_size / static_cast<int32_t>(sizeof(offset_type)) <
         required_offsets) {
       return Status::Invalid("Offsets buffer size (bytes): ", 
offsets_byte_size,
                              " isn't large enough for length: ", data.length,
                              " and offset: ", data.offset);
     }
+    if constexpr (is_list_view) {
+      const auto required_sizes = data.length + data.offset;
+      const auto sizes_bytes_size = data.buffers[2]->size();
+      if (sizes_bytes_size / static_cast<int32_t>(sizeof(offset_type)) < 
required_sizes) {
+        return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size,
+                               " isn't large enough for length: ", data.length,
+                               " and offset: ", data.offset);
+      }
+    }
 
     if (full_validation && required_offsets > 0) {
-      // Validate all offset values
-      const offset_type* offsets = data.GetValues<offset_type>(1);
-
-      auto prev_offset = offsets[0];
-      if (prev_offset < 0) {
-        return Status::Invalid(
-            "Offset invariant failure: array starts at negative offset ", 
prev_offset);
-      }
-      for (int64_t i = 1; i <= data.length; ++i) {
-        const auto current_offset = offsets[i];
-        if (current_offset < prev_offset) {
-          return Status::Invalid(
-              "Offset invariant failure: non-monotonic offset at slot ", i, ": 
",
-              current_offset, " < ", prev_offset);
-        }
-        if (current_offset > offset_limit) {
-          return Status::Invalid("Offset invariant failure: offset for slot ", 
i,
-                                 " out of bounds: ", current_offset, " > ", 
offset_limit);
-        }
-        prev_offset = current_offset;
+      if constexpr (is_list_view) {
+        return FullyValidateOffsetsAndSizes<offset_type>(offset_limit);
+      } else {
+        return FullyValidateOffsets<offset_type>(offset_limit);
       }
     }
     return Status::OK();
   }
 
+ public:
+  template <typename TypeClass>
+  enable_if_list_view<TypeClass, Status> ValidateOffsetsAndSizes(const 
TypeClass& type,
+                                                                 int64_t 
offset_limit) {
+    return ValidateOffsetsAndMaybeSizes<TypeClass>(type, offset_limit);
+  }
+
+  template <typename TypeClass>
+  std::enable_if_t<is_var_length_list_type<TypeClass>::value ||
+                       is_base_binary_like(TypeClass::type_id),
+                   Status>
+  ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+    return ValidateOffsetsAndMaybeSizes<TypeClass>(type, offset_limit);
+  }

Review Comment:
   But this class is just an implementation detail, it's not for outside use. 
   I don't entirely object to this but I found the indirection a bit tedious to 
follow when reading this code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to