wgtmac commented on code in PR #50160:
URL: https://github.com/apache/arrow/pull/50160#discussion_r3411441523


##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -673,13 +673,39 @@ class ListReader : public ColumnReaderImpl {
 
   const std::shared_ptr<Field> field() override { return field_; }
 
- private:
+ protected:
   std::shared_ptr<ReaderContext> ctx_;
+
+ private:
   std::shared_ptr<Field> field_;
   ::parquet::internal::LevelInfo level_info_;
   std::unique_ptr<ColumnReaderImpl> item_reader_;
 };
 
+template <typename IndexType>
+class PARQUET_NO_EXPORT ListViewReader : public ListReader<IndexType> {
+ public:
+  using ListReader<IndexType>::ListReader;
+
+  ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+      std::shared_ptr<ArrayData> data) final {
+    DCHECK_EQ(data->buffers.size(), 2);
+    const auto* offsets = reinterpret_cast<const 
IndexType*>(data->buffers[1]->data());
+    ARROW_ASSIGN_OR_RAISE(
+        auto sizes_buffer,
+        AllocateResizableBuffer(sizeof(IndexType) * data->length, 
this->ctx_->pool));
+    auto* sizes = reinterpret_cast<IndexType*>(sizes_buffer->mutable_data());
+    for (int64_t i = 0; i < data->length; ++i) {
+      sizes[i] = offsets[i + 1] - offsets[i];
+    }
+    data->buffers[1] = ::arrow::SliceBuffer(std::move(data->buffers[1]), 0,

Review Comment:
   Why do we need this? If it is really needed, let's add `/*offset=*/0` for 
better readability.



##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -673,13 +673,39 @@ class ListReader : public ColumnReaderImpl {
 
   const std::shared_ptr<Field> field() override { return field_; }
 
- private:
+ protected:
   std::shared_ptr<ReaderContext> ctx_;
+
+ private:
   std::shared_ptr<Field> field_;
   ::parquet::internal::LevelInfo level_info_;
   std::unique_ptr<ColumnReaderImpl> item_reader_;
 };
 
+template <typename IndexType>
+class PARQUET_NO_EXPORT ListViewReader : public ListReader<IndexType> {
+ public:
+  using ListReader<IndexType>::ListReader;
+
+  ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+      std::shared_ptr<ArrayData> data) final {
+    DCHECK_EQ(data->buffers.size(), 2);

Review Comment:
   Do we want to call `DCHECK_EQ` on `field()->type()->id()` just in case?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3324,6 +3325,75 @@ TEST(ArrowReadWrite, LargeList) {
   }
 }
 
+TEST(ArrowReadWrite, ListView) {
+  auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");
+  auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]");
+  auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]");
+  ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays(
+                                       ::arrow::list_view(::arrow::int32()), 
*offsets,
+                                       *sizes, *values, 
default_memory_pool()));
+  auto table = Table::Make(
+      ::arrow::schema({::arrow::field("root", array->type(), false)}), 
{array});
+
+  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();
+  CheckSimpleRoundtrip(table, 2, props_store_schema);
+
+  ASSERT_OK_AND_ASSIGN(auto expected_array,
+                       ::arrow::ListArray::FromListView(*array, 
default_memory_pool()));
+  auto expected = Table::Make(
+      ::arrow::schema({::arrow::field("root", ::arrow::list(::arrow::int32()), 
false)}),
+      {expected_array});
+  CheckConfiguredRoundtrip(table, expected);
+}
+
+TEST(ArrowReadWrite, EmptyListView) {
+  auto type = ::arrow::list_view(::arrow::int32());
+  auto array = ArrayFromJSON(type, "[]");
+  auto table = Table::Make(::arrow::schema({::arrow::field("root", type)}), 
{array});
+
+  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();
+  std::shared_ptr<Table> result;
+  ASSERT_NO_FATAL_FAILURE(
+      DoRoundtrip(table, 1, &result, default_writer_properties(), 
props_store_schema));
+  ASSERT_OK(result->ValidateFull());
+
+  ASSERT_EQ(1, result->column(0)->num_chunks());
+  const auto& list_view =
+      checked_cast<const 
::arrow::ListViewArray&>(*result->column(0)->chunk(0));
+  ASSERT_EQ(0, list_view.length());
+  ASSERT_NE(nullptr, list_view.value_offsets());
+  ASSERT_EQ(0, list_view.value_offsets()->size());
+  ASSERT_NE(nullptr, list_view.value_sizes());
+  ASSERT_EQ(0, list_view.value_sizes()->size());
+}
+
+TEST(ArrowReadWrite, LargeListView) {
+  auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");
+  auto offsets = ArrayFromJSON(::arrow::int64(), "[3, 0, 5, 1]");
+  auto sizes = ArrayFromJSON(::arrow::int64(), "[2, 1, 0, 2]");
+  auto element = ::arrow::field("element", ::arrow::int32());
+  ASSERT_OK_AND_ASSIGN(auto array, ::arrow::LargeListViewArray::FromArrays(
+                                       ::arrow::large_list_view(element), 
*offsets,
+                                       *sizes, *values, 
default_memory_pool()));
+  auto table = Table::Make(
+      ::arrow::schema({::arrow::field("root", array->type(), false)}), 
{array});
+
+  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();

Review Comment:
   I think we need to add a comment somewhere to note that reading back into 
`ListView/LargeListView` is only supported from a file with `store_schema` 
enabled.



##########
cpp/src/parquet/arrow/writer.cc:
##########
@@ -169,13 +170,20 @@ class ArrowColumnWriterV2 {
             leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
               size_t visited_component_size = 
result.post_list_visited_elements.size();
               DCHECK_GT(visited_component_size, 0);
-              if (visited_component_size != 1) {
-                return Status::NotImplemented(
-                    "Lists with non-zero length null components are not 
supported");
+              std::shared_ptr<Array> values_array;
+              if (visited_component_size == 1) {
+                const ElementRange& range = 
result.post_list_visited_elements[0];
+                values_array = result.leaf_array->Slice(range.start, 
range.Size());
+              } else {

Review Comment:
   Please add a comment to explain when `visited_component_size` will be 
greater than 1?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3324,6 +3325,75 @@ TEST(ArrowReadWrite, LargeList) {
   }
 }
 
+TEST(ArrowReadWrite, ListView) {
+  auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");
+  auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]");
+  auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]");
+  ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays(
+                                       ::arrow::list_view(::arrow::int32()), 
*offsets,
+                                       *sizes, *values, 
default_memory_pool()));
+  auto table = Table::Make(
+      ::arrow::schema({::arrow::field("root", array->type(), false)}), 
{array});
+
+  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();
+  CheckSimpleRoundtrip(table, 2, props_store_schema);
+
+  ASSERT_OK_AND_ASSIGN(auto expected_array,
+                       ::arrow::ListArray::FromListView(*array, 
default_memory_pool()));
+  auto expected = Table::Make(
+      ::arrow::schema({::arrow::field("root", ::arrow::list(::arrow::int32()), 
false)}),
+      {expected_array});
+  CheckConfiguredRoundtrip(table, expected);
+}
+
+TEST(ArrowReadWrite, EmptyListView) {
+  auto type = ::arrow::list_view(::arrow::int32());
+  auto array = ArrayFromJSON(type, "[]");
+  auto table = Table::Make(::arrow::schema({::arrow::field("root", type)}), 
{array});
+
+  auto props_store_schema = 
ArrowWriterProperties::Builder().store_schema()->build();
+  std::shared_ptr<Table> result;
+  ASSERT_NO_FATAL_FAILURE(
+      DoRoundtrip(table, 1, &result, default_writer_properties(), 
props_store_schema));
+  ASSERT_OK(result->ValidateFull());
+
+  ASSERT_EQ(1, result->column(0)->num_chunks());
+  const auto& list_view =
+      checked_cast<const 
::arrow::ListViewArray&>(*result->column(0)->chunk(0));
+  ASSERT_EQ(0, list_view.length());
+  ASSERT_NE(nullptr, list_view.value_offsets());
+  ASSERT_EQ(0, list_view.value_offsets()->size());
+  ASSERT_NE(nullptr, list_view.value_sizes());
+  ASSERT_EQ(0, list_view.value_sizes()->size());
+}
+
+TEST(ArrowReadWrite, LargeListView) {
+  auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");

Review Comment:
   Can we use a single test by parameterizing `schema` and `table` to reduce 
duplicated lines? I don't remember if `RecordBatchFromJson` supports list view 
natively.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to