pitrou commented on code in PR #50160:
URL: https://github.com/apache/arrow/pull/50160#discussion_r3419189024
##########
cpp/src/parquet/arrow/writer.cc:
##########
@@ -169,13 +170,24 @@ class ArrowColumnWriterV2 {
leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
size_t visited_component_size =
result.post_list_visited_elements.size();
DCHECK_GT(visited_component_size, 0);
- if (visited_component_size != 1) {
- return Status::NotImplemented(
- "Lists with non-zero length null components are not
supported");
+ std::shared_ptr<Array> values_array;
+ if (visited_component_size == 1) {
+ const ElementRange& range =
result.post_list_visited_elements[0];
+ values_array = result.leaf_array->Slice(range.start,
range.Size());
+ } else {
+ // Multiple leaf ranges can be produced when child values are
+ // skipped, such as null fixed-size-list slots, or when
+ // list-view ranges are non-contiguous. Concatenate the slices
+ // in logical write order.
+ ::arrow::ArrayVector arrays;
+ arrays.reserve(visited_component_size);
+ for (const auto& range : result.post_list_visited_elements) {
+ DCHECK(!range.Empty());
+ arrays.push_back(result.leaf_array->Slice(range.start,
range.Size()));
+ }
+ ARROW_ASSIGN_OR_RAISE(values_array,
+ ::arrow::Concatenate(arrays,
ctx->memory_pool));
Review Comment:
Instead of concatenating, would it be possible (and potentially more
efficient) to call `WriteArrow` for each individual chunk?
##########
cpp/src/parquet/arrow/writer.cc:
##########
@@ -169,13 +170,24 @@ class ArrowColumnWriterV2 {
leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
size_t visited_component_size =
result.post_list_visited_elements.size();
DCHECK_GT(visited_component_size, 0);
- if (visited_component_size != 1) {
- return Status::NotImplemented(
- "Lists with non-zero length null components are not
supported");
+ std::shared_ptr<Array> values_array;
+ if (visited_component_size == 1) {
+ const ElementRange& range =
result.post_list_visited_elements[0];
+ values_array = result.leaf_array->Slice(range.start,
range.Size());
+ } else {
+ // Multiple leaf ranges can be produced when child values are
+ // skipped, such as null fixed-size-list slots, or when
+ // list-view ranges are non-contiguous. Concatenate the slices
+ // in logical write order.
Review Comment:
Is this somehow fixing https://github.com/apache/arrow/issues/35697?
##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3324,6 +3325,75 @@ TEST(ArrowReadWrite, LargeList) {
}
}
+TEST(ArrowReadWrite, ListView) {
+ auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");
+ auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]");
+ auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]");
+ ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays(
+ ::arrow::list_view(::arrow::int32()),
*offsets,
+ *sizes, *values,
default_memory_pool()));
+ auto table = Table::Make(
+ ::arrow::schema({::arrow::field("root", array->type(), false)}),
{array});
+
+ auto props_store_schema =
ArrowWriterProperties::Builder().store_schema()->build();
Review Comment:
Can we also check the expect List result when `store_schema` is false? This
would further validate that the actual Parquet contents are ok.
##########
cpp/src/parquet/arrow/path_internal.cc:
##########
@@ -406,14 +405,20 @@ class ListPathNode {
// def_levels entered first.
break;
}
+ if constexpr (RangeSelector::kCheckContiguous) {
+ if (size_check.start != child_range->end) {
Review Comment:
Can you comment on why we're doing this? This code is complicated and this
will help the reader.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]