[ 
https://issues.apache.org/jira/browse/PARQUET-1268?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16441188#comment-16441188
 ] 

ASF GitHub Bot commented on PARQUET-1268:
-----------------------------------------

xhochy closed pull request #454: PARQUET-1268: Fix conversion of null list 
Arrow arrays
URL: https://github.com/apache/parquet-cpp/pull/454
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 79a393f6..f2402dfb 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1000,23 +1000,56 @@ TEST_F(TestStringParquetIO, 
EmptyStringColumnRequiredWrite) {
 using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
 
 TEST_F(TestNullParquetIO, NullColumn) {
-  std::shared_ptr<Array> values = 
std::make_shared<::arrow::NullArray>(SMALL_SIZE);
-  std::shared_ptr<Table> table = MakeSimpleTable(values, true);
-  this->sink_ = std::make_shared<InMemoryOutputStream>();
-  ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
-                                values->length(), 
default_writer_properties()));
+  for (int32_t num_rows : {0, SMALL_SIZE}) {
+    std::shared_ptr<Array> values = 
std::make_shared<::arrow::NullArray>(num_rows);
+    std::shared_ptr<Table> table = MakeSimpleTable(values, true /* nullable 
*/);
+    this->sink_ = std::make_shared<InMemoryOutputStream>();
 
-  std::shared_ptr<Table> out;
-  std::unique_ptr<FileReader> reader;
-  this->ReaderFromSink(&reader);
-  this->ReadTableFromFile(std::move(reader), &out);
-  ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+    const int64_t chunk_size = std::max(static_cast<int64_t>(1), 
table->num_rows());
+    ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
+                                  chunk_size, default_writer_properties()));
 
-  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
-  ASSERT_EQ(1, chunked_array->num_chunks());
+    std::shared_ptr<Table> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader);
+    this->ReadTableFromFile(std::move(reader), &out);
+    ASSERT_EQ(1, out->num_columns());
+    ASSERT_EQ(num_rows, out->num_rows());
 
-  internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+    std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+    ASSERT_EQ(1, chunked_array->num_chunks());
+    internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+  }
+}
+
+TEST_F(TestNullParquetIO, NullListColumn) {
+  std::vector<int32_t> offsets1 = {0};
+  std::vector<int32_t> offsets2 = {0, 2, 2, 3, 115};
+  for (std::vector<int32_t> offsets : {offsets1, offsets2}) {
+    std::shared_ptr<Array> offsets_array, values_array, list_array;
+    ::arrow::ArrayFromVector<::arrow::Int32Type, int32_t>(offsets, 
&offsets_array);
+    values_array = std::make_shared<::arrow::NullArray>(offsets.back());
+    ASSERT_OK(::arrow::ListArray::FromArrays(*offsets_array, *values_array,
+                                             default_memory_pool(), 
&list_array));
+
+    std::shared_ptr<Table> table = MakeSimpleTable(list_array, false /* 
nullable */);
+    this->sink_ = std::make_shared<InMemoryOutputStream>();
+
+    const int64_t chunk_size = std::max(static_cast<int64_t>(1), 
table->num_rows());
+    ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
+                                  chunk_size, default_writer_properties()));
+
+    std::shared_ptr<Table> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader);
+    this->ReadTableFromFile(std::move(reader), &out);
+    ASSERT_EQ(1, out->num_columns());
+    ASSERT_EQ(offsets.size() - 1, out->num_rows());
+
+    std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+    ASSERT_EQ(1, chunked_array->num_chunks());
+    internal::AssertArraysEqual(*list_array, *chunked_array->chunk(0));
+  }
 }
 
 TEST_F(TestNullParquetIO, NullDictionaryColumn) {
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index dd58d7a9..1f933e64 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -1235,17 +1235,6 @@ struct TransferFunctor<::arrow::Decimal128Type, 
Int64Type> {
   } break;
 
 Status PrimitiveImpl::NextBatch(int64_t records_to_read, 
std::shared_ptr<Array>* out) {
-  if (!record_reader_->HasMoreData()) {
-    // Exhausted all row groups.
-    *out = nullptr;
-    return Status::OK();
-  }
-
-  if (field_->type()->id() == ::arrow::Type::NA) {
-    *out = std::make_shared<::arrow::NullArray>(records_to_read);
-    return Status::OK();
-  }
-
   try {
     // Pre-allocation gives much better performance for flat columns
     record_reader_->Reserve(records_to_read);
@@ -1282,6 +1271,11 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, 
std::shared_ptr<Array>*
     TRANSFER_CASE(DATE32, ::arrow::Date32Type, Int32Type)
     TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type)
     TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType)
+    case ::arrow::Type::NA: {
+      *out = 
std::make_shared<::arrow::NullArray>(record_reader_->values_written());
+      RETURN_NOT_OK(WrapIntoListArray<Int32Type>(out));
+      break;
+    }
     case ::arrow::Type::DECIMAL: {
       switch (descr_->physical_type()) {
         case ::parquet::Type::INT32: {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 5040e0cc..a5dae3cf 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -193,9 +193,9 @@ class LevelBuilder {
   }
 
   Status HandleNonNullList(int16_t def_level, int16_t rep_level, int64_t 
index) {
-    int32_t inner_offset = offsets_[rep_level][index];
-    int32_t inner_length = offsets_[rep_level][index + 1] - inner_offset;
-    int64_t recursion_level = rep_level + 1;
+    const int32_t inner_offset = offsets_[rep_level][index];
+    const int32_t inner_length = offsets_[rep_level][index + 1] - inner_offset;
+    const int64_t recursion_level = rep_level + 1;
     if (inner_length == 0) {
       return def_levels_.Append(def_level);
     }
@@ -205,14 +205,21 @@ class LevelBuilder {
                                inner_length);
     } else {
       // We have reached the leaf: primitive list, handle remaining nullables
+      const bool nullable_level = nullable_[recursion_level];
+      const int64_t level_null_count = null_counts_[recursion_level];
+      const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level];
+
       for (int64_t i = 0; i < inner_length; i++) {
         if (i > 0) {
           RETURN_NOT_OK(rep_levels_.Append(static_cast<int16_t>(rep_level + 
1)));
         }
-        if (nullable_[recursion_level] &&
-            ((null_counts_[recursion_level] == 0) ||
-             BitUtil::GetBit(valid_bitmaps_[recursion_level],
-                             inner_offset + i + 
array_offsets_[recursion_level]))) {
+        if (level_null_count && level_valid_bitmap == nullptr) {
+          // Special case: this is a null array (all elements are null)
+          RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 
1)));
+        } else if (nullable_level && ((level_null_count == 0) ||
+            BitUtil::GetBit(level_valid_bitmap,
+                            inner_offset + i + 
array_offsets_[recursion_level]))) {
+          // Non-null element in a null level
           RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 
2)));
         } else {
           // This can be produced in two case:


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Conversion of Arrow null list columns fails
> -------------------------------------------------
>
>                 Key: PARQUET-1268
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1268
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>            Reporter: Antoine Pitrou
>            Assignee: Antoine Pitrou
>            Priority: Major
>
> See ARROW-2450



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to