[
https://issues.apache.org/jira/browse/PARQUET-1268?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16441188#comment-16441188
]
ASF GitHub Bot commented on PARQUET-1268:
-
xhochy closed pull request #454: PARQUET-1268: Fix conversion of null list
Arrow arrays
URL: https://github.com/apache/parquet-cpp/pull/454
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 79a393f6..f2402dfb 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1000,23 +1000,56 @@ TEST_F(TestStringParquetIO,
EmptyStringColumnRequiredWrite) {
using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
TEST_F(TestNullParquetIO, NullColumn) {
- std::shared_ptr values =
std::make_shared<::arrow::NullArray>(SMALL_SIZE);
- std::shared_ptr table = MakeSimpleTable(values, true);
- this->sink_ = std::make_shared();
- ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(),
this->sink_,
-values->length(),
default_writer_properties()));
+ for (int32_t num_rows : {0, SMALL_SIZE}) {
+std::shared_ptr values =
std::make_shared<::arrow::NullArray>(num_rows);
+std::shared_ptr table = MakeSimpleTable(values, true /* nullable
*/);
+this->sink_ = std::make_shared();
- std::shared_ptr out;
- std::unique_ptr reader;
- this->ReaderFromSink(&reader);
- this->ReadTableFromFile(std::move(reader), &out);
- ASSERT_EQ(1, out->num_columns());
- ASSERT_EQ(100, out->num_rows());
+const int64_t chunk_size = std::max(static_cast(1),
table->num_rows());
+ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(),
this->sink_,
+ chunk_size, default_writer_properties()));
- std::shared_ptr chunked_array = out->column(0)->data();
- ASSERT_EQ(1, chunked_array->num_chunks());
+std::shared_ptr out;
+std::unique_ptr reader;
+this->ReaderFromSink(&reader);
+this->ReadTableFromFile(std::move(reader), &out);
+ASSERT_EQ(1, out->num_columns());
+ASSERT_EQ(num_rows, out->num_rows());
- internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+std::shared_ptr chunked_array = out->column(0)->data();
+ASSERT_EQ(1, chunked_array->num_chunks());
+internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+ }
+}
+
+TEST_F(TestNullParquetIO, NullListColumn) {
+ std::vector offsets1 = {0};
+ std::vector offsets2 = {0, 2, 2, 3, 115};
+ for (std::vector offsets : {offsets1, offsets2}) {
+std::shared_ptr offsets_array, values_array, list_array;
+::arrow::ArrayFromVector<::arrow::Int32Type, int32_t>(offsets,
&offsets_array);
+values_array = std::make_shared<::arrow::NullArray>(offsets.back());
+ASSERT_OK(::arrow::ListArray::FromArrays(*offsets_array, *values_array,
+ default_memory_pool(),
&list_array));
+
+std::shared_ptr table = MakeSimpleTable(list_array, false /*
nullable */);
+this->sink_ = std::make_shared();
+
+const int64_t chunk_size = std::max(static_cast(1),
table->num_rows());
+ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(),
this->sink_,
+ chunk_size, default_writer_properties()));
+
+std::shared_ptr out;
+std::unique_ptr reader;
+this->ReaderFromSink(&reader);
+this->ReadTableFromFile(std::move(reader), &out);
+ASSERT_EQ(1, out->num_columns());
+ASSERT_EQ(offsets.size() - 1, out->num_rows());
+
+std::shared_ptr chunked_array = out->column(0)->data();
+ASSERT_EQ(1, chunked_array->num_chunks());
+internal::AssertArraysEqual(*list_array, *chunked_array->chunk(0));
+ }
}
TEST_F(TestNullParquetIO, NullDictionaryColumn) {
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index dd58d7a9..1f933e64 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -1235,17 +1235,6 @@ struct TransferFunctor<::arrow::Decimal128Type,
Int64Type> {
} break;
Status PrimitiveImpl::NextBatch(int64_t records_to_read,
std::shared_ptr* out) {
- if (!record_reader_->HasMoreData()) {
-// Exhausted all row groups.
-*out = nullptr;
-return Status::OK();
- }
-
- if (field_->type()->id() == ::arrow::Type::NA) {
-*out = std::make_shared<::arrow::NullArray>(records_to_read);
-return Status::OK();
- }
-
try {
// Pre-allocation gives much better performance for flat columns
record_reader_->Reserve(records_to_read);
@@ -1282,6 +1271,11 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_rea