[jira] [Commented] (PARQUET-1268) [C++] Conversion of Arrow null list columns fails

2018-04-17 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/PARQUET-1268?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16441188#comment-16441188
 ] 

ASF GitHub Bot commented on PARQUET-1268:
-

xhochy closed pull request #454: PARQUET-1268: Fix conversion of null list 
Arrow arrays
URL: https://github.com/apache/parquet-cpp/pull/454
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 79a393f6..f2402dfb 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1000,23 +1000,56 @@ TEST_F(TestStringParquetIO, 
EmptyStringColumnRequiredWrite) {
 using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
 
 TEST_F(TestNullParquetIO, NullColumn) {
-  std::shared_ptr values = 
std::make_shared<::arrow::NullArray>(SMALL_SIZE);
-  std::shared_ptr table = MakeSimpleTable(values, true);
-  this->sink_ = std::make_shared();
-  ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
-values->length(), 
default_writer_properties()));
+  for (int32_t num_rows : {0, SMALL_SIZE}) {
+std::shared_ptr values = 
std::make_shared<::arrow::NullArray>(num_rows);
+std::shared_ptr table = MakeSimpleTable(values, true /* nullable 
*/);
+this->sink_ = std::make_shared();
 
-  std::shared_ptr out;
-  std::unique_ptr reader;
-  this->ReaderFromSink(&reader);
-  this->ReadTableFromFile(std::move(reader), &out);
-  ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+const int64_t chunk_size = std::max(static_cast(1), 
table->num_rows());
+ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
+  chunk_size, default_writer_properties()));
 
-  std::shared_ptr chunked_array = out->column(0)->data();
-  ASSERT_EQ(1, chunked_array->num_chunks());
+std::shared_ptr out;
+std::unique_ptr reader;
+this->ReaderFromSink(&reader);
+this->ReadTableFromFile(std::move(reader), &out);
+ASSERT_EQ(1, out->num_columns());
+ASSERT_EQ(num_rows, out->num_rows());
 
-  internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+std::shared_ptr chunked_array = out->column(0)->data();
+ASSERT_EQ(1, chunked_array->num_chunks());
+internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
+  }
+}
+
+TEST_F(TestNullParquetIO, NullListColumn) {
+  std::vector offsets1 = {0};
+  std::vector offsets2 = {0, 2, 2, 3, 115};
+  for (std::vector offsets : {offsets1, offsets2}) {
+std::shared_ptr offsets_array, values_array, list_array;
+::arrow::ArrayFromVector<::arrow::Int32Type, int32_t>(offsets, 
&offsets_array);
+values_array = std::make_shared<::arrow::NullArray>(offsets.back());
+ASSERT_OK(::arrow::ListArray::FromArrays(*offsets_array, *values_array,
+ default_memory_pool(), 
&list_array));
+
+std::shared_ptr table = MakeSimpleTable(list_array, false /* 
nullable */);
+this->sink_ = std::make_shared();
+
+const int64_t chunk_size = std::max(static_cast(1), 
table->num_rows());
+ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), 
this->sink_,
+  chunk_size, default_writer_properties()));
+
+std::shared_ptr out;
+std::unique_ptr reader;
+this->ReaderFromSink(&reader);
+this->ReadTableFromFile(std::move(reader), &out);
+ASSERT_EQ(1, out->num_columns());
+ASSERT_EQ(offsets.size() - 1, out->num_rows());
+
+std::shared_ptr chunked_array = out->column(0)->data();
+ASSERT_EQ(1, chunked_array->num_chunks());
+internal::AssertArraysEqual(*list_array, *chunked_array->chunk(0));
+  }
 }
 
 TEST_F(TestNullParquetIO, NullDictionaryColumn) {
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index dd58d7a9..1f933e64 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -1235,17 +1235,6 @@ struct TransferFunctor<::arrow::Decimal128Type, 
Int64Type> {
   } break;
 
 Status PrimitiveImpl::NextBatch(int64_t records_to_read, 
std::shared_ptr* out) {
-  if (!record_reader_->HasMoreData()) {
-// Exhausted all row groups.
-*out = nullptr;
-return Status::OK();
-  }
-
-  if (field_->type()->id() == ::arrow::Type::NA) {
-*out = std::make_shared<::arrow::NullArray>(records_to_read);
-return Status::OK();
-  }
-
   try {
 // Pre-allocation gives much better performance for flat columns
 record_reader_->Reserve(records_to_read);
@@ -1282,6 +1271,11 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_rea

[jira] [Commented] (PARQUET-1268) [C++] Conversion of Arrow null list columns fails

2018-04-12 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/PARQUET-1268?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16435741#comment-16435741
 ] 

ASF GitHub Bot commented on PARQUET-1268:
-

pitrou opened a new pull request #454: PARQUET-1268: Fix conversion of null 
list Arrow arrays
URL: https://github.com/apache/parquet-cpp/pull/454
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Conversion of Arrow null list columns fails
> -
>
> Key: PARQUET-1268
> URL: https://issues.apache.org/jira/browse/PARQUET-1268
> Project: Parquet
>  Issue Type: Bug
>  Components: parquet-cpp
>Reporter: Antoine Pitrou
>Assignee: Antoine Pitrou
>Priority: Major
>
> See ARROW-2450



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)