[ 
https://issues.apache.org/jira/browse/PARQUET-1346?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16541219#comment-16541219
 ] 

ASF GitHub Bot commented on PARQUET-1346:
-----------------------------------------

xhochy closed pull request #474: PARQUET-1346: [C++] Protect against empty 
Arrow arrays with null values
URL: https://github.com/apache/parquet-cpp/pull/474
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc 
b/src/parquet/arrow/arrow-reader-writer-test.cc
index 6d7e1ebb..e0ff7aab 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -536,6 +536,13 @@ class TestParquetIO : public ::testing::Test {
     *out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists);
   }
 
+  // Prepare table of empty lists, with null values array (ARROW-2744)
+  void PrepareEmptyListsTable(int64_t size, std::shared_ptr<Table>* out) {
+    std::shared_ptr<Array> lists;
+    ASSERT_OK(MakeEmptyListsArray(size, &lists));
+    *out = MakeSimpleTable(lists, true /* nullable_lists */);
+  }
+
   void PrepareListOfListTable(int64_t size, bool nullable_parent_lists,
                               bool nullable_lists, bool nullable_elements,
                               int64_t null_count, std::shared_ptr<Table>* out) 
{
@@ -713,6 +720,12 @@ TYPED_TEST(TestParquetIO, 
SingleColumnTableOptionalReadWrite) {
   ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
 }
 
+TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {
+  std::shared_ptr<Table> table;
+  ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table));
+  ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
+}
+
 TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
   std::shared_ptr<Table> table;
   ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, 
&table));
@@ -1524,8 +1537,6 @@ void MakeDoubleTable(int num_columns, int num_rows, int 
nchunks,
 
 void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type,
                    std::shared_ptr<Array>* out_array) {
-  ::arrow::Int32Builder offset_builder;
-
   std::vector<int32_t> length_draws;
   randint(num_rows, 0, 100, &length_draws);
 
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 7264324d..c70e0eff 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -394,6 +394,33 @@ Status MakeListArray(const std::shared_ptr<Array>& values, 
int64_t size,
   return Status::OK();
 }
 
+// Make an array containing only empty lists, with a null values array
+Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
+  // Allocate an offsets buffer containing only zeroes
+  std::shared_ptr<Buffer> offsets_buffer;
+  const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
+  RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), 
offsets_nbytes,
+                                        &offsets_buffer));
+  memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
+
+  auto value_field = ::arrow::field("item", ::arrow::float64(),
+                                    false /* nullable_values */);
+  auto list_type = ::arrow::list(value_field);
+
+  std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap 
*/,
+                                                        nullptr /* values */ };
+  auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
+                                             std::move(child_buffers));
+
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
+                                                  offsets_buffer };
+  auto array_data = ::arrow::ArrayData::Make(list_type, size, 
std::move(buffers));
+  array_data->child_data.push_back(child_data);
+
+  *out_array = ::arrow::MakeArray(array_data);
+  return Status::OK();
+}
+
 static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name,
                                                    const 
std::shared_ptr<Array>& array,
                                                    bool nullable) {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 50b4649e..f7727381 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -411,8 +411,13 @@ Status ArrowColumnWriter::TypedWriteBatch(const Array& 
array, int64_t num_levels
   using ArrowCType = typename ArrowType::c_type;
 
   const auto& data = static_cast<const PrimitiveArray&>(array);
-  auto values =
-      reinterpret_cast<const ArrowCType*>(data.values()->data()) + 
data.offset();
+  const ArrowCType* values = nullptr;
+  // The values buffer may be null if the array is empty (ARROW-2744)
+  if (data.values() != nullptr) {
+    values = reinterpret_cast<const ArrowCType*>(data.values()->data()) + 
data.offset();
+  } else {
+    DCHECK_EQ(data.length(), 0);
+  }
 
   if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 
0)) {
     // no nulls, just dump the data
@@ -706,7 +711,13 @@ Status ArrowColumnWriter::TypedWriteBatch<BooleanType, 
::arrow::BooleanType>(
   RETURN_NOT_OK(ctx_->GetScratchData<bool>(array.length(), &buffer));
 
   const auto& data = static_cast<const BooleanArray&>(array);
-  auto values = reinterpret_cast<const uint8_t*>(data.values()->data());
+  const uint8_t* values = nullptr;
+  // The values buffer may be null if the array is empty (ARROW-2744)
+  if (data.values() != nullptr) {
+    values = reinterpret_cast<const uint8_t*>(data.values()->data());
+  } else {
+    DCHECK_EQ(data.length(), 0);
+  }
 
   int buffer_idx = 0;
   int64_t offset = array.offset();


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Protect against null values data in empty Arrow array
> -----------------------------------------------------------
>
>                 Key: PARQUET-1346
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1346
>             Project: Parquet
>          Issue Type: Improvement
>          Components: parquet-cpp
>            Reporter: Antoine Pitrou
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: cpp-1.5.0
>
>
> See ARROW-2744.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to