This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 69824e3  ARROW-14321: [R] segfault converting dictionary ChunkedArray 
with 0 chunks
69824e3 is described below

commit 69824e374510fa2931e64ed854a3c6be8b941545
Author: Weston Pace <[email protected]>
AuthorDate: Sat Oct 16 13:04:06 2021 -0400

    ARROW-14321: [R] segfault converting dictionary ChunkedArray with 0 chunks
    
    Both dictionary arrays and struct arrays were relying on chunked arrays 
always having at least 1 chunk.
    
    The fix also revealed a bug in arrow::Table::FromChunkedStructArray (the 
method was also expecting at least 1 chunk).
    
    Closes #11428 from 
westonpace/bugfix/ARROW-14321--segfault-converting-no-chunks-in-r
    
    Authored-by: Weston Pace <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/table.cc                |  3 +-
 r/src/array_to_vector.cpp             | 62 +++++++++++++++++++----------------
 r/tests/testthat/test-chunked-array.R | 15 +++++++--
 3 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index d4c7802..a8a45e9 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -327,7 +327,8 @@ Result<std::shared_ptr<Table>> 
Table::FromChunkedStructArray(
                    [i](const std::shared_ptr<Array>& struct_chunk) {
                      return static_cast<const 
StructArray&>(*struct_chunk).field(i);
                    });
-    columns[i] = std::make_shared<ChunkedArray>(std::move(chunks));
+    columns[i] =
+        std::make_shared<ChunkedArray>(std::move(chunks), 
type->field(i)->type());
   }
 
   return Table::Make(::arrow::schema(type->fields()), std::move(columns),
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 32b52ab..480eb86 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -158,6 +158,15 @@ Status IngestSome(const std::shared_ptr<arrow::Array>& 
array, R_xlen_t n,
   return IngestSome(array, n, std::forward<SetNonNull>(set_non_null), nothing);
 }
 
+std::shared_ptr<Array> CreateEmptyArray(const std::shared_ptr<DataType>& 
array_type) {
+  std::unique_ptr<arrow::ArrayBuilder> builder;
+  StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), array_type, &builder));
+
+  std::shared_ptr<arrow::Array> array;
+  StopIfNotOk(builder->Finish(&array));
+  return array;
+}
+
 template <typename Type>
 class Converter_Int : public Converter {
   using value_type = typename TypeTraits<Type>::ArrayType::value_type;
@@ -545,9 +554,7 @@ class Converter_Dictionary : public Converter {
   explicit Converter_Dictionary(const std::shared_ptr<ChunkedArray>& 
chunked_array)
       : Converter(chunked_array), need_unification_(NeedUnification()) {
     if (need_unification_) {
-      const auto& arr_first =
-          checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
-      const auto& arr_type = checked_cast<const 
DictionaryType&>(*arr_first.type());
+      const auto& arr_type = checked_cast<const 
DictionaryType&>(*chunked_array->type());
       unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
 
       size_t n_arrays = chunked_array->num_chunks();
@@ -561,11 +568,10 @@ class Converter_Dictionary : public Converter {
 
       StopIfNotOk(unifier_->GetResult(&out_type_, &dictionary_));
     } else {
-      const auto& dict_array =
-          checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
+      const auto& dict_type = checked_cast<const 
DictionaryType&>(*chunked_array->type());
 
-      auto indices = dict_array.indices();
-      switch (indices->type_id()) {
+      const auto& indices_type = *dict_type.index_type();
+      switch (indices_type.id()) {
         case Type::UINT8:
         case Type::INT8:
         case Type::UINT16:
@@ -575,10 +581,18 @@ class Converter_Dictionary : public Converter {
           break;
         default:
           cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
-                      dict_array.type()->ToString().c_str());
+                      dict_type.ToString().c_str());
       }
 
-      dictionary_ = dict_array.dictionary();
+      if (chunked_array->num_chunks() > 0) {
+        // NeedUnification() returned false so we can safely assume the
+        // dictionary of the first chunk applies everywhere
+        const auto& dict_array =
+            checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
+        dictionary_ = dict_array.dictionary();
+      } else {
+        dictionary_ = CreateEmptyArray(dict_type.value_type());
+      }
     }
   }
 
@@ -680,9 +694,7 @@ class Converter_Dictionary : public Converter {
   }
 
   bool GetOrdered() const {
-    return checked_cast<const DictionaryArray&>(*chunked_array_->chunk(0))
-        .dict_type()
-        ->ordered();
+    return checked_cast<const 
DictionaryType&>(*chunked_array_->type()).ordered();
   }
 
   SEXP GetLevels() const {
@@ -705,13 +717,15 @@ class Converter_Struct : public Converter {
  public:
   explicit Converter_Struct(const std::shared_ptr<ChunkedArray>& chunked_array)
       : Converter(chunked_array), converters() {
-    auto first_array =
-        checked_cast<const 
arrow::StructArray*>(this->chunked_array_->chunk(0).get());
-    int nf = chunked_array->type()->num_fields();
+    const auto& struct_type =
+        checked_cast<const arrow::StructType&>(*chunked_array->type());
+
+    int nf = struct_type.num_fields();
 
+    std::shared_ptr<arrow::Table> array_as_table =
+        ValueOrStop(arrow::Table::FromChunkedStructArray(chunked_array));
     for (int i = 0; i < nf; i++) {
-      converters.push_back(
-          
Converter::Make(std::make_shared<ChunkedArray>(first_array->field(i))));
+      converters.push_back(Converter::Make(array_as_table->column(i)));
     }
   }
 
@@ -943,12 +957,7 @@ class Converter_List : public Converter {
                                   ? arrow::r::data::classes_arrow_list
                                   : arrow::r::data::classes_arrow_large_list;
 
-    // Build an empty array to match value_type
-    std::unique_ptr<arrow::ArrayBuilder> builder;
-    StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), value_type_, &builder));
-
-    std::shared_ptr<arrow::Array> array;
-    StopIfNotOk(builder->Finish(&array));
+    std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
 
     // convert to an R object to store as the list' ptype
     res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
@@ -994,12 +1003,7 @@ class Converter_FixedSizeList : public Converter {
     Rf_classgets(res, arrow::r::data::classes_arrow_fixed_size_list);
     res.attr(arrow::r::symbols::list_size) = Rf_ScalarInteger(list_size_);
 
-    // Build an empty array to match value_type
-    std::unique_ptr<arrow::ArrayBuilder> builder;
-    StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), value_type_, &builder));
-
-    std::shared_ptr<arrow::Array> array;
-    StopIfNotOk(builder->Finish(&array));
+    std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
 
     // convert to an R object to store as the list' ptype
     res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
diff --git a/r/tests/testthat/test-chunked-array.R 
b/r/tests/testthat/test-chunked-array.R
index 2d1febe..c931dde 100644
--- a/r/tests/testthat/test-chunked-array.R
+++ b/r/tests/testthat/test-chunked-array.R
@@ -206,18 +206,27 @@ test_that("ChunkedArray supports empty arrays 
(ARROW-13761)", {
     int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
     uint64(), float32(), float64(), timestamp("ns"), binary(),
     large_binary(), fixed_size_binary(32), date32(), date64(),
-    decimal(4, 2)
+    decimal(4, 2), dictionary(), struct(x = int32())
   )
 
   empty_filter <- ChunkedArray$create(type = bool())
   for (type in types) {
     one_empty_chunk <- ChunkedArray$create(type = type)
     expect_type_equal(one_empty_chunk$type, type)
-    expect_identical(length(one_empty_chunk), 
length(as.vector(one_empty_chunk)))
+    if (type != struct(x = int32())) {
+      expect_identical(length(one_empty_chunk), 
length(as.vector(one_empty_chunk)))
+    } else {
+      # struct -> tbl and length(tbl) is num_columns instead of num_rows
+      expect_identical(length(as.vector(one_empty_chunk)), 1L)
+    }
     zero_empty_chunks <- one_empty_chunk$Filter(empty_filter)
     expect_equal(zero_empty_chunks$num_chunks, 0)
     expect_type_equal(zero_empty_chunks$type, type)
-    expect_identical(length(zero_empty_chunks), 
length(as.vector(zero_empty_chunks)))
+    if (type != struct(x = int32())) {
+      expect_identical(length(zero_empty_chunks), 
length(as.vector(zero_empty_chunks)))
+    } else {
+      expect_identical(length(as.vector(zero_empty_chunks)), 1L)
+    }
   }
 })
 

Reply via email to