This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 69824e3 ARROW-14321: [R] segfault converting dictionary ChunkedArray
with 0 chunks
69824e3 is described below
commit 69824e374510fa2931e64ed854a3c6be8b941545
Author: Weston Pace <[email protected]>
AuthorDate: Sat Oct 16 13:04:06 2021 -0400
ARROW-14321: [R] segfault converting dictionary ChunkedArray with 0 chunks
Both dictionary arrays and struct arrays were relying on chunked arrays
always having at least 1 chunk.
The fix also revealed a bug in arrow::Table::FromChunkedStructArray (the
method was also expecting at least 1 chunk).
Closes #11428 from
westonpace/bugfix/ARROW-14321--segfault-converting-no-chunks-in-r
Authored-by: Weston Pace <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/table.cc | 3 +-
r/src/array_to_vector.cpp | 62 +++++++++++++++++++----------------
r/tests/testthat/test-chunked-array.R | 15 +++++++--
3 files changed, 47 insertions(+), 33 deletions(-)
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index d4c7802..a8a45e9 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -327,7 +327,8 @@ Result<std::shared_ptr<Table>>
Table::FromChunkedStructArray(
[i](const std::shared_ptr<Array>& struct_chunk) {
return static_cast<const
StructArray&>(*struct_chunk).field(i);
});
- columns[i] = std::make_shared<ChunkedArray>(std::move(chunks));
+ columns[i] =
+ std::make_shared<ChunkedArray>(std::move(chunks),
type->field(i)->type());
}
return Table::Make(::arrow::schema(type->fields()), std::move(columns),
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 32b52ab..480eb86 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -158,6 +158,15 @@ Status IngestSome(const std::shared_ptr<arrow::Array>&
array, R_xlen_t n,
return IngestSome(array, n, std::forward<SetNonNull>(set_non_null), nothing);
}
+std::shared_ptr<Array> CreateEmptyArray(const std::shared_ptr<DataType>&
array_type) {
+ std::unique_ptr<arrow::ArrayBuilder> builder;
+ StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), array_type, &builder));
+
+ std::shared_ptr<arrow::Array> array;
+ StopIfNotOk(builder->Finish(&array));
+ return array;
+}
+
template <typename Type>
class Converter_Int : public Converter {
using value_type = typename TypeTraits<Type>::ArrayType::value_type;
@@ -545,9 +554,7 @@ class Converter_Dictionary : public Converter {
explicit Converter_Dictionary(const std::shared_ptr<ChunkedArray>&
chunked_array)
: Converter(chunked_array), need_unification_(NeedUnification()) {
if (need_unification_) {
- const auto& arr_first =
- checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
- const auto& arr_type = checked_cast<const
DictionaryType&>(*arr_first.type());
+ const auto& arr_type = checked_cast<const
DictionaryType&>(*chunked_array->type());
unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
size_t n_arrays = chunked_array->num_chunks();
@@ -561,11 +568,10 @@ class Converter_Dictionary : public Converter {
StopIfNotOk(unifier_->GetResult(&out_type_, &dictionary_));
} else {
- const auto& dict_array =
- checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
+ const auto& dict_type = checked_cast<const
DictionaryType&>(*chunked_array->type());
- auto indices = dict_array.indices();
- switch (indices->type_id()) {
+ const auto& indices_type = *dict_type.index_type();
+ switch (indices_type.id()) {
case Type::UINT8:
case Type::INT8:
case Type::UINT16:
@@ -575,10 +581,18 @@ class Converter_Dictionary : public Converter {
break;
default:
cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
- dict_array.type()->ToString().c_str());
+ dict_type.ToString().c_str());
}
- dictionary_ = dict_array.dictionary();
+ if (chunked_array->num_chunks() > 0) {
+ // NeedUnification() returned false so we can safely assume the
+ // dictionary of the first chunk applies everywhere
+ const auto& dict_array =
+ checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary_ = dict_array.dictionary();
+ } else {
+ dictionary_ = CreateEmptyArray(dict_type.value_type());
+ }
}
}
@@ -680,9 +694,7 @@ class Converter_Dictionary : public Converter {
}
bool GetOrdered() const {
- return checked_cast<const DictionaryArray&>(*chunked_array_->chunk(0))
- .dict_type()
- ->ordered();
+ return checked_cast<const
DictionaryType&>(*chunked_array_->type()).ordered();
}
SEXP GetLevels() const {
@@ -705,13 +717,15 @@ class Converter_Struct : public Converter {
public:
explicit Converter_Struct(const std::shared_ptr<ChunkedArray>& chunked_array)
: Converter(chunked_array), converters() {
- auto first_array =
- checked_cast<const
arrow::StructArray*>(this->chunked_array_->chunk(0).get());
- int nf = chunked_array->type()->num_fields();
+ const auto& struct_type =
+ checked_cast<const arrow::StructType&>(*chunked_array->type());
+
+ int nf = struct_type.num_fields();
+ std::shared_ptr<arrow::Table> array_as_table =
+ ValueOrStop(arrow::Table::FromChunkedStructArray(chunked_array));
for (int i = 0; i < nf; i++) {
- converters.push_back(
-
Converter::Make(std::make_shared<ChunkedArray>(first_array->field(i))));
+ converters.push_back(Converter::Make(array_as_table->column(i)));
}
}
@@ -943,12 +957,7 @@ class Converter_List : public Converter {
? arrow::r::data::classes_arrow_list
: arrow::r::data::classes_arrow_large_list;
- // Build an empty array to match value_type
- std::unique_ptr<arrow::ArrayBuilder> builder;
- StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), value_type_, &builder));
-
- std::shared_ptr<arrow::Array> array;
- StopIfNotOk(builder->Finish(&array));
+ std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
// convert to an R object to store as the list' ptype
res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
@@ -994,12 +1003,7 @@ class Converter_FixedSizeList : public Converter {
Rf_classgets(res, arrow::r::data::classes_arrow_fixed_size_list);
res.attr(arrow::r::symbols::list_size) = Rf_ScalarInteger(list_size_);
- // Build an empty array to match value_type
- std::unique_ptr<arrow::ArrayBuilder> builder;
- StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), value_type_, &builder));
-
- std::shared_ptr<arrow::Array> array;
- StopIfNotOk(builder->Finish(&array));
+ std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
// convert to an R object to store as the list' ptype
res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
diff --git a/r/tests/testthat/test-chunked-array.R
b/r/tests/testthat/test-chunked-array.R
index 2d1febe..c931dde 100644
--- a/r/tests/testthat/test-chunked-array.R
+++ b/r/tests/testthat/test-chunked-array.R
@@ -206,18 +206,27 @@ test_that("ChunkedArray supports empty arrays
(ARROW-13761)", {
int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
uint64(), float32(), float64(), timestamp("ns"), binary(),
large_binary(), fixed_size_binary(32), date32(), date64(),
- decimal(4, 2)
+ decimal(4, 2), dictionary(), struct(x = int32())
)
empty_filter <- ChunkedArray$create(type = bool())
for (type in types) {
one_empty_chunk <- ChunkedArray$create(type = type)
expect_type_equal(one_empty_chunk$type, type)
- expect_identical(length(one_empty_chunk),
length(as.vector(one_empty_chunk)))
+ if (type != struct(x = int32())) {
+ expect_identical(length(one_empty_chunk),
length(as.vector(one_empty_chunk)))
+ } else {
+ # struct -> tbl and length(tbl) is num_columns instead of num_rows
+ expect_identical(length(as.vector(one_empty_chunk)), 1L)
+ }
zero_empty_chunks <- one_empty_chunk$Filter(empty_filter)
expect_equal(zero_empty_chunks$num_chunks, 0)
expect_type_equal(zero_empty_chunks$type, type)
- expect_identical(length(zero_empty_chunks),
length(as.vector(zero_empty_chunks)))
+ if (type != struct(x = int32())) {
+ expect_identical(length(zero_empty_chunks),
length(as.vector(zero_empty_chunks)))
+ } else {
+ expect_identical(length(as.vector(zero_empty_chunks)), 1L)
+ }
}
})