This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 44b5d00400 GH-34211: [R] Make sure Arrow arrays are unmaterialized
before attempting to access the underlying ChunkedArray (#34489)
44b5d00400 is described below
commit 44b5d004000a33734b379cbb69349e32b7aa114c
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Mar 8 10:42:27 2023 -0400
GH-34211: [R] Make sure Arrow arrays are unmaterialized before attempting
to access the underlying ChunkedArray (#34489)
### Rationale for this change
When we attempt to re-use an object that Arrow itself created previously by
wrapping a chunked array, we will get a crash if this object has been
materialized (i.e., R values have been accessed and the ChunkedArray reference
deleted). This behaviour changed between 10.0.0 and 11.0.0 because I redid the
ALTREP implementation just after the 10.0.0 release.
The following test crashes R on main and 11.0.0 but passes after this PR:
``` r
library(arrow, warn.conflicts = FALSE)
#> Some features are not enabled in this build of Arrow. Run `arrow_info()`
for more information.
library(testthat, warn.conflicts = FALSE)
withr::local_namespace("arrow")
test_that("Materialized ALTREP arrays don't cause arrow to crash when
attempting to bypass", {
a_int <- Array$create(c(1L, 2L, 3L))
b_int <- a_int$as_vector()
expect_true(is_arrow_altrep(b_int))
expect_false(test_arrow_altrep_is_materialized(b_int))
# Some operations that use altrep bypass
expect_equal(infer_type(b_int), int32())
expect_equal(as_arrow_array(b_int), a_int)
# Still shouldn't have materialized yet
expect_false(test_arrow_altrep_is_materialized(b_int))
# Force it to materialize and check again
test_arrow_altrep_force_materialize(b_int)
expect_true(test_arrow_altrep_is_materialized(b_int))
expect_equal(infer_type(b_int), int32())
expect_equal(as_arrow_array(b_int), a_int)
})
#> Test passed 🎉
```
### What changes are included in this PR?
We used a function called `is_arrow_altrep()` to check if we could safely
access the ChunkedArray reference; however, *materialized* ALTREP arrays still
cause this return `true`. I added a new function
`is_unmaterialized_arrow_altrep()` and replaced usage that depended on the
ChunkedArray actually existing to use it.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* Closes: #34211
Authored-by: Dewey Dunnington <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/src/altrep.cpp | 8 +++++++-
r/src/array_to_vector.cpp | 4 ++--
r/src/arrow_types.h | 1 +
r/src/r_to_arrow.cpp | 2 +-
r/src/type_infer.cpp | 2 +-
r/tests/testthat/test-altrep.R | 21 ++++++++++++++++++++-
6 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp
index 37a45e42bc..a6b52fe9b6 100644
--- a/r/src/altrep.cpp
+++ b/r/src/altrep.cpp
@@ -1052,8 +1052,12 @@ bool is_arrow_altrep(SEXP x) {
return false;
}
+bool is_unmaterialized_arrow_altrep(SEXP x) {
+ return is_arrow_altrep(x) && R_altrep_data1(x) != R_NilValue;
+}
+
std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) {
- if (is_arrow_altrep(x) && R_altrep_data1(x) != R_NilValue) {
+ if (is_unmaterialized_arrow_altrep(x)) {
return GetChunkedArray(x);
}
@@ -1079,6 +1083,8 @@ bool is_arrow_altrep(SEXP) { return false; }
std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) { return
nullptr; }
+bool is_unmaterialized_arrow_altrep(SEXP) { return false; }
+
} // namespace altrep
} // namespace r
} // namespace arrow
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 9cf539bb41..bf026d2723 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -761,7 +761,7 @@ class Converter_Struct : public Converter {
SEXP data_i = VECTOR_ELT(data, i);
// only ingest if the column is not altrep
- if (!altrep::is_arrow_altrep(data_i)) {
+ if (!altrep::is_unmaterialized_arrow_altrep(data_i)) {
StopIfNotOk(converters[i]->Ingest_all_nulls(data_i, start, n));
}
}
@@ -778,7 +778,7 @@ class Converter_Struct : public Converter {
SEXP data_i = VECTOR_ELT(data, i);
// only ingest if the column is not altrep
- if (!altrep::is_arrow_altrep(data_i)) {
+ if (!altrep::is_unmaterialized_arrow_altrep(data_i)) {
StopIfNotOk(converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i),
arrays[i],
start, n, chunk_index));
}
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 49283bd224..2f6597ad8f 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -239,6 +239,7 @@ void Init_Altrep_classes(DllInfo* dll);
SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array);
bool is_arrow_altrep(SEXP x);
+bool is_unmaterialized_arrow_altrep(SEXP x);
std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP);
} // namespace altrep
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 89b4ba2e05..19c23973a6 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -1498,7 +1498,7 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst,
SEXP schema_sxp,
} else if (Rf_inherits(x, "Array")) {
columns[j] = std::make_shared<arrow::ChunkedArray>(
cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x));
- } else if (arrow::r::altrep::is_arrow_altrep(x)) {
+ } else if (arrow::r::altrep::is_unmaterialized_arrow_altrep(x)) {
columns[j] = arrow::r::altrep::vec_to_arrow_altrep_bypass(x);
} else {
arrow::r::RConversionOptions options;
diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp
index ac52ed03e8..e61320e117 100644
--- a/r/src/type_infer.cpp
+++ b/r/src/type_infer.cpp
@@ -185,7 +185,7 @@ std::shared_ptr<arrow::DataType>
InferArrowTypeFromVector<VECSXP>(SEXP x) {
}
std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
- if (arrow::r::altrep::is_arrow_altrep(x)) {
+ if (arrow::r::altrep::is_unmaterialized_arrow_altrep(x)) {
return arrow::r::altrep::vec_to_arrow_altrep_bypass(x)->type();
}
diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R
index 4eb316d446..7a66d0e778 100644
--- a/r/tests/testthat/test-altrep.R
+++ b/r/tests/testthat/test-altrep.R
@@ -607,7 +607,6 @@ test_that("element access methods for ALTREP factors", {
}
})
-
test_that("R checks for bounds", {
v_int <- Array$create(c(1, 2, 3))$as_vector()
v_dbl <- Array$create(c(1L, 2L, 3L))$as_vector()
@@ -631,3 +630,23 @@ test_that("Operations on altrep R vectors don't modify the
original", {
c_int <- -b_int
expect_false(isTRUE(all.equal(b_int, c_int)))
})
+
+test_that("Materialized ALTREP arrays don't cause arrow to crash when
attempting to bypass", {
+ a_int <- Array$create(c(1L, 2L, 3L))
+ b_int <- a_int$as_vector()
+ expect_true(is_arrow_altrep(b_int))
+ expect_false(test_arrow_altrep_is_materialized(b_int))
+
+ # Some operations that use altrep bypass
+ expect_equal(infer_type(b_int), int32())
+ expect_equal(as_arrow_array(b_int), a_int)
+
+ # Still shouldn't have materialized yet
+ expect_false(test_arrow_altrep_is_materialized(b_int))
+
+ # Force it to materialize and check again
+ test_arrow_altrep_force_materialize(b_int)
+ expect_true(test_arrow_altrep_is_materialized(b_int))
+ expect_equal(infer_type(b_int), int32())
+ expect_equal(as_arrow_array(b_int), a_int)
+})