This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 44b5d00400 GH-34211: [R] Make sure Arrow arrays are unmaterialized 
before attempting to access the underlying ChunkedArray (#34489)
44b5d00400 is described below

commit 44b5d004000a33734b379cbb69349e32b7aa114c
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Mar 8 10:42:27 2023 -0400

    GH-34211: [R] Make sure Arrow arrays are unmaterialized before attempting 
to access the underlying ChunkedArray (#34489)
    
    ### Rationale for this change
    
    When we attempt to re-use an object that Arrow itself created previously by 
wrapping a chunked array, we will get a crash if this object has been 
materialized (i.e., R values have been accessed and the ChunkedArray reference 
deleted). This behaviour changed between 10.0.0 and 11.0.0 because I redid the 
ALTREP implementation just after the 10.0.0 release.
    
    The following test crashes R on main and 11.0.0 but passes after this PR:
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    #> Some features are not enabled in this build of Arrow. Run `arrow_info()` 
for more information.
    library(testthat, warn.conflicts = FALSE)
    withr::local_namespace("arrow")
    
    test_that("Materialized ALTREP arrays don't cause arrow to crash when 
attempting to bypass", {
      a_int <- Array$create(c(1L, 2L, 3L))
      b_int <- a_int$as_vector()
      expect_true(is_arrow_altrep(b_int))
      expect_false(test_arrow_altrep_is_materialized(b_int))
    
      # Some operations that use altrep bypass
      expect_equal(infer_type(b_int), int32())
      expect_equal(as_arrow_array(b_int), a_int)
    
      # Still shouldn't have materialized yet
      expect_false(test_arrow_altrep_is_materialized(b_int))
    
      # Force it to materialize and check again
      test_arrow_altrep_force_materialize(b_int)
      expect_true(test_arrow_altrep_is_materialized(b_int))
      expect_equal(infer_type(b_int), int32())
      expect_equal(as_arrow_array(b_int), a_int)
    })
    #> Test passed 🎉
    ```
    
    ### What changes are included in this PR?
    
    We used a function called `is_arrow_altrep()` to check if we could safely 
access the ChunkedArray reference; however, *materialized* ALTREP arrays still 
cause this return `true`. I added a new function 
`is_unmaterialized_arrow_altrep()` and replaced usage that depended on the 
ChunkedArray actually existing to use it.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #34211
    
    Authored-by: Dewey Dunnington <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/src/altrep.cpp               |  8 +++++++-
 r/src/array_to_vector.cpp      |  4 ++--
 r/src/arrow_types.h            |  1 +
 r/src/r_to_arrow.cpp           |  2 +-
 r/src/type_infer.cpp           |  2 +-
 r/tests/testthat/test-altrep.R | 21 ++++++++++++++++++++-
 6 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp
index 37a45e42bc..a6b52fe9b6 100644
--- a/r/src/altrep.cpp
+++ b/r/src/altrep.cpp
@@ -1052,8 +1052,12 @@ bool is_arrow_altrep(SEXP x) {
   return false;
 }
 
+bool is_unmaterialized_arrow_altrep(SEXP x) {
+  return is_arrow_altrep(x) && R_altrep_data1(x) != R_NilValue;
+}
+
 std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) {
-  if (is_arrow_altrep(x) && R_altrep_data1(x) != R_NilValue) {
+  if (is_unmaterialized_arrow_altrep(x)) {
     return GetChunkedArray(x);
   }
 
@@ -1079,6 +1083,8 @@ bool is_arrow_altrep(SEXP) { return false; }
 
 std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) { return 
nullptr; }
 
+bool is_unmaterialized_arrow_altrep(SEXP) { return false; }
+
 }  // namespace altrep
 }  // namespace r
 }  // namespace arrow
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 9cf539bb41..bf026d2723 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -761,7 +761,7 @@ class Converter_Struct : public Converter {
       SEXP data_i = VECTOR_ELT(data, i);
 
       // only ingest if the column is not altrep
-      if (!altrep::is_arrow_altrep(data_i)) {
+      if (!altrep::is_unmaterialized_arrow_altrep(data_i)) {
         StopIfNotOk(converters[i]->Ingest_all_nulls(data_i, start, n));
       }
     }
@@ -778,7 +778,7 @@ class Converter_Struct : public Converter {
       SEXP data_i = VECTOR_ELT(data, i);
 
       // only ingest if the column is not altrep
-      if (!altrep::is_arrow_altrep(data_i)) {
+      if (!altrep::is_unmaterialized_arrow_altrep(data_i)) {
         StopIfNotOk(converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), 
arrays[i],
                                                      start, n, chunk_index));
       }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 49283bd224..2f6597ad8f 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -239,6 +239,7 @@ void Init_Altrep_classes(DllInfo* dll);
 
 SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array);
 bool is_arrow_altrep(SEXP x);
+bool is_unmaterialized_arrow_altrep(SEXP x);
 std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP);
 
 }  // namespace altrep
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 89b4ba2e05..19c23973a6 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -1498,7 +1498,7 @@ std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, 
SEXP schema_sxp,
     } else if (Rf_inherits(x, "Array")) {
       columns[j] = std::make_shared<arrow::ChunkedArray>(
           cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x));
-    } else if (arrow::r::altrep::is_arrow_altrep(x)) {
+    } else if (arrow::r::altrep::is_unmaterialized_arrow_altrep(x)) {
       columns[j] = arrow::r::altrep::vec_to_arrow_altrep_bypass(x);
     } else {
       arrow::r::RConversionOptions options;
diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp
index ac52ed03e8..e61320e117 100644
--- a/r/src/type_infer.cpp
+++ b/r/src/type_infer.cpp
@@ -185,7 +185,7 @@ std::shared_ptr<arrow::DataType> 
InferArrowTypeFromVector<VECSXP>(SEXP x) {
 }
 
 std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
-  if (arrow::r::altrep::is_arrow_altrep(x)) {
+  if (arrow::r::altrep::is_unmaterialized_arrow_altrep(x)) {
     return arrow::r::altrep::vec_to_arrow_altrep_bypass(x)->type();
   }
 
diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R
index 4eb316d446..7a66d0e778 100644
--- a/r/tests/testthat/test-altrep.R
+++ b/r/tests/testthat/test-altrep.R
@@ -607,7 +607,6 @@ test_that("element access methods for ALTREP factors", {
   }
 })
 
-
 test_that("R checks for bounds", {
   v_int <- Array$create(c(1, 2, 3))$as_vector()
   v_dbl <- Array$create(c(1L, 2L, 3L))$as_vector()
@@ -631,3 +630,23 @@ test_that("Operations on altrep R vectors don't modify the 
original", {
   c_int <- -b_int
   expect_false(isTRUE(all.equal(b_int, c_int)))
 })
+
+test_that("Materialized ALTREP arrays don't cause arrow to crash when 
attempting to bypass", {
+  a_int <- Array$create(c(1L, 2L, 3L))
+  b_int <- a_int$as_vector()
+  expect_true(is_arrow_altrep(b_int))
+  expect_false(test_arrow_altrep_is_materialized(b_int))
+
+  # Some operations that use altrep bypass
+  expect_equal(infer_type(b_int), int32())
+  expect_equal(as_arrow_array(b_int), a_int)
+
+  # Still shouldn't have materialized yet
+  expect_false(test_arrow_altrep_is_materialized(b_int))
+
+  # Force it to materialize and check again
+  test_arrow_altrep_force_materialize(b_int)
+  expect_true(test_arrow_altrep_is_materialized(b_int))
+  expect_equal(infer_type(b_int), int32())
+  expect_equal(as_arrow_array(b_int), a_int)
+})

Reply via email to