This is an automated email from the ASF dual-hosted git repository. brycemecum pushed a commit to branch maint-19.0.1.1-r in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 453c1f4fdcca3b3b75ec158efcce5bb2c782fd72 Author: Jonathan Keane <[email protected]> AuthorDate: Sat Apr 5 17:01:01 2025 -0500 GH-45949: [R] Fix CRAN warnings for 19.0.1 about compiled code (#45951) This gets rid of `OBJECT`, `DATAPTR` has been replaced with `INTEGER()`, `REAL()`, etc. though strings are more complicated. I will fully admit that this C++ is stretching my comfort zone, so might include obviously wrong things! CI is currently failing, but I'm not totally sure yet if that means the code changes here are wrong or if maybe these allow us to have slightly different assumptions about materialization (see https://github.com/apache/arrow/pull/45951#discussion_r2019675135) I've also requested reviews broadly for folks I know have been around this code before, I appreciate any effort that y'all can spare 🙏 * GitHub Issue: #45949 Lead-authored-by: Jonathan Keane <[email protected]> Co-authored-by: Dewey Dunnington <[email protected]> Signed-off-by: Jonathan Keane <[email protected]> --- r/src/altrep.cpp | 52 ++++++++++++++++++++++++++++-------------- r/src/arrow_types.h | 20 +++++++++++++++- r/src/r_to_arrow.cpp | 12 +++++----- r/tests/testthat/test-altrep.R | 14 +++++------- 4 files changed, 66 insertions(+), 32 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 90a459e19c..a4adf58997 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -115,7 +115,7 @@ const std::shared_ptr<ChunkedArray>& GetChunkedArray(SEXP alt) { // materialization is needed. // data2: starts as NULL, and becomes a standard R vector with the same // data if necessary: if materialization is needed, e.g. if we need -// to access its data pointer, with DATAPTR(). +// to access its data pointer, with INTEGER(), REAL(), etc. template <typename Impl> struct AltrepVectorBase { // store the Array as an external pointer in data1, mark as immutable @@ -220,7 +220,14 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex SEXP copy = PROTECT(Rf_allocVector(sexp_type, size)); // copy the data from the array, through Get_region - Get_region(alt, 0, size, reinterpret_cast<c_type*>(DATAPTR(copy))); + if constexpr (std::is_same_v<c_type, double>) { + Get_region(alt, 0, size, REAL(copy)); + } else if constexpr (std::is_same_v<c_type, int>) { + Get_region(alt, 0, size, INTEGER(copy)); + } else { + static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>, + "ALTREP not implemented for this c_type"); + } // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -269,13 +276,27 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex } // Otherwise we have to materialize and hand the pointer to data2 - return DATAPTR(Materialize(alt)); + if constexpr (std::is_same_v<c_type, double>) { + return REAL(Materialize(alt)); + } else if constexpr (std::is_same_v<c_type, int>) { + return INTEGER(Materialize(alt)); + } else { + static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>, + "ALTREP not implemented for this c_type"); + } } // The value at position i static c_type Elt(SEXP alt, R_xlen_t i) { if (IsMaterialized(alt)) { - return reinterpret_cast<c_type*>(DATAPTR(Representation(alt)))[i]; + if constexpr (std::is_same_v<c_type, double>) { + return REAL(Representation(alt))[i]; + } else if constexpr (std::is_same_v<c_type, int>) { + return INTEGER(Representation(alt))[i]; + } else { + static_assert(std::is_same_v<c_type, double> || std::is_same_v<c_type, int>, + "ALTREP not implemented for this c_type"); + } } auto altrep_data = @@ -531,7 +552,7 @@ struct AltrepFactor : public AltrepVectorBase<AltrepFactor> { SEXP copy = PROTECT(Rf_allocVector(INTSXP, size)); // copy the data from the array, through Get_region - Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy))); + Get_region(alt, 0, size, INTEGER(copy)); // store as data2, this is now considered materialized SetRepresentation(alt, copy); @@ -552,7 +573,7 @@ struct AltrepFactor : public AltrepVectorBase<AltrepFactor> { return nullptr; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { return INTEGER(Materialize(alt)); } static SEXP Duplicate(SEXP alt, Rboolean /* deep */) { // the representation integer vector @@ -892,7 +913,9 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> { return s; } - static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); } + static void* Dataptr(SEXP alt, Rboolean writeable) { + return const_cast<void*>(DATAPTR_RO(Materialize(alt))); + } static SEXP Materialize(SEXP alt) { if (Base::IsMaterialized(alt)) { @@ -931,7 +954,9 @@ struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> { } static const void* Dataptr_or_null(SEXP alt) { - if (Base::IsMaterialized(alt)) return DATAPTR(Representation(alt)); + if (Base::IsMaterialized(alt)) { + return DATAPTR_RO(alt); + } // otherwise give up return nullptr; @@ -1267,21 +1292,14 @@ sexp test_arrow_altrep_copy_by_dataptr(sexp x) { if (TYPEOF(x) == INTSXP) { cpp11::writable::integers out(Rf_xlength(x)); - int* ptr = reinterpret_cast<int*>(DATAPTR(x)); + int* ptr = INTEGER(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } return out; } else if (TYPEOF(x) == REALSXP) { cpp11::writable::doubles out(Rf_xlength(x)); - double* ptr = reinterpret_cast<double*>(DATAPTR(x)); - for (R_xlen_t i = 0; i < n; i++) { - out[i] = ptr[i]; - } - return out; - } else if (TYPEOF(x) == STRSXP) { - cpp11::writable::strings out(Rf_xlength(x)); - SEXP* ptr = reinterpret_cast<SEXP*>(DATAPTR(x)); + double* ptr = REAL(x); for (R_xlen_t i = 0; i < n; i++) { out[i] = ptr[i]; } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 05c8f6062d..9da02fd09f 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -173,7 +173,7 @@ template <typename RVector> class RBuffer : public MutableBuffer { public: explicit RBuffer(RVector vec) - : MutableBuffer(reinterpret_cast<uint8_t*>(DATAPTR(vec)), + : MutableBuffer(reinterpret_cast<uint8_t*>(getDataPointer(vec)), vec.size() * sizeof(typename RVector::value_type), arrow::CPUDevice::memory_manager(gc_memory_pool())), vec_(vec) {} @@ -181,6 +181,24 @@ class RBuffer : public MutableBuffer { private: // vec_ holds the memory RVector vec_; + + static void* getDataPointer(RVector& vec) { + if (TYPEOF(vec) == LGLSXP) { + return LOGICAL(vec); + } else if (TYPEOF(vec) == INTSXP) { + return INTEGER(vec); + } else if (TYPEOF(vec) == REALSXP) { + return REAL(vec); + } else if (TYPEOF(vec) == CPLXSXP) { + return COMPLEX(vec); + } else if (TYPEOF(vec) == STRSXP) { + // We don't want to expose the string data here, so we error + cpp11::stop("Operation not supported for string vectors."); + } else { + // raw + return RAW(vec); + } + } }; std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP); diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d2db11e14a..32e186ceac 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1214,11 +1214,11 @@ bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type) { // because MakeSimpleArray below will force materialization switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x); + return TYPEOF(x) == INTSXP && !Rf_isObject(x); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x); + return TYPEOF(x) == REALSXP && !Rf_isObject(x); case Type::INT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x); + return TYPEOF(x) == RAWSXP && !Rf_isObject(x); case Type::INT64: return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64"); default: @@ -1412,17 +1412,17 @@ bool vector_from_r_memory(SEXP x, const std::shared_ptr<DataType>& type, switch (type->id()) { case Type::INT32: - return TYPEOF(x) == INTSXP && !OBJECT(x) && + return TYPEOF(x) == INTSXP && !Rf_isObject(x) && vector_from_r_memory_impl<cpp11::integers, Int32Type>(x, type, columns, j, tasks); case Type::DOUBLE: - return TYPEOF(x) == REALSXP && !OBJECT(x) && + return TYPEOF(x) == REALSXP && !Rf_isObject(x) && vector_from_r_memory_impl<cpp11::doubles, DoubleType>(x, type, columns, j, tasks); case Type::UINT8: - return TYPEOF(x) == RAWSXP && !OBJECT(x) && + return TYPEOF(x) == RAWSXP && !Rf_isObject(x) && vector_from_r_memory_impl<cpp11::raws, UInt8Type>(x, type, columns, j, tasks); diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 50bd40988e..d1e90b6b59 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -170,7 +170,7 @@ test_that("element access methods for int32 ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -193,7 +193,7 @@ test_that("element access methods for double ALTREP with nulls", { expect_identical(test_arrow_altrep_copy_by_region(altrep, 123), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # because there are no nulls, DATAPTR() does not materialize + # because there are nulls, DATAPTR() does materialize expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) expect_true(test_arrow_altrep_is_materialized(altrep)) @@ -244,14 +244,13 @@ test_that("element access methods for character ALTREP", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("element access methods for character ALTREP from large_utf8()", { @@ -265,14 +264,13 @@ test_that("element access methods for character ALTREP from large_utf8()", { expect_identical(test_arrow_altrep_copy_by_element(altrep), original) expect_false(test_arrow_altrep_is_materialized(altrep)) - # DATAPTR() should always materialize for strings - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) + # match() calls DATAPTR() internally which materializes the vector + match(altrep, c("1", "40", "999")) expect_true(test_arrow_altrep_is_materialized(altrep)) # test element access after materialization expect_true(test_arrow_altrep_is_materialized(altrep)) expect_identical(test_arrow_altrep_copy_by_element(altrep), original) - expect_identical(test_arrow_altrep_copy_by_dataptr(altrep), original) }) test_that("empty vectors are not altrep", {
