romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r699105566



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, 
double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& 
array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", 
array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       In a way yes, but the user should not want that because the object is 
marked as immutable (in the R sense). Unfortunately, this is more of a 
guideline than something that is enforced by R. 
   
   Accessing data read-only is somewhat new in R, and lots of internal R code 
would still call `DATAPTR()` even though a read)-only equivalent would be 
sufficient. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to