romainfrancois commented on a change in pull request #11225:
URL: https://github.com/apache/arrow/pull/11225#discussion_r718411734



##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side

Review comment:
       We need to handle both C++ destructors and R's jumping. `BEGIN_CPP11` / 
`END_CPP11` is essentially a C++ try/catch that knows how to deal with special 
exceptions `cpp11::unwind_exception`
   
   These exceptions are thrown by `unwind_protect()` when the *C only* code it 
wraps jumps. When there is a jump, we have an opportunity to do any C++ cleanup 
(mostly destructors) before the unwinding is resumed so that the error is 
propagated back to the R side. 
   
   In typical code that uses `cpp11` we don't see the `BEGIN_CPP11` / 
`END_CPP11` because they get generated by the `[[::export]]` decorations, but 
here we are not defining functions that we call from the R side, but rather 
defining functions that are called directly by R internals/altrep...  
   
   
https://github.com/r-lib/cpp11/blob/master/inst/include/cpp11/declarations.hpp
   
   ```cpp
   #ifdef HAS_UNWIND_PROTECT
   #define CPP11_UNWIND R_ContinueUnwind(err);
   #else
   #define CPP11_UNWIND \
     do {               \
     } while (false);
   #endif
   
   #define CPP11_ERROR_BUFSIZE 8192
   
   #define BEGIN_CPP11                   \
     SEXP err = R_NilValue;              \
     char buf[CPP11_ERROR_BUFSIZE] = ""; \
     try {
   #define END_CPP11                                               \
     }                                                             \
     catch (cpp11::unwind_exception & e) {                         \
       err = e.token;                                              \
     }                                                             \
     catch (std::exception & e) {                                  \
       strncpy(buf, e.what(), sizeof(buf) - 1);                    \
     }                                                             \
     catch (...) {                                                 \
       strncpy(buf, "C++ error (unknown cause)", sizeof(buf) - 1); \
     }                                                             \
     if (buf[0] != '\0') {                                         \
       Rf_errorcall(R_NilValue, "%s", buf);                        \
     } else if (err != R_NilValue) {                               \
       CPP11_UNWIND                                                \
     }                                                             \
     return R_NilValue;
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to