pitrou commented on a change in pull request #11225:
URL: https://github.com/apache/arrow/pull/11225#discussion_r718175530



##########
File path: r/src/arrow_cpp11.h
##########
@@ -374,3 +374,5 @@ SEXP as_sexp(const std::shared_ptr<T>& ptr) {
 }
 
 }  // namespace cpp11
+
+bool is_altrep(SEXP x);

Review comment:
       Is there a reason to put this function in the toplevel namespace? 
Generally this should be avoided.

##########
File path: r/src/altrep.cpp
##########
@@ -70,135 +69,125 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, 
R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-// altrep R vector shadowing an Array.
+void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
+using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+
+// base class for all altrep vectors
+//
+// The altrep vector stores the Array as an external pointer in data1
+// Implementation classes AltrepVectorPrimitive<> and AltrepVectorString
+// also use data2
+struct AltrepVectorBase {
+  // store the Array as an external pointer in data1, mark as immutable
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& 
array) {
+    SEXP alt_ =
+        R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)), 
R_NilValue);
+    MARK_NOT_MUTABLE(alt_);
+
+    return alt_;
+  }
+
+  // the Array that is being wrapped by the altrep object
+  static const std::shared_ptr<Array>& array(SEXP alt_) {

Review comment:
       I'm not sure why you sometimes append underscores to variable or 
parameter names, but according to the [C++ style 
guide](https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci)
 this should only be done on (class, struct) member variables.

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;

Review comment:
       Call this `out_type`?

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));

Review comment:
       Note we have compute functions named "min" and "max" now, you don't need 
to unwrap the struct manually.

##########
File path: r/src/altrep.cpp
##########
@@ -251,7 +234,7 @@ struct AltrepArrayPrimitive {
     // array has nulls
     //
     // This only materialize the region, into buf. Not the entire vector.
-    auto slice = array()->Slice(i, n);
+    auto slice = array(alt_)->Slice(i, n);
     R_xlen_t ncopy = slice->length();
 
     // first copy the data buffer

Review comment:
       Similarly as `Elt`, should we protect for NA singleton confusion?

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {

Review comment:
       I'm not sure why you need to both special-case this explicitly _and_ 
instantiate a ScalarAggregateOptions. You probably need only one of these.

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(

Review comment:
       1) You don't really need to return a `shared_ptr` since the options 
object won't outlive the caller.
   2) Call this `MakeScalarAggregateOptions`?

##########
File path: r/src/altrep.cpp
##########
@@ -70,135 +69,125 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, 
R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-// altrep R vector shadowing an Array.
+void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }

Review comment:
       Generally all helper code that's not exposed in a `.h` should go into 
the anonymous namespace (the equivalent of marking functions `static`). Could 
you do that here?

##########
File path: r/src/altrep.cpp
##########
@@ -70,135 +69,125 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, 
R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-// altrep R vector shadowing an Array.
+void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
+using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+
+// base class for all altrep vectors
+//
+// The altrep vector stores the Array as an external pointer in data1
+// Implementation classes AltrepVectorPrimitive<> and AltrepVectorString
+// also use data2
+struct AltrepVectorBase {
+  // store the Array as an external pointer in data1, mark as immutable
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& 
array) {
+    SEXP alt_ =
+        R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)), 
R_NilValue);
+    MARK_NOT_MUTABLE(alt_);
+
+    return alt_;
+  }
+
+  // the Array that is being wrapped by the altrep object
+  static const std::shared_ptr<Array>& array(SEXP alt_) {
+    return *Pointer(R_altrep_data1(alt_));
+  }
+
+  // Is the vector materialized, i.e. does the data2 slot contain a
+  // standard R vector with the same data as the array.
+  static bool IsMaterialized(SEXP alt_) { return 
!Rf_isNull(R_altrep_data2(alt_)); }
+
+  static R_xlen_t Length(SEXP alt_) { return array(alt_)->length(); }
+
+  static int No_NA(SEXP alt_) { return array(alt_)->null_count() == 0; }
+
+  static int Is_sorted(SEXP alt_) { return UNKNOWN_SORTEDNESS; }
+
+  // What gets printed on .Internal(inspect(<the altrep object>))
+  static Rboolean Inspect(SEXP alt_, int pre, int deep, int pvec,
+                          void (*inspect_subtree)(SEXP, int, int, int)) {
+    const auto& array_ = array(alt_);
+    Rprintf("arrow::Array<%s, %d nulls> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(), 
array_->length(),
+            array_.get());
+    inspect_subtree(R_altrep_data1(alt_), pre, deep + 1, pvec);

Review comment:
       I'm not sure: is it useful to recurse into data1 (which is an opaque C++ 
pointer as far as R is concerned)?

##########
File path: r/src/altrep.cpp
##########
@@ -218,29 +207,23 @@ struct AltrepArrayPrimitive {
     // Simply stop() when `writeable = TRUE` is too strong, e.g. this fails

Review comment:
       I'm not sure I understand this comment about stopping. Does it still 
apply?

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side
+    SEXP s;
+    cpp11::unwind_protect([&]() {
+      if (strip_out_nuls) {
+        s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+      } else {
+        s = r_string_from_view_keep_nul(view, stripped_string);
+      }
 
-template <typename AltrepClass>
-auto Elt(SEXP alt, R_xlen_t i) -> decltype(AltrepClass(alt).Elt(i)) {
-  return AltrepClass(alt).Elt(i);
-}
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
+    return s;
 
-template <typename AltrepClass>
-int No_NA(SEXP alt) {
-  return AltrepClass(alt).No_NA();
-}
+    END_CPP11
+  }
 
-template <typename AltrepClass>
-int Is_sorted(SEXP alt) {
-  return AltrepClass(alt).Is_sorted();
-}
+  static void* Dataptr(SEXP alt_, Rboolean writeable) {
+    return DATAPTR(Materialize(alt_));
+  }
 
-template <typename AltrepClass>
-R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, typename 
AltrepClass::c_type* buf) {
-  return AltrepClass(alt).Get_region(i, n, buf);
-}
+  static SEXP Materialize(SEXP alt_) {
+    if (IsMaterialized(alt_)) {
+      return R_altrep_data2(alt_);
+    }
 
-template <typename AltrepClass>
-SEXP Serialized_state(SEXP alt) {
-  return AltrepClass(alt).Serialized_state();
-}
+    BEGIN_CPP11
 
-template <typename AltrepClass>
-SEXP Unserialize(SEXP class_, SEXP state) {
-  return AltrepClass::Unserialize(class_, state);
-}
+    auto array_ = array(alt_);
+    R_xlen_t n = array_->length();
+    SEXP data2_ = PROTECT(Rf_allocVector(STRSXP, n));
+    MARK_NOT_MUTABLE(data2_);
 
-template <typename AltrepClass>
-SEXP Coerce(SEXP alt, int type) {
-  return AltrepClass(alt).Coerce(type);
-}
+    std::string stripped_string;
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    auto* string_array = 
internal::checked_cast<StringArrayType*>(array_.get());
+    util::string_view view;
 
-static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
-    const std::shared_ptr<Array>& array, bool na_rm) {
-  auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
-      arrow::compute::ScalarAggregateOptions::Defaults());
-  options->min_count = 0;
-  options->skip_nulls = na_rm;
-  return options;
-}
+    cpp11::unwind_protect([&]() {
+      for (R_xlen_t i = 0; i < n; i++) {
+        SEXP s = STRING_ELT(data2_, i);
+
+        // nul, so materialize to NA_STRING
+        if (array_->IsNull(i)) {
+          SET_STRING_ELT(data2_, i, NA_STRING);
+          continue;
+        }
+
+        // materialize a real string, with care about potential jump
+        // from Rf_mkCharLenCE()
+        view = string_array->GetView(i);
+        if (strip_out_nuls) {
+          s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+        } else {
+          s = r_string_from_view_keep_nul(view, stripped_string);
+        }
+        SET_STRING_ELT(data2_, i, s);
+      }
+
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
 
-template <int sexp_type, bool Min>
-SEXP MinMax(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
-  using scalar_type =
-      typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+    // only set to data2 if all the values have been converted
+    R_set_altrep_data2(alt_, data2_);
+    UNPROTECT(1);
 
-  AltrepArrayPrimitive<sexp_type> alt_(alt);
+    return data2_;
 
-  const auto& array = alt_.array();
-  bool na_rm = narm == TRUE;
-  auto n = array->length();
-  auto null_count = array->null_count();
-  if ((na_rm || n == 0) && null_count == n) {
-    return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    END_CPP11
   }
-  if (!na_rm && null_count > 0) {
-    return cpp11::as_sexp(cpp11::na<data_type>());
+
+  static const void* Dataptr_or_null(SEXP alt_) {
+    // only valid if all strings have been materialized
+    // i.e. it is not enough for data2 to be not NULL
+    if (IsMaterialized(alt_)) return DATAPTR(R_altrep_data2(alt_));
+
+    // otherwise give up
+    return NULL;
   }
 
-  auto options = NaRmOptions(array, na_rm);
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
 
-  const auto& minmax =
-      ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
-  const auto& minmax_scalar =
-      internal::checked_cast<const StructScalar&>(*minmax.scalar());
+  static SEXP Serialized_state(SEXP alt_) { return Materialize(alt_); }
 
-  const auto& result_scalar = internal::checked_cast<const scalar_type&>(
-      *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
-  return cpp11::as_sexp(result_scalar.value);
-}
+  static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-template <int sexp_type>
-SEXP Min(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, true>(alt, narm);
-}
+  static SEXP Duplicate(SEXP alt_, Rboolean /* deep */) {
+    return Rf_lazy_duplicate(Materialize(alt_));
+  }
 
-template <int sexp_type>
-SEXP Max(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, false>(alt, narm);
-}
+  // static method so that this can error without concerns of
+  // destruction for the
+  static void Set_elt(SEXP alt_, R_xlen_t i, SEXP v) {
+    Rf_error("ALTSTRING objects of type <arrow::array_string_vector> are 
immutable");
+  }
 
-template <int sexp_type>
-static SEXP Sum(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+  // this is called from an unwind_protect() block because
+  // r_string_from_view might jump
+  static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
+                                           std::string& stripped_string,
+                                           bool* nul_was_stripped) {
+    const char* old_string = view.data();
+
+    size_t stripped_len = 0, nul_count = 0;
+
+    for (size_t i = 0; i < view.size(); i++) {
+      if (old_string[i] == '\0') {
+        ++nul_count;
 
-  AltrepArrayPrimitive<sexp_type> alt_(alt);
+        if (nul_count == 1) {
+          // first nul spotted: allocate stripped string storage
+          stripped_string = view.to_string();

Review comment:
       This is just reallocating a new string storage. Instead you can call 
`stripped_string.assign(view.begin(), view.end())`.

##########
File path: r/src/altrep.cpp
##########
@@ -70,135 +69,125 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, 
R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-// altrep R vector shadowing an Array.
+void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
+using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+
+// base class for all altrep vectors
+//
+// The altrep vector stores the Array as an external pointer in data1
+// Implementation classes AltrepVectorPrimitive<> and AltrepVectorString
+// also use data2
+struct AltrepVectorBase {
+  // store the Array as an external pointer in data1, mark as immutable
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& 
array) {
+    SEXP alt_ =
+        R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)), 
R_NilValue);
+    MARK_NOT_MUTABLE(alt_);
+
+    return alt_;
+  }
+
+  // the Array that is being wrapped by the altrep object
+  static const std::shared_ptr<Array>& array(SEXP alt_) {
+    return *Pointer(R_altrep_data1(alt_));
+  }
+
+  // Is the vector materialized, i.e. does the data2 slot contain a
+  // standard R vector with the same data as the array.
+  static bool IsMaterialized(SEXP alt_) { return 
!Rf_isNull(R_altrep_data2(alt_)); }
+
+  static R_xlen_t Length(SEXP alt_) { return array(alt_)->length(); }
+
+  static int No_NA(SEXP alt_) { return array(alt_)->null_count() == 0; }
+
+  static int Is_sorted(SEXP alt_) { return UNKNOWN_SORTEDNESS; }
+
+  // What gets printed on .Internal(inspect(<the altrep object>))
+  static Rboolean Inspect(SEXP alt_, int pre, int deep, int pvec,
+                          void (*inspect_subtree)(SEXP, int, int, int)) {
+    const auto& array_ = array(alt_);

Review comment:
       Same here: `array` (no trailing underscore).

##########
File path: r/src/altrep.cpp
##########
@@ -218,29 +207,23 @@ struct AltrepArrayPrimitive {
     // Simply stop() when `writeable = TRUE` is too strong, e.g. this fails
     // identical() which calls DATAPTR() even though DATAPTR_RO() would
     // be enough
-    Materialize();
-    return DATAPTR(R_altrep_data2(alt_));
+    return DATAPTR(Materialize(alt_));
   }
 
-  // Does the Array have no nulls ?
-  int No_NA() const { return array()->null_count() != 0; }
-
-  int Is_sorted() const { return UNKNOWN_SORTEDNESS; }
-
   // The value at position i
-  c_type Elt(R_xlen_t i) {
-    const auto& array_ = array();
+  static c_type Elt(SEXP alt_, R_xlen_t i) {
+    const auto& array_ = array(alt_);
     return array_->IsNull(i) ? cpp11::na<c_type>()
                              : array_->data()->template 
GetValues<c_type>(1)[i];

Review comment:
       Do we want to protect against the case where the non-null C value is 
equal to the R NA singleton?

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);

Review comment:
       Same remark: you don't need both the options struct and the above 
special-case for `null_count > 0`.

##########
File path: r/src/altrep.cpp
##########
@@ -496,24 +611,76 @@ void InitAltIntegerClass(DllInfo* dll, const char* name) {
   InitAltIntegerMethods<AltrepClass>(AltrepClass::class_t, dll);
 }
 
+template <typename AltrepClass>
+void InitAltStringClass(DllInfo* dll, const char* name) {
+  AltrepClass::class_t = R_make_altstring_class(name, "arrow", dll);
+  R_set_altrep_Length_method(AltrepClass::class_t, AltrepClass::Length);
+  R_set_altrep_Inspect_method(AltrepClass::class_t, AltrepClass::Inspect);
+  R_set_altrep_Duplicate_method(AltrepClass::class_t, AltrepClass::Duplicate);
+  R_set_altrep_Serialized_state_method(AltrepClass::class_t,
+                                       AltrepClass::Serialized_state);
+  R_set_altrep_Unserialize_method(AltrepClass::class_t, 
AltrepClass::Unserialize);
+  R_set_altrep_Coerce_method(AltrepClass::class_t, AltrepClass::Coerce);
+
+  R_set_altvec_Dataptr_method(AltrepClass::class_t, AltrepClass::Dataptr);
+  R_set_altvec_Dataptr_or_null_method(AltrepClass::class_t, 
AltrepClass::Dataptr_or_null);
+
+  R_set_altstring_Elt_method(AltrepClass::class_t, AltrepClass::Elt);
+  R_set_altstring_Set_elt_method(AltrepClass::class_t, AltrepClass::Set_elt);
+  R_set_altstring_No_NA_method(AltrepClass::class_t, AltrepClass::No_NA);
+  R_set_altstring_Is_sorted_method(AltrepClass::class_t, 
AltrepClass::Is_sorted);
+}
+
 // initialize the altrep classes
 void Init_Altrep_classes(DllInfo* dll) {
-  InitAltRealClass<AltrepArrayPrimitive<REALSXP>>(dll, "array_dbl_vector");
-  InitAltIntegerClass<AltrepArrayPrimitive<INTSXP>>(dll, "array_int_vector");
+  InitAltRealClass<AltrepVectorPrimitive<REALSXP>>(dll, 
"arrow::array_dbl_vector");
+  InitAltIntegerClass<AltrepVectorPrimitive<INTSXP>>(dll, 
"arrow::array_int_vector");
+
+  InitAltStringClass<AltrepVectorString<StringType>>(dll, 
"arrow::array_string_vector");
+  InitAltStringClass<AltrepVectorString<LargeStringType>>(
+      dll, "arrow::array_large_string_vector");
 }
 
+}  // namespace altrep
+}  // namespace r
+}  // namespace arrow
+
+#endif  // HAS_ALTREP

Review comment:
       Is there a situation where HAS_ALTREP is false?

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side
+    SEXP s;
+    cpp11::unwind_protect([&]() {
+      if (strip_out_nuls) {
+        s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+      } else {
+        s = r_string_from_view_keep_nul(view, stripped_string);
+      }
 
-template <typename AltrepClass>
-auto Elt(SEXP alt, R_xlen_t i) -> decltype(AltrepClass(alt).Elt(i)) {
-  return AltrepClass(alt).Elt(i);
-}
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
+    return s;
 
-template <typename AltrepClass>
-int No_NA(SEXP alt) {
-  return AltrepClass(alt).No_NA();
-}
+    END_CPP11
+  }
 
-template <typename AltrepClass>
-int Is_sorted(SEXP alt) {
-  return AltrepClass(alt).Is_sorted();
-}
+  static void* Dataptr(SEXP alt_, Rboolean writeable) {
+    return DATAPTR(Materialize(alt_));
+  }
 
-template <typename AltrepClass>
-R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, typename 
AltrepClass::c_type* buf) {
-  return AltrepClass(alt).Get_region(i, n, buf);
-}
+  static SEXP Materialize(SEXP alt_) {
+    if (IsMaterialized(alt_)) {
+      return R_altrep_data2(alt_);
+    }
 
-template <typename AltrepClass>
-SEXP Serialized_state(SEXP alt) {
-  return AltrepClass(alt).Serialized_state();
-}
+    BEGIN_CPP11
 
-template <typename AltrepClass>
-SEXP Unserialize(SEXP class_, SEXP state) {
-  return AltrepClass::Unserialize(class_, state);
-}
+    auto array_ = array(alt_);
+    R_xlen_t n = array_->length();
+    SEXP data2_ = PROTECT(Rf_allocVector(STRSXP, n));
+    MARK_NOT_MUTABLE(data2_);
 
-template <typename AltrepClass>
-SEXP Coerce(SEXP alt, int type) {
-  return AltrepClass(alt).Coerce(type);
-}
+    std::string stripped_string;
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    auto* string_array = 
internal::checked_cast<StringArrayType*>(array_.get());
+    util::string_view view;
 
-static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
-    const std::shared_ptr<Array>& array, bool na_rm) {
-  auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
-      arrow::compute::ScalarAggregateOptions::Defaults());
-  options->min_count = 0;
-  options->skip_nulls = na_rm;
-  return options;
-}
+    cpp11::unwind_protect([&]() {
+      for (R_xlen_t i = 0; i < n; i++) {
+        SEXP s = STRING_ELT(data2_, i);
+
+        // nul, so materialize to NA_STRING
+        if (array_->IsNull(i)) {
+          SET_STRING_ELT(data2_, i, NA_STRING);
+          continue;
+        }
+
+        // materialize a real string, with care about potential jump
+        // from Rf_mkCharLenCE()
+        view = string_array->GetView(i);
+        if (strip_out_nuls) {
+          s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+        } else {
+          s = r_string_from_view_keep_nul(view, stripped_string);
+        }
+        SET_STRING_ELT(data2_, i, s);
+      }
+
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
 
-template <int sexp_type, bool Min>
-SEXP MinMax(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
-  using scalar_type =
-      typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+    // only set to data2 if all the values have been converted
+    R_set_altrep_data2(alt_, data2_);
+    UNPROTECT(1);
 
-  AltrepArrayPrimitive<sexp_type> alt_(alt);
+    return data2_;
 
-  const auto& array = alt_.array();
-  bool na_rm = narm == TRUE;
-  auto n = array->length();
-  auto null_count = array->null_count();
-  if ((na_rm || n == 0) && null_count == n) {
-    return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    END_CPP11
   }
-  if (!na_rm && null_count > 0) {
-    return cpp11::as_sexp(cpp11::na<data_type>());
+
+  static const void* Dataptr_or_null(SEXP alt_) {
+    // only valid if all strings have been materialized
+    // i.e. it is not enough for data2 to be not NULL
+    if (IsMaterialized(alt_)) return DATAPTR(R_altrep_data2(alt_));
+
+    // otherwise give up
+    return NULL;
   }
 
-  auto options = NaRmOptions(array, na_rm);
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
 
-  const auto& minmax =
-      ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
-  const auto& minmax_scalar =
-      internal::checked_cast<const StructScalar&>(*minmax.scalar());
+  static SEXP Serialized_state(SEXP alt_) { return Materialize(alt_); }
 
-  const auto& result_scalar = internal::checked_cast<const scalar_type&>(
-      *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
-  return cpp11::as_sexp(result_scalar.value);
-}
+  static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-template <int sexp_type>
-SEXP Min(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, true>(alt, narm);
-}
+  static SEXP Duplicate(SEXP alt_, Rboolean /* deep */) {
+    return Rf_lazy_duplicate(Materialize(alt_));
+  }
 
-template <int sexp_type>
-SEXP Max(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, false>(alt, narm);
-}
+  // static method so that this can error without concerns of
+  // destruction for the
+  static void Set_elt(SEXP alt_, R_xlen_t i, SEXP v) {
+    Rf_error("ALTSTRING objects of type <arrow::array_string_vector> are 
immutable");
+  }
 
-template <int sexp_type>
-static SEXP Sum(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+  // this is called from an unwind_protect() block because
+  // r_string_from_view might jump
+  static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
+                                           std::string& stripped_string,
+                                           bool* nul_was_stripped) {
+    const char* old_string = view.data();
+
+    size_t stripped_len = 0, nul_count = 0;
+
+    for (size_t i = 0; i < view.size(); i++) {

Review comment:
       I think you can reuse the `std::find` approach of 
`r_string_from_view_keep_nul`, so that the common case of no nuls can be made 
faster.

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side
+    SEXP s;
+    cpp11::unwind_protect([&]() {
+      if (strip_out_nuls) {
+        s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+      } else {
+        s = r_string_from_view_keep_nul(view, stripped_string);
+      }
 
-template <typename AltrepClass>
-auto Elt(SEXP alt, R_xlen_t i) -> decltype(AltrepClass(alt).Elt(i)) {
-  return AltrepClass(alt).Elt(i);
-}
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
+    return s;
 
-template <typename AltrepClass>
-int No_NA(SEXP alt) {
-  return AltrepClass(alt).No_NA();
-}
+    END_CPP11
+  }
 
-template <typename AltrepClass>
-int Is_sorted(SEXP alt) {
-  return AltrepClass(alt).Is_sorted();
-}
+  static void* Dataptr(SEXP alt_, Rboolean writeable) {
+    return DATAPTR(Materialize(alt_));
+  }
 
-template <typename AltrepClass>
-R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, typename 
AltrepClass::c_type* buf) {
-  return AltrepClass(alt).Get_region(i, n, buf);
-}
+  static SEXP Materialize(SEXP alt_) {
+    if (IsMaterialized(alt_)) {
+      return R_altrep_data2(alt_);
+    }
 
-template <typename AltrepClass>
-SEXP Serialized_state(SEXP alt) {
-  return AltrepClass(alt).Serialized_state();
-}
+    BEGIN_CPP11
 
-template <typename AltrepClass>
-SEXP Unserialize(SEXP class_, SEXP state) {
-  return AltrepClass::Unserialize(class_, state);
-}
+    auto array_ = array(alt_);
+    R_xlen_t n = array_->length();
+    SEXP data2_ = PROTECT(Rf_allocVector(STRSXP, n));
+    MARK_NOT_MUTABLE(data2_);
 
-template <typename AltrepClass>
-SEXP Coerce(SEXP alt, int type) {
-  return AltrepClass(alt).Coerce(type);
-}
+    std::string stripped_string;
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    auto* string_array = 
internal::checked_cast<StringArrayType*>(array_.get());
+    util::string_view view;
 
-static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
-    const std::shared_ptr<Array>& array, bool na_rm) {
-  auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
-      arrow::compute::ScalarAggregateOptions::Defaults());
-  options->min_count = 0;
-  options->skip_nulls = na_rm;
-  return options;
-}
+    cpp11::unwind_protect([&]() {
+      for (R_xlen_t i = 0; i < n; i++) {
+        SEXP s = STRING_ELT(data2_, i);
+
+        // nul, so materialize to NA_STRING
+        if (array_->IsNull(i)) {
+          SET_STRING_ELT(data2_, i, NA_STRING);
+          continue;
+        }
+
+        // materialize a real string, with care about potential jump
+        // from Rf_mkCharLenCE()
+        view = string_array->GetView(i);
+        if (strip_out_nuls) {
+          s = r_string_from_view_strip_nul(view, stripped_string, 
&nul_was_stripped);
+        } else {
+          s = r_string_from_view_keep_nul(view, stripped_string);
+        }
+        SET_STRING_ELT(data2_, i, s);
+      }
+
+      if (nul_was_stripped) {
+        cpp11::warning("Stripping '\\0' (nul) from character vector");
+      }
+    });
 
-template <int sexp_type, bool Min>
-SEXP MinMax(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
-  using scalar_type =
-      typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+    // only set to data2 if all the values have been converted
+    R_set_altrep_data2(alt_, data2_);
+    UNPROTECT(1);
 
-  AltrepArrayPrimitive<sexp_type> alt_(alt);
+    return data2_;
 
-  const auto& array = alt_.array();
-  bool na_rm = narm == TRUE;
-  auto n = array->length();
-  auto null_count = array->null_count();
-  if ((na_rm || n == 0) && null_count == n) {
-    return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    END_CPP11
   }
-  if (!na_rm && null_count > 0) {
-    return cpp11::as_sexp(cpp11::na<data_type>());
+
+  static const void* Dataptr_or_null(SEXP alt_) {
+    // only valid if all strings have been materialized
+    // i.e. it is not enough for data2 to be not NULL
+    if (IsMaterialized(alt_)) return DATAPTR(R_altrep_data2(alt_));
+
+    // otherwise give up
+    return NULL;
   }
 
-  auto options = NaRmOptions(array, na_rm);
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
 
-  const auto& minmax =
-      ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
-  const auto& minmax_scalar =
-      internal::checked_cast<const StructScalar&>(*minmax.scalar());
+  static SEXP Serialized_state(SEXP alt_) { return Materialize(alt_); }
 
-  const auto& result_scalar = internal::checked_cast<const scalar_type&>(
-      *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
-  return cpp11::as_sexp(result_scalar.value);
-}
+  static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-template <int sexp_type>
-SEXP Min(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, true>(alt, narm);
-}
+  static SEXP Duplicate(SEXP alt_, Rboolean /* deep */) {
+    return Rf_lazy_duplicate(Materialize(alt_));
+  }
 
-template <int sexp_type>
-SEXP Max(SEXP alt, Rboolean narm) {
-  return MinMax<sexp_type, false>(alt, narm);
-}
+  // static method so that this can error without concerns of
+  // destruction for the
+  static void Set_elt(SEXP alt_, R_xlen_t i, SEXP v) {
+    Rf_error("ALTSTRING objects of type <arrow::array_string_vector> are 
immutable");
+  }
 
-template <int sexp_type>
-static SEXP Sum(SEXP alt, Rboolean narm) {
-  using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+  // this is called from an unwind_protect() block because
+  // r_string_from_view might jump
+  static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
+                                           std::string& stripped_string,
+                                           bool* nul_was_stripped) {

Review comment:
       Instead of passing mutable state, can you create a struct helper? For 
example:
   ```c++
   struct RStringViewer {
     std::string stripped_string;
   
     SEXP ViewStripNul(string_view view, bool* nul_was_stripped) { ... }
   
     SEXP ViewKeepNul(string_view view) { ... }
   };
   ```
   

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side

Review comment:
       Hmmm... isn't this redundant with BEGIN_CPP11 and END_CPP11? Or am I 
misunderstanding what these macros do?

##########
File path: r/src/altrep.cpp
##########
@@ -496,24 +611,76 @@ void InitAltIntegerClass(DllInfo* dll, const char* name) {
   InitAltIntegerMethods<AltrepClass>(AltrepClass::class_t, dll);
 }
 
+template <typename AltrepClass>
+void InitAltStringClass(DllInfo* dll, const char* name) {
+  AltrepClass::class_t = R_make_altstring_class(name, "arrow", dll);
+  R_set_altrep_Length_method(AltrepClass::class_t, AltrepClass::Length);
+  R_set_altrep_Inspect_method(AltrepClass::class_t, AltrepClass::Inspect);
+  R_set_altrep_Duplicate_method(AltrepClass::class_t, AltrepClass::Duplicate);
+  R_set_altrep_Serialized_state_method(AltrepClass::class_t,
+                                       AltrepClass::Serialized_state);
+  R_set_altrep_Unserialize_method(AltrepClass::class_t, 
AltrepClass::Unserialize);
+  R_set_altrep_Coerce_method(AltrepClass::class_t, AltrepClass::Coerce);
+
+  R_set_altvec_Dataptr_method(AltrepClass::class_t, AltrepClass::Dataptr);
+  R_set_altvec_Dataptr_or_null_method(AltrepClass::class_t, 
AltrepClass::Dataptr_or_null);
+
+  R_set_altstring_Elt_method(AltrepClass::class_t, AltrepClass::Elt);
+  R_set_altstring_Set_elt_method(AltrepClass::class_t, AltrepClass::Set_elt);
+  R_set_altstring_No_NA_method(AltrepClass::class_t, AltrepClass::No_NA);
+  R_set_altstring_Is_sorted_method(AltrepClass::class_t, 
AltrepClass::Is_sorted);
+}
+
 // initialize the altrep classes
 void Init_Altrep_classes(DllInfo* dll) {
-  InitAltRealClass<AltrepArrayPrimitive<REALSXP>>(dll, "array_dbl_vector");
-  InitAltIntegerClass<AltrepArrayPrimitive<INTSXP>>(dll, "array_int_vector");
+  InitAltRealClass<AltrepVectorPrimitive<REALSXP>>(dll, 
"arrow::array_dbl_vector");
+  InitAltIntegerClass<AltrepVectorPrimitive<INTSXP>>(dll, 
"arrow::array_int_vector");
+
+  InitAltStringClass<AltrepVectorString<StringType>>(dll, 
"arrow::array_string_vector");
+  InitAltStringClass<AltrepVectorString<LargeStringType>>(
+      dll, "arrow::array_large_string_vector");
 }
 
+}  // namespace altrep
+}  // namespace r
+}  // namespace arrow
+
+#endif  // HAS_ALTREP

Review comment:
       Ouch, I see.

##########
File path: r/src/altrep.cpp
##########
@@ -274,210 +257,342 @@ struct AltrepArrayPrimitive {
 
   // This cannot keep the external pointer to an Arrow object through
   // R serialization, so return the materialized
-  SEXP Serialized_state() {
-    Materialize();
-    return R_altrep_data2(alt_);
-  }
+  static SEXP Serialized_state(SEXP alt_) { return 
R_altrep_data2(Materialize(alt_)); }
 
   static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
 
-  SEXP Coerce(int type) {
-    // Just let R handle it for now
-    return NULL;
+  static SEXP Coerce(SEXP alt_, int type) {
+    return Rf_coerceVector(Materialize(alt_), type);
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
+
+  template <bool Min>
+  static SEXP MinMax(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+    using scalar_type =
+        typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto n = array_->length();
+    auto null_count = array_->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array_}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Min(SEXP alt_, Rboolean narm) { return MinMax<true>(alt_, narm); 
}
+
+  static SEXP Max(SEXP alt_, Rboolean narm) { return MinMax<false>(alt_, 
narm); }
+
+  static SEXP Sum(SEXP alt_, Rboolean narm) {
+    using data_type = typename std::conditional<sexp_type == REALSXP, double, 
int>::type;
+
+    const auto& array_ = array(alt_);
+    bool na_rm = narm == TRUE;
+    auto null_count = array_->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = NaRmOptions(array_, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array_}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
   }
 };
 template <int sexp_type>
-R_altrep_class_t AltrepArrayPrimitive<sexp_type>::class_t;
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
-// The methods below are how R interacts with the altrep objects.
-//
-// They all use the same pattern: create a C++ object of the
-// class parameter, and then call the method.
-template <typename AltrepClass>
-R_xlen_t Length(SEXP alt) {
-  return AltrepClass(alt).Length();
-}
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase {
+  static R_altrep_class_t class_t;
+  using StringArrayType = typename TypeTraits<Type>::ArrayType;
 
-template <typename AltrepClass>
-Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
-                 void (*inspect_subtree)(SEXP, int, int, int)) {
-  return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree);
-}
+  static SEXP Make(const std::shared_ptr<Array>& array) {
+    return AltrepVectorBase::Make(class_t, array);
+  }
 
-template <typename AltrepClass>
-const void* Dataptr_or_null(SEXP alt) {
-  return AltrepClass(alt).Dataptr_or_null();
-}
+  // Get a single string, as a CHARSXP SEXP
+  // data2 is initialized, the CHARSXP is generated from the Array data
+  // and stored in data2, so that this only needs to expand a given string once
+  static SEXP Elt(SEXP alt_, R_xlen_t i) {
+    if (IsMaterialized(alt_)) {
+      return STRING_ELT(R_altrep_data2(alt_), i);
+    }
 
-template <typename AltrepClass>
-void* Dataptr(SEXP alt, Rboolean writeable) {
-  return AltrepClass(alt).Dataptr(writeable);
-}
+    // nul -> to NA_STRING
+    if (array(alt_)->IsNull(i)) {
+      return NA_STRING;
+    }
 
-template <typename AltrepClass>
-SEXP Duplicate(SEXP alt, Rboolean deep) {
-  return AltrepClass(alt).Duplicate(deep);
-}
+    // not nul, but we need care about embedded nuls
+    // this needs to call an R api function: Rf_mkCharLenCE() that
+    // might jump, i.e. throw an R error, which is dealt with using
+    // BEGIN_CPP11/END_CPP11/cpp11::unwind_protect()
+
+    BEGIN_CPP11
+
+    // C++ objects that will properly be destroyed by END_CPP11
+    // before it resumes the unwinding - and perhaps let
+    // the R error pass through
+    auto array_ = array(alt_);
+    auto view = 
internal::checked_cast<StringArrayType*>(array_.get())->GetView(i);
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+    bool nul_was_stripped = false;
+    std::string stripped_string;
+
+    // both cases might jump, although it's less likely when
+    // nuls are stripped, but still we need the unwind protection
+    // so that C++ objects here are correctly destructed, whilst errors
+    // properly pass through to the R side

Review comment:
       Hmm, thank you.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to