romainfrancois commented on a change in pull request #11369:
URL: https://github.com/apache/arrow/pull/11369#discussion_r727083194



##########
File path: r/src/altrep.cpp
##########
@@ -237,29 +242,35 @@ struct AltrepVectorPrimitive : public 
AltrepVectorBase<AltrepVectorPrimitive<sex
     // array has nulls
     //
     // This only materialize the region, into buf. Not the entire vector.
-    auto slice = GetArray(alt)->Slice(i, n);
+    auto slice = GetChunkedArray(alt)->Slice(i, n);
     R_xlen_t ncopy = slice->length();
 
-    // first copy the data buffer
-    memcpy(buf, slice->data()->template GetValues<c_type>(1), ncopy * 
sizeof(c_type));
+    c_type* out = buf;

Review comment:
       Same as before but we have to loop across the chunks of the slice. 
   
   This loop could be done in parallel, if we believe this is worth it. 

##########
File path: r/src/altrep.cpp
##########
@@ -69,11 +69,12 @@ R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, 
R_xlen_t n, int* buf)
   return INTEGER_GET_REGION(data2, i, n, buf);
 }
 
-void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
-using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+void DeleteChunkedArray(std::shared_ptr<ChunkedArray>* ptr) { delete ptr; }
+using Pointer =
+    cpp11::external_pointer<std::shared_ptr<ChunkedArray>, DeleteChunkedArray>;
 
-// the Array that is being wrapped by the altrep object
-static const std::shared_ptr<Array>& GetArray(SEXP alt) {
+// the ChunkedArray that is being wrapped by the altrep object
+const std::shared_ptr<ChunkedArray>& GetChunkedArray(SEXP alt) {

Review comment:
       Move it out of the class so that it can be used in 
`vec_to_arrow_altrep_bypass()`, and now returns a `ChunkedArray`, which is what 
the altrep classes host now.   

##########
File path: r/src/altrep.cpp
##########
@@ -273,21 +284,21 @@ struct AltrepVectorPrimitive : public 
AltrepVectorBase<AltrepVectorPrimitive<sex
     using scalar_type =
         typename std::conditional<sexp_type == INTSXP, Int32Scalar, 
DoubleScalar>::type;
 
-    const auto& array = GetArray(alt);
+    const auto& chunked_array = GetChunkedArray(alt);
     bool na_rm = narm == TRUE;
-    auto n = array->length();
-    auto null_count = array->null_count();
+    auto n = chunked_array->length();
+    auto null_count = chunked_array->null_count();
     if ((na_rm || n == 0) && null_count == n) {
       return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
     }
     if (!na_rm && null_count > 0) {
       return cpp11::as_sexp(cpp11::na<data_type>());
     }
 
-    auto options = NaRmOptions(array, na_rm);
+    auto options = NaRmOptions(na_rm);
 
-    const auto& minmax =
-        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
+    const auto& minmax = ValueOrStop(
+        arrow::compute::CallFunction("min_max", {chunked_array}, 
options.get()));

Review comment:
       Needs more testing of calling these on chunk arrays. 

##########
File path: r/src/r_to_arrow.cpp
##########
@@ -1425,7 +1450,8 @@ SEXP vec_to_arrow(SEXP x, SEXP s_type) {
   } else {
     type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(s_type);
   }
-  return cpp11::to_r6(arrow::r::vec_to_arrow(x, type, type_inferred));
+
+  return cpp11::to_r6(arrow::r::vec_to_arrow_Array(x, type, type_inferred));

Review comment:
       concatenation might have to happen here, this is called from 
`Array$create()`

##########
File path: r/R/array.R
##########
@@ -186,7 +186,7 @@ Array$create <- function(x, type = NULL) {
     }
     return(out)
   }
-  vec_to_arrow(x, type)
+  vec_to_Array(x, type)

Review comment:
       changed internal name, so that it is clear that it returns an `Array`. 
Internally it might have to concatenate if `x` is an altrep vector (backed by 
ChunkedArray now) that hosts multiple chunks. 

##########
File path: r/src/altrep.cpp
##########
@@ -640,9 +657,9 @@ bool is_arrow_altrep(SEXP x) {
   return false;
 }
 
-std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP x) {
+std::shared_ptr<ChunkedArray> vec_to_arrow_altrep_bypass(SEXP x) {

Review comment:
       Now returning a ChunkedArray. 

##########
File path: r/src/altrep.cpp
##########
@@ -436,15 +450,16 @@ struct AltrepVectorString : public 
AltrepVectorBase<AltrepVectorString<Type>> {
 
     BEGIN_CPP11
 
-    const auto& array = GetArray(alt);
-    RStringViewer r_string_viewer(array);
+    auto array = GetChunkedArray(alt)->Slice(i, 1)->chunk(0);

Review comment:
       The array is now transient in `RStringViewer` so that it can process all 
the chunks. 

##########
File path: r/src/chunkedarray.cpp
##########
@@ -126,10 +126,17 @@ std::shared_ptr<arrow::ChunkedArray> 
ChunkedArray__from_list(cpp11::list chunks,
     // because we might have inferred the type from the first element of the 
list
     //
     // this only really matters for dictionary arrays
-    vec.push_back(arrow::r::vec_to_arrow(chunks[0], type, type_inferred));
+    auto chunked_array =

Review comment:
       each element of the R list might be an altrep vector with potentially 
more than 1 chunk. So using `vec_to_arrow_ChunkedArray()` so that the chunks 
are not concatenated, and adding each to the list. 

##########
File path: r/src/altrep.cpp
##########
@@ -237,29 +242,35 @@ struct AltrepVectorPrimitive : public 
AltrepVectorBase<AltrepVectorPrimitive<sex
     // array has nulls
     //
     // This only materialize the region, into buf. Not the entire vector.
-    auto slice = GetArray(alt)->Slice(i, n);
+    auto slice = GetChunkedArray(alt)->Slice(i, n);
     R_xlen_t ncopy = slice->length();
 
-    // first copy the data buffer
-    memcpy(buf, slice->data()->template GetValues<c_type>(1), ncopy * 
sizeof(c_type));
+    c_type* out = buf;
+    for (const auto& array : slice->chunks()) {
+      auto n = array->length();
+
+      // first copy the data buffer
+      memcpy(out, array->data()->template GetValues<c_type>(1), n * 
sizeof(c_type));
 
-    // then set the R NA sentinels if needed
-    if (slice->null_count() > 0) {
-      internal::BitmapReader bitmap_reader(slice->null_bitmap()->data(), 
slice->offset(),
-                                           ncopy);
+      // then set the R NA sentinels if needed
+      if (array->null_count() > 0) {
+        internal::BitmapReader bitmap_reader(array->null_bitmap()->data(),
+                                             array->offset(), ncopy);
 
-      for (R_xlen_t j = 0; j < ncopy; j++, bitmap_reader.Next()) {
-        if (bitmap_reader.IsNotSet()) {
-          buf[j] = cpp11::na<c_type>();
+        for (R_xlen_t j = 0; j < n; j++, bitmap_reader.Next()) {
+          if (bitmap_reader.IsNotSet()) {
+            out[j] = cpp11::na<c_type>();
+          }
         }
       }
+
+      out += n;
     }
 
     return ncopy;
   }
 
-  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
-      const std::shared_ptr<Array>& array, bool na_rm) {
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> 
NaRmOptions(bool na_rm) {

Review comment:
       This never used the Array, so removing it. 

##########
File path: r/src/r_to_arrow.cpp
##########
@@ -1191,7 +1206,17 @@ std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
 
   StopIfNotOk(converter->Extend(x, options.size));
 
-  return ValueOrStop(converter->ToArray());
+  return ValueOrStop(converter->ToChunkedArray());
+}
+

Review comment:
       We need `vec_to_arrow_Array()` in addition to 
`vec_to_arrow_ChunkedArray()` because some code do need to make `Array`, i.e. 
`Array$create()` or `RecordBatch$create()`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to