romainfrancois commented on a change in pull request #11738:
URL: https://github.com/apache/arrow/pull/11738#discussion_r753069590



##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public 
AltrepVectorBase<AltrepVectorPrimitive<sex
 template <int sexp_type>
 R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+  // singleton altrep class description
+  static R_altrep_class_t class_t;
+
+  using Base = AltrepVectorBase<AltrepFactor>;
+  using Base::IsMaterialized;
+
+  // redefining because data2 is a paired list with the representation as the
+  // first node: the CAR
+  static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+  static void SetRepresentation(SEXP alt, SEXP x) { 
SETCAR(R_altrep_data2(alt), x); }
+
+  // The CADR(data2) is used to store the transposed arrays when unification 
is needed
+  // In that case we store a vector of Buffers
+  using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+  static bool WasUnified(SEXP alt) { return 
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+  static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+    const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+    return arrays->operator[](i);
+  }
+
+  static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+    bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+    std::shared_ptr<Array> dictionary;
+    SEXP pointer_arrays_transpose;
+
+    if (need_unification) {
+      const auto& arr_type =
+          internal::checked_cast<const 
DictionaryType&>(*chunked_array->type());
+      std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+          ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+      size_t n_arrays = chunked_array->num_chunks();
+      Pointer<BufferVector> arrays_transpose(
+          new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+      for (size_t i = 0; i < n_arrays; i++) {
+        std::shared_ptr<Buffer>& transpose_i = 
arrays_transpose->get()->operator[](i);
+        const auto& dict_i =
+            *internal::checked_cast<const 
DictionaryArray&>(*chunked_array->chunk(i))
+                 .dictionary();
+        StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+      }
+
+      std::shared_ptr<DataType> out_type;
+      StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+      pointer_arrays_transpose = PROTECT(arrays_transpose);
+    } else {
+      // just use the first one
+      const auto& dict_array =
+          internal::checked_cast<const 
DictionaryArray&>(*chunked_array->chunk(0));
+      dictionary = dict_array.dictionary();
+
+      pointer_arrays_transpose = PROTECT(R_NilValue);
+    }
+
+    // only dealing with dictionaries of strings
+    if (dictionary->type_id() != arrow::Type::STRING) {
+      UNPROTECT(1);
+      return R_NilValue;
+    }
+
+    // the chunked array as data1
+    SEXP data1 =
+        PROTECT(Pointer<ChunkedArray>(new 
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+    // a pairlist with the representation in the first node
+    SEXP data2 = PROTECT(Rf_list2(R_NilValue,  // representation, empty at 
first
+                                  pointer_arrays_transpose));
+
+    SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+    MARK_NOT_MUTABLE(alt);
+
+    // set factor attributes
+    Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+    if (internal::checked_cast<const 
DictionaryType&>(*chunked_array->type()).ordered()) {
+      Rf_classgets(alt, arrow::r::data::classes_ordered);
+    } else {
+      Rf_classgets(alt, arrow::r::data::classes_factor);
+    }
+
+    UNPROTECT(4);
+    return alt;
+  }
+
+  // TODO: this is similar to the primitive Materialize
+  static SEXP Materialize(SEXP alt) {
+    if (!IsMaterialized(alt)) {
+      auto size = Base::Length(alt);
+
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+      // copy the data from the array, through Get_region
+      Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+      // store as data2, this is now considered materialized
+      SetRepresentation(alt, copy);
+      MARK_NOT_MUTABLE(copy);
+
+      UNPROTECT(1);
+    }
+    return Representation(alt);
+  }
+
+  static const void* Dataptr_or_null(SEXP alt) {
+    if (IsMaterialized(alt)) {
+      return DATAPTR_RO(Representation(alt));
+    }
+
+    return nullptr;
+  }
+
+  static void* Dataptr(SEXP alt, Rboolean writeable) { return 
DATAPTR(Materialize(alt)); }
+
+  static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+    // the representation integer vector
+    SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+    // additional attributes from the altrep
+    SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+    SET_ATTRIB(dup, atts);
+
+    UNPROTECT(2);
+    return dup;
+  }
+
+  // The value at position i
+  static int Elt(SEXP alt, R_xlen_t i) {
+    if (Base::IsMaterialized(alt)) {
+      return INTEGER_ELT(Representation(alt), i);
+    }
+
+    int out;
+    Get_region(alt, i, 1, &out);
+    return out;
+  }
+
+  static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+    // If we have data2, we can just copy the region into buf
+    // using the standard Get_region for this R type
+    if (Base::IsMaterialized(alt)) {
+      return Standard_Get_region<int>(Representation(alt), start, n, buf);
+    }
+
+    auto chunked_array = GetChunkedArray(alt);
+
+    // get out if there is nothing to do
+    auto chunked_array_size = chunked_array->length();
+    if (start >= chunked_array_size) return 0;
+
+    auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+    if (WasUnified(alt)) {
+      int j = 0;
+
+      // find out which is the first chunk of the chunk array
+      // that is present in the slice, because the main loop
+      // needs to refer to the correct transpose buffers
+      int64_t k = 0;
+      for (; j < chunked_array->num_chunks(); j++) {
+        auto nj = chunked_array->chunk(j)->length();
+        if (k + nj > start) {
+          break;
+        }
+
+        k += nj;
+      }
+
+      int* out = buf;
+      for (const auto& array : slice->chunks()) {
+        const auto& indices =
+            internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+        // using the transpose data for this chunk
+        const auto* transpose_data =
+            reinterpret_cast<const int32_t*>(GetArrayTransposed(alt, 
j)->data());
+        auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+        GetRegionDispatch(array, indices, transpose, out);
+
+        out += array->length();
+        j++;
+      }
+
+    } else {
+      // simpler case, identity transpose
+      auto transpose = [](int x) { return x; };
+
+      int* out = buf;
+      for (const auto& array : slice->chunks()) {
+        const auto& indices =
+            internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+        GetRegionDispatch(array, indices, transpose, out);
+
+        out += array->length();
+      }
+    }
+
+    return slice->length();
+  }
+
+  template <typename Transpose>
+  static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+                                const std::shared_ptr<Array>& indices,
+                                Transpose transpose, int* out) {
+    switch (indices->type_id()) {
+      case Type::UINT8:
+        GetRegionTranspose<UInt8Type>(array, indices, transpose, out);
+        break;
+      case Type::INT8:
+        GetRegionTranspose<Int8Type>(array, indices, transpose, out);
+        break;
+      case Type::UINT16:
+        GetRegionTranspose<UInt16Type>(array, indices, transpose, out);
+        break;
+      case Type::INT16:
+        GetRegionTranspose<Int16Type>(array, indices, transpose, out);
+        break;
+      case Type::INT32:
+        GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+        break;
+      case Type::UINT32:
+        GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+        break;
+      default:
+        break;
+    }
+  }
+
+  template <typename Type, typename Transpose>
+  static void GetRegionTranspose(const std::shared_ptr<Array>& array,
+                                 const std::shared_ptr<Array>& indices,
+                                 Transpose transpose, int* out) {
+    using index_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
+    auto raw_indices = indices->data()->GetValues<index_type>(1);
+
+    auto n = array->length();
+
+    // then set the R NA sentinels if needed
+    if (indices->null_count() > 0) {

Review comment:
       Nice. We can probably use this in other places. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to