pitrou commented on a change in pull request #11738:
URL: https://github.com/apache/arrow/pull/11738#discussion_r752323855
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
Review comment:
This is extremely inefficient, no?
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
Review comment:
Perhaps more readably:
```c++
BufferVector arrays_transpose(n_arrays);
for (size_t i = 0; i < n_arrays; i++) {
const auto& dict_i =
*internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
.dictionary();
StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose[i]));
}
std::shared_ptr<DataType> out_type;
StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
Pointer<BufferVector> ptr(new std::shared_ptr<BufferVector>(
std::move(arrays_transpose)));
pointer_arrays_transpose = PROTECT(ptr);
```
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
+ return out;
+ }
+
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<int>(Representation(alt), start, n, buf);
+ }
+
+ auto chunked_array = GetChunkedArray(alt);
+
+ // get out if there is nothing to do
+ auto chunked_array_size = chunked_array->length();
+ if (start >= chunked_array_size) return 0;
+
+ auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+ if (WasUnified(alt)) {
+ int j = 0;
+
+ // find out which is the first chunk of the chunk array
+ // that is present in the slice, because the main loop
+ // needs to refer to the correct transpose buffers
+ int64_t k = 0;
+ for (; j < chunked_array->num_chunks(); j++) {
+ auto nj = chunked_array->chunk(j)->length();
+ if (k + nj > start) {
+ break;
+ }
+
+ k += nj;
+ }
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ // using the transpose data for this chunk
+ const auto* transpose_data =
+ reinterpret_cast<const int32_t*>(GetArrayTransposed(alt,
j)->data());
+ auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ j++;
+ }
+
+ } else {
+ // simpler case, identity transpose
+ auto transpose = [](int x) { return x; };
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ }
+ }
+
+ return slice->length();
+ }
+
+ template <typename Transpose>
+ static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ switch (indices->type_id()) {
+ case Type::UINT8:
+ GetRegionTranspose<UInt8Type>(array, indices, transpose, out);
+ break;
+ case Type::INT8:
+ GetRegionTranspose<Int8Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT16:
+ GetRegionTranspose<UInt16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT16:
+ GetRegionTranspose<Int16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ default:
+ break;
+ }
+ }
+
+ template <typename Type, typename Transpose>
+ static void GetRegionTranspose(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ using index_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
+ auto raw_indices = indices->data()->GetValues<index_type>(1);
+
+ auto n = array->length();
+
+ // then set the R NA sentinels if needed
+ if (indices->null_count() > 0) {
Review comment:
Can use `VisitArrayDataInline` here:
```c++
VisitArrayDataInline<Type>(*array->data(),
/*valid_func=*/ [&](index_type index) {
*out++ = transpose(index) + 1;
},
/*null_func=*/ [&]() {
*out++ = cpp11::na<int>();
});
```
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
Review comment:
Nit: can write `(*arrays)[i]`.
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
+ return out;
+ }
+
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<int>(Representation(alt), start, n, buf);
+ }
+
+ auto chunked_array = GetChunkedArray(alt);
+
+ // get out if there is nothing to do
+ auto chunked_array_size = chunked_array->length();
+ if (start >= chunked_array_size) return 0;
+
+ auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+ if (WasUnified(alt)) {
+ int j = 0;
+
+ // find out which is the first chunk of the chunk array
+ // that is present in the slice, because the main loop
+ // needs to refer to the correct transpose buffers
+ int64_t k = 0;
+ for (; j < chunked_array->num_chunks(); j++) {
+ auto nj = chunked_array->chunk(j)->length();
+ if (k + nj > start) {
+ break;
+ }
+
+ k += nj;
+ }
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ // using the transpose data for this chunk
+ const auto* transpose_data =
+ reinterpret_cast<const int32_t*>(GetArrayTransposed(alt,
j)->data());
+ auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ j++;
+ }
+
+ } else {
+ // simpler case, identity transpose
+ auto transpose = [](int x) { return x; };
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ }
+ }
+
+ return slice->length();
+ }
+
+ template <typename Transpose>
+ static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ switch (indices->type_id()) {
+ case Type::UINT8:
+ GetRegionTranspose<UInt8Type>(array, indices, transpose, out);
+ break;
+ case Type::INT8:
+ GetRegionTranspose<Int8Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT16:
+ GetRegionTranspose<UInt16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT16:
+ GetRegionTranspose<Int16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ default:
+ break;
+ }
+ }
+
+ template <typename Type, typename Transpose>
+ static void GetRegionTranspose(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ using index_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
Review comment:
`using index_type = typename Type::c_type` should work.
##########
File path: r/src/altrep.cpp
##########
@@ -51,8 +51,16 @@ extern "C" {
#include "./r_task_group.h"
+// defined in array_to_vector.cpp
+SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
+
namespace arrow {
namespace r {
+
+// defined in array_to_vector.cpp
+bool DictionaryChunkArrayNeedUnification(
+ const std::shared_ptr<ChunkedArray>& chunked_array);
Review comment:
Is there a reason these declarations cannot be in a `.h`?
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
+ return out;
+ }
+
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<int>(Representation(alt), start, n, buf);
+ }
+
+ auto chunked_array = GetChunkedArray(alt);
+
+ // get out if there is nothing to do
+ auto chunked_array_size = chunked_array->length();
+ if (start >= chunked_array_size) return 0;
+
+ auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+ if (WasUnified(alt)) {
+ int j = 0;
+
+ // find out which is the first chunk of the chunk array
+ // that is present in the slice, because the main loop
+ // needs to refer to the correct transpose buffers
+ int64_t k = 0;
+ for (; j < chunked_array->num_chunks(); j++) {
+ auto nj = chunked_array->chunk(j)->length();
+ if (k + nj > start) {
+ break;
+ }
+
+ k += nj;
+ }
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ // using the transpose data for this chunk
+ const auto* transpose_data =
+ reinterpret_cast<const int32_t*>(GetArrayTransposed(alt,
j)->data());
+ auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ j++;
+ }
+
+ } else {
+ // simpler case, identity transpose
+ auto transpose = [](int x) { return x; };
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ }
+ }
+
+ return slice->length();
+ }
+
+ template <typename Transpose>
+ static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
Review comment:
Pass `Transpose&& transpose` and pass
`std::forward<Transpose>(transpose)` below.
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
+ return out;
+ }
+
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<int>(Representation(alt), start, n, buf);
+ }
+
+ auto chunked_array = GetChunkedArray(alt);
+
+ // get out if there is nothing to do
+ auto chunked_array_size = chunked_array->length();
+ if (start >= chunked_array_size) return 0;
+
+ auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+ if (WasUnified(alt)) {
+ int j = 0;
+
+ // find out which is the first chunk of the chunk array
+ // that is present in the slice, because the main loop
+ // needs to refer to the correct transpose buffers
+ int64_t k = 0;
+ for (; j < chunked_array->num_chunks(); j++) {
+ auto nj = chunked_array->chunk(j)->length();
+ if (k + nj > start) {
+ break;
+ }
+
+ k += nj;
+ }
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ // using the transpose data for this chunk
+ const auto* transpose_data =
+ reinterpret_cast<const int32_t*>(GetArrayTransposed(alt,
j)->data());
+ auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ j++;
+ }
+
+ } else {
+ // simpler case, identity transpose
+ auto transpose = [](int x) { return x; };
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ }
+ }
+
+ return slice->length();
+ }
+
+ template <typename Transpose>
+ static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ switch (indices->type_id()) {
+ case Type::UINT8:
+ GetRegionTranspose<UInt8Type>(array, indices, transpose, out);
+ break;
+ case Type::INT8:
+ GetRegionTranspose<Int8Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT16:
+ GetRegionTranspose<UInt16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT16:
+ GetRegionTranspose<Int16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
Review comment:
`UInt32Type`
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
Review comment:
You could check this before trying to unify dictionaries, no?
##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +388,277 @@ struct AltrepVectorPrimitive : public
AltrepVectorBase<AltrepVectorPrimitive<sex
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using Base = AltrepVectorBase<AltrepFactor>;
+ using Base::IsMaterialized;
+
+ // redefining because data2 is a paired list with the representation as the
+ // first node: the CAR
+ static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+ static void SetRepresentation(SEXP alt, SEXP x) {
SETCAR(R_altrep_data2(alt), x); }
+
+ // The CADR(data2) is used to store the transposed arrays when unification
is needed
+ // In that case we store a vector of Buffers
+ using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+ static bool WasUnified(SEXP alt) { return
!Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+ static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+ const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+ return arrays->operator[](i);
+ }
+
+ static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
+
+ std::shared_ptr<Array> dictionary;
+ SEXP pointer_arrays_transpose;
+
+ if (need_unification) {
+ const auto& arr_type =
+ internal::checked_cast<const
DictionaryType&>(*chunked_array->type());
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
+ ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ Pointer<BufferVector> arrays_transpose(
+ new std::shared_ptr<BufferVector>(new BufferVector(n_arrays)));
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ std::shared_ptr<Buffer>& transpose_i =
arrays_transpose->get()->operator[](i);
+ const auto& dict_i =
+ *internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(i))
+ .dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &transpose_i));
+ }
+
+ std::shared_ptr<DataType> out_type;
+ StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
+
+ pointer_arrays_transpose = PROTECT(arrays_transpose);
+ } else {
+ // just use the first one
+ const auto& dict_array =
+ internal::checked_cast<const
DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary = dict_array.dictionary();
+
+ pointer_arrays_transpose = PROTECT(R_NilValue);
+ }
+
+ // only dealing with dictionaries of strings
+ if (dictionary->type_id() != arrow::Type::STRING) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ // the chunked array as data1
+ SEXP data1 =
+ PROTECT(Pointer<ChunkedArray>(new
std::shared_ptr<ChunkedArray>(chunked_array)));
+
+ // a pairlist with the representation in the first node
+ SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at
first
+ pointer_arrays_transpose));
+
+ SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
+ MARK_NOT_MUTABLE(alt);
+
+ // set factor attributes
+ Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
+
+ if (internal::checked_cast<const
DictionaryType&>(*chunked_array->type()).ordered()) {
+ Rf_classgets(alt, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(alt, arrow::r::data::classes_factor);
+ }
+
+ UNPROTECT(4);
+ return alt;
+ }
+
+ // TODO: this is similar to the primitive Materialize
+ static SEXP Materialize(SEXP alt) {
+ if (!IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ SetRepresentation(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return Representation(alt);
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (IsMaterialized(alt)) {
+ return DATAPTR_RO(Representation(alt));
+ }
+
+ return nullptr;
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return
DATAPTR(Materialize(alt)); }
+
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ // the representation integer vector
+ SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
+
+ // additional attributes from the altrep
+ SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
+ SET_ATTRIB(dup, atts);
+
+ UNPROTECT(2);
+ return dup;
+ }
+
+ // The value at position i
+ static int Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return INTEGER_ELT(Representation(alt), i);
+ }
+
+ int out;
+ Get_region(alt, i, 1, &out);
+ return out;
+ }
+
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<int>(Representation(alt), start, n, buf);
+ }
+
+ auto chunked_array = GetChunkedArray(alt);
+
+ // get out if there is nothing to do
+ auto chunked_array_size = chunked_array->length();
+ if (start >= chunked_array_size) return 0;
+
+ auto slice = GetChunkedArray(alt)->Slice(start, n);
+
+ if (WasUnified(alt)) {
+ int j = 0;
+
+ // find out which is the first chunk of the chunk array
+ // that is present in the slice, because the main loop
+ // needs to refer to the correct transpose buffers
+ int64_t k = 0;
+ for (; j < chunked_array->num_chunks(); j++) {
+ auto nj = chunked_array->chunk(j)->length();
+ if (k + nj > start) {
+ break;
+ }
+
+ k += nj;
+ }
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ // using the transpose data for this chunk
+ const auto* transpose_data =
+ reinterpret_cast<const int32_t*>(GetArrayTransposed(alt,
j)->data());
+ auto transpose = [transpose_data](int x) { return transpose_data[x]; };
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ j++;
+ }
+
+ } else {
+ // simpler case, identity transpose
+ auto transpose = [](int x) { return x; };
+
+ int* out = buf;
+ for (const auto& array : slice->chunks()) {
+ const auto& indices =
+ internal::checked_cast<const DictionaryArray&>(*array).indices();
+
+ GetRegionDispatch(array, indices, transpose, out);
+
+ out += array->length();
+ }
+ }
+
+ return slice->length();
+ }
+
+ template <typename Transpose>
+ static void GetRegionDispatch(const std::shared_ptr<Array>& array,
+ const std::shared_ptr<Array>& indices,
+ Transpose transpose, int* out) {
+ switch (indices->type_id()) {
+ case Type::UINT8:
+ GetRegionTranspose<UInt8Type>(array, indices, transpose, out);
+ break;
+ case Type::INT8:
+ GetRegionTranspose<Int8Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT16:
+ GetRegionTranspose<UInt16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT16:
+ GetRegionTranspose<Int16Type>(array, indices, transpose, out);
+ break;
+ case Type::INT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ case Type::UINT32:
+ GetRegionTranspose<Int32Type>(array, indices, transpose, out);
+ break;
+ default:
Review comment:
Why are 64-bit indices not handled?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]