This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b3ba226ef7 GH-39603: [R] Error: Cannot convert Dictionary Array of
type dictionary<values=large_string, indices=uint32, ordered=0> to R (#49710)
b3ba226ef7 is described below
commit b3ba226ef7cf43ba697aa21b671b93ede424d778
Author: Nic Crane <[email protected]>
AuthorDate: Wed Jun 10 15:57:49 2026 +0100
GH-39603: [R] Error: Cannot convert Dictionary Array of type
dictionary<values=large_string, indices=uint32, ordered=0> to R (#49710)
### Rationale for this change
Dictionary Arrays with specific type indices caused error
### What changes are included in this PR?
Allows those type indices and add check to ensure no overflow
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
### AI Usage
I used AI to create the changes but manually reviewed myself afterwards
* GitHub Issue: #39603
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
cpp/src/arrow/util/converter.h | 2 ++
r/src/array_to_vector.cpp | 26 ++++++++++++++++++++++++--
r/src/r_to_arrow.cpp | 4 ++--
r/tests/testthat/test-Table.R | 16 ++++++++--------
4 files changed, 36 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h
index c23d6ccd98..d987bf3061 100644
--- a/cpp/src/arrow/util/converter.h
+++ b/cpp/src/arrow/util/converter.h
@@ -238,7 +238,9 @@ struct MakeConverterImpl {
DICTIONARY_CASE(FloatType);
DICTIONARY_CASE(DoubleType);
DICTIONARY_CASE(BinaryType);
+ DICTIONARY_CASE(LargeBinaryType);
DICTIONARY_CASE(StringType);
+ DICTIONARY_CASE(LargeStringType);
DICTIONARY_CASE(FixedSizeBinaryType);
#undef DICTIONARY_CASE
default:
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 432b49503e..0992181acf 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter {
case Type::UINT16:
case Type::INT16:
case Type::INT32:
- // TODO: also add int64, uint32, uint64 downcasts, if possible
+ case Type::UINT32:
+ case Type::INT64:
+ case Type::UINT64:
break;
default:
cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
@@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter {
dictionary_ = CreateEmptyArray(dict_type.value_type());
}
}
+
+ // R factors store their codes in 32-bit integers, so dictionary arrays
with
+ // more levels than that cannot be represented safely.
+ if (dictionary_->length() > std::numeric_limits<int>::max()) {
+ const auto& dict_type = checked_cast<const
DictionaryType&>(*chunked_array->type());
+ cpp11::stop(
+ "Cannot convert Dictionary Array of type `%s` to R: dictionary has "
+ "more levels than an R factor can represent",
+ dict_type.ToString().c_str());
+ }
}
SEXP Allocate(R_xlen_t n) const {
@@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter {
case Type::INT32:
return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
chunk_index);
+ case Type::UINT32:
+ return Ingest_some_nulls_Impl<arrow::UInt32Type>(data, array, start, n,
+ chunk_index);
+ case Type::INT64:
+ return Ingest_some_nulls_Impl<arrow::Int64Type>(data, array, start, n,
+ chunk_index);
+ case Type::UINT64:
+ return Ingest_some_nulls_Impl<arrow::UInt64Type>(data, array, start, n,
+ chunk_index);
default:
break;
}
@@ -704,7 +725,8 @@ class Converter_Dictionary : public Converter {
// TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
// Alternative: preserve the logical type of the dictionary values
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
- if (dictionary_->type_id() != Type::STRING) {
+ if (dictionary_->type_id() != Type::STRING &&
+ dictionary_->type_id() != Type::LARGE_STRING) {
cpp11::safe[Rf_warning]("Coercing dictionary values to R character
factor levels");
}
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 45d68043af..4312572a5e 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -1029,8 +1029,8 @@ class RDictionaryConverter<ValueType,
enable_if_has_string_view<ValueType>>
// first we need to handle the levels
SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
- auto memo_chunked_chunked_array =
- arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false);
+ auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray(
+ levels, this->dict_type_->value_type(), false);
for (const auto& chunk : memo_chunked_chunked_array->chunks()) {
RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk));
}
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 1ca8832beb..12ef25975c 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -357,18 +357,18 @@ test_that("Table handles null type (ARROW-7064)", {
test_that("Can create table with specific dictionary types", {
fact <- example_data[, "fct"]
- int_types <- c(int8(), int16(), int32(), int64())
- # TODO: test uint types when format allows
- # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
- for (i in int_types) {
+ index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(),
uint32(), uint64())
+ for (i in index_types) {
sch <- schema(fct = dictionary(i, utf8()))
tab <- Table$create(fact, schema = sch)
expect_equal(sch, tab$schema)
- if (i != int64()) {
- # TODO: same downcast to int32 as we do for int64() type elsewhere
- expect_equal_data_frame(tab, fact)
- }
+ expect_equal_data_frame(tab, fact)
}
+
+ # large_utf8 values exercise the non-ALTREP conversion path
+ tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(),
large_utf8())))
+ expect_equal(tab_large$schema, schema(fct = dictionary(uint32(),
large_utf8())))
+ expect_equal_data_frame(tab_large, fact)
})
test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {