This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b3ba226ef7 GH-39603: [R] Error: Cannot convert Dictionary Array of 
type dictionary<values=large_string, indices=uint32, ordered=0> to R (#49710)
b3ba226ef7 is described below

commit b3ba226ef7cf43ba697aa21b671b93ede424d778
Author: Nic Crane <[email protected]>
AuthorDate: Wed Jun 10 15:57:49 2026 +0100

    GH-39603: [R] Error: Cannot convert Dictionary Array of type 
dictionary<values=large_string, indices=uint32, ordered=0> to R (#49710)
    
    ### Rationale for this change
    
    Dictionary Arrays with specific type indices caused error
    
    ### What changes are included in this PR?
    
    Allows those type indices and add check to ensure no overflow
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    ### AI Usage
    
    I used AI to create the changes but manually reviewed myself afterwards
    
    * GitHub Issue: #39603
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 cpp/src/arrow/util/converter.h |  2 ++
 r/src/array_to_vector.cpp      | 26 ++++++++++++++++++++++++--
 r/src/r_to_arrow.cpp           |  4 ++--
 r/tests/testthat/test-Table.R  | 16 ++++++++--------
 4 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h
index c23d6ccd98..d987bf3061 100644
--- a/cpp/src/arrow/util/converter.h
+++ b/cpp/src/arrow/util/converter.h
@@ -238,7 +238,9 @@ struct MakeConverterImpl {
       DICTIONARY_CASE(FloatType);
       DICTIONARY_CASE(DoubleType);
       DICTIONARY_CASE(BinaryType);
+      DICTIONARY_CASE(LargeBinaryType);
       DICTIONARY_CASE(StringType);
+      DICTIONARY_CASE(LargeStringType);
       DICTIONARY_CASE(FixedSizeBinaryType);
 #undef DICTIONARY_CASE
       default:
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 432b49503e..0992181acf 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter {
         case Type::UINT16:
         case Type::INT16:
         case Type::INT32:
-          // TODO: also add int64, uint32, uint64 downcasts, if possible
+        case Type::UINT32:
+        case Type::INT64:
+        case Type::UINT64:
           break;
         default:
           cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
@@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter {
         dictionary_ = CreateEmptyArray(dict_type.value_type());
       }
     }
+
+    // R factors store their codes in 32-bit integers, so dictionary arrays 
with
+    // more levels than that cannot be represented safely.
+    if (dictionary_->length() > std::numeric_limits<int>::max()) {
+      const auto& dict_type = checked_cast<const 
DictionaryType&>(*chunked_array->type());
+      cpp11::stop(
+          "Cannot convert Dictionary Array of type `%s` to R: dictionary has "
+          "more levels than an R factor can represent",
+          dict_type.ToString().c_str());
+    }
   }
 
   SEXP Allocate(R_xlen_t n) const {
@@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter {
       case Type::INT32:
         return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
                                                         chunk_index);
+      case Type::UINT32:
+        return Ingest_some_nulls_Impl<arrow::UInt32Type>(data, array, start, n,
+                                                         chunk_index);
+      case Type::INT64:
+        return Ingest_some_nulls_Impl<arrow::Int64Type>(data, array, start, n,
+                                                        chunk_index);
+      case Type::UINT64:
+        return Ingest_some_nulls_Impl<arrow::UInt64Type>(data, array, start, n,
+                                                         chunk_index);
       default:
         break;
     }
@@ -704,7 +725,8 @@ class Converter_Dictionary : public Converter {
     // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
     // Alternative: preserve the logical type of the dictionary values
     // (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
-    if (dictionary_->type_id() != Type::STRING) {
+    if (dictionary_->type_id() != Type::STRING &&
+        dictionary_->type_id() != Type::LARGE_STRING) {
       cpp11::safe[Rf_warning]("Coercing dictionary values to R character 
factor levels");
     }
 
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 45d68043af..4312572a5e 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -1029,8 +1029,8 @@ class RDictionaryConverter<ValueType, 
enable_if_has_string_view<ValueType>>
 
     // first we need to handle the levels
     SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
-    auto memo_chunked_chunked_array =
-        arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false);
+    auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray(
+        levels, this->dict_type_->value_type(), false);
     for (const auto& chunk : memo_chunked_chunked_array->chunks()) {
       RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk));
     }
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 1ca8832beb..12ef25975c 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -357,18 +357,18 @@ test_that("Table handles null type (ARROW-7064)", {
 
 test_that("Can create table with specific dictionary types", {
   fact <- example_data[, "fct"]
-  int_types <- c(int8(), int16(), int32(), int64())
-  # TODO: test uint types when format allows
-  # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
-  for (i in int_types) {
+  index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), 
uint32(), uint64())
+  for (i in index_types) {
     sch <- schema(fct = dictionary(i, utf8()))
     tab <- Table$create(fact, schema = sch)
     expect_equal(sch, tab$schema)
-    if (i != int64()) {
-      # TODO: same downcast to int32 as we do for int64() type elsewhere
-      expect_equal_data_frame(tab, fact)
-    }
+    expect_equal_data_frame(tab, fact)
   }
+
+  # large_utf8 values exercise the non-ALTREP conversion path
+  tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), 
large_utf8())))
+  expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), 
large_utf8())))
+  expect_equal_data_frame(tab_large, fact)
 })
 
 test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {

Reply via email to