This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a1f569c433 GH-49689: [R][C++] Parquets do not support list-columns of 
ordered factors (ordered dictionaries) (#49937)
a1f569c433 is described below

commit a1f569c433ecdeca4a32aff20ae0ce868d73fb1a
Author: Nic Crane <[email protected]>
AuthorDate: Thu Jun 11 09:26:56 2026 +0100

    GH-49689: [R][C++] Parquets do not support list-columns of ordered factors 
(ordered dictionaries) (#49937)
    
    ### Rationale for this change
    
    Ordered dictionaries inside nested types lose their `ordered` flag during 
construction, because `DictionaryBuilder` doesn't track it.
    
    ### What changes are included in this PR?
    
    Store the `ordered` flag in `DictionaryBuilderBase` and pass it through 
when reconstructing the `DictionaryType` in `type()` and `FinishInternal()`.
    
    Remove the R-side workaround that was patching this for top-level columns 
only.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    ### AI usage
    
    Written by Claude, reviewed by me and Codex (via 
[roborev](https://github.com/roborev-dev/roborev)).  I had Claude create the 
tests first, to make sure they failed as expected, then had Claude make the 
fixes and checked the tests passed.  I questioned the approach as I went.
    
    * GitHub Issue: #49689
    
    Lead-authored-by: Nic Crane <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 cpp/src/arrow/array/array_dict_test.cc | 76 ++++++++++++++++++++++++++++
 cpp/src/arrow/array/builder_dict.h     | 90 +++++++++++++++++++++++-----------
 cpp/src/arrow/builder.cc               | 26 +++++++---
 r/src/r_to_arrow.cpp                   |  8 ---
 r/tests/testthat/test-Array.R          | 21 ++++++++
 5 files changed, 176 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/array/array_dict_test.cc 
b/cpp/src/arrow/array/array_dict_test.cc
index 22d6d1fc5a..5f4335bcbc 100644
--- a/cpp/src/arrow/array/array_dict_test.cc
+++ b/cpp/src/arrow/array/array_dict_test.cc
@@ -1761,4 +1761,80 @@ TEST(TestDictionaryUnifier, TableZeroColumns) {
   AssertTablesEqual(*table, *unified);
 }
 
+// GH-49689: Ordered dictionary tests
+
+TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) {
+  for (bool ordered : {true, false}) {
+    ARROW_SCOPED_TRACE("ordered = ", ordered);
+    auto dict_type = dictionary(int8(), utf8(), ordered);
+    ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(dict_type));
+
+    auto builder_type = boxed_builder->type();
+    ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(), 
ordered);
+  }
+}
+
+TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) {
+  for (bool ordered : {true, false}) {
+    ARROW_SCOPED_TRACE("ordered = ", ordered);
+    auto dict_type = dictionary(int8(), utf8(), ordered);
+    ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(dict_type));
+    auto& builder = 
checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder);
+
+    ASSERT_OK(builder.Append("a"));
+    ASSERT_OK(builder.Append("b"));
+    ASSERT_OK(builder.Append("a"));
+
+    std::shared_ptr<Array> result;
+    ASSERT_OK(builder.Finish(&result));
+
+    const auto& result_type = checked_cast<const 
DictionaryType&>(*result->type());
+    ASSERT_EQ(result_type.ordered(), ordered);
+
+    auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])");
+    auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+    DictionaryArray expected(dict_type, ex_indices, ex_dict);
+    AssertArraysEqual(expected, *result);
+  }
+}
+
+TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) {
+  for (bool ordered : {true, false}) {
+    ARROW_SCOPED_TRACE("ordered = ", ordered);
+    auto dict_type = dictionary(int8(), utf8(), ordered);
+    auto list_type = list(field("item", dict_type));
+
+    ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(list_type));
+    auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder);
+    auto& dict_builder =
+        
checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder());
+
+    ASSERT_OK(list_builder.Append());
+    ASSERT_OK(dict_builder.Append("a"));
+    ASSERT_OK(dict_builder.Append("b"));
+    ASSERT_OK(list_builder.Append());
+    ASSERT_OK(dict_builder.Append("a"));
+
+    std::shared_ptr<Array> result;
+    ASSERT_OK(list_builder.Finish(&result));
+
+    const auto& result_list_type = checked_cast<const 
ListType&>(*result->type());
+    const auto& result_dict_type =
+        checked_cast<const DictionaryType&>(*result_list_type.value_type());
+    ASSERT_EQ(result_dict_type.ordered(), ordered);
+  }
+}
+
+TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) {
+  for (bool ordered : {true, false}) {
+    ARROW_SCOPED_TRACE("ordered = ", ordered);
+    auto dict_type = dictionary(int8(), utf8(), ordered);
+    ASSERT_OK_AND_ASSIGN(auto builder,
+                         MakeDictionaryBuilder(dict_type, 
/*dictionary=*/nullptr));
+
+    auto builder_type = builder->type();
+    ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(), 
ordered);
+  }
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/builder_dict.h 
b/cpp/src/arrow/array/builder_dict.h
index 116c82049e..269cdeee64 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -154,26 +154,28 @@ class DictionaryBuilderBase : public ArrayBuilder {
                                     const std::shared_ptr<DataType>&>
                             value_type,
                         MemoryPool* pool = default_memory_pool(),
-                        int64_t alignment = kDefaultBufferAlignment)
+                        int64_t alignment = kDefaultBufferAlignment, bool 
ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(-1),
         indices_builder_(start_int_size, pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilderBase(
       enable_if_t<!is_fixed_size_binary_type<T1>::value, const 
std::shared_ptr<DataType>&>
           value_type,
       MemoryPool* pool = default_memory_pool(),
-      int64_t alignment = kDefaultBufferAlignment)
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(-1),
         indices_builder_(pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilderBase(
@@ -181,13 +183,14 @@ class DictionaryBuilderBase : public ArrayBuilder {
       enable_if_t<!is_fixed_size_binary_type<T1>::value, const 
std::shared_ptr<DataType>&>
           value_type,
       MemoryPool* pool = default_memory_pool(),
-      int64_t alignment = kDefaultBufferAlignment)
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(-1),
         indices_builder_(index_type, pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename B = BuilderType, typename T1 = T>
   DictionaryBuilderBase(uint8_t start_int_size,
@@ -196,38 +199,41 @@ class DictionaryBuilderBase : public ArrayBuilder {
                                     const std::shared_ptr<DataType>&>
                             value_type,
                         MemoryPool* pool = default_memory_pool(),
-                        int64_t alignment = kDefaultBufferAlignment)
+                        int64_t alignment = kDefaultBufferAlignment, bool 
ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(static_cast<const T1&>(*value_type).byte_width()),
         indices_builder_(start_int_size, pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilderBase(
       enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> 
value_type,
       MemoryPool* pool = default_memory_pool(),
-      int64_t alignment = kDefaultBufferAlignment)
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(static_cast<const T1&>(*value_type).byte_width()),
         indices_builder_(pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilderBase(
       const std::shared_ptr<DataType>& index_type,
       enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> 
value_type,
       MemoryPool* pool = default_memory_pool(),
-      int64_t alignment = kDefaultBufferAlignment)
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
         delta_offset_(0),
         byte_width_(static_cast<const T1&>(*value_type).byte_width()),
         indices_builder_(index_type, pool, alignment),
-        value_type_(value_type) {}
+        value_type_(value_type),
+        ordered_(ordered) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilderBase(
@@ -237,13 +243,15 @@ class DictionaryBuilderBase : public ArrayBuilder {
   // This constructor doesn't check for errors. Use InsertMemoValues instead.
   explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
                                  MemoryPool* pool = default_memory_pool(),
-                                 int64_t alignment = kDefaultBufferAlignment)
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 bool ordered = false)
       : ArrayBuilder(pool, alignment),
         memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
         delta_offset_(0),
         byte_width_(-1),
         indices_builder_(pool, alignment),
-        value_type_(dictionary->type()) {}
+        value_type_(dictionary->type()),
+        ordered_(ordered) {}
 
   ~DictionaryBuilderBase() override = default;
 
@@ -490,7 +498,7 @@ class DictionaryBuilderBase : public ArrayBuilder {
   Status Finish(std::shared_ptr<DictionaryArray>* out) { return 
FinishTyped(out); }
 
   std::shared_ptr<DataType> type() const override {
-    return ::arrow::dictionary(indices_builder_.type(), value_type_);
+    return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_);
   }
 
  protected:
@@ -561,6 +569,7 @@ class DictionaryBuilderBase : public ArrayBuilder {
 
   BuilderType indices_builder_;
   std::shared_ptr<DataType> value_type_;
+  bool ordered_ = false;
 };
 
 template <typename BuilderType>
@@ -571,31 +580,53 @@ class DictionaryBuilderBase<BuilderType, NullType> : 
public ArrayBuilder {
       enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
           start_int_size,
       const std::shared_ptr<DataType>& value_type,
-      MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(start_int_size, pool, alignment),
+        ordered_(ordered) {}
 
   explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
-                                 MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(pool) {}
+                                 MemoryPool* pool = default_memory_pool(),
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(pool, alignment),
+        ordered_(ordered) {}
 
   explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
                                  const std::shared_ptr<DataType>& value_type,
-                                 MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(index_type, pool) {}
+                                 MemoryPool* pool = default_memory_pool(),
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(index_type, pool, alignment),
+        ordered_(ordered) {}
 
   template <typename B = BuilderType>
   explicit DictionaryBuilderBase(
       enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
           start_int_size,
-      MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(start_int_size, pool, alignment),
+        ordered_(ordered) {}
 
-  explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(pool) {}
+  explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool(),
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(pool, alignment),
+        ordered_(ordered) {}
 
   explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
-                                 MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), indices_builder_(pool) {}
+                                 MemoryPool* pool = default_memory_pool(),
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 bool ordered = false)
+      : ArrayBuilder(pool, alignment),
+        indices_builder_(pool, alignment),
+        ordered_(ordered) {}
 
   /// \brief Append a scalar null value
   Status AppendNull() final {
@@ -647,7 +678,7 @@ class DictionaryBuilderBase<BuilderType, NullType> : public 
ArrayBuilder {
 
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
     ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
-    (*out)->type = dictionary((*out)->type, null());
+    (*out)->type = dictionary((*out)->type, null(), ordered_);
     (*out)->dictionary = NullArray(0).data();
     return Status::OK();
   }
@@ -659,11 +690,12 @@ class DictionaryBuilderBase<BuilderType, NullType> : 
public ArrayBuilder {
   Status Finish(std::shared_ptr<DictionaryArray>* out) { return 
FinishTyped(out); }
 
   std::shared_ptr<DataType> type() const override {
-    return ::arrow::dictionary(indices_builder_.type(), null());
+    return ::arrow::dictionary(indices_builder_.type(), null(), ordered_);
   }
 
  protected:
   BuilderType indices_builder_;
+  bool ordered_ = false;
 };
 
 }  // namespace internal
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 1a6a718c15..2190f1ce04 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -168,17 +168,21 @@ struct DictionaryBuilderCase {
   template <typename ValueType>
   Status CreateFor() {
     using AdaptiveBuilderType = DictionaryBuilder<ValueType>;
+    using ExactBuilderType =
+        internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>;
     if (dictionary != nullptr) {
-      out->reset(new AdaptiveBuilderType(dictionary, pool));
+      out->reset(
+          new AdaptiveBuilderType(dictionary, pool, kDefaultBufferAlignment, 
ordered));
     } else if (exact_index_type) {
       if (!is_integer(index_type->id())) {
         return Status::TypeError("MakeBuilder: invalid index type ", 
*index_type);
       }
-      out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder, 
ValueType>(
-          index_type, value_type, pool));
+      out->reset(new ExactBuilderType(index_type, value_type, pool,
+                                      kDefaultBufferAlignment, ordered));
     } else {
       auto start_int_size = index_type->byte_width();
-      out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool));
+      out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool,
+                                         kDefaultBufferAlignment, ordered));
     }
     return Status::OK();
   }
@@ -188,8 +192,9 @@ struct DictionaryBuilderCase {
   MemoryPool* pool;
   const std::shared_ptr<DataType>& index_type;
   const std::shared_ptr<DataType>& value_type;
-  const std::shared_ptr<Array>& dictionary;
+  std::shared_ptr<Array> dictionary;
   bool exact_index_type;
+  bool ordered;
   std::unique_ptr<ArrayBuilder>* out;
 };
 
@@ -206,6 +211,7 @@ struct MakeBuilderImpl {
                                      dict_type.value_type(),
                                      /*dictionary=*/nullptr,
                                      exact_index_type,
+                                     dict_type.ordered(),
                                      &out};
     return visitor.Make();
   }
@@ -332,9 +338,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>&
                              const std::shared_ptr<Array>& dictionary,
                              std::unique_ptr<ArrayBuilder>* out) {
   const auto& dict_type = static_cast<const DictionaryType&>(*type);
-  DictionaryBuilderCase visitor = {
-      pool,       dict_type.index_type(),     dict_type.value_type(),
-      dictionary, /*exact_index_type=*/false, out};
+  DictionaryBuilderCase visitor = {pool,
+                                   dict_type.index_type(),
+                                   dict_type.value_type(),
+                                   dictionary,
+                                   /*exact_index_type=*/false,
+                                   dict_type.ordered(),
+                                   out};
   return visitor.Make();
 }
 
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 4312572a5e..cbe404aba9 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -996,14 +996,6 @@ class RDictionaryConverter<ValueType, 
enable_if_has_string_view<ValueType>>
   Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() override {
     ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());
 
-    auto result_type = checked_cast<DictionaryType*>(result->type().get());
-    if (this->dict_type_->ordered() && !result_type->ordered()) {
-      // TODO: we should not have to do that, there is probably something wrong
-      //       in the DictionaryBuilder code
-      result->data()->type =
-          arrow::dictionary(result_type->index_type(), 
result_type->value_type(), true);
-    }
-
     return std::make_shared<ChunkedArray>(
         std::make_shared<DictionaryArray>(result->data()));
   }
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index 8520160d12..9fcc3e6b86 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -643,6 +643,13 @@ test_that("arrow_array() handles vector -> list arrays 
(ARROW-7662)", {
   expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), 
list_of(dictionary(int8(), utf8())))
   expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), 
list_of(dictionary(int8(), utf8())))
 
+  # ordered factor (GH-49689)
+  expect_array_roundtrip(
+    list(ordered(c("b", "a"), levels = c("a", "b"))),
+    list_of(dictionary(int8(), utf8(), ordered = TRUE))
+  )
+  expect_array_roundtrip(list(ordered(NA, levels = c("a", "b"))), 
list_of(dictionary(int8(), utf8(), ordered = TRUE)))
+
   # struct
   expect_array_roundtrip(
     list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = 
logical(0))),
@@ -742,6 +749,13 @@ test_that("arrow_array() handles vector -> large list 
arrays", {
     as = large_list_of(dictionary(int8(), utf8()))
   )
 
+  # ordered factor (GH-49689)
+  expect_array_roundtrip(
+    list(ordered(c("b", "a"), levels = c("a", "b"))),
+    large_list_of(dictionary(int8(), utf8(), ordered = TRUE)),
+    as = large_list_of(dictionary(int8(), utf8(), ordered = TRUE))
+  )
+
   # struct
   expect_array_roundtrip(
     list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = 
logical(0))),
@@ -809,6 +823,13 @@ test_that("arrow_array() handles vector -> fixed size list 
arrays", {
     as = fixed_size_list_of(dictionary(int8(), utf8()), 2L)
   )
 
+  # ordered factor (GH-49689)
+  expect_array_roundtrip(
+    list(ordered(c("b", "a"), levels = c("a", "b"))),
+    fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L),
+    as = fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L)
+  )
+
   # struct
   expect_array_roundtrip(
     list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),

Reply via email to