This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 55587efbf4 GH-47859: [C++] Fix creating union types without type_codes 
for fields.size() == 128 (#47815)
55587efbf4 is described below

commit 55587efbf4f272afda97bff2f33d6aaf4b4c0c8a
Author: Daniil Timižev <[email protected]>
AuthorDate: Sun Nov 23 16:15:17 2025 +0300

    GH-47859: [C++] Fix creating union types without type_codes for 
fields.size() == 128 (#47815)
    
    ### Rationale for this change
    
    There is a bug for creating union types with empty `type_codes`. If 
`fields.size()` == 128 (`kMaxTypeCode` + 1) and `type_codes` is empty, 
static_cast<int8_t> returns -128 and `internal::Iota` generates an empty vector 
of type codes, but the expected vector is [0, 1, 2, ..., 127], where 127 is 
`kMaxTypeCode`.
    
    ### What changes are included in this PR?
    
    - Added a new `internal::Iota` function to generate vectors of size = 
`length` with values from `start`.
    - Changed `internal::Iota` call from old parameters to new for creating 
`dense_union` and `sparse_union` types.
    - Implemented a new test to detect this error.
    
    ### Are these changes tested?
    
    Yes, there is a new test.
    
    ### Are there any user-facing changes?
    
    No.
    
    **This PR contains a "Critical Fix".**
    (b) a bug that caused incorrect or invalid data to be produced
    
    If `fields.size()` == 128 (`kMaxTypeCode` + 1), `internal::Iota` returns an 
empty vector that cannot validate here:
    https://github.com/apache/arrow/blob/main/cpp/src/arrow/type.cc#L1232
    * GitHub Issue: #47859
    
    Lead-authored-by: Daniil Timizhev <[email protected]>
    Co-authored-by: Daniil Timižev <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/type.cc           |  8 ++++----
 cpp/src/arrow/type_test.cc      | 31 +++++++++++++++++++++++++++++++
 cpp/src/arrow/util/range.h      | 12 +++++++++---
 docs/source/format/Columnar.rst |  2 +-
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2e9d860a8d..cba4a0ecd3 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -3270,7 +3270,7 @@ std::shared_ptr<DataType> 
run_end_encoded(std::shared_ptr<DataType> run_end_type
 std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
                                        std::vector<int8_t> type_codes) {
   if (type_codes.empty()) {
-    type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+    type_codes = internal::Iota<int8_t>(0, child_fields.size());
   }
   return std::make_shared<SparseUnionType>(std::move(child_fields),
                                            std::move(type_codes));
@@ -3278,7 +3278,7 @@ std::shared_ptr<DataType> sparse_union(FieldVector 
child_fields,
 std::shared_ptr<DataType> dense_union(FieldVector child_fields,
                                       std::vector<int8_t> type_codes) {
   if (type_codes.empty()) {
-    type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+    type_codes = internal::Iota<int8_t>(0, child_fields.size());
   }
   return std::make_shared<DenseUnionType>(std::move(child_fields), 
std::move(type_codes));
 }
@@ -3310,7 +3310,7 @@ std::shared_ptr<DataType> sparse_union(const ArrayVector& 
children,
                                        std::vector<std::string> field_names,
                                        std::vector<int8_t> type_codes) {
   if (type_codes.empty()) {
-    type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+    type_codes = internal::Iota<int8_t>(0, children.size());
   }
   auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
   return sparse_union(std::move(fields), std::move(type_codes));
@@ -3320,7 +3320,7 @@ std::shared_ptr<DataType> dense_union(const ArrayVector& 
children,
                                       std::vector<std::string> field_names,
                                       std::vector<int8_t> type_codes) {
   if (type_codes.empty()) {
-    type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+    type_codes = internal::Iota<int8_t>(0, children.size());
   }
   auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
   return dense_union(std::move(fields), std::move(type_codes));
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index cb17f1ac3f..e9b1d30e6e 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -22,6 +22,7 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -2191,6 +2192,36 @@ TEST(TestUnionType, Basics) {
   ASSERT_EQ(ty6->child_ids(), child_ids2);
 }
 
+TEST(TestUnionType, MaxTypeCode) {
+  std::vector<std::shared_ptr<Field>> fields;
+  for (int32_t i = 0; i <= UnionType::kMaxTypeCode; i++) {
+    fields.push_back(field(std::to_string(i), int32()));
+  }
+
+  std::vector<int8_t> type_codes(fields.size());
+  std::iota(type_codes.begin(), type_codes.end(), 0);
+
+  auto t1 = checked_pointer_cast<UnionType>(dense_union(fields, type_codes));
+  ASSERT_EQ(t1->type_codes().size(), UnionType::kMaxTypeCode + 1);
+  ASSERT_EQ(t1->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+  auto t2 = checked_pointer_cast<UnionType>(dense_union(fields));
+  ASSERT_EQ(t2->type_codes().size(), UnionType::kMaxTypeCode + 1);
+  ASSERT_EQ(t2->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+  AssertTypeEqual(*t1, *t2);
+
+  auto t3 = checked_pointer_cast<UnionType>(sparse_union(fields, type_codes));
+  ASSERT_EQ(t3->type_codes().size(), UnionType::kMaxTypeCode + 1);
+  ASSERT_EQ(t3->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+  auto t4 = checked_pointer_cast<UnionType>(sparse_union(fields));
+  ASSERT_EQ(t4->type_codes().size(), UnionType::kMaxTypeCode + 1);
+  ASSERT_EQ(t4->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+  AssertTypeEqual(*t3, *t4);
+}
+
 TEST(TestDictionaryType, Basics) {
   auto value_type = int32();
 
diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h
index 2055328798..55155b7e34 100644
--- a/cpp/src/arrow/util/range.h
+++ b/cpp/src/arrow/util/range.h
@@ -27,15 +27,21 @@
 
 namespace arrow::internal {
 
+/// Create a vector containing the values from start with length elements
+template <typename T>
+std::vector<T> Iota(T start, size_t length) {
+  std::vector<T> result(length);
+  std::iota(result.begin(), result.end(), start);
+  return result;
+}
+
 /// Create a vector containing the values from start up to stop
 template <typename T>
 std::vector<T> Iota(T start, T stop) {
   if (start > stop) {
     return {};
   }
-  std::vector<T> result(static_cast<size_t>(stop - start));
-  std::iota(result.begin(), result.end(), start);
-  return result;
+  return Iota<T>(start, static_cast<size_t>(stop - start));
 }
 
 /// Create a vector containing the values from 0 up to length
diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 5002e80524..cf7ff0c3de 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -880,7 +880,7 @@ each value. Its physical layout is as follows:
 * One child array for each type
 * Types buffer: A buffer of 8-bit signed integers. Each type in the
   union has a corresponding type id whose values are found in this
-  buffer. A union with more than 127 possible types can be modeled as
+  buffer. A union with more than 128 possible types can be modeled as
   a union of unions.
 * Offsets buffer: A buffer of signed Int32 values indicating the
   relative offset into the respective child array for the type in a

Reply via email to