This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 55587efbf4 GH-47859: [C++] Fix creating union types without type_codes
for fields.size() == 128 (#47815)
55587efbf4 is described below
commit 55587efbf4f272afda97bff2f33d6aaf4b4c0c8a
Author: Daniil Timižev <[email protected]>
AuthorDate: Sun Nov 23 16:15:17 2025 +0300
GH-47859: [C++] Fix creating union types without type_codes for
fields.size() == 128 (#47815)
### Rationale for this change
There is a bug for creating union types with empty `type_codes`. If
`fields.size()` == 128 (`kMaxTypeCode` + 1) and `type_codes` is empty,
static_cast<int8_t> returns -128 and `internal::Iota` generates an empty vector
of type codes, but the expected vector is [0, 1, 2, ..., 127], where 127 is
`kMaxTypeCode`.
### What changes are included in this PR?
- Added a new `internal::Iota` function to generate vectors of size =
`length` with values from `start`.
- Changed `internal::Iota` call from old parameters to new for creating
`dense_union` and `sparse_union` types.
- Implemented a new test to detect this error.
### Are these changes tested?
Yes, there is a new test.
### Are there any user-facing changes?
No.
**This PR contains a "Critical Fix".**
(b) a bug that caused incorrect or invalid data to be produced
If `fields.size()` == 128 (`kMaxTypeCode` + 1), `internal::Iota` returns an
empty vector that cannot validate here:
https://github.com/apache/arrow/blob/main/cpp/src/arrow/type.cc#L1232
* GitHub Issue: #47859
Lead-authored-by: Daniil Timizhev <[email protected]>
Co-authored-by: Daniil Timižev <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/type.cc | 8 ++++----
cpp/src/arrow/type_test.cc | 31 +++++++++++++++++++++++++++++++
cpp/src/arrow/util/range.h | 12 +++++++++---
docs/source/format/Columnar.rst | 2 +-
4 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2e9d860a8d..cba4a0ecd3 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -3270,7 +3270,7 @@ std::shared_ptr<DataType>
run_end_encoded(std::shared_ptr<DataType> run_end_type
std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
std::vector<int8_t> type_codes) {
if (type_codes.empty()) {
- type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ type_codes = internal::Iota<int8_t>(0, child_fields.size());
}
return std::make_shared<SparseUnionType>(std::move(child_fields),
std::move(type_codes));
@@ -3278,7 +3278,7 @@ std::shared_ptr<DataType> sparse_union(FieldVector
child_fields,
std::shared_ptr<DataType> dense_union(FieldVector child_fields,
std::vector<int8_t> type_codes) {
if (type_codes.empty()) {
- type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ type_codes = internal::Iota<int8_t>(0, child_fields.size());
}
return std::make_shared<DenseUnionType>(std::move(child_fields),
std::move(type_codes));
}
@@ -3310,7 +3310,7 @@ std::shared_ptr<DataType> sparse_union(const ArrayVector&
children,
std::vector<std::string> field_names,
std::vector<int8_t> type_codes) {
if (type_codes.empty()) {
- type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+ type_codes = internal::Iota<int8_t>(0, children.size());
}
auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
return sparse_union(std::move(fields), std::move(type_codes));
@@ -3320,7 +3320,7 @@ std::shared_ptr<DataType> dense_union(const ArrayVector&
children,
std::vector<std::string> field_names,
std::vector<int8_t> type_codes) {
if (type_codes.empty()) {
- type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+ type_codes = internal::Iota<int8_t>(0, children.size());
}
auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
return dense_union(std::move(fields), std::move(type_codes));
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index cb17f1ac3f..e9b1d30e6e 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -22,6 +22,7 @@
#include <cstdint>
#include <functional>
#include <memory>
+#include <numeric>
#include <string>
#include <unordered_set>
#include <vector>
@@ -2191,6 +2192,36 @@ TEST(TestUnionType, Basics) {
ASSERT_EQ(ty6->child_ids(), child_ids2);
}
+TEST(TestUnionType, MaxTypeCode) {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int32_t i = 0; i <= UnionType::kMaxTypeCode; i++) {
+ fields.push_back(field(std::to_string(i), int32()));
+ }
+
+ std::vector<int8_t> type_codes(fields.size());
+ std::iota(type_codes.begin(), type_codes.end(), 0);
+
+ auto t1 = checked_pointer_cast<UnionType>(dense_union(fields, type_codes));
+ ASSERT_EQ(t1->type_codes().size(), UnionType::kMaxTypeCode + 1);
+ ASSERT_EQ(t1->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+ auto t2 = checked_pointer_cast<UnionType>(dense_union(fields));
+ ASSERT_EQ(t2->type_codes().size(), UnionType::kMaxTypeCode + 1);
+ ASSERT_EQ(t2->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+ AssertTypeEqual(*t1, *t2);
+
+ auto t3 = checked_pointer_cast<UnionType>(sparse_union(fields, type_codes));
+ ASSERT_EQ(t3->type_codes().size(), UnionType::kMaxTypeCode + 1);
+ ASSERT_EQ(t3->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+ auto t4 = checked_pointer_cast<UnionType>(sparse_union(fields));
+ ASSERT_EQ(t4->type_codes().size(), UnionType::kMaxTypeCode + 1);
+ ASSERT_EQ(t4->child_ids().size(), UnionType::kMaxTypeCode + 1);
+
+ AssertTypeEqual(*t3, *t4);
+}
+
TEST(TestDictionaryType, Basics) {
auto value_type = int32();
diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h
index 2055328798..55155b7e34 100644
--- a/cpp/src/arrow/util/range.h
+++ b/cpp/src/arrow/util/range.h
@@ -27,15 +27,21 @@
namespace arrow::internal {
+/// Create a vector containing the values from start with length elements
+template <typename T>
+std::vector<T> Iota(T start, size_t length) {
+ std::vector<T> result(length);
+ std::iota(result.begin(), result.end(), start);
+ return result;
+}
+
/// Create a vector containing the values from start up to stop
template <typename T>
std::vector<T> Iota(T start, T stop) {
if (start > stop) {
return {};
}
- std::vector<T> result(static_cast<size_t>(stop - start));
- std::iota(result.begin(), result.end(), start);
- return result;
+ return Iota<T>(start, static_cast<size_t>(stop - start));
}
/// Create a vector containing the values from 0 up to length
diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 5002e80524..cf7ff0c3de 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -880,7 +880,7 @@ each value. Its physical layout is as follows:
* One child array for each type
* Types buffer: A buffer of 8-bit signed integers. Each type in the
union has a corresponding type id whose values are found in this
- buffer. A union with more than 127 possible types can be modeled as
+ buffer. A union with more than 128 possible types can be modeled as
a union of unions.
* Offsets buffer: A buffer of signed Int32 values indicating the
relative offset into the respective child array for the type in a