This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new cff2c522e5 GH-40053: [Python] Preserve dict key order when inferring
struct type (#48813)
cff2c522e5 is described below
commit cff2c522e580f052c6d3d1ce6cda7e46fd4db380
Author: Gabriel Simões <[email protected]>
AuthorDate: Tue Jan 13 10:10:56 2026 +0000
GH-40053: [Python] Preserve dict key order when inferring struct type
(#48813)
### Rationale for this change
Fixes https://github.com/apache/arrow/issues/40053
When converting Python dictionaries to PyArrow arrays, struct fields are
sorted alphabetically instead of preserving the original dictionary key
insertion order. Since Python 3.7+, dictionaries maintain insertion order, and
users expect this order to be preserved.
```python
>>> import pyarrow as pa
>>> pa.array([{"b": 2, "a": 1}]).type
struct<a: int64, b: int64>
```
Expected: `struct<b: int64, a: int64>`
### What changes are included in this PR?
Replace `std::map<std::string, TypeInferrer>` with
`std::vector<std::pair<std::string, TypeInferrer>>` +
`std::unordered_map<std::string, size_t>` in the type inference code. This
follows the same pattern used in the JSON parser
(`cpp/src/arrow/json/parser.cc`) for the same problem.
### Are these changes tested?
Updated existing tests to verify field ordering.
### Are there any user-facing changes?
Struct field order now matches dictionary key insertion order instead of
being sorted alphabetically. This is a behavioral change but aligns with user
expectations and Python semantics.
* GitHub Issue: #40053
Authored-by: Gabriel Silva Simoes <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/src/arrow/python/inference.cc | 28 +++++++++++++++++-----------
python/pyarrow/table.pxi | 28 ++++++++++++++--------------
python/pyarrow/tests/test_convert_builtin.py | 23 ++++++++++-------------
3 files changed, 41 insertions(+), 38 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/inference.cc
b/python/pyarrow/src/arrow/python/inference.cc
index e5714862e4..06cb469483 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -22,8 +22,8 @@
#include <algorithm>
#include <limits>
-#include <map>
#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -704,15 +704,19 @@ class TypeInferrer {
Py_TYPE(key_obj)->tp_name, "'");
}
// Get or create visitor for this key
- auto it = struct_inferrers_.find(key);
- if (it == struct_inferrers_.end()) {
- it = struct_inferrers_
- .insert(
- std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
- validate_interval_,
make_unions_)))
- .first;
+ TypeInferrer* visitor;
+ auto it = struct_field_index_.find(key);
+ if (it == struct_field_index_.end()) {
+ // New field - add to vector and index
+ size_t new_index = struct_inferrers_.size();
+ struct_inferrers_.emplace_back(
+ key, TypeInferrer(pandas_null_sentinels_, validate_interval_,
make_unions_));
+ struct_field_index_.emplace(std::move(key), new_index);
+ visitor = &struct_inferrers_.back().second;
+ } else {
+ // Existing field - retrieve from vector
+ visitor = &struct_inferrers_[it->second].second;
}
- TypeInferrer* visitor = &it->second;
// We ignore termination signals from child visitors for now
//
@@ -730,7 +734,8 @@ class TypeInferrer {
Status GetStructType(std::shared_ptr<DataType>* out) {
std::vector<std::shared_ptr<Field>> fields;
- for (auto&& it : struct_inferrers_) {
+ fields.reserve(struct_inferrers_.size());
+ for (auto& it : struct_inferrers_) {
std::shared_ptr<DataType> field_type;
RETURN_NOT_OK(it.second.GetType(&field_type));
fields.emplace_back(field(it.first, field_type));
@@ -762,7 +767,8 @@ class TypeInferrer {
int64_t numpy_dtype_count_;
int64_t interval_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
- std::map<std::string, TypeInferrer> struct_inferrers_;
+ std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
+ std::unordered_map<std::string, size_t> struct_field_index_;
std::shared_ptr<DataType> scalar_type_;
// If we observe a strongly-typed value in e.g. a NumPy array, we can store
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9136f25298..8e258e38af 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -3581,9 +3581,9 @@ cdef class RecordBatch(_Tabular):
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
... {'year': 2022, 'n_legs': 4}])
>>> pa.RecordBatch.from_struct_array(struct).to_pandas()
- animals n_legs year
- 0 Parrot 2 NaN
- 1 None 4 2022.0
+ n_legs animals year
+ 0 2 Parrot NaN
+ 1 4 None 2022.0
"""
cdef:
shared_ptr[CRecordBatch] c_record_batch
@@ -4468,18 +4468,18 @@ cdef class Table(_Tabular):
... names = ["a", "month"])
>>> table
pyarrow.Table
- a: struct<animals: string, n_legs: int64, year: int64>
- child 0, animals: string
- child 1, n_legs: int64
+ a: struct<n_legs: int64, animals: string, year: int64>
+ child 0, n_legs: int64
+ child 1, animals: string
child 2, year: int64
month: int64
----
a: [
-- is_valid: all not null
- -- child 0 type: string
- ["Parrot",null]
- -- child 1 type: int64
+ -- child 0 type: int64
[2,4]
+ -- child 1 type: string
+ ["Parrot",null]
-- child 2 type: int64
[null,2022]]
month: [[4,6]]
@@ -4488,13 +4488,13 @@ cdef class Table(_Tabular):
>>> table.flatten()
pyarrow.Table
- a.animals: string
a.n_legs: int64
+ a.animals: string
a.year: int64
month: int64
----
- a.animals: [["Parrot",null]]
a.n_legs: [[2,4]]
+ a.animals: [["Parrot",null]]
a.year: [[null,2022]]
month: [[4,6]]
"""
@@ -4936,9 +4936,9 @@ cdef class Table(_Tabular):
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
... {'year': 2022, 'n_legs': 4}])
>>> pa.Table.from_struct_array(struct).to_pandas()
- animals n_legs year
- 0 Parrot 2 NaN
- 1 None 4 2022.0
+ n_legs animals year
+ 0 2 Parrot NaN
+ 1 4 None 2022.0
"""
if isinstance(struct_array, Array):
return
Table.from_batches([RecordBatch.from_struct_array(struct_array)])
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index f1461a302d..c10ae0f62b 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -69,14 +69,6 @@ class MyBrokenInt:
1/0 # MARKER
-def check_struct_type(ty, expected):
- """
- Check a struct type is as expected, but not taking order into account.
- """
- assert pa.types.is_struct(ty)
- assert set(ty) == set(expected)
-
-
def test_iterable_types():
arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
arr2 = pa.array((0, 1, 2, 3))
@@ -2010,25 +2002,29 @@ def test_struct_from_dicts_inference():
{'a': 6, 'b': 'bar', 'c': False}]
arr = pa.array(data)
- check_struct_type(arr.type, expected_type)
+ assert arr.type == expected_type
assert arr.to_pylist() == data
# With omitted values
+ # GH-40053: Field order follows first occurrence (a, c, then b)
data = [{'a': 5, 'c': True},
None,
{},
{'a': None, 'b': 'bar'}]
- expected = [{'a': 5, 'b': None, 'c': True},
+ expected_type_omitted = pa.struct([pa.field('a', pa.int64()),
+ pa.field('c', pa.bool_()),
+ pa.field('b', pa.string())])
+ expected = [{'a': 5, 'c': True, 'b': None},
None,
- {'a': None, 'b': None, 'c': None},
- {'a': None, 'b': 'bar', 'c': None}]
+ {'a': None, 'c': None, 'b': None},
+ {'a': None, 'c': None, 'b': 'bar'}]
arr = pa.array(data)
data_as_ndarray = np.empty(len(data), dtype=object)
data_as_ndarray[:] = data
arr2 = pa.array(data)
- check_struct_type(arr.type, expected_type)
+ assert arr.type == expected_type_omitted
assert arr.to_pylist() == expected
assert arr.equals(arr2)
@@ -2042,6 +2038,7 @@ def test_struct_from_dicts_inference():
{'a': None, 'b': 'bar'}]
arr = pa.array(data)
+ assert arr.type == expected_type
assert arr.to_pylist() == data
# Edge cases