This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new cff2c522e5 GH-40053: [Python] Preserve dict key order when inferring 
struct type (#48813)
cff2c522e5 is described below

commit cff2c522e580f052c6d3d1ce6cda7e46fd4db380
Author: Gabriel Simões <[email protected]>
AuthorDate: Tue Jan 13 10:10:56 2026 +0000

    GH-40053: [Python] Preserve dict key order when inferring struct type 
(#48813)
    
    ### Rationale for this change
    
    Fixes https://github.com/apache/arrow/issues/40053
    
    When converting Python dictionaries to PyArrow arrays, struct fields are 
sorted alphabetically instead of preserving the original dictionary key 
insertion order. Since Python 3.7+, dictionaries maintain insertion order, and 
users expect this order to be preserved.
    
    ```python
    >>> import pyarrow as pa
    >>> pa.array([{"b": 2, "a": 1}]).type
    struct<a: int64, b: int64>
    ```
    
    Expected: `struct<b: int64, a: int64>`
    
    ### What changes are included in this PR?
    
    Replace `std::map<std::string, TypeInferrer>` with 
`std::vector<std::pair<std::string, TypeInferrer>>` + 
`std::unordered_map<std::string, size_t>` in the type inference code. This 
follows the same pattern used in the JSON parser 
(`cpp/src/arrow/json/parser.cc`) for the same problem.
    
    ### Are these changes tested?
    
    Updated existing tests to verify field ordering.
    
    ### Are there any user-facing changes?
    
    Struct field order now matches dictionary key insertion order instead of 
being sorted alphabetically. This is a behavioral change but aligns with user 
expectations and Python semantics.
    
    * GitHub Issue: #40053
    
    Authored-by: Gabriel Silva Simoes <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/src/arrow/python/inference.cc | 28 +++++++++++++++++-----------
 python/pyarrow/table.pxi                     | 28 ++++++++++++++--------------
 python/pyarrow/tests/test_convert_builtin.py | 23 ++++++++++-------------
 3 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/inference.cc 
b/python/pyarrow/src/arrow/python/inference.cc
index e5714862e4..06cb469483 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -22,8 +22,8 @@
 
 #include <algorithm>
 #include <limits>
-#include <map>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -704,15 +704,19 @@ class TypeInferrer {
                                  Py_TYPE(key_obj)->tp_name, "'");
       }
       // Get or create visitor for this key
-      auto it = struct_inferrers_.find(key);
-      if (it == struct_inferrers_.end()) {
-        it = struct_inferrers_
-                 .insert(
-                     std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
-                                                      validate_interval_, 
make_unions_)))
-                 .first;
+      TypeInferrer* visitor;
+      auto it = struct_field_index_.find(key);
+      if (it == struct_field_index_.end()) {
+        // New field - add to vector and index
+        size_t new_index = struct_inferrers_.size();
+        struct_inferrers_.emplace_back(
+            key, TypeInferrer(pandas_null_sentinels_, validate_interval_, 
make_unions_));
+        struct_field_index_.emplace(std::move(key), new_index);
+        visitor = &struct_inferrers_.back().second;
+      } else {
+        // Existing field - retrieve from vector
+        visitor = &struct_inferrers_[it->second].second;
       }
-      TypeInferrer* visitor = &it->second;
 
       // We ignore termination signals from child visitors for now
       //
@@ -730,7 +734,8 @@ class TypeInferrer {
 
   Status GetStructType(std::shared_ptr<DataType>* out) {
     std::vector<std::shared_ptr<Field>> fields;
-    for (auto&& it : struct_inferrers_) {
+    fields.reserve(struct_inferrers_.size());
+    for (auto& it : struct_inferrers_) {
       std::shared_ptr<DataType> field_type;
       RETURN_NOT_OK(it.second.GetType(&field_type));
       fields.emplace_back(field(it.first, field_type));
@@ -762,7 +767,8 @@ class TypeInferrer {
   int64_t numpy_dtype_count_;
   int64_t interval_count_;
   std::unique_ptr<TypeInferrer> list_inferrer_;
-  std::map<std::string, TypeInferrer> struct_inferrers_;
+  std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
+  std::unordered_map<std::string, size_t> struct_field_index_;
   std::shared_ptr<DataType> scalar_type_;
 
   // If we observe a strongly-typed value in e.g. a NumPy array, we can store
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9136f25298..8e258e38af 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -3581,9 +3581,9 @@ cdef class RecordBatch(_Tabular):
         >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
         ...                    {'year': 2022, 'n_legs': 4}])
         >>> pa.RecordBatch.from_struct_array(struct).to_pandas()
-          animals  n_legs    year
-        0  Parrot       2     NaN
-        1    None       4  2022.0
+           n_legs animals    year
+        0       2  Parrot     NaN
+        1       4    None  2022.0
         """
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
@@ -4468,18 +4468,18 @@ cdef class Table(_Tabular):
         ...                              names = ["a", "month"])
         >>> table
         pyarrow.Table
-        a: struct<animals: string, n_legs: int64, year: int64>
-          child 0, animals: string
-          child 1, n_legs: int64
+        a: struct<n_legs: int64, animals: string, year: int64>
+          child 0, n_legs: int64
+          child 1, animals: string
           child 2, year: int64
         month: int64
         ----
         a: [
           -- is_valid: all not null
-          -- child 0 type: string
-        ["Parrot",null]
-          -- child 1 type: int64
+          -- child 0 type: int64
         [2,4]
+          -- child 1 type: string
+        ["Parrot",null]
           -- child 2 type: int64
         [null,2022]]
         month: [[4,6]]
@@ -4488,13 +4488,13 @@ cdef class Table(_Tabular):
 
         >>> table.flatten()
         pyarrow.Table
-        a.animals: string
         a.n_legs: int64
+        a.animals: string
         a.year: int64
         month: int64
         ----
-        a.animals: [["Parrot",null]]
         a.n_legs: [[2,4]]
+        a.animals: [["Parrot",null]]
         a.year: [[null,2022]]
         month: [[4,6]]
         """
@@ -4936,9 +4936,9 @@ cdef class Table(_Tabular):
         >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
         ...                    {'year': 2022, 'n_legs': 4}])
         >>> pa.Table.from_struct_array(struct).to_pandas()
-          animals  n_legs    year
-        0  Parrot       2     NaN
-        1    None       4  2022.0
+           n_legs animals    year
+        0       2  Parrot     NaN
+        1       4    None  2022.0
         """
         if isinstance(struct_array, Array):
             return 
Table.from_batches([RecordBatch.from_struct_array(struct_array)])
diff --git a/python/pyarrow/tests/test_convert_builtin.py 
b/python/pyarrow/tests/test_convert_builtin.py
index f1461a302d..c10ae0f62b 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -69,14 +69,6 @@ class MyBrokenInt:
         1/0  # MARKER
 
 
-def check_struct_type(ty, expected):
-    """
-    Check a struct type is as expected, but not taking order into account.
-    """
-    assert pa.types.is_struct(ty)
-    assert set(ty) == set(expected)
-
-
 def test_iterable_types():
     arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
     arr2 = pa.array((0, 1, 2, 3))
@@ -2010,25 +2002,29 @@ def test_struct_from_dicts_inference():
             {'a': 6, 'b': 'bar', 'c': False}]
 
     arr = pa.array(data)
-    check_struct_type(arr.type, expected_type)
+    assert arr.type == expected_type
     assert arr.to_pylist() == data
 
     # With omitted values
+    # GH-40053: Field order follows first occurrence (a, c, then b)
     data = [{'a': 5, 'c': True},
             None,
             {},
             {'a': None, 'b': 'bar'}]
-    expected = [{'a': 5, 'b': None, 'c': True},
+    expected_type_omitted = pa.struct([pa.field('a', pa.int64()),
+                                       pa.field('c', pa.bool_()),
+                                       pa.field('b', pa.string())])
+    expected = [{'a': 5, 'c': True, 'b': None},
                 None,
-                {'a': None, 'b': None, 'c': None},
-                {'a': None, 'b': 'bar', 'c': None}]
+                {'a': None, 'c': None, 'b': None},
+                {'a': None, 'c': None, 'b': 'bar'}]
 
     arr = pa.array(data)
     data_as_ndarray = np.empty(len(data), dtype=object)
     data_as_ndarray[:] = data
     arr2 = pa.array(data)
 
-    check_struct_type(arr.type, expected_type)
+    assert arr.type == expected_type_omitted
     assert arr.to_pylist() == expected
     assert arr.equals(arr2)
 
@@ -2042,6 +2038,7 @@ def test_struct_from_dicts_inference():
             {'a': None, 'b': 'bar'}]
     arr = pa.array(data)
 
+    assert arr.type == expected_type
     assert arr.to_pylist() == data
 
     # Edge cases

Reply via email to