This is an automated email from the ASF dual-hosted git repository.
willayd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0cd9c8e824 GH-33473 [Python] Fix KeyError on Pandas roundtrip with
RangeIndex in MultiIndex (#39983)
0cd9c8e824 is described below
commit 0cd9c8e824fb365ce66137648f47f7ed4fb9e764
Author: Eirik <[email protected]>
AuthorDate: Mon Nov 24 16:12:18 2025 +0100
GH-33473 [Python] Fix KeyError on Pandas roundtrip with RangeIndex in
MultiIndex (#39983)
### Rationale for this change
Fixes bug when round-tripping to Pandas with a specific combination of
indices
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
I don't know if this counts as a a "Critical Fix". Without it,
`Table.from_pandas()` can return a table which cannot be converted back with
`table.to_pandas()` due to a column missing from the `"pandas"` field in the
table metadata.
* Closes: #33473
Authored-by: Eirik B. Stavestrand <[email protected]>
Signed-off-by: Will Ayd <[email protected]>
---
python/pyarrow/pandas_compat.py | 20 +++++++++++---------
python/pyarrow/tests/test_pandas.py | 8 ++++++++
2 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index b0009507e4..dfed76d371 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -221,8 +221,14 @@ def construct_metadata(columns_to_convert, df,
column_names, index_levels,
# see https://github.com/apache/arrow/pull/44963#discussion_r1875771953
column_field_names = [str(name) for name in column_names]
- num_serialized_index_levels = len([descr for descr in index_descriptors
- if not isinstance(descr, dict)])
+ serialized_index_levels = [
+ (level, descriptor)
+ for level, descriptor in zip(index_levels, index_descriptors)
+ if not isinstance(descriptor, dict)
+ ]
+
+ num_serialized_index_levels = len(serialized_index_levels)
+
# Use ntypes instead of Python shorthand notation [:-len(x)] as [:-0]
# behaves differently to what we want.
ntypes = len(types)
@@ -240,13 +246,9 @@ def construct_metadata(columns_to_convert, df,
column_names, index_levels,
index_column_metadata = []
if preserve_index is not False:
non_str_index_names = []
- for level, arrow_type, descriptor in zip(index_levels, index_types,
- index_descriptors):
- if isinstance(descriptor, dict):
- # The index is represented in a non-serialized fashion,
- # e.g. RangeIndex
- continue
-
+ for (level, descriptor), arrow_type in zip(
+ serialized_index_levels, index_types
+ ):
if level.name is not None and not isinstance(level.name, str):
non_str_index_names.append(level.name)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 5a55e90fbb..7f9b04eaab 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -345,6 +345,14 @@ class TestConvertMetadata:
)
_check_pandas_roundtrip(df, preserve_index=True)
+ def test_multiindex_rangeindex(self):
+ # https://github.com/apache/arrow/issues/33473
+ multiindex = pd.MultiIndex.from_arrays(
+ [pd.RangeIndex(0, 2), pd.Index([1, 2])]
+ )
+ df = pd.DataFrame(pd.Series([1, 2], name="a"), index=multiindex)
+ _check_pandas_roundtrip(df, preserve_index=None)
+
def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)