This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5042863 ARROW-1754: [Python] alternative fix for duplicate
index/column name that preserves index name if available
5042863 is described below
commit 5042863d88d02afa9ff791ef56aba11d90528aa5
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Fri Feb 2 12:25:27 2018 -0500
ARROW-1754: [Python] alternative fix for duplicate index/column name that
preserves index name if available
Related to the discussion about the pandas metadata specification in
https://github.com/pandas-dev/pandas/pull/18201, and an alternative to
https://github.com/apache/arrow/pull/1271.
I don't open this PR because it should necessarily be merged, I just want
to show that it is not that difficult to both fix
[ARROW-1754](https://issues.apache.org/jira/browse/ARROW-1754) and preserve
index names as field names when possible (as this was mentioned in
https://github.com/pandas-dev/pandas/pull/18201 as the reason to make this
change to not preserve index names).
The diff is partly a revert of https://github.com/apache/arrow/pull/1271,
but then adapted to the current codebase.
Main reasons I prefer to preserve index names: 1) usability in pyarrow
itself (if you would want to work with pyarrow Tables created from pandas) and
2) when interchanging parquet files with other people / other non-pandas
systems, then it would be much nicer to not have `__index_level_n__` column
names if possible.
Author: Joris Van den Bossche <[email protected]>
Closes #1408 from jorisvandenbossche/index-names and squashes the following
commits:
eef1d334 [Joris Van den Bossche] alternative fix for duplicate index/column
name that preserves index name if available
---
python/pyarrow/pandas_compat.py | 45 ++++++++++++++++++++---------
python/pyarrow/tests/test_convert_pandas.py | 5 ++--
2 files changed, 34 insertions(+), 16 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 4a30fb3..240cccd 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -179,10 +179,8 @@ def get_column_metadata(column, name, arrow_type,
field_name):
}
-index_level_name = '__index_level_{:d}__'.format
-
-
-def construct_metadata(df, column_names, index_levels, preserve_index, types):
+def construct_metadata(df, column_names, index_levels, index_column_names,
+ preserve_index, types):
"""Returns a dictionary containing enough metadata to reconstruct a pandas
DataFrame as an Arrow Table, including index columns.
@@ -197,9 +195,8 @@ def construct_metadata(df, column_names, index_levels,
preserve_index, types):
-------
dict
"""
- ncolumns = len(column_names)
- df_types = types[:ncolumns - len(index_levels)]
- index_types = types[ncolumns - len(index_levels):]
+ df_types = types[:-len(index_levels)]
+ index_types = types[-len(index_levels):]
column_metadata = [
get_column_metadata(
@@ -213,9 +210,6 @@ def construct_metadata(df, column_names, index_levels,
preserve_index, types):
]
if preserve_index:
- index_column_names = list(map(
- index_level_name, range(len(index_levels))
- ))
index_column_metadata = [
get_column_metadata(
level,
@@ -294,9 +288,29 @@ def _column_name_to_strings(name):
return str(name)
+def _index_level_name(index, i, column_names):
+ """Return the name of an index level or a default name if `index.name` is
+ None or is already a column name.
+
+ Parameters
+ ----------
+ index : pandas.Index
+ i : int
+
+ Returns
+ -------
+ name : str
+ """
+ if index.name is not None and index.name not in column_names:
+ return index.name
+ else:
+ return '__index_level_{:d}__'.format(i)
+
+
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
- names = []
+ column_names = []
index_columns = []
+ index_column_names = []
type = None
if preserve_index:
@@ -324,12 +338,13 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1):
columns_to_convert.append(col)
convert_types.append(type)
- names.append(name)
+ column_names.append(name)
for i, column in enumerate(index_columns):
columns_to_convert.append(column)
convert_types.append(None)
- names.append(index_level_name(i))
+ name = _index_level_name(column, i, column_names)
+ index_column_names.append(name)
# NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
# using a thread pool is worth it. Currently the heuristic is whether the
@@ -358,8 +373,10 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1):
types = [x.type for x in arrays]
metadata = construct_metadata(
- df, names, index_columns, preserve_index, types
+ df, column_names, index_columns, index_column_names, preserve_index,
+ types
)
+ names = column_names + index_column_names
return names, arrays, metadata
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index ca2f1e3..f1f40a6 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -191,8 +191,9 @@ class TestPandasConversion(object):
assert idx0['field_name'] == idx0_name
assert idx0['name'] is None
- assert foo_name == '__index_level_1__'
- assert foo['name'] == 'foo'
+ assert foo_name == 'foo'
+ assert foo['field_name'] == foo_name
+ assert foo['name'] == foo_name
def test_categorical_column_index(self):
df = pd.DataFrame(
--
To stop receiving notification emails like this one, please contact
[email protected].