This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 5042863 ARROW-1754: [Python] alternative fix for duplicate index/column name that preserves index name if available 5042863 is described below commit 5042863d88d02afa9ff791ef56aba11d90528aa5 Author: Joris Van den Bossche <jorisvandenboss...@gmail.com> AuthorDate: Fri Feb 2 12:25:27 2018 -0500 ARROW-1754: [Python] alternative fix for duplicate index/column name that preserves index name if available Related to the discussion about the pandas metadata specification in https://github.com/pandas-dev/pandas/pull/18201, and an alternative to https://github.com/apache/arrow/pull/1271. I don't open this PR because it should necessarily be merged, I just want to show that it is not that difficult to both fix [ARROW-1754](https://issues.apache.org/jira/browse/ARROW-1754) and preserve index names as field names when possible (as this was mentioned in https://github.com/pandas-dev/pandas/pull/18201 as the reason to make this change to not preserve index names). The diff is partly a revert of https://github.com/apache/arrow/pull/1271, but then adapted to the current codebase. Main reasons I prefer to preserve index names: 1) usability in pyarrow itself (if you would want to work with pyarrow Tables created from pandas) and 2) when interchanging parquet files with other people / other non-pandas systems, then it would be much nicer to not have `__index_level_n__` column names if possible. Author: Joris Van den Bossche <jorisvandenboss...@gmail.com> Closes #1408 from jorisvandenbossche/index-names and squashes the following commits: eef1d334 [Joris Van den Bossche] alternative fix for duplicate index/column name that preserves index name if available --- python/pyarrow/pandas_compat.py | 45 ++++++++++++++++++++--------- python/pyarrow/tests/test_convert_pandas.py | 5 ++-- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 4a30fb3..240cccd 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -179,10 +179,8 @@ def get_column_metadata(column, name, arrow_type, field_name): } -index_level_name = '__index_level_{:d}__'.format - - -def construct_metadata(df, column_names, index_levels, preserve_index, types): +def construct_metadata(df, column_names, index_levels, index_column_names, + preserve_index, types): """Returns a dictionary containing enough metadata to reconstruct a pandas DataFrame as an Arrow Table, including index columns. @@ -197,9 +195,8 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): ------- dict """ - ncolumns = len(column_names) - df_types = types[:ncolumns - len(index_levels)] - index_types = types[ncolumns - len(index_levels):] + df_types = types[:-len(index_levels)] + index_types = types[-len(index_levels):] column_metadata = [ get_column_metadata( @@ -213,9 +210,6 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): ] if preserve_index: - index_column_names = list(map( - index_level_name, range(len(index_levels)) - )) index_column_metadata = [ get_column_metadata( level, @@ -294,9 +288,29 @@ def _column_name_to_strings(name): return str(name) +def _index_level_name(index, i, column_names): + """Return the name of an index level or a default name if `index.name` is + None or is already a column name. + + Parameters + ---------- + index : pandas.Index + i : int + + Returns + ------- + name : str + """ + if index.name is not None and index.name not in column_names: + return index.name + else: + return '__index_level_{:d}__'.format(i) + + def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): - names = [] + column_names = [] index_columns = [] + index_column_names = [] type = None if preserve_index: @@ -324,12 +338,13 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): columns_to_convert.append(col) convert_types.append(type) - names.append(name) + column_names.append(name) for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) - names.append(index_level_name(i)) + name = _index_level_name(column, i, column_names) + index_column_names.append(name) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the @@ -358,8 +373,10 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): types = [x.type for x in arrays] metadata = construct_metadata( - df, names, index_columns, preserve_index, types + df, column_names, index_columns, index_column_names, preserve_index, + types ) + names = column_names + index_column_names return names, arrays, metadata diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ca2f1e3..f1f40a6 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -191,8 +191,9 @@ class TestPandasConversion(object): assert idx0['field_name'] == idx0_name assert idx0['name'] is None - assert foo_name == '__index_level_1__' - assert foo['name'] == 'foo' + assert foo_name == 'foo' + assert foo['field_name'] == foo_name + assert foo['name'] == foo_name def test_categorical_column_index(self): df = pd.DataFrame( -- To stop receiving notification emails like this one, please contact w...@apache.org.