[
https://issues.apache.org/jira/browse/ARROW-1895?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16285341#comment-16285341
]
ASF GitHub Bot commented on ARROW-1895:
---------------------------------------
xhochy closed pull request #1397: ARROW-1895/ARROW-1897: [Python] Add
field_name to pandas index metadata
URL: https://github.com/apache/arrow/pull/1397
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 8459ec31b..afb204db8 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -110,7 +110,11 @@ def get_logical_type_from_numpy(pandas_collection):
except KeyError:
if hasattr(pandas_collection.dtype, 'tz'):
return 'datetimetz'
- return infer_dtype(pandas_collection)
+ result = infer_dtype(pandas_collection)
+
+ if result == 'string':
+ return 'bytes' if PY2 else 'unicode'
+ return result
def get_extension_dtype_info(column):
@@ -122,7 +126,7 @@ def get_extension_dtype_info(column):
'num_categories': len(cats.categories),
'ordered': cats.ordered,
}
- physical_dtype = 'object'
+ physical_dtype = str(cats.codes.dtype)
elif hasattr(dtype, 'tz'):
metadata = {'timezone': str(dtype.tz)}
physical_dtype = 'datetime64[ns]'
@@ -132,7 +136,7 @@ def get_extension_dtype_info(column):
return physical_dtype, metadata
-def get_column_metadata(column, name, arrow_type):
+def get_column_metadata(column, name, arrow_type, field_name):
"""Construct the metadata for a given column
Parameters
@@ -140,6 +144,10 @@ def get_column_metadata(column, name, arrow_type):
column : pandas.Series or pandas.Index
name : str
arrow_type : pyarrow.DataType
+ field_name : str
+ Equivalent to `name` when `column` is a `Series`, otherwise if `column`
+ is a pandas Index then `field_name` will not be the same as `name`.
+ This is the name of the field in the arrow Table's schema.
Returns
-------
@@ -164,6 +172,7 @@ def get_column_metadata(column, name, arrow_type):
return {
'name': name,
+ 'field_name': str(field_name),
'pandas_type': logical_type,
'numpy_type': string_dtype,
'metadata': extra_metadata,
@@ -193,10 +202,14 @@ def construct_metadata(df, column_names, index_levels,
preserve_index, types):
index_types = types[ncolumns - len(index_levels):]
column_metadata = [
- get_column_metadata(df[col_name], name=sanitized_name,
- arrow_type=arrow_type)
- for col_name, sanitized_name, arrow_type in
- zip(df.columns, column_names, df_types)
+ get_column_metadata(
+ df[col_name],
+ name=sanitized_name,
+ arrow_type=arrow_type,
+ field_name=sanitized_name
+ ) for col_name, sanitized_name, arrow_type in zip(
+ df.columns, column_names, df_types
+ )
]
if preserve_index:
@@ -204,9 +217,13 @@ def construct_metadata(df, column_names, index_levels,
preserve_index, types):
index_level_name, range(len(index_levels))
))
index_column_metadata = [
- get_column_metadata(level, name=level.name, arrow_type=arrow_type)
- for i, (level, arrow_type) in enumerate(
- zip(index_levels, index_types)
+ get_column_metadata(
+ level,
+ name=level.name,
+ arrow_type=arrow_type,
+ field_name=field_name,
+ ) for i, (level, arrow_type, field_name) in enumerate(
+ zip(index_levels, index_types, index_column_names)
)
]
@@ -214,9 +231,16 @@ def construct_metadata(df, column_names, index_levels,
preserve_index, types):
for level in getattr(df.columns, 'levels', [df.columns]):
string_dtype, extra_metadata = get_extension_dtype_info(level)
+
+ pandas_type = get_logical_type_from_numpy(level)
+ if pandas_type == 'unicode':
+ assert not extra_metadata
+ extra_metadata = {'encoding': 'UTF-8'}
+
column_index = {
'name': level.name,
- 'pandas_type': get_logical_type_from_numpy(level),
+ 'field_name': level.name,
+ 'pandas_type': pandas_type,
'numpy_type': string_dtype,
'metadata': extra_metadata,
}
@@ -436,7 +460,6 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
schema = table.schema
row_count = table.num_rows
metadata = schema.metadata
- columns_metadata = None
has_pandas_metadata = metadata is not None and b'pandas' in metadata
@@ -446,13 +469,36 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
columns = pandas_metadata['columns']
column_indexes = pandas_metadata.get('column_indexes', [])
table = _add_any_metadata(table, pandas_metadata)
- columns_metadata = pandas_metadata.get('columns', None)
block_table = table
+ index_columns_set = frozenset(index_columns)
+
+ # 0. 'field_name' is the name of the column in the arrow Table
+ # 1. 'name' is the user-facing name of the column, that is, it came from
+ # pandas
+ # 2. 'field_name' and 'name' differ for index columns
+ # 3. We fall back on c['name'] for backwards compatibility
+ logical_index_names = [
+ c['name'] for c in columns
+ if c.get('field_name', c['name']) in index_columns_set
+ ]
+
+ # There must be the same number of field names and physical names
+ # (fields in the arrow Table)
+ assert len(logical_index_names) == len(index_columns_set)
+
+ # It can never be the case in a released version of pyarrow that
+ # c['name'] is None *and* 'field_name' is not a key in the column metadata,
+ # because the change to allow c['name'] to be None and the change to add
+ # 'field_name' are in the same release (0.8.0)
+ assert all(
+ (c['name'] is None and 'field_name' in c) or c['name'] is not None
+ for c in columns
+ )
+
# Build up a list of index columns and names while removing those columns
# from the original table
- logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
for raw_name, logical_name in zip(index_columns, logical_index_names):
i = schema.get_field_index(raw_name)
if i != -1:
@@ -483,15 +529,12 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
index = pd.RangeIndex(row_count)
column_strings = [x.name for x in block_table.itercolumns()]
- if columns_metadata is not None:
- columns_name_dict = dict(
- (str(x['name']), x['name'])
- for x in columns_metadata
- )
+ if columns:
+ columns_name_dict = {
+ c.get('field_name', str(c['name'])): c['name'] for c in columns
+ }
columns_values = [
- columns_name_dict[y]
- if y in columns_name_dict.keys() else y
- for y in column_strings
+ columns_name_dict.get(name, name) for name in column_strings
]
else:
columns_values = column_strings
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index e94ee4608..d7f059649 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -30,7 +30,7 @@
import pandas as pd
import pandas.util.testing as tm
-from pyarrow.compat import u
+from pyarrow.compat import u, PY2
import pyarrow as pa
import pyarrow.types as patypes
@@ -160,9 +160,41 @@ def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)
+ def test_index_metadata_field_name(self):
+ # test None case, and strangely named non-index columns
+ df = pd.DataFrame(
+ [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
+ index=pd.MultiIndex.from_arrays(
+ [['c', 'b', 'a'], [3, 2, 1]],
+ names=[None, 'foo']
+ ),
+ columns=['a', None, '__index_level_0__'],
+ )
+ t = pa.Table.from_pandas(df, preserve_index=True)
+ raw_metadata = t.schema.metadata
+
+ js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+
+ col1, col2, col3, idx0, foo = js['columns']
+
+ assert col1['name'] == 'a'
+ assert col1['name'] == col1['field_name']
+
+ assert col2['name'] is None
+ assert col2['field_name'] == 'None'
+
+ assert col3['name'] == '__index_level_0__'
+ assert col3['name'] == col3['field_name']
+
+ idx0_name, foo_name = js['index_columns']
+ assert idx0_name == '__index_level_0__'
+ assert idx0['field_name'] == idx0_name
+ assert idx0['name'] is None
+
+ assert foo_name == '__index_level_1__'
+ assert foo['name'] == 'foo'
+
def test_categorical_column_index(self):
- # I *really* hope no one uses category dtypes for single level column
- # indexes
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
columns=pd.Index(list('def'), dtype='category')
@@ -174,15 +206,36 @@ def test_categorical_column_index(self):
column_indexes, = js['column_indexes']
assert column_indexes['name'] is None
assert column_indexes['pandas_type'] == 'categorical'
- assert column_indexes['numpy_type'] == 'object'
+ assert column_indexes['numpy_type'] == 'int8'
md = column_indexes['metadata']
assert md['num_categories'] == 3
assert md['ordered'] is False
+ def test_string_column_index(self):
+ df = pd.DataFrame(
+ [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
+ columns=pd.Index(list('def'), name='stringz')
+ )
+ t = pa.Table.from_pandas(df, preserve_index=True)
+ raw_metadata = t.schema.metadata
+ js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+
+ column_indexes, = js['column_indexes']
+ assert column_indexes['name'] == 'stringz'
+ assert column_indexes['name'] == column_indexes['field_name']
+ assert column_indexes['pandas_type'] == ('bytes' if PY2 else 'unicode')
+ assert column_indexes['numpy_type'] == 'object'
+
+ md = column_indexes['metadata']
+
+ if not PY2:
+ assert len(md) == 1
+ assert md['encoding'] == 'UTF-8'
+ else:
+ assert md is None or 'encoding' not in md
+
def test_datetimetz_column_index(self):
- # I *really* hope no one uses category dtypes for single level column
- # indexes
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
columns=pd.date_range(
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Add field_name to pandas index metadata
> ------------------------------------------------
>
> Key: ARROW-1895
> URL: https://issues.apache.org/jira/browse/ARROW-1895
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 0.7.1
> Reporter: Phillip Cloud
> Assignee: Phillip Cloud
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> See the discussion here for details:
> https://github.com/pandas-dev/pandas/pull/18201
> In short we need a way to map index column names to field names in an arrow
> Table.
> Additionally, we're depending on the index columns being written at the end
> of the table and fixing this would allow us to read metadata written by other
> systems (e.g., fastparquet) that don't make this assumption.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)