[jira] [Commented] (ARROW-1895) [Python] Add field_name to pandas index metadata

ASF GitHub Bot (JIRA) Sun, 10 Dec 2017 10:35:43 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1895?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16285341#comment-16285341
 ]


ASF GitHub Bot commented on ARROW-1895:
---------------------------------------

xhochy closed pull request #1397: ARROW-1895/ARROW-1897: [Python] Add 
field_name to pandas index metadata
URL: https://github.com/apache/arrow/pull/1397
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 8459ec31b..afb204db8 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -110,7 +110,11 @@ def get_logical_type_from_numpy(pandas_collection):
     except KeyError:
         if hasattr(pandas_collection.dtype, 'tz'):
             return 'datetimetz'
-        return infer_dtype(pandas_collection)
+        result = infer_dtype(pandas_collection)
+
+        if result == 'string':
+            return 'bytes' if PY2 else 'unicode'
+        return result
 
 
 def get_extension_dtype_info(column):
@@ -122,7 +126,7 @@ def get_extension_dtype_info(column):
             'num_categories': len(cats.categories),
             'ordered': cats.ordered,
         }
-        physical_dtype = 'object'
+        physical_dtype = str(cats.codes.dtype)
     elif hasattr(dtype, 'tz'):
         metadata = {'timezone': str(dtype.tz)}
         physical_dtype = 'datetime64[ns]'
@@ -132,7 +136,7 @@ def get_extension_dtype_info(column):
     return physical_dtype, metadata
 
 
-def get_column_metadata(column, name, arrow_type):
+def get_column_metadata(column, name, arrow_type, field_name):
     """Construct the metadata for a given column
 
     Parameters
@@ -140,6 +144,10 @@ def get_column_metadata(column, name, arrow_type):
     column : pandas.Series or pandas.Index
     name : str
     arrow_type : pyarrow.DataType
+    field_name : str
+        Equivalent to `name` when `column` is a `Series`, otherwise if `column`
+        is a pandas Index then `field_name` will not be the same as `name`.
+        This is the name of the field in the arrow Table's schema.
 
     Returns
     -------
@@ -164,6 +172,7 @@ def get_column_metadata(column, name, arrow_type):
 
     return {
         'name': name,
+        'field_name': str(field_name),
         'pandas_type': logical_type,
         'numpy_type': string_dtype,
         'metadata': extra_metadata,
@@ -193,10 +202,14 @@ def construct_metadata(df, column_names, index_levels, 
preserve_index, types):
     index_types = types[ncolumns - len(index_levels):]
 
     column_metadata = [
-        get_column_metadata(df[col_name], name=sanitized_name,
-                            arrow_type=arrow_type)
-        for col_name, sanitized_name, arrow_type in
-        zip(df.columns, column_names, df_types)
+        get_column_metadata(
+            df[col_name],
+            name=sanitized_name,
+            arrow_type=arrow_type,
+            field_name=sanitized_name
+        ) for col_name, sanitized_name, arrow_type in zip(
+            df.columns, column_names, df_types
+        )
     ]
 
     if preserve_index:
@@ -204,9 +217,13 @@ def construct_metadata(df, column_names, index_levels, 
preserve_index, types):
             index_level_name, range(len(index_levels))
         ))
         index_column_metadata = [
-            get_column_metadata(level, name=level.name, arrow_type=arrow_type)
-            for i, (level, arrow_type) in enumerate(
-                zip(index_levels, index_types)
+            get_column_metadata(
+                level,
+                name=level.name,
+                arrow_type=arrow_type,
+                field_name=field_name,
+            ) for i, (level, arrow_type, field_name) in enumerate(
+                zip(index_levels, index_types, index_column_names)
             )
         ]
 
@@ -214,9 +231,16 @@ def construct_metadata(df, column_names, index_levels, 
preserve_index, types):
 
         for level in getattr(df.columns, 'levels', [df.columns]):
             string_dtype, extra_metadata = get_extension_dtype_info(level)
+
+            pandas_type = get_logical_type_from_numpy(level)
+            if pandas_type == 'unicode':
+                assert not extra_metadata
+                extra_metadata = {'encoding': 'UTF-8'}
+
             column_index = {
                 'name': level.name,
-                'pandas_type': get_logical_type_from_numpy(level),
+                'field_name': level.name,
+                'pandas_type': pandas_type,
                 'numpy_type': string_dtype,
                 'metadata': extra_metadata,
             }
@@ -436,7 +460,6 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
     schema = table.schema
     row_count = table.num_rows
     metadata = schema.metadata
-    columns_metadata = None
 
     has_pandas_metadata = metadata is not None and b'pandas' in metadata
 
@@ -446,13 +469,36 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
         columns = pandas_metadata['columns']
         column_indexes = pandas_metadata.get('column_indexes', [])
         table = _add_any_metadata(table, pandas_metadata)
-        columns_metadata = pandas_metadata.get('columns', None)
 
     block_table = table
 
+    index_columns_set = frozenset(index_columns)
+
+    # 0. 'field_name' is the name of the column in the arrow Table
+    # 1. 'name' is the user-facing name of the column, that is, it came from
+    #    pandas
+    # 2. 'field_name' and 'name' differ for index columns
+    # 3. We fall back on c['name'] for backwards compatibility
+    logical_index_names = [
+        c['name'] for c in columns
+        if c.get('field_name', c['name']) in index_columns_set
+    ]
+
+    # There must be the same number of field names and physical names
+    # (fields in the arrow Table)
+    assert len(logical_index_names) == len(index_columns_set)
+
+    # It can never be the case in a released version of pyarrow that
+    # c['name'] is None *and* 'field_name' is not a key in the column metadata,
+    # because the change to allow c['name'] to be None and the change to add
+    # 'field_name' are in the same release (0.8.0)
+    assert all(
+        (c['name'] is None and 'field_name' in c) or c['name'] is not None
+        for c in columns
+    )
+
     # Build up a list of index columns and names while removing those columns
     # from the original table
-    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
     for raw_name, logical_name in zip(index_columns, logical_index_names):
         i = schema.get_field_index(raw_name)
         if i != -1:
@@ -483,15 +529,12 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
         index = pd.RangeIndex(row_count)
 
     column_strings = [x.name for x in block_table.itercolumns()]
-    if columns_metadata is not None:
-        columns_name_dict = dict(
-            (str(x['name']), x['name'])
-            for x in columns_metadata
-        )
+    if columns:
+        columns_name_dict = {
+            c.get('field_name', str(c['name'])): c['name'] for c in columns
+        }
         columns_values = [
-            columns_name_dict[y]
-            if y in columns_name_dict.keys() else y
-            for y in column_strings
+            columns_name_dict.get(name, name) for name in column_strings
         ]
     else:
         columns_values = column_strings
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index e94ee4608..d7f059649 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -30,7 +30,7 @@
 import pandas as pd
 import pandas.util.testing as tm
 
-from pyarrow.compat import u
+from pyarrow.compat import u, PY2
 import pyarrow as pa
 import pyarrow.types as patypes
 
@@ -160,9 +160,41 @@ def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
 
+    def test_index_metadata_field_name(self):
+        # test None case, and strangely named non-index columns
+        df = pd.DataFrame(
+            [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
+            index=pd.MultiIndex.from_arrays(
+                [['c', 'b', 'a'], [3, 2, 1]],
+                names=[None, 'foo']
+            ),
+            columns=['a', None, '__index_level_0__'],
+        )
+        t = pa.Table.from_pandas(df, preserve_index=True)
+        raw_metadata = t.schema.metadata
+
+        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+
+        col1, col2, col3, idx0, foo = js['columns']
+
+        assert col1['name'] == 'a'
+        assert col1['name'] == col1['field_name']
+
+        assert col2['name'] is None
+        assert col2['field_name'] == 'None'
+
+        assert col3['name'] == '__index_level_0__'
+        assert col3['name'] == col3['field_name']
+
+        idx0_name, foo_name = js['index_columns']
+        assert idx0_name == '__index_level_0__'
+        assert idx0['field_name'] == idx0_name
+        assert idx0['name'] is None
+
+        assert foo_name == '__index_level_1__'
+        assert foo['name'] == 'foo'
+
     def test_categorical_column_index(self):
-        # I *really* hope no one uses category dtypes for single level column
-        # indexes
         df = pd.DataFrame(
             [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
             columns=pd.Index(list('def'), dtype='category')
@@ -174,15 +206,36 @@ def test_categorical_column_index(self):
         column_indexes, = js['column_indexes']
         assert column_indexes['name'] is None
         assert column_indexes['pandas_type'] == 'categorical'
-        assert column_indexes['numpy_type'] == 'object'
+        assert column_indexes['numpy_type'] == 'int8'
 
         md = column_indexes['metadata']
         assert md['num_categories'] == 3
         assert md['ordered'] is False
 
+    def test_string_column_index(self):
+        df = pd.DataFrame(
+            [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
+            columns=pd.Index(list('def'), name='stringz')
+        )
+        t = pa.Table.from_pandas(df, preserve_index=True)
+        raw_metadata = t.schema.metadata
+        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+
+        column_indexes, = js['column_indexes']
+        assert column_indexes['name'] == 'stringz'
+        assert column_indexes['name'] == column_indexes['field_name']
+        assert column_indexes['pandas_type'] == ('bytes' if PY2 else 'unicode')
+        assert column_indexes['numpy_type'] == 'object'
+
+        md = column_indexes['metadata']
+
+        if not PY2:
+            assert len(md) == 1
+            assert md['encoding'] == 'UTF-8'
+        else:
+            assert md is None or 'encoding' not in md
+
     def test_datetimetz_column_index(self):
-        # I *really* hope no one uses category dtypes for single level column
-        # indexes
         df = pd.DataFrame(
             [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
             columns=pd.date_range(


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Add field_name to pandas index metadata
> ------------------------------------------------
>
>                 Key: ARROW-1895
>                 URL: https://issues.apache.org/jira/browse/ARROW-1895
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.7.1
>            Reporter: Phillip Cloud
>            Assignee: Phillip Cloud
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> See the discussion here for details:
> https://github.com/pandas-dev/pandas/pull/18201
> In short we need a way to map index column names to field names in an arrow 
> Table.
> Additionally, we're depending on the index columns being written at the end 
> of the table and fixing this would allow us to read metadata written by other 
> systems (e.g., fastparquet) that don't make this assumption.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1895) [Python] Add field_name to pandas index metadata

Reply via email to