[jira] [Commented] (ARROW-1883) [Python] BUG: Table.to_pandas metadata checking fails if columns are not present

ASF GitHub Bot (JIRA) Sun, 10 Dec 2017 15:41:25 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1883?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16285414#comment-16285414
 ]


ASF GitHub Bot commented on ARROW-1883:
---------------------------------------

wesm closed pull request #1386: ARROW-1883: [Python] Fix handling of metadata 
in to_pandas when not all columns are present
URL: https://github.com/apache/arrow/pull/1386
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 668048fd6..b5d395fe5 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -641,19 +641,36 @@ def _add_any_metadata(table, pandas_metadata):
 
     schema = table.schema
 
+    index_columns = pandas_metadata['index_columns']
+    n_index_levels = len(index_columns)
+    n_columns = len(pandas_metadata['columns']) - n_index_levels
+
     # Add time zones
     for i, col_meta in enumerate(pandas_metadata['columns']):
-        if col_meta['pandas_type'] == 'datetimetz':
-            col = table[i]
-            converted = col.to_pandas()
-            tz = col_meta['metadata']['timezone']
-            tz_aware_type = pa.timestamp('ns', tz=tz)
-            with_metadata = pa.Array.from_pandas(converted.values,
-                                                 type=tz_aware_type)
-
-            field = pa.field(schema[i].name, tz_aware_type)
-            modified_columns[i] = pa.Column.from_array(field,
-                                                       with_metadata)
+
+        raw_name = col_meta.get('field_name')
+        if not raw_name:
+            # deal with metadata written with arrow < 0.8
+            raw_name = col_meta['name']
+            if i >= n_columns:
+                # index columns
+                raw_name = index_columns[i - n_columns]
+            if raw_name is None:
+                raw_name = 'None'
+
+        idx = schema.get_field_index(raw_name)
+        if idx != -1:
+            if col_meta['pandas_type'] == 'datetimetz':
+                col = table[idx]
+                converted = col.to_pandas()
+                tz = col_meta['metadata']['timezone']
+                tz_aware_type = pa.timestamp('ns', tz=tz)
+                with_metadata = pa.Array.from_pandas(converted.values,
+                                                     type=tz_aware_type)
+
+                field = pa.field(schema[idx].name, tz_aware_type)
+                modified_columns[idx] = pa.Column.from_array(field,
+                                                             with_metadata)
 
     if len(modified_columns) > 0:
         columns = []
diff --git a/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet 
b/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet
new file mode 100644
index 000000000..d48041f51
Binary files /dev/null and 
b/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet differ
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 97bbb6a17..7609d3488 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1286,6 +1286,37 @@ def test_array_from_pandas_typed_array_with_mask(self, 
t, data, expected):
         assert pa.Array.from_pandas(expected,
                                     type=pa.list_(t())).equals(result)
 
+    def test_table_column_subset_metadata(self):
+        # ARROW-1883
+        df = pd.DataFrame({
+            'a': [1, 2, 3],
+            'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
+        table = pa.Table.from_pandas(df)
+
+        table_subset = table.remove_column(1)
+        result = table_subset.to_pandas()
+        tm.assert_frame_equal(result, df[['a']])
+
+        table_subset2 = table_subset.remove_column(1)
+        result = table_subset2.to_pandas()
+        tm.assert_frame_equal(result, df[['a']])
+
+        # non-default index
+        for index in [
+                pd.Index(['a', 'b', 'c'], name='index'),
+                pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
+            df = pd.DataFrame({'a': [1, 2, 3],
+                               'b': [.1, .2, .3]}, index=index)
+            table = pa.Table.from_pandas(df)
+
+            table_subset = table.remove_column(1)
+            result = table_subset.to_pandas()
+            tm.assert_frame_equal(result, df[['a']])
+
+            table_subset2 = table_subset.remove_column(1)
+            result = table_subset2.to_pandas()
+            tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 2543e7d17..79e24d8d4 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1570,6 +1570,29 @@ def 
test_backwards_compatible_index_multi_level_some_named():
     tm.assert_frame_equal(result, expected)
 
 
+@parquet
+def test_backwards_compatible_column_metadata_handling():
+    expected = pd.DataFrame(
+        {'a': [1, 2, 3], 'b': [.1, .2, .3],
+         'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
+    expected.index = pd.MultiIndex.from_arrays(
+        [['a', 'b', 'c'],
+         pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
+        names=['index', None])
+
+    path = os.path.join(
+        os.path.dirname(__file__), 'data',
+        'v0.7.1.column-metadata-handling.parquet'
+    )
+    t = _read_table(path)
+    result = t.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+    t = _read_table(path, columns=['a'])
+    result = t.to_pandas()
+    tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
+
+
 def test_decimal_roundtrip(tmpdir):
     num_values = 10
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] BUG: Table.to_pandas metadata checking fails if columns are not 
> present
> --------------------------------------------------------------------------------
>
>                 Key: ARROW-1883
>                 URL: https://issues.apache.org/jira/browse/ARROW-1883
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.7.1
>            Reporter: Joris Van den Bossche
>            Assignee: Joris Van den Bossche
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> Found this bug in the example in the pandas documentation 
> (http://pandas-docs.github.io/pandas-docs-travis/io.html#parquet), which does:
> {code}
> df = pd.DataFrame({'a': list('abc'),
>                    'b': list(range(1, 4)),
>                    'c': np.arange(3, 6).astype('u1'),
>                    'd': np.arange(4.0, 7.0, dtype='float64'),
>                    'e': [True, False, True],
>                    'f': pd.date_range('20130101', periods=3),
>                    'g': pd.date_range('20130101', periods=3, 
> tz='US/Eastern')})
> df.to_parquet('example_pa.parquet', engine='pyarrow')
> pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
> {code}
> and this raises in the last line reading a subset of columns:
> {code}
> ...
> /home/joris/miniconda3/envs/dev/lib/python3.5/site-packages/pyarrow/pandas_compat.py
>  in _add_any_metadata(table, pandas_metadata)
>     357     for i, col_meta in enumerate(pandas_metadata['columns']):
>     358         if col_meta['pandas_type'] == 'datetimetz':
> --> 359             col = table[i]
>     360             converted = col.to_pandas()
>     361             tz = col_meta['metadata']['timezone']
> table.pxi in pyarrow.lib.Table.__getitem__()
> table.pxi in pyarrow.lib.Table.column()
> IndexError: Table column index 6 is out of range
> {code}
> This is due to checking the `pandas_metadata` for all columns (and in this 
> case trying to deal with a datetime tz column), while in practice not all 
> columns are present in this case ('mismatch' between pandas metadata and 
> actual schema). 
> A smaller example without parquet:
> {code}
> In [38]: df = pd.DataFrame({'a': [1, 2, 3], 'b': pd.date_range("2017-01-01", 
> periods=3, tz='Europe/Brussels')})
> In [39]: table = pyarrow.Table.from_pandas(df)
> In [40]: table
> Out[40]: 
> pyarrow.Table
> a: int64
> b: timestamp[ns, tz=Europe/Brussels]
> __index_level_0__: int64
> metadata
> --------
> {b'pandas': b'{"columns": [{"pandas_type": "int64", "metadata": null, 
> "numpy_t'
>             b'ype": "int64", "name": "a"}, {"pandas_type": "datetimetz", 
> "meta'
>             b'data": {"timezone": "Europe/Brussels"}, "numpy_type": 
> "datetime6'
>             b'4[ns, Europe/Brussels]", "name": "b"}, {"pandas_type": "int64", 
> '
>             b'"metadata": null, "numpy_type": "int64", "name": 
> "__index_level_'
>             b'0__"}], "index_columns": ["__index_level_0__"], 
> "pandas_version"'
>             b': "0.22.0.dev0+277.gd61f411"}'}
> In [41]: table.to_pandas()
> Out[41]: 
>    a                         b
> 0  1 2017-01-01 00:00:00+01:00
> 1  2 2017-01-02 00:00:00+01:00
> 2  3 2017-01-03 00:00:00+01:00
> In [44]: table_without_tz = table.remove_column(1)
> In [45]: table_without_tz
> Out[45]: 
> pyarrow.Table
> a: int64
> __index_level_0__: int64
> metadata
> --------
> {b'pandas': b'{"columns": [{"pandas_type": "int64", "metadata": null, 
> "numpy_t'
>             b'ype": "int64", "name": "a"}, {"pandas_type": "datetimetz", 
> "meta'
>             b'data": {"timezone": "Europe/Brussels"}, "numpy_type": 
> "datetime6'
>             b'4[ns, Europe/Brussels]", "name": "b"}, {"pandas_type": "int64", 
> '
>             b'"metadata": null, "numpy_type": "int64", "name": 
> "__index_level_'
>             b'0__"}], "index_columns": ["__index_level_0__"], 
> "pandas_version"'
>             b': "0.22.0.dev0+277.gd61f411"}'}
> In [46]: table_without_tz.to_pandas()          # <------ wrong output !
> Out[46]: 
>                                      a
> 1970-01-01 01:00:00+01:00            1
> 1970-01-01 01:00:00.000000001+01:00  2
> 1970-01-01 01:00:00.000000002+01:00  3
> In [47]: table_without_tz2 = table_without_tz.remove_column(1)
> In [48]: table_without_tz2
> Out[48]: 
> pyarrow.Table
> a: int64
> metadata
> --------
> {b'pandas': b'{"columns": [{"pandas_type": "int64", "metadata": null, 
> "numpy_t'
>             b'ype": "int64", "name": "a"}, {"pandas_type": "datetimetz", 
> "meta'
>             b'data": {"timezone": "Europe/Brussels"}, "numpy_type": 
> "datetime6'
>             b'4[ns, Europe/Brussels]", "name": "b"}, {"pandas_type": "int64", 
> '
>             b'"metadata": null, "numpy_type": "int64", "name": 
> "__index_level_'
>             b'0__"}], "index_columns": ["__index_level_0__"], 
> "pandas_version"'
>             b': "0.22.0.dev0+277.gd61f411"}'}
> In [49]: table_without_tz2.to_pandas()     # <------ error !
> ---------------------------------------------------------------------------
> IndexError                                Traceback (most recent call last)
> <ipython-input-49-c82f33476c6b> in <module>()
> ----> 1 table_without_tz2.to_pandas()
> table.pxi in pyarrow.lib.Table.to_pandas()
> /home/joris/miniconda3/envs/dev/lib/python3.5/site-packages/pyarrow/pandas_compat.py
>  in table_to_blockmanager(options, table, memory_pool, nthreads)
>     289         pandas_metadata = 
> json.loads(metadata[b'pandas'].decode('utf8'))
>     290         index_columns = pandas_metadata['index_columns']
> --> 291         table = _add_any_metadata(table, pandas_metadata)
>     292 
>     293     block_table = table
> /home/joris/miniconda3/envs/dev/lib/python3.5/site-packages/pyarrow/pandas_compat.py
>  in _add_any_metadata(table, pandas_metadata)
>     357     for i, col_meta in enumerate(pandas_metadata['columns']):
>     358         if col_meta['pandas_type'] == 'datetimetz':
> --> 359             col = table[i]
>     360             converted = col.to_pandas()
>     361             tz = col_meta['metadata']['timezone']
> table.pxi in pyarrow.lib.Table.__getitem__()
> table.pxi in pyarrow.lib.Table.column()
> IndexError: Table column index 1 is out of range
> {code}
> The reason is that `_add_any_metadata` does not check if the column it is 
> processing (currently only datetime tz columns need such processing) is 
> actually present in the schema.
> Working on a fix, will submit a PR.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1883) [Python] BUG: Table.to_pandas metadata checking fails if columns are not present

Reply via email to