[jira] [Commented] (ARROW-1976) [Python] Handling unicode pandas columns on parquet.read_table

ASF GitHub Bot (JIRA) Mon, 05 Feb 2018 16:28:28 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16353146#comment-16353146
 ]


ASF GitHub Bot commented on ARROW-1976:
---------------------------------------

wesm closed pull request #1553: ARROW-1976: [Python] Handling unicode pandas 
columns on parquet.read_table
URL: https://github.com/apache/arrow/pull/1553
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.gitignore b/.gitignore
index e6dfe19bb..c38694e1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@ cpp/.idea/
 python/.eggs/
 .vscode
 .idea/
+.pytest_cache/
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 240cccdaf..987bb7555 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -170,9 +170,11 @@ def get_column_metadata(column, name, arrow_type, 
field_name):
             )
         )
 
+    assert field_name is None or isinstance(field_name, six.string_types), \
+        str(type(field_name))
     return {
         'name': name,
-        'field_name': str(field_name),
+        'field_name': 'None' if field_name is None else field_name,
         'pandas_type': logical_type,
         'numpy_type': string_dtype,
         'metadata': extra_metadata,
@@ -279,8 +281,11 @@ def _column_name_to_strings(name):
     """
     if isinstance(name, six.string_types):
         return name
+    elif isinstance(name, six.binary_type):
+        # XXX: should we assume that bytes in Python 3 are UTF-8?
+        return name.decode('utf8')
     elif isinstance(name, tuple):
-        return tuple(map(_column_name_to_strings, name))
+        return str(tuple(map(_column_name_to_strings, name)))
     elif isinstance(name, collections.Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
     elif name is None:
@@ -327,10 +332,7 @@ def dataframe_to_arrays(df, schema, preserve_index, 
nthreads=1):
 
     for name in df.columns:
         col = df[name]
-        if not isinstance(name, six.string_types):
-            name = _column_name_to_strings(name)
-            if name is not None:
-                name = str(name)
+        name = _column_name_to_strings(name)
 
         if schema is not None:
             field = schema.field_by_name(name)
@@ -561,7 +563,8 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
     column_strings = [x.name for x in block_table.itercolumns()]
     if columns:
         columns_name_dict = {
-            c.get('field_name', str(c['name'])): c['name'] for c in columns
+            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
+            for c in columns
         }
         columns_values = [
             columns_name_dict.get(name, name) for name in column_strings
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 8820b6b4a..494e65ebb 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -939,7 +939,7 @@ def write_table(table, where, row_group_size=None, 
version='1.0',
                 coerce_timestamps=None,
                 flavor=None, **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
-
+    use_int96 = use_deprecated_int96_timestamps
     try:
         with ParquetWriter(
                 where, table.schema,
@@ -948,7 +948,7 @@ def write_table(table, where, row_group_size=None, 
version='1.0',
                 use_dictionary=use_dictionary,
                 coerce_timestamps=coerce_timestamps,
                 compression=compression,
-                use_deprecated_int96_timestamps= 
use_deprecated_int96_timestamps, # noqa
+                use_deprecated_int96_timestamps=use_int96,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 31099072f..4f0a68729 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -156,6 +156,11 @@ def test_multiindex_columns_with_dtypes(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)
 
+    def test_multiindex_columns_unicode(self):
+        columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
+        _check_pandas_roundtrip(df, preserve_index=True)
+
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
@@ -519,6 +524,31 @@ def test_unicode(self):
 
         _check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_unicode_with_unicode_column_and_index(self):
+        df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
+
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_mixed_unicode_column_names(self):
+        df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+
+        # TODO(phillipc): Should this raise?
+        with pytest.raises(AssertionError):
+            _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_binary_column_name(self):
+        column_data = [u'い']
+        data = {u'あ'.encode('utf8'): column_data}
+        df = pd.DataFrame(data)
+
+        # we can't use _check_pandas_roundtrip here because our metdata
+        # is always decoded as utf8: even if binary goes in, utf8 comes out
+        t = pa.Table.from_pandas(df, preserve_index=True)
+        df2 = t.to_pandas()
+        assert df.values[0] == df2.values[0]
+        assert df.index.values[0] == df2.index.values[0]
+        assert df.columns[0] == df2.columns[0].encode('utf8')
+
     def test_bytes_to_binary(self):
         values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
         df = pd.DataFrame({'strings': values})


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Handling unicode pandas columns on parquet.read_table
> --------------------------------------------------------------
>
>                 Key: ARROW-1976
>                 URL: https://issues.apache.org/jira/browse/ARROW-1976
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Simbarashe Nyatsanga
>            Assignee: Licht Takeuchi
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Unicode columns in pandas DataFrames aren't being handled correctly for some 
> datasets when reading a parquet file into a pandas DataFrame, leading to the 
> common Python ASCII encoding error.
>  
> The dataset used to get the error is here: 
> https://catalog.data.gov/dataset/college-scorecard
> {code}
> import numpy as np
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> df = pd.read_csv('college_data.csv')
> {code}
> For verification, the DataFrame's columns are indeed unicode
> {code}
> df.columns
> > Index([u'UNITID', u'OPEID', u'OPEID6', u'INSTNM', u'CITY', u'STABBR',
>        u'INSTURL', u'NPCURL', u'HCM2', u'PREDDEG',
>        ...
>        u'RET_PTL4', u'PCTFLOAN', u'UG25ABV', u'MD_EARN_WNE_P10', u'GT_25K_P6',
>        u'GRAD_DEBT_MDN_SUPP', u'GRAD_DEBT_MDN10YR_SUPP', u'RPY_3YR_RT_SUPP',
>        u'C150_L4_POOLED_SUPP', u'C150_4_POOLED_SUPP'],
>       dtype='object', length=123)
> {code}
> The DataFrame can be saved into a parquet file
> {code}
> arrow_table = pa.Table.from_pandas(df)
> pq.write_table(arrow_table, 'college_data.parquet')
> {code}
> But trying to read the parquet file immediately afterwards results in the 
> following
> {code}
> df = pq.read_table('college_data.parquet').to_pandas()
> > ---------------------------------------------------------------------------
> UnicodeEncodeError                        Traceback (most recent call last)
> <ipython-input-29-23906ea1efe3> in <module>()
> ----> 2 df = pq.read_table('college_data.parquet').to_pandas()
> /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/table.pxi in 
> pyarrow.lib.Table.to_pandas 
> (/Users/travis/build/BryanCutler/arrow-dist/arrow/python/build/temp.macosx-10.6-intel-2.7/lib.cxx:46331)()
>    1041         if nthreads is None:
>    1042             nthreads = cpu_count()
> -> 1043         mgr = pdcompat.table_to_blockmanager(options, self, 
> memory_pool,
>    1044                                              nthreads)
>    1045         return pd.DataFrame(mgr)
> /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/pandas_compat.pyc
>  in table_to_blockmanager(options, table, memory_pool, nthreads, categoricals)
>     539     if columns:
>     540         columns_name_dict = {
> --> 541             c.get('field_name', str(c['name'])): c['name'] for c in 
> columns
>     542         }
>     543         columns_values = [
> /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/pandas_compat.pyc
>  in <dictcomp>((c,))
>     539     if columns:
>     540         columns_name_dict = {
> --> 541             c.get('field_name', str(c['name'])): c['name'] for c in 
> columns
>     542         }
>     543         columns_values = [
> UnicodeEncodeError: 'ascii' codec can't encode character u'\ufeff' in 
> position 0: ordinal not in range(128)
> {code}
> Looking at the stacktrace , it looks like this line, which is using str which 
> by default will try to do ascii encoding: 
> https://github.com/apache/arrow/blob/master/python/pyarrow/pandas_compat.py#L541



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-1976) [Python] Handling unicode pandas columns on parquet.read_table

Reply via email to