[ https://issues.apache.org/jira/browse/ARROW-1976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16351177#comment-16351177 ]
ASF GitHub Bot commented on ARROW-1976: --------------------------------------- cpcloud closed pull request #1476: ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table URL: https://github.com/apache/arrow/pull/1476 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 4a30fb3b4..610eecdfd 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -27,7 +27,7 @@ import six import pyarrow as pa -from pyarrow.compat import PY2, zip_longest # noqa +from pyarrow.compat import PY2, zip_longest, frombytes # noqa def infer_dtype(column): @@ -170,9 +170,12 @@ def get_column_metadata(column, name, arrow_type, field_name): ) ) + if not isinstance(field_name, six.string_types): + field_name = frombytes(field_name) + return { 'name': name, - 'field_name': str(field_name), + 'field_name': field_name, 'pandas_type': logical_type, 'numpy_type': string_dtype, 'metadata': extra_metadata, @@ -316,7 +319,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): if not isinstance(name, six.string_types): name = _column_name_to_strings(name) if name is not None: - name = str(name) + name = frombytes(name) if schema is not None: field = schema.field_by_name(name) @@ -543,9 +546,14 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1, column_strings = [x.name for x in block_table.itercolumns()] if columns: - columns_name_dict = { - c.get('field_name', str(c['name'])): c['name'] for c in columns - } + columns_name_dict = {} + for c in columns: + column_name = c['name'] + if not isinstance(column_name, six.text_type): + column_name = frombytes(column_name) + + columns_name_dict[c.get('field_name', column_name)] = c['name'] + columns_values = [ columns_name_dict.get(name, name) for name in column_strings ] diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ca2f1e361..5b16bd7ff 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -518,6 +518,11 @@ def test_unicode(self): _check_pandas_roundtrip(df, expected_schema=schema) + def test_unicode_with_unicode_column_and_index(self): + df = pd.DataFrame({u'あ': [u'い']}, index=[u'う']) + + _check_pandas_roundtrip(df, preserve_index=True) + def test_bytes_to_binary(self): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] df = pd.DataFrame({'strings': values}) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Handling unicode pandas columns on parquet.read_table > -------------------------------------------------------------- > > Key: ARROW-1976 > URL: https://issues.apache.org/jira/browse/ARROW-1976 > Project: Apache Arrow > Issue Type: Bug > Components: Python > Affects Versions: 0.8.0 > Reporter: Simbarashe Nyatsanga > Assignee: Licht Takeuchi > Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Unicode columns in pandas DataFrames aren't being handled correctly for some > datasets when reading a parquet file into a pandas DataFrame, leading to the > common Python ASCII encoding error. > > The dataset used to get the error is here: > https://catalog.data.gov/dataset/college-scorecard > {code} > import numpy as np > import pandas as pd > import pyarrow as pa > import pyarrow.parquet as pq > df = pd.read_csv('college_data.csv') > {code} > For verification, the DataFrame's columns are indeed unicode > {code} > df.columns > > Index([u'UNITID', u'OPEID', u'OPEID6', u'INSTNM', u'CITY', u'STABBR', > u'INSTURL', u'NPCURL', u'HCM2', u'PREDDEG', > ... > u'RET_PTL4', u'PCTFLOAN', u'UG25ABV', u'MD_EARN_WNE_P10', u'GT_25K_P6', > u'GRAD_DEBT_MDN_SUPP', u'GRAD_DEBT_MDN10YR_SUPP', u'RPY_3YR_RT_SUPP', > u'C150_L4_POOLED_SUPP', u'C150_4_POOLED_SUPP'], > dtype='object', length=123) > {code} > The DataFrame can be saved into a parquet file > {code} > arrow_table = pa.Table.from_pandas(df) > pq.write_table(arrow_table, 'college_data.parquet') > {code} > But trying to read the parquet file immediately afterwards results in the > following > {code} > df = pq.read_table('college_data.parquet').to_pandas() > > --------------------------------------------------------------------------- > UnicodeEncodeError Traceback (most recent call last) > <ipython-input-29-23906ea1efe3> in <module>() > ----> 2 df = pq.read_table('college_data.parquet').to_pandas() > /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/table.pxi in > pyarrow.lib.Table.to_pandas > (/Users/travis/build/BryanCutler/arrow-dist/arrow/python/build/temp.macosx-10.6-intel-2.7/lib.cxx:46331)() > 1041 if nthreads is None: > 1042 nthreads = cpu_count() > -> 1043 mgr = pdcompat.table_to_blockmanager(options, self, > memory_pool, > 1044 nthreads) > 1045 return pd.DataFrame(mgr) > /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/pandas_compat.pyc > in table_to_blockmanager(options, table, memory_pool, nthreads, categoricals) > 539 if columns: > 540 columns_name_dict = { > --> 541 c.get('field_name', str(c['name'])): c['name'] for c in > columns > 542 } > 543 columns_values = [ > /Users/anaconda/envs/env/lib/python2.7/site-packages/pyarrow/pandas_compat.pyc > in <dictcomp>((c,)) > 539 if columns: > 540 columns_name_dict = { > --> 541 c.get('field_name', str(c['name'])): c['name'] for c in > columns > 542 } > 543 columns_values = [ > UnicodeEncodeError: 'ascii' codec can't encode character u'\ufeff' in > position 0: ordinal not in range(128) > {code} > Looking at the stacktrace , it looks like this line, which is using str which > by default will try to do ascii encoding: > https://github.com/apache/arrow/blob/master/python/pyarrow/pandas_compat.py#L541 -- This message was sent by Atlassian JIRA (v7.6.3#76005)