[
https://issues.apache.org/jira/browse/ARROW-1714?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16239508#comment-16239508
]
ASF GitHub Bot commented on ARROW-1714:
---------------------------------------
xhochy closed pull request #1263: ARROW-1714: [Python] Fix invalid
serialization/deserialization None name Series
URL: https://github.com/apache/arrow/pull/1263
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 1984598ff..87b47b8a6 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -263,6 +263,8 @@ def _column_name_to_strings(name):
return tuple(map(_column_name_to_strings, name))
elif isinstance(name, collections.Sequence):
raise TypeError("Unsupported type for MultiIndex level")
+ elif name is None:
+ return None
return str(name)
@@ -280,7 +282,9 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1):
for name in df.columns:
col = df[name]
if not isinstance(name, six.string_types):
- name = str(_column_name_to_strings(name))
+ name = _column_name_to_strings(name)
+ if name is not None:
+ name = str(name)
if schema is not None:
field = schema.field_by_name(name)
@@ -361,6 +365,7 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1):
schema = table.schema
row_count = table.num_rows
metadata = schema.metadata
+ columns_metadata = None
has_pandas_metadata = metadata is not None and b'pandas' in metadata
@@ -370,6 +375,7 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1):
columns = pandas_metadata['columns']
column_indexes = pandas_metadata.get('column_indexes', [])
table = _add_any_metadata(table, pandas_metadata)
+ columns_metadata = pandas_metadata.get('columns', None)
block_table = table
@@ -428,6 +434,18 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1):
index = pd.RangeIndex(row_count)
column_strings = [x.name for x in block_table.itercolumns()]
+ if columns_metadata is not None:
+ columns_name_dict = dict(
+ (str(x['name']), x['name'])
+ for x in columns_metadata
+ )
+ columns_values = [
+ columns_name_dict[y]
+ if y in columns_name_dict.keys() else y
+ for y in column_strings
+ ]
+ else:
+ columns_values = column_strings
# If we're passed multiple column indexes then evaluate with
# ast.literal_eval, since the column index values show up as a list of
@@ -437,11 +455,11 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1):
# Create the column index
# Construct the base index
- if not column_strings:
- columns = pd.Index(column_strings)
+ if not columns_values:
+ columns = pd.Index(columns_values)
else:
columns = pd.MultiIndex.from_tuples(
- list(map(to_pair, column_strings)),
+ list(map(to_pair, columns_values)),
names=[col_index['name'] for col_index in column_indexes] or None,
)
@@ -466,25 +484,35 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1):
_level if _level.dtype == _dtype else _level.astype(_dtype)
for _level, _dtype in levels_dtypes
]
+
columns = pd.MultiIndex(
levels=new_levels,
labels=labels,
names=columns.names
)
- # flatten a single level column MultiIndex for pandas 0.21.0 :(
- if isinstance(columns, pd.MultiIndex) and columns.nlevels == 1:
- levels, = columns.levels
- labels, = columns.labels
-
- # Cheaply check that we do not somehow have duplicate column names
- assert len(levels) == len(labels), 'Found non-unique column index'
- columns = levels[labels]
+ # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
+ columns = _flatten_single_level_multiindex(columns)
axes = [columns, index]
return _int.BlockManager(blocks, axes)
+def _flatten_single_level_multiindex(index):
+ if isinstance(index, pd.MultiIndex) and index.nlevels == 1:
+ levels, = index.levels
+ labels, = index.labels
+
+ # Cheaply check that we do not somehow have duplicate column names
+ if not index.is_unique:
+ raise ValueError('Found non-unique column index')
+
+ return pd.Index([levels[_label] if _label != -1 else None
+ for _label in labels],
+ name=index.names[0])
+ return index
+
+
def _add_any_metadata(table, pandas_metadata):
modified_columns = {}
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6165a6622..5ba5f83d2 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -345,7 +345,10 @@ cdef _schema_from_arrays(arrays, names, dict metadata,
else:
raise TypeError(type(val))
- c_name = tobytes(names[i])
+ if names[i] is None:
+ c_name = tobytes(u'None')
+ else:
+ c_name = tobytes(names[i])
fields[i].reset(new CField(c_name, type_, True))
schema.reset(new CSchema(fields, unbox_metadata(metadata)))
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 68c0c80aa..5033ea957 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -432,16 +432,23 @@ def test_serialize_pandas_no_preserve_index():
def test_serialize_with_pandas_objects():
df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
+ s = pd.Series([1, 2, 3, 4])
data = {
'a_series': df['a'],
- 'a_frame': df
+ 'a_frame': df,
+ 's_series': s
}
serialized = pa.serialize(data).to_buffer()
deserialized = pa.deserialize(serialized)
assert_frame_equal(deserialized['a_frame'], df)
+
assert_series_equal(deserialized['a_series'], df['a'])
+ assert deserialized['a_series'].name == 'a'
+
+ assert_series_equal(deserialized['s_series'], s)
+ assert deserialized['s_series'].name is None
def test_schema_batch_serialize_methods():
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] No named pd.Series name serialized as u'None'
> ------------------------------------------------------
>
> Key: ARROW-1714
> URL: https://issues.apache.org/jira/browse/ARROW-1714
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 0.7.1
> Reporter: Licht Takeuchi
> Assignee: Licht Takeuchi
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> Repro code.
> {code:java}
> import pandas as pd
> import pyarrow as pa
> s = pd.Series([1,2,3,4])
> serialized = pa.serialize(s).to_buffer()
> pa.deserialize(serialized).name
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)