[arrow] branch master updated: ARROW-1940: [Python] Extra metadata gets added after multiple conversions between pd.DataFrame and pa.Table

wesm Thu, 08 Mar 2018 20:12:06 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 34b18f7  ARROW-1940: [Python] Extra metadata gets added after multiple 
conversions between pd.DataFrame and pa.Table
34b18f7 is described below

commit 34b18f711e2f566722316a62b49e7050adbd75ac
Author: Phillip Cloud <[email protected]>
AuthorDate: Thu Mar 8 23:11:46 2018 -0500

    ARROW-1940: [Python] Extra metadata gets added after multiple conversions 
between pd.DataFrame and pa.Table
    
    Author: Phillip Cloud <[email protected]>
    
    Closes #1728 from cpcloud/ARROW-1940 and squashes the following commits:
    
    2e5b7afb <Phillip Cloud> ARROW-1940:  Extra metadata gets added after 
multiple conversions between pd.DataFrame and pa.Table
---
 cpp/src/arrow/python/helpers.cc             |  6 +-
 python/pyarrow/pandas_compat.py             | 99 +++++++++++++++++++++++++----
 python/pyarrow/tests/test_convert_pandas.py | 15 ++++-
 3 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 429068d..13dcc46 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -116,7 +116,8 @@ static Status InferDecimalPrecisionAndScale(PyObject* 
python_decimal, int32_t* p
   DCHECK_NE(scale, NULLPTR);
 
   // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a 
DCHECK
-  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", ""));
+  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, 
const_cast<char*>("as_tuple"),
+                                        const_cast<char*>("")));
   RETURN_IF_PYERROR();
   DCHECK(PyTuple_Check(as_tuple.obj()));
 
@@ -241,7 +242,8 @@ bool PyDecimal_Check(PyObject* obj) {
 
 bool PyDecimal_ISNAN(PyObject* obj) {
   DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
-  OwnedRef is_nan(PyObject_CallMethod(obj, "is_nan", ""));
+  OwnedRef is_nan(
+      PyObject_CallMethod(obj, const_cast<char*>("is_nan"), 
const_cast<char*>("")));
   return PyObject_IsTrue(is_nan.obj()) == 1;
 }
 
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 0bc47fc..97ea51d 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,7 @@
 import ast
 import collections
 import json
+import operator
 import re
 
 import pandas.core.internals as _int
@@ -99,8 +100,8 @@ _numpy_logical_type_map = {
     np.float32: 'float32',
     np.float64: 'float64',
     'datetime64[D]': 'date',
-    np.str_: 'unicode',
-    np.bytes_: 'bytes',
+    np.unicode_: 'string' if not PY2 else 'unicode',
+    np.bytes_: 'bytes' if not PY2 else 'string',
 }
 
 
@@ -615,6 +616,22 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
 
 
 def _backwards_compatible_index_name(raw_name, logical_name):
+    """Compute the name of an index column that is compatible with older
+    versions of :mod:`pyarrow`.
+
+    Parameters
+    ----------
+    raw_name : str
+    logical_name : str
+
+    Returns
+    -------
+    result : str
+
+    Notes
+    -----
+    * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
+    """
     # Part of table_to_blockmanager
     pattern = r'^__index_level_\d+__$'
     if raw_name == logical_name and re.match(pattern, raw_name) is not None:
@@ -623,8 +640,57 @@ def _backwards_compatible_index_name(raw_name, 
logical_name):
         return logical_name
 
 
+_pandas_logical_type_map = {
+    'date': 'datetime64[D]',
+    'unicode': np.unicode_,
+    'bytes': np.bytes_,
+    'string': np.str_,
+    'empty': np.object_,
+    'mixed': np.object_,
+}
+
+
+def _pandas_type_to_numpy_type(pandas_type):
+    """Get the numpy dtype that corresponds to a pandas type.
+
+    Parameters
+    ----------
+    pandas_type : str
+        The result of a call to pandas.lib.infer_dtype.
+
+    Returns
+    -------
+    dtype : np.dtype
+        The dtype that corresponds to `pandas_type`.
+    """
+    try:
+        return _pandas_logical_type_map[pandas_type]
+    except KeyError:
+        return np.dtype(pandas_type)
+
+
 def _reconstruct_columns_from_metadata(columns, column_indexes):
-    # Part of table_to_blockmanager
+    """Construct a pandas MultiIndex from `columns` and column index metadata
+    in `column_indexes`.
+
+    Parameters
+    ----------
+    columns : List[pd.Index]
+        The columns coming from a pyarrow.Table
+    column_indexes : List[Dict[str, str]]
+        The column index metadata deserialized from the JSON schema metadata
+        in a :class:`~pyarrow.Table`.
+
+    Returns
+    -------
+    result : MultiIndex
+        The index reconstructed using `column_indexes` metadata with levels of
+        the correct type.
+
+    Notes
+    -----
+    * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
+    """
 
     # Get levels and labels, and provide sane defaults if the index has a
     # single level to avoid if/else spaghetti.
@@ -635,21 +701,28 @@ def _reconstruct_columns_from_metadata(columns, 
column_indexes):
 
     # Convert each level to the dtype provided in the metadata
     levels_dtypes = [
-        (level, col_index.get('numpy_type', level.dtype))
+        (level, col_index.get('pandas_type', str(level.dtype)))
         for level, col_index in zip_longest(
             levels, column_indexes, fillvalue={}
         )
     ]
-    new_levels = [
-        _level if _level.dtype == _dtype else _level.astype(_dtype)
-        for _level, _dtype in levels_dtypes
-    ]
 
-    return pd.MultiIndex(
-        levels=new_levels,
-        labels=labels,
-        names=columns.names
-    )
+    new_levels = []
+    encoder = operator.methodcaller('encode', 'UTF-8')
+    for level, pandas_dtype in levels_dtypes:
+        dtype = _pandas_type_to_numpy_type(pandas_dtype)
+
+        # Since our metadata is UTF-8 encoded, Python turns things that were
+        # bytes into unicode strings when json.loads-ing them. We need to
+        # convert them back to bytes to preserve metadata.
+        if dtype == np.bytes_:
+            level = level.map(encoder)
+        elif level.dtype != dtype:
+            level = level.astype(dtype)
+
+        new_levels.append(level)
+
+    return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
 
 
 def _table_to_blocks(options, block_table, nthreads, memory_pool, categories):
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 5abc026..333199a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -305,7 +305,8 @@ class TestConvertMetadata(object):
 
     def test_binary_column_name(self):
         column_data = [u'い']
-        data = {u'あ'.encode('utf8'): column_data}
+        key = u'あ'.encode('utf8')
+        data = {key: column_data}
         df = pd.DataFrame(data)
 
         # we can't use _check_pandas_roundtrip here because our metdata
@@ -314,7 +315,7 @@ class TestConvertMetadata(object):
         df2 = t.to_pandas()
         assert df.values[0] == df2.values[0]
         assert df.index.values[0] == df2.index.values[0]
-        assert df.columns[0] == df2.columns[0].encode('utf8')
+        assert df.columns[0] == key
 
     def test_multiindex_duplicate_values(self):
         num_rows = 3
@@ -1728,6 +1729,16 @@ def _fully_loaded_dataframe_example():
     return pd.DataFrame(data, index=index)
 
 
[email protected]('columns', ([b'foo'], ['foo']))
+def test_roundtrip_with_bytes_unicode(columns):
+    df = pd.DataFrame(columns=columns)
+    table1 = pa.Table.from_pandas(df)
+    table2 = pa.Table.from_pandas(table1.to_pandas())
+    assert table1.equals(table2)
+    assert table1.schema.equals(table2.schema)
+    assert table1.schema.metadata == table2.schema.metadata
+
+
 def _check_serialize_components_roundtrip(df):
     ctx = pa.default_serialization_context()
 

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-1940: [Python] Extra metadata gets added after multiple conversions between pd.DataFrame and pa.Table

Reply via email to