This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 34b18f7 ARROW-1940: [Python] Extra metadata gets added after multiple
conversions between pd.DataFrame and pa.Table
34b18f7 is described below
commit 34b18f711e2f566722316a62b49e7050adbd75ac
Author: Phillip Cloud <[email protected]>
AuthorDate: Thu Mar 8 23:11:46 2018 -0500
ARROW-1940: [Python] Extra metadata gets added after multiple conversions
between pd.DataFrame and pa.Table
Author: Phillip Cloud <[email protected]>
Closes #1728 from cpcloud/ARROW-1940 and squashes the following commits:
2e5b7afb <Phillip Cloud> ARROW-1940: Extra metadata gets added after
multiple conversions between pd.DataFrame and pa.Table
---
cpp/src/arrow/python/helpers.cc | 6 +-
python/pyarrow/pandas_compat.py | 99 +++++++++++++++++++++++++----
python/pyarrow/tests/test_convert_pandas.py | 15 ++++-
3 files changed, 103 insertions(+), 17 deletions(-)
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 429068d..13dcc46 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -116,7 +116,8 @@ static Status InferDecimalPrecisionAndScale(PyObject*
python_decimal, int32_t* p
DCHECK_NE(scale, NULLPTR);
// TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a
DCHECK
- OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", ""));
+ OwnedRef as_tuple(PyObject_CallMethod(python_decimal,
const_cast<char*>("as_tuple"),
+ const_cast<char*>("")));
RETURN_IF_PYERROR();
DCHECK(PyTuple_Check(as_tuple.obj()));
@@ -241,7 +242,8 @@ bool PyDecimal_Check(PyObject* obj) {
bool PyDecimal_ISNAN(PyObject* obj) {
DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
- OwnedRef is_nan(PyObject_CallMethod(obj, "is_nan", ""));
+ OwnedRef is_nan(
+ PyObject_CallMethod(obj, const_cast<char*>("is_nan"),
const_cast<char*>("")));
return PyObject_IsTrue(is_nan.obj()) == 1;
}
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 0bc47fc..97ea51d 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,7 @@
import ast
import collections
import json
+import operator
import re
import pandas.core.internals as _int
@@ -99,8 +100,8 @@ _numpy_logical_type_map = {
np.float32: 'float32',
np.float64: 'float64',
'datetime64[D]': 'date',
- np.str_: 'unicode',
- np.bytes_: 'bytes',
+ np.unicode_: 'string' if not PY2 else 'unicode',
+ np.bytes_: 'bytes' if not PY2 else 'string',
}
@@ -615,6 +616,22 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
def _backwards_compatible_index_name(raw_name, logical_name):
+ """Compute the name of an index column that is compatible with older
+ versions of :mod:`pyarrow`.
+
+ Parameters
+ ----------
+ raw_name : str
+ logical_name : str
+
+ Returns
+ -------
+ result : str
+
+ Notes
+ -----
+ * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
+ """
# Part of table_to_blockmanager
pattern = r'^__index_level_\d+__$'
if raw_name == logical_name and re.match(pattern, raw_name) is not None:
@@ -623,8 +640,57 @@ def _backwards_compatible_index_name(raw_name,
logical_name):
return logical_name
+_pandas_logical_type_map = {
+ 'date': 'datetime64[D]',
+ 'unicode': np.unicode_,
+ 'bytes': np.bytes_,
+ 'string': np.str_,
+ 'empty': np.object_,
+ 'mixed': np.object_,
+}
+
+
+def _pandas_type_to_numpy_type(pandas_type):
+ """Get the numpy dtype that corresponds to a pandas type.
+
+ Parameters
+ ----------
+ pandas_type : str
+ The result of a call to pandas.lib.infer_dtype.
+
+ Returns
+ -------
+ dtype : np.dtype
+ The dtype that corresponds to `pandas_type`.
+ """
+ try:
+ return _pandas_logical_type_map[pandas_type]
+ except KeyError:
+ return np.dtype(pandas_type)
+
+
def _reconstruct_columns_from_metadata(columns, column_indexes):
- # Part of table_to_blockmanager
+ """Construct a pandas MultiIndex from `columns` and column index metadata
+ in `column_indexes`.
+
+ Parameters
+ ----------
+ columns : List[pd.Index]
+ The columns coming from a pyarrow.Table
+ column_indexes : List[Dict[str, str]]
+ The column index metadata deserialized from the JSON schema metadata
+ in a :class:`~pyarrow.Table`.
+
+ Returns
+ -------
+ result : MultiIndex
+ The index reconstructed using `column_indexes` metadata with levels of
+ the correct type.
+
+ Notes
+ -----
+ * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
+ """
# Get levels and labels, and provide sane defaults if the index has a
# single level to avoid if/else spaghetti.
@@ -635,21 +701,28 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
# Convert each level to the dtype provided in the metadata
levels_dtypes = [
- (level, col_index.get('numpy_type', level.dtype))
+ (level, col_index.get('pandas_type', str(level.dtype)))
for level, col_index in zip_longest(
levels, column_indexes, fillvalue={}
)
]
- new_levels = [
- _level if _level.dtype == _dtype else _level.astype(_dtype)
- for _level, _dtype in levels_dtypes
- ]
- return pd.MultiIndex(
- levels=new_levels,
- labels=labels,
- names=columns.names
- )
+ new_levels = []
+ encoder = operator.methodcaller('encode', 'UTF-8')
+ for level, pandas_dtype in levels_dtypes:
+ dtype = _pandas_type_to_numpy_type(pandas_dtype)
+
+ # Since our metadata is UTF-8 encoded, Python turns things that were
+ # bytes into unicode strings when json.loads-ing them. We need to
+ # convert them back to bytes to preserve metadata.
+ if dtype == np.bytes_:
+ level = level.map(encoder)
+ elif level.dtype != dtype:
+ level = level.astype(dtype)
+
+ new_levels.append(level)
+
+ return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
def _table_to_blocks(options, block_table, nthreads, memory_pool, categories):
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 5abc026..333199a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -305,7 +305,8 @@ class TestConvertMetadata(object):
def test_binary_column_name(self):
column_data = [u'い']
- data = {u'あ'.encode('utf8'): column_data}
+ key = u'あ'.encode('utf8')
+ data = {key: column_data}
df = pd.DataFrame(data)
# we can't use _check_pandas_roundtrip here because our metdata
@@ -314,7 +315,7 @@ class TestConvertMetadata(object):
df2 = t.to_pandas()
assert df.values[0] == df2.values[0]
assert df.index.values[0] == df2.index.values[0]
- assert df.columns[0] == df2.columns[0].encode('utf8')
+ assert df.columns[0] == key
def test_multiindex_duplicate_values(self):
num_rows = 3
@@ -1728,6 +1729,16 @@ def _fully_loaded_dataframe_example():
return pd.DataFrame(data, index=index)
[email protected]('columns', ([b'foo'], ['foo']))
+def test_roundtrip_with_bytes_unicode(columns):
+ df = pd.DataFrame(columns=columns)
+ table1 = pa.Table.from_pandas(df)
+ table2 = pa.Table.from_pandas(table1.to_pandas())
+ assert table1.equals(table2)
+ assert table1.schema.equals(table2.schema)
+ assert table1.schema.metadata == table2.schema.metadata
+
+
def _check_serialize_components_roundtrip(df):
ctx = pa.default_serialization_context()
--
To stop receiving notification emails like this one, please contact
[email protected].