This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b83db61 ARROW-3374: [Python] Implicitly set from_pandas=True when
passing pandas.Categorical to pyarrow.array. Preserve ordered categories
b83db61 is described below
commit b83db6116dc323eea3f6b516e176d36d66960475
Author: Wes McKinney <[email protected]>
AuthorDate: Mon Oct 1 06:10:38 2018 -0400
ARROW-3374: [Python] Implicitly set from_pandas=True when passing
pandas.Categorical to pyarrow.array. Preserve ordered categories
Author: Wes McKinney <[email protected]>
Closes #2670 from wesm/ARROW-3374 and squashes the following commits:
914c7dddc <Wes McKinney> Implicitly set from_pandas=True when passing
pandas.Categorical to pyarrow.array. Ensure that ordered categories are
preserved
---
cpp/src/arrow/python/arrow_to_pandas.cc | 5 +++
python/pyarrow/array.pxi | 4 +--
python/pyarrow/tests/test_convert_pandas.py | 47 +++++++++++++++++++----------
3 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index a7d98da..be6e9b1 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1898,6 +1898,11 @@ class ArrowDeserializer {
PyDict_SetItemString(result_, "dictionary", block->dictionary());
RETURN_IF_PYERROR();
+ PyObject* py_ordered = type.ordered() ? Py_True : Py_False;
+ Py_INCREF(py_ordered);
+ PyDict_SetItemString(result_, "ordered", py_ordered);
+ RETURN_IF_PYERROR();
+
return Status::OK();
}
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 7de2340..320852a 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -160,7 +160,7 @@ def array(object obj, type=None, mask=None, size=None, bint
from_pandas=False,
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
- from_pandas=from_pandas, safe=safe,
+ from_pandas=True, safe=safe,
memory_pool=memory_pool)
else:
import pyarrow.pandas_compat as pdcompat
@@ -811,7 +811,7 @@ cdef wrap_array_output(PyObject* output):
if isinstance(obj, dict):
return Categorical(obj['indices'],
categories=obj['dictionary'],
- fastpath=True)
+ ordered=obj['ordered'], fastpath=True)
else:
return obj
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index db87d9a..9bae5ea 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1983,26 +1983,41 @@ class TestConvertMisc(object):
v1 = ['foo', None, 'bar', 'qux', np.nan]
v2 = [4, 5, 6, 7, 8]
v3 = [b'foo', None, b'bar', b'qux', np.nan]
- df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
- 'cat_ints': pd.Categorical(v2 * repeats),
- 'cat_binary': pd.Categorical(v3 * repeats),
- 'cat_strings_ordered': pd.Categorical(
- v1 * repeats, categories=['bar', 'qux', 'foo'],
- ordered=True),
- 'ints': v2 * repeats,
- 'ints2': v2 * repeats,
- 'strings': v1 * repeats,
- 'strings2': v1 * repeats,
- 'strings3': v3 * repeats})
+
+ arrays = {
+ 'cat_strings': pd.Categorical(v1 * repeats),
+ 'cat_strings_with_na': pd.Categorical(v1 * repeats,
+ categories=['foo', 'bar']),
+ 'cat_ints': pd.Categorical(v2 * repeats),
+ 'cat_binary': pd.Categorical(v3 * repeats),
+ 'cat_strings_ordered': pd.Categorical(
+ v1 * repeats, categories=['bar', 'qux', 'foo'],
+ ordered=True),
+ 'ints': v2 * repeats,
+ 'ints2': v2 * repeats,
+ 'strings': v1 * repeats,
+ 'strings2': v1 * repeats,
+ 'strings3': v3 * repeats}
+ df = pd.DataFrame(arrays)
_check_pandas_roundtrip(df)
+ for k in arrays:
+ _check_array_roundtrip(arrays[k])
+
+ def test_category_implicit_from_pandas(self):
+ # ARROW-3374
+ def _check(v):
+ arr = pa.array(v)
+ result = arr.to_pandas()
+ tm.assert_series_equal(pd.Series(result), pd.Series(v))
+
arrays = [
- pd.Categorical(v1 * repeats),
- pd.Categorical(v2 * repeats),
- pd.Categorical(v3 * repeats)
+ pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']),
+ pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'],
+ ordered=True)
]
- for values in arrays:
- _check_array_roundtrip(values)
+ for arr in arrays:
+ _check(arr)
def test_empty_category(self):
# ARROW-2443