This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 48a6ff8 ARROW-1721: [Python] Implement null-mask check in places where it isn't supported in numpy_to_arrow.cc 48a6ff8 is described below commit 48a6ff856cf4de939f5ced42a09b1b39866efc1e Author: Licht-T <lich...@outlook.jp> AuthorDate: Wed Oct 25 22:19:51 2017 -0400 ARROW-1721: [Python] Implement null-mask check in places where it isn't supported in numpy_to_arrow.cc This closes [ARROW-1721](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1721). Author: Licht-T <lich...@outlook.jp> Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1246 from Licht-T/feature-object-from_pandas-mask and squashes the following commits: 41a1229d [Wes McKinney] Fix flake8 issues d7545334 [Licht-T] Fix lint issues by clang-format-4.0 7ef7f784 [Licht-T] Revert "Fix lint issues" 5c6c1822 [Licht-T] Fix lint issues 78d3c3fc [Licht-T] TST: Add tests of null-mask check for object types 72030bfe [Licht-T] ENH: Implement null-mask check for object types --- cpp/src/arrow/python/numpy_to_arrow.cc | 86 ++++++++++++++++++++++------- python/pyarrow/tests/test_convert_pandas.py | 48 ++++++++++++++-- 2 files changed, 110 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 2c89a9f..ead3a04 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -622,8 +622,12 @@ Status NumPyConverter::ConvertDates() { Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; if (mask_ != nullptr) { - return Status::NotImplemented("mask not supported in object conversions yet"); + mask_values.Init(mask_); + have_mask = true; } BuilderType builder(pool_); @@ -636,10 +640,10 @@ Status NumPyConverter::ConvertDates() { PyObject* obj; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; - if (PyDate_CheckExact(obj)) { - RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj))); - } else if (PandasObjectIsNull(obj)) { + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { RETURN_NOT_OK(builder.AppendNull()); + } else if (PyDate_CheckExact(obj)) { + RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj))); } else { std::stringstream ss; ss << "Error converting from Python objects to Date: "; @@ -1029,6 +1033,41 @@ Status LoopPySequence(PyObject* sequence, T func) { return Status::OK(); } +template <typename T> +Status LoopPySequenceWithMasks(PyObject* sequence, + const Ndarray1DIndexer<uint8_t>& mask_values, + bool have_mask, T func) { + if (PySequence_Check(sequence)) { + OwnedRef ref; + Py_ssize_t size = PySequence_Size(sequence); + if (PyArray_Check(sequence)) { + auto array = reinterpret_cast<PyArrayObject*>(sequence); + Ndarray1DIndexer<PyObject*> objects(array); + for (int64_t i = 0; i < size; ++i) { + RETURN_NOT_OK(func(objects[i], have_mask && mask_values[i])); + } + } else { + for (int64_t i = 0; i < size; ++i) { + ref.reset(PySequence_GetItem(sequence, i)); + RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i])); + } + } + } else if (PyObject_HasAttrString(sequence, "__iter__")) { + OwnedRef iter = OwnedRef(PyObject_GetIter(sequence)); + PyObject* item; + int64_t i = 0; + while ((item = PyIter_Next(iter.obj()))) { + OwnedRef ref = OwnedRef(item); + RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i])); + i++; + } + } else { + return Status::TypeError("Object is not a sequence or iterable"); + } + + return Status::OK(); +} + template <int ITEM_TYPE, typename ArrowType> inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { @@ -1037,15 +1076,18 @@ inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr<DataType>& PyAcquireGIL lock; - // TODO: mask not supported here + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; if (mask_ != nullptr) { - return Status::NotImplemented("mask not supported in object conversions yet"); + mask_values.Init(mask_); + have_mask = true; } BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder()); - auto foreach_item = [&](PyObject* object) { - if (PandasObjectIsNull(object)) { + auto foreach_item = [&](PyObject* object, bool mask) { + if (mask || PandasObjectIsNull(object)) { return builder->AppendNull(); } else if (PyArray_Check(object)) { auto numpy_array = reinterpret_cast<PyArrayObject*>(object); @@ -1071,7 +1113,7 @@ inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr<DataType>& } }; - return LoopPySequence(list, foreach_item); + return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item); } template <> @@ -1079,15 +1121,18 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>( const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; - // TODO: mask not supported here + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; if (mask_ != nullptr) { - return Status::NotImplemented("mask not supported in object conversions yet"); + mask_values.Init(mask_); + have_mask = true; } auto value_builder = static_cast<NullBuilder*>(builder->value_builder()); - auto foreach_item = [&](PyObject* object) { - if (PandasObjectIsNull(object)) { + auto foreach_item = [&](PyObject* object, bool mask) { + if (mask || PandasObjectIsNull(object)) { return builder->AppendNull(); } else if (PyArray_Check(object)) { auto numpy_array = reinterpret_cast<PyArrayObject*>(object); @@ -1112,7 +1157,7 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>( } }; - return LoopPySequence(list, foreach_item); + return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item); } template <> @@ -1122,15 +1167,18 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>( // TODO: If there are bytes involed, convert to Binary representation bool have_bytes = false; - // TODO: mask not supported here + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; if (mask_ != nullptr) { - return Status::NotImplemented("mask not supported in object conversions yet"); + mask_values.Init(mask_); + have_mask = true; } auto value_builder = static_cast<StringBuilder*>(builder->value_builder()); - auto foreach_item = [&](PyObject* object) { - if (PandasObjectIsNull(object)) { + auto foreach_item = [&](PyObject* object, bool mask) { + if (mask || PandasObjectIsNull(object)) { return builder->AppendNull(); } else if (PyArray_Check(object)) { auto numpy_array = reinterpret_cast<PyArrayObject*>(object); @@ -1162,7 +1210,7 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>( } }; - return LoopPySequence(list, foreach_item); + return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item); } #define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 41ad201..527466e 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -19,7 +19,6 @@ from collections import OrderedDict from datetime import date, time -import unittest import decimal import json @@ -61,7 +60,7 @@ def _alltypes_example(size=100): }) -class TestPandasConversion(unittest.TestCase): +class TestPandasConversion(object): def setUp(self): pass @@ -420,7 +419,7 @@ class TestPandasConversion(unittest.TestCase): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) - with self.assertRaises(pa.ArrowInvalid): + with pytest.raises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema) def test_timestamps_notimezone_no_nulls(self): @@ -697,11 +696,11 @@ class TestPandasConversion(unittest.TestCase): def test_mixed_types_fails(self): data = pd.DataFrame({'a': ['a', 1, 2.0]}) - with self.assertRaises(pa.ArrowException): + with pytest.raises(pa.ArrowException): pa.Table.from_pandas(data) data = pd.DataFrame({'a': [1, True]}) - with self.assertRaises(pa.ArrowException): + with pytest.raises(pa.ArrowException): pa.Table.from_pandas(data) def test_strided_data_import(self): @@ -1096,6 +1095,45 @@ class TestPandasConversion(unittest.TestCase): expected = pd.DataFrame({'strings': pd.Categorical(values)}) tm.assert_frame_equal(result, expected, check_dtype=True) + def test_array_from_pandas_date_with_mask(self): + m = np.array([True, False, True]) + data = pd.Series([ + date(1990, 1, 1), + date(1991, 1, 1), + date(1992, 1, 1) + ]) + + result = pa.Array.from_pandas(data, mask=m) + + expected = pd.Series([None, date(1991, 1, 1), None]) + assert pa.Array.from_pandas(expected).equals(result) + + @pytest.mark.parametrize('t,data,expected', [ + ( + pa.int64, + [[1, 2], [3], None], + [None, [3], None] + ), + ( + pa.string, + [[u'aaa', u'bb'], [u'c'], None], + [None, [u'c'], None] + ), + ( + pa.null, + [[None, None], [None], None], + [None, [None], None] + ) + ]) + def test_array_from_pandas_typed_array_with_mask(self, t, data, expected): + m = np.array([True, False, True]) + + s = pd.Series(data) + result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t())) + + assert pa.Array.from_pandas(expected, + type=pa.list_(t())).equals(result) + def _pytime_from_micros(val): microseconds = val % 1000000 -- To stop receiving notification emails like this one, please contact ['"commits@arrow.apache.org" <commits@arrow.apache.org>'].