This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 48a6ff8  ARROW-1721: [Python] Implement null-mask check in places 
where it isn't supported in numpy_to_arrow.cc
48a6ff8 is described below

commit 48a6ff856cf4de939f5ced42a09b1b39866efc1e
Author: Licht-T <lich...@outlook.jp>
AuthorDate: Wed Oct 25 22:19:51 2017 -0400

    ARROW-1721: [Python] Implement null-mask check in places where it isn't 
supported in numpy_to_arrow.cc
    
    This closes 
[ARROW-1721](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1721).
    
    Author: Licht-T <lich...@outlook.jp>
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1246 from Licht-T/feature-object-from_pandas-mask and squashes the 
following commits:
    
    41a1229d [Wes McKinney] Fix flake8 issues
    d7545334 [Licht-T] Fix lint issues by clang-format-4.0
    7ef7f784 [Licht-T] Revert "Fix lint issues"
    5c6c1822 [Licht-T] Fix lint issues
    78d3c3fc [Licht-T] TST: Add tests of null-mask check for object types
    72030bfe [Licht-T] ENH: Implement null-mask check for object types
---
 cpp/src/arrow/python/numpy_to_arrow.cc      | 86 ++++++++++++++++++++++-------
 python/pyarrow/tests/test_convert_pandas.py | 48 ++++++++++++++--
 2 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 2c89a9f..ead3a04 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -622,8 +622,12 @@ Status NumPyConverter::ConvertDates() {
 
   Ndarray1DIndexer<PyObject*> objects(arr_);
 
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   BuilderType builder(pool_);
@@ -636,10 +640,10 @@ Status NumPyConverter::ConvertDates() {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if (PyDate_CheckExact(obj)) {
-      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
-    } else if (PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyDate_CheckExact(obj)) {
+      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
     } else {
       std::stringstream ss;
       ss << "Error converting from Python objects to Date: ";
@@ -1029,6 +1033,41 @@ Status LoopPySequence(PyObject* sequence, T func) {
   return Status::OK();
 }
 
+template <typename T>
+Status LoopPySequenceWithMasks(PyObject* sequence,
+                               const Ndarray1DIndexer<uint8_t>& mask_values,
+                               bool have_mask, T func) {
+  if (PySequence_Check(sequence)) {
+    OwnedRef ref;
+    Py_ssize_t size = PySequence_Size(sequence);
+    if (PyArray_Check(sequence)) {
+      auto array = reinterpret_cast<PyArrayObject*>(sequence);
+      Ndarray1DIndexer<PyObject*> objects(array);
+      for (int64_t i = 0; i < size; ++i) {
+        RETURN_NOT_OK(func(objects[i], have_mask && mask_values[i]));
+      }
+    } else {
+      for (int64_t i = 0; i < size; ++i) {
+        ref.reset(PySequence_GetItem(sequence, i));
+        RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
+      }
+    }
+  } else if (PyObject_HasAttrString(sequence, "__iter__")) {
+    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
+    PyObject* item;
+    int64_t i = 0;
+    while ((item = PyIter_Next(iter.obj()))) {
+      OwnedRef ref = OwnedRef(item);
+      RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
+      i++;
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence or iterable");
+  }
+
+  return Status::OK();
+}
+
 template <int ITEM_TYPE, typename ArrowType>
 inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>& type,
                                                 ListBuilder* builder, 
PyObject* list) {
@@ -1037,15 +1076,18 @@ inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>&
 
   PyAcquireGIL lock;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1071,7 +1113,7 @@ inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>&
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 template <>
@@ -1079,15 +1121,18 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
     const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* 
list) {
   PyAcquireGIL lock;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1112,7 +1157,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 template <>
@@ -1122,15 +1167,18 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
   // TODO: If there are bytes involed, convert to Binary representation
   bool have_bytes = false;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1162,7 +1210,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 #define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                            \
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 41ad201..527466e 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -19,7 +19,6 @@
 from collections import OrderedDict
 
 from datetime import date, time
-import unittest
 import decimal
 import json
 
@@ -61,7 +60,7 @@ def _alltypes_example(size=100):
     })
 
 
-class TestPandasConversion(unittest.TestCase):
+class TestPandasConversion(object):
 
     def setUp(self):
         pass
@@ -420,7 +419,7 @@ class TestPandasConversion(unittest.TestCase):
         values = [b'foo', None, b'ba', None, None, b'hey']
         df = pd.DataFrame({'strings': values})
         schema = pa.schema([pa.field('strings', pa.binary(3))])
-        with self.assertRaises(pa.ArrowInvalid):
+        with pytest.raises(pa.ArrowInvalid):
             pa.Table.from_pandas(df, schema=schema)
 
     def test_timestamps_notimezone_no_nulls(self):
@@ -697,11 +696,11 @@ class TestPandasConversion(unittest.TestCase):
 
     def test_mixed_types_fails(self):
         data = pd.DataFrame({'a': ['a', 1, 2.0]})
-        with self.assertRaises(pa.ArrowException):
+        with pytest.raises(pa.ArrowException):
             pa.Table.from_pandas(data)
 
         data = pd.DataFrame({'a': [1, True]})
-        with self.assertRaises(pa.ArrowException):
+        with pytest.raises(pa.ArrowException):
             pa.Table.from_pandas(data)
 
     def test_strided_data_import(self):
@@ -1096,6 +1095,45 @@ class TestPandasConversion(unittest.TestCase):
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)
 
+    def test_array_from_pandas_date_with_mask(self):
+        m = np.array([True, False, True])
+        data = pd.Series([
+            date(1990, 1, 1),
+            date(1991, 1, 1),
+            date(1992, 1, 1)
+        ])
+
+        result = pa.Array.from_pandas(data, mask=m)
+
+        expected = pd.Series([None, date(1991, 1, 1), None])
+        assert pa.Array.from_pandas(expected).equals(result)
+
+    @pytest.mark.parametrize('t,data,expected', [
+        (
+            pa.int64,
+            [[1, 2], [3], None],
+            [None, [3], None]
+        ),
+        (
+            pa.string,
+            [[u'aaa', u'bb'], [u'c'], None],
+            [None, [u'c'], None]
+        ),
+        (
+            pa.null,
+            [[None, None], [None], None],
+            [None, [None], None]
+        )
+    ])
+    def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
+        m = np.array([True, False, True])
+
+        s = pd.Series(data)
+        result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
+
+        assert pa.Array.from_pandas(expected,
+                                    type=pa.list_(t())).equals(result)
+
 
 def _pytime_from_micros(val):
     microseconds = val % 1000000

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Reply via email to