[ 
https://issues.apache.org/jira/browse/ARROW-1721?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16219857#comment-16219857
 ] 

ASF GitHub Bot commented on ARROW-1721:
---------------------------------------

wesm closed pull request #1246: ARROW-1721: [Python] Implement null-mask check 
in places where it isn't supported in numpy_to_arrow.cc
URL: https://github.com/apache/arrow/pull/1246
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 2c89a9f61..ead3a0481 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -622,8 +622,12 @@ Status NumPyConverter::ConvertDates() {
 
   Ndarray1DIndexer<PyObject*> objects(arr_);
 
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   BuilderType builder(pool_);
@@ -636,10 +640,10 @@ Status NumPyConverter::ConvertDates() {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if (PyDate_CheckExact(obj)) {
-      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
-    } else if (PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyDate_CheckExact(obj)) {
+      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
     } else {
       std::stringstream ss;
       ss << "Error converting from Python objects to Date: ";
@@ -1029,6 +1033,41 @@ Status LoopPySequence(PyObject* sequence, T func) {
   return Status::OK();
 }
 
+template <typename T>
+Status LoopPySequenceWithMasks(PyObject* sequence,
+                               const Ndarray1DIndexer<uint8_t>& mask_values,
+                               bool have_mask, T func) {
+  if (PySequence_Check(sequence)) {
+    OwnedRef ref;
+    Py_ssize_t size = PySequence_Size(sequence);
+    if (PyArray_Check(sequence)) {
+      auto array = reinterpret_cast<PyArrayObject*>(sequence);
+      Ndarray1DIndexer<PyObject*> objects(array);
+      for (int64_t i = 0; i < size; ++i) {
+        RETURN_NOT_OK(func(objects[i], have_mask && mask_values[i]));
+      }
+    } else {
+      for (int64_t i = 0; i < size; ++i) {
+        ref.reset(PySequence_GetItem(sequence, i));
+        RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
+      }
+    }
+  } else if (PyObject_HasAttrString(sequence, "__iter__")) {
+    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
+    PyObject* item;
+    int64_t i = 0;
+    while ((item = PyIter_Next(iter.obj()))) {
+      OwnedRef ref = OwnedRef(item);
+      RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
+      i++;
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence or iterable");
+  }
+
+  return Status::OK();
+}
+
 template <int ITEM_TYPE, typename ArrowType>
 inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>& type,
                                                 ListBuilder* builder, 
PyObject* list) {
@@ -1037,15 +1076,18 @@ inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>&
 
   PyAcquireGIL lock;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1071,7 +1113,7 @@ inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>&
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 template <>
@@ -1079,15 +1121,18 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
     const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* 
list) {
   PyAcquireGIL lock;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1112,7 +1157,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 template <>
@@ -1122,15 +1167,18 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
   // TODO: If there are bytes involed, convert to Binary representation
   bool have_bytes = false;
 
-  // TODO: mask not supported here
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
   if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions 
yet");
+    mask_values.Init(mask_);
+    have_mask = true;
   }
 
   auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
 
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
+  auto foreach_item = [&](PyObject* object, bool mask) {
+    if (mask || PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1162,7 +1210,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
     }
   };
 
-  return LoopPySequence(list, foreach_item);
+  return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item);
 }
 
 #define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                            \
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 41ad20102..527466e6e 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -19,7 +19,6 @@
 from collections import OrderedDict
 
 from datetime import date, time
-import unittest
 import decimal
 import json
 
@@ -61,7 +60,7 @@ def _alltypes_example(size=100):
     })
 
 
-class TestPandasConversion(unittest.TestCase):
+class TestPandasConversion(object):
 
     def setUp(self):
         pass
@@ -420,7 +419,7 @@ def 
test_fixed_size_bytes_does_not_accept_varying_lengths(self):
         values = [b'foo', None, b'ba', None, None, b'hey']
         df = pd.DataFrame({'strings': values})
         schema = pa.schema([pa.field('strings', pa.binary(3))])
-        with self.assertRaises(pa.ArrowInvalid):
+        with pytest.raises(pa.ArrowInvalid):
             pa.Table.from_pandas(df, schema=schema)
 
     def test_timestamps_notimezone_no_nulls(self):
@@ -697,11 +696,11 @@ def test_category(self):
 
     def test_mixed_types_fails(self):
         data = pd.DataFrame({'a': ['a', 1, 2.0]})
-        with self.assertRaises(pa.ArrowException):
+        with pytest.raises(pa.ArrowException):
             pa.Table.from_pandas(data)
 
         data = pd.DataFrame({'a': [1, True]})
-        with self.assertRaises(pa.ArrowException):
+        with pytest.raises(pa.ArrowException):
             pa.Table.from_pandas(data)
 
     def test_strided_data_import(self):
@@ -1096,6 +1095,45 @@ def test_table_str_to_categorical(self):
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)
 
+    def test_array_from_pandas_date_with_mask(self):
+        m = np.array([True, False, True])
+        data = pd.Series([
+            date(1990, 1, 1),
+            date(1991, 1, 1),
+            date(1992, 1, 1)
+        ])
+
+        result = pa.Array.from_pandas(data, mask=m)
+
+        expected = pd.Series([None, date(1991, 1, 1), None])
+        assert pa.Array.from_pandas(expected).equals(result)
+
+    @pytest.mark.parametrize('t,data,expected', [
+        (
+            pa.int64,
+            [[1, 2], [3], None],
+            [None, [3], None]
+        ),
+        (
+            pa.string,
+            [[u'aaa', u'bb'], [u'c'], None],
+            [None, [u'c'], None]
+        ),
+        (
+            pa.null,
+            [[None, None], [None], None],
+            [None, [None], None]
+        )
+    ])
+    def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
+        m = np.array([True, False, True])
+
+        s = pd.Series(data)
+        result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
+
+        assert pa.Array.from_pandas(expected,
+                                    type=pa.list_(t())).equals(result)
+
 
 def _pytime_from_micros(val):
     microseconds = val % 1000000


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Support null mask in places where it isn't supported in 
> numpy_to_arrow.cc
> ----------------------------------------------------------------------------------
>
>                 Key: ARROW-1721
>                 URL: https://issues.apache.org/jira/browse/ARROW-1721
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Licht Takeuchi
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> see https://github.com/apache/spark/pull/18664#discussion_r146472109 for 
> SPARK-21375



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to