pitrou commented on a change in pull request #11185:
URL: https://github.com/apache/arrow/pull/11185#discussion_r726027770
##########
File path: python/pyarrow/array.pxi
##########
@@ -222,9 +222,9 @@ def array(object obj, type=None, mask=None, size=None,
from_pandas=None,
if hasattr(obj, '__arrow_array__'):
return _handle_arrow_array_protocol(obj, type, mask, size)
elif _is_array_like(obj):
- if mask is not None:
- # out argument unused
- mask = get_values(mask, &is_pandas_object)
+ if mask is not None and not _is_array_like(mask):
+ raise ValueError("Mask must be a numpy array "
Review comment:
Can you raise `TypeError` instead?
##########
File path: cpp/src/arrow/python/iterators.h
##########
@@ -97,31 +99,64 @@ inline Status VisitSequence(PyObject* obj, int64_t offset,
VisitorFunc&& func) {
template <class VisitorFunc>
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
VisitorFunc&& func) {
- if (mo == nullptr || !PyArray_Check(mo)) {
- return Status::Invalid("Null mask must be NumPy array");
- }
+ if (PyArray_Check(mo)) {
+ PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+ if (PyArray_NDIM(mask) != 1) {
+ return Status::Invalid("Mask must be 1D array");
+ }
+ if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+ return Status::Invalid("Mask was a different length from sequence being
converted");
+ }
- PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
- if (PyArray_NDIM(mask) != 1) {
- return Status::Invalid("Mask must be 1D array");
- }
+ const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+ if (dtype == NPY_BOOL) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
- const Py_ssize_t obj_size = PySequence_Size(obj);
- if (PyArray_SIZE(mask) != static_cast<int64_t>(obj_size)) {
- return Status::Invalid("Mask was a different length from sequence being
converted");
- }
+ return VisitSequenceGeneric(
+ obj, offset,
+ [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+ return func(value, mask_values[i], keep_going);
+ });
+ } else {
+ return Status::Invalid("Mask must be boolean dtype");
+ }
+ } else if (py::is_array(mo)) {
+ auto unwrap_mask_result = unwrap_array(mo);
+ if (!unwrap_mask_result.ok()) {
+ return Status::Invalid("Mask must be an array of booleans");
+ }
+ std::shared_ptr<Array> mask_ = unwrap_mask_result.ValueOrDie();
+ BooleanArray* boolmask = dynamic_cast<BooleanArray*>(mask_.get());
Review comment:
Instead relying on `dynamic_cast`, you should first compare the type id
with `Type::BOOLEAN` (which is a very fast check), then use `checked_cast`
(which has a different behaviour in debug and release mode).
##########
File path: python/pyarrow/tests/test_array.py
##########
@@ -2745,6 +2745,33 @@ def test_array_masked():
mask=np.array([False, True, False, True]))
assert arr.type == pa.int64()
+ # ARROW-13883
+ arr = pa.array([4, None, 4, 3],
+ mask=pa.array([False, True, False, True]))
Review comment:
What if `pa.array` has a null in it? Is it allowed?
##########
File path: cpp/src/arrow/python/iterators.h
##########
@@ -97,31 +99,64 @@ inline Status VisitSequence(PyObject* obj, int64_t offset,
VisitorFunc&& func) {
template <class VisitorFunc>
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
VisitorFunc&& func) {
- if (mo == nullptr || !PyArray_Check(mo)) {
- return Status::Invalid("Null mask must be NumPy array");
- }
+ if (PyArray_Check(mo)) {
+ PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+ if (PyArray_NDIM(mask) != 1) {
+ return Status::Invalid("Mask must be 1D array");
+ }
+ if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+ return Status::Invalid("Mask was a different length from sequence being
converted");
+ }
- PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
- if (PyArray_NDIM(mask) != 1) {
- return Status::Invalid("Mask must be 1D array");
- }
+ const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+ if (dtype == NPY_BOOL) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
- const Py_ssize_t obj_size = PySequence_Size(obj);
- if (PyArray_SIZE(mask) != static_cast<int64_t>(obj_size)) {
- return Status::Invalid("Mask was a different length from sequence being
converted");
- }
+ return VisitSequenceGeneric(
+ obj, offset,
+ [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+ return func(value, mask_values[i], keep_going);
+ });
+ } else {
+ return Status::Invalid("Mask must be boolean dtype");
+ }
+ } else if (py::is_array(mo)) {
+ auto unwrap_mask_result = unwrap_array(mo);
+ if (!unwrap_mask_result.ok()) {
+ return Status::Invalid("Mask must be an array of booleans");
Review comment:
Why not simply propagate the original error?
##########
File path: cpp/src/arrow/python/iterators.h
##########
@@ -97,31 +99,64 @@ inline Status VisitSequence(PyObject* obj, int64_t offset,
VisitorFunc&& func) {
template <class VisitorFunc>
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
VisitorFunc&& func) {
- if (mo == nullptr || !PyArray_Check(mo)) {
- return Status::Invalid("Null mask must be NumPy array");
- }
+ if (PyArray_Check(mo)) {
+ PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+ if (PyArray_NDIM(mask) != 1) {
+ return Status::Invalid("Mask must be 1D array");
+ }
+ if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+ return Status::Invalid("Mask was a different length from sequence being
converted");
+ }
- PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
- if (PyArray_NDIM(mask) != 1) {
- return Status::Invalid("Mask must be 1D array");
- }
+ const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+ if (dtype == NPY_BOOL) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
- const Py_ssize_t obj_size = PySequence_Size(obj);
- if (PyArray_SIZE(mask) != static_cast<int64_t>(obj_size)) {
- return Status::Invalid("Mask was a different length from sequence being
converted");
- }
+ return VisitSequenceGeneric(
+ obj, offset,
+ [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+ return func(value, mask_values[i], keep_going);
+ });
+ } else {
+ return Status::Invalid("Mask must be boolean dtype");
+ }
+ } else if (py::is_array(mo)) {
+ auto unwrap_mask_result = unwrap_array(mo);
+ if (!unwrap_mask_result.ok()) {
+ return Status::Invalid("Mask must be an array of booleans");
+ }
+ std::shared_ptr<Array> mask_ = unwrap_mask_result.ValueOrDie();
+ BooleanArray* boolmask = dynamic_cast<BooleanArray*>(mask_.get());
+ if (boolmask == nullptr) {
+ return Status::Invalid("Mask must be an array of booleans");
+ }
+
+ if (mask_->length() != PySequence_Size(obj)) {
+ return Status::Invalid("Mask was a different length from sequence being
converted");
+ }
- const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
- if (dtype == NPY_BOOL) {
- Ndarray1DIndexer<uint8_t> mask_values(mask);
+ return VisitSequenceGeneric(
+ obj, offset, [&func, &boolmask](PyObject* value, int64_t i, bool*
keep_going) {
+ return func(value, boolmask->Value(i), keep_going);
Review comment:
What about nulls?
##########
File path: cpp/src/arrow/python/iterators.h
##########
@@ -97,31 +99,64 @@ inline Status VisitSequence(PyObject* obj, int64_t offset,
VisitorFunc&& func) {
template <class VisitorFunc>
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
VisitorFunc&& func) {
- if (mo == nullptr || !PyArray_Check(mo)) {
- return Status::Invalid("Null mask must be NumPy array");
- }
+ if (PyArray_Check(mo)) {
+ PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+ if (PyArray_NDIM(mask) != 1) {
+ return Status::Invalid("Mask must be 1D array");
+ }
+ if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+ return Status::Invalid("Mask was a different length from sequence being
converted");
+ }
- PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
- if (PyArray_NDIM(mask) != 1) {
- return Status::Invalid("Mask must be 1D array");
- }
+ const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+ if (dtype == NPY_BOOL) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
- const Py_ssize_t obj_size = PySequence_Size(obj);
- if (PyArray_SIZE(mask) != static_cast<int64_t>(obj_size)) {
- return Status::Invalid("Mask was a different length from sequence being
converted");
- }
+ return VisitSequenceGeneric(
+ obj, offset,
+ [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+ return func(value, mask_values[i], keep_going);
+ });
+ } else {
+ return Status::Invalid("Mask must be boolean dtype");
Review comment:
Ideally, this should have been TypeError...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]