This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a5d8ccc  ARROW-4181: [Python] Fixes for Numpy struct array conversion
a5d8ccc is described below

commit a5d8ccc3fc7671986c38cfe3558b77e0fe157053
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Feb 12 15:32:35 2019 +0100

    ARROW-4181: [Python] Fixes for Numpy struct array conversion
    
    This fixes two issues:
    - object fields inside numpy structs should be allowed
    - buggy ndarray indexing if stride % itemsize != 0
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #3614 from pitrou/ARROW-4181-py-struct-conversion-fixes and squashes 
the following commits:
    
    9886e29e <Antoine Pitrou> ARROW-4181:  Fixes for Numpy struct array 
conversion
---
 cpp/src/arrow/python/numpy-internal.h       | 20 +++++++++++---------
 cpp/src/arrow/python/numpy_to_arrow.cc      | 23 ++++++++++++-----------
 python/pyarrow/tests/test_convert_pandas.py | 18 +++++++++++++-----
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/cpp/src/arrow/python/numpy-internal.h 
b/cpp/src/arrow/python/numpy-internal.h
index 6954e35..e27ae5c 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -40,14 +40,12 @@ class Ndarray1DIndexer {
 
   Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {}
 
-  explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { 
Init(arr); }
-
-  void Init(PyArrayObject* arr) {
+  explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
     arr_ = arr;
     DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
     Py_INCREF(arr);
-    data_ = reinterpret_cast<T*>(PyArray_DATA(arr));
-    stride_ = PyArray_STRIDES(arr)[0] / sizeof(T);
+    data_ = reinterpret_cast<uint8_t*>(PyArray_DATA(arr));
+    stride_ = PyArray_STRIDES(arr)[0];
   }
 
   ~Ndarray1DIndexer() { Py_XDECREF(arr_); }
@@ -56,14 +54,18 @@ class Ndarray1DIndexer {
 
   T* data() const { return data_; }
 
-  bool is_strided() const { return stride_ != 1; }
+  bool is_strided() const { return stride_ != sizeof(T); }
 
-  T& operator[](size_type index) { return data_[index * stride_]; }
-  T& operator[](size_type index) const { return data_[index * stride_]; }
+  T& operator[](size_type index) {
+    return *reinterpret_cast<T*>(data_ + index * stride_);
+  }
+  const T& operator[](size_type index) const {
+    return *reinterpret_cast<const T*>(data_ + index * stride_);
+  }
 
  private:
   PyArrayObject* arr_;
-  T* data_;
+  uint8_t* data_;
   int64_t stride_;
 };
 
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index a944b80..ef63ccf 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -318,8 +318,18 @@ Status NumPyConverter::Convert() {
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  DCHECK_NE(dtype_->type_num, NPY_OBJECT)
-      << "This class does not handle NPY_OBJECT arrays";
+  if (dtype_->type_num == NPY_OBJECT) {
+    // If an object array, convert it like a normal Python sequence
+    PyConversionOptions py_options;
+    py_options.type = type_;
+    py_options.from_pandas = from_pandas_;
+    std::shared_ptr<ChunkedArray> res;
+    RETURN_NOT_OK(ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
+                                    reinterpret_cast<PyObject*>(mask_), 
py_options,
+                                    &res));
+    out_arrays_ = res->chunks();
+    return Status::OK();
+  }
 
   if (type_ == nullptr) {
     return Status::Invalid("Must pass data type for non-object arrays");
@@ -790,15 +800,6 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, 
PyObject* mo, bool from_pa
     return Status::Invalid("Input object was not a NumPy array");
   }
 
-  PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
-
-  if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) {
-    PyConversionOptions py_options;
-    py_options.type = type;
-    py_options.from_pandas = from_pandas;
-    return ConvertPySequence(ao, mo, py_options, out);
-  }
-
   NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
   RETURN_NOT_OK(converter.Convert());
   const auto& output_arrays = converter.result();
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 8f2c2eb..e904486 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1851,21 +1851,29 @@ class TestConvertStructTypes(object):
         assert arr.to_pylist() == [{}, {}]
 
     def test_from_numpy_nested(self):
+        # Note: an object field inside a struct
         dt = np.dtype([('x', np.dtype([('xx', np.int8),
                                        ('yy', np.bool_)])),
-                       ('y', np.int16)])
+                       ('y', np.int16),
+                       ('z', np.object_)])
+        # Note: itemsize is not a multiple of sizeof(object)
+        assert dt.itemsize == 12
         ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()),
                                                  pa.field('yy', pa.bool_())])),
-                        pa.field('y', pa.int16())])
+                        pa.field('y', pa.int16()),
+                        pa.field('z', pa.string())])
 
         data = np.array([], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == []
 
-        data = np.array([((1, True), 2), ((3, False), 4)], dtype=dt)
+        data = np.array([
+            ((1, True), 2, 'foo'),
+            ((3, False), 4, 'bar')], dtype=dt)
         arr = pa.array(data, type=ty)
-        assert arr.to_pylist() == [{'x': {'xx': 1, 'yy': True}, 'y': 2},
-                                   {'x': {'xx': 3, 'yy': False}, 'y': 4}]
+        assert arr.to_pylist() == [
+            {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
+            {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]
 
     @pytest.mark.large_memory
     def test_from_numpy_large(self):

Reply via email to