This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a5d8ccc ARROW-4181: [Python] Fixes for Numpy struct array conversion
a5d8ccc is described below
commit a5d8ccc3fc7671986c38cfe3558b77e0fe157053
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Feb 12 15:32:35 2019 +0100
ARROW-4181: [Python] Fixes for Numpy struct array conversion
This fixes two issues:
- object fields inside numpy structs should be allowed
- buggy ndarray indexing if stride % itemsize != 0
Author: Antoine Pitrou <[email protected]>
Closes #3614 from pitrou/ARROW-4181-py-struct-conversion-fixes and squashes
the following commits:
9886e29e <Antoine Pitrou> ARROW-4181: Fixes for Numpy struct array
conversion
---
cpp/src/arrow/python/numpy-internal.h | 20 +++++++++++---------
cpp/src/arrow/python/numpy_to_arrow.cc | 23 ++++++++++++-----------
python/pyarrow/tests/test_convert_pandas.py | 18 +++++++++++++-----
3 files changed, 36 insertions(+), 25 deletions(-)
diff --git a/cpp/src/arrow/python/numpy-internal.h
b/cpp/src/arrow/python/numpy-internal.h
index 6954e35..e27ae5c 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -40,14 +40,12 @@ class Ndarray1DIndexer {
Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {}
- explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
Init(arr); }
-
- void Init(PyArrayObject* arr) {
+ explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
arr_ = arr;
DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
Py_INCREF(arr);
- data_ = reinterpret_cast<T*>(PyArray_DATA(arr));
- stride_ = PyArray_STRIDES(arr)[0] / sizeof(T);
+ data_ = reinterpret_cast<uint8_t*>(PyArray_DATA(arr));
+ stride_ = PyArray_STRIDES(arr)[0];
}
~Ndarray1DIndexer() { Py_XDECREF(arr_); }
@@ -56,14 +54,18 @@ class Ndarray1DIndexer {
T* data() const { return data_; }
- bool is_strided() const { return stride_ != 1; }
+ bool is_strided() const { return stride_ != sizeof(T); }
- T& operator[](size_type index) { return data_[index * stride_]; }
- T& operator[](size_type index) const { return data_[index * stride_]; }
+ T& operator[](size_type index) {
+ return *reinterpret_cast<T*>(data_ + index * stride_);
+ }
+ const T& operator[](size_type index) const {
+ return *reinterpret_cast<const T*>(data_ + index * stride_);
+ }
private:
PyArrayObject* arr_;
- T* data_;
+ uint8_t* data_;
int64_t stride_;
};
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc
b/cpp/src/arrow/python/numpy_to_arrow.cc
index a944b80..ef63ccf 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -318,8 +318,18 @@ Status NumPyConverter::Convert() {
return Status::Invalid("only handle 1-dimensional arrays");
}
- DCHECK_NE(dtype_->type_num, NPY_OBJECT)
- << "This class does not handle NPY_OBJECT arrays";
+ if (dtype_->type_num == NPY_OBJECT) {
+ // If an object array, convert it like a normal Python sequence
+ PyConversionOptions py_options;
+ py_options.type = type_;
+ py_options.from_pandas = from_pandas_;
+ std::shared_ptr<ChunkedArray> res;
+ RETURN_NOT_OK(ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
+ reinterpret_cast<PyObject*>(mask_),
py_options,
+ &res));
+ out_arrays_ = res->chunks();
+ return Status::OK();
+ }
if (type_ == nullptr) {
return Status::Invalid("Must pass data type for non-object arrays");
@@ -790,15 +800,6 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao,
PyObject* mo, bool from_pa
return Status::Invalid("Input object was not a NumPy array");
}
- PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
-
- if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) {
- PyConversionOptions py_options;
- py_options.type = type;
- py_options.from_pandas = from_pandas;
- return ConvertPySequence(ao, mo, py_options, out);
- }
-
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
RETURN_NOT_OK(converter.Convert());
const auto& output_arrays = converter.result();
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 8f2c2eb..e904486 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1851,21 +1851,29 @@ class TestConvertStructTypes(object):
assert arr.to_pylist() == [{}, {}]
def test_from_numpy_nested(self):
+ # Note: an object field inside a struct
dt = np.dtype([('x', np.dtype([('xx', np.int8),
('yy', np.bool_)])),
- ('y', np.int16)])
+ ('y', np.int16),
+ ('z', np.object_)])
+ # Note: itemsize is not a multiple of sizeof(object)
+ assert dt.itemsize == 12
ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()),
pa.field('yy', pa.bool_())])),
- pa.field('y', pa.int16())])
+ pa.field('y', pa.int16()),
+ pa.field('z', pa.string())])
data = np.array([], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == []
- data = np.array([((1, True), 2), ((3, False), 4)], dtype=dt)
+ data = np.array([
+ ((1, True), 2, 'foo'),
+ ((3, False), 4, 'bar')], dtype=dt)
arr = pa.array(data, type=ty)
- assert arr.to_pylist() == [{'x': {'xx': 1, 'yy': True}, 'y': 2},
- {'x': {'xx': 3, 'yy': False}, 'y': 4}]
+ assert arr.to_pylist() == [
+ {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
+ {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]
@pytest.mark.large_memory
def test_from_numpy_large(self):