Repository: arrow Updated Branches: refs/heads/master e44ede87c -> 2972c9d3a
ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists This does drop the vector append to the builder that was there before. I'm going to do some local benchmarking to make sure this doesn't degrade performance unacceptably, will report back here Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #956 from wesm/ARROW-1342 and squashes the following commits: f2ebeba8 [Wes McKinney] Fix cpplint issue f403f9dd [Wes McKinney] Fix test case to be platform independent, note ARROW-1345. Improve quality of error message f4f44c18 [Wes McKinney] Fix test case where inferred list type is null ae5c8312 [Wes McKinney] Drop striding check b4aecd3a [Wes McKinney] Support strided ndarrays in pandas conversion from nested lists Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2972c9d3 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2972c9d3 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2972c9d3 Branch: refs/heads/master Commit: 2972c9d3a0d371dbdcf69c68a0109b83aa6fd944 Parents: e44ede8 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Wed Aug 9 17:47:30 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Wed Aug 9 17:47:30 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/numpy-internal.h | 41 +++++++++++++++ cpp/src/arrow/python/pandas_to_arrow.cc | 67 ++++++++++-------------- cpp/src/arrow/table.cc | 4 +- python/pyarrow/error.pxi | 3 +- python/pyarrow/includes/common.pxd | 1 + python/pyarrow/tests/pandas_examples.py | 10 ++-- python/pyarrow/tests/test_convert_pandas.py | 9 ++++ 7 files changed, 91 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/python/numpy-internal.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index f1ef7da..db34d24 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -25,6 +25,7 @@ #include "arrow/python/platform.h" #include <cstdint> +#include <string> namespace arrow { namespace py { @@ -51,7 +52,12 @@ class Ndarray1DIndexer { int64_t size() const { return PyArray_SIZE(arr_); } + T* data() const { return data_; } + + bool is_strided() const { return stride_ == 1; } + T& operator[](size_type index) { return *(data_ + index * stride_); } + T& operator[](size_type index) const { return *(data_ + index * stride_); } private: PyArrayObject* arr_; @@ -59,6 +65,41 @@ class Ndarray1DIndexer { int64_t stride_; }; +static inline std::string GetNumPyTypeName(int npy_type) { +#define TYPE_CASE(TYPE, NAME) \ + case NPY_##TYPE: \ + return NAME; + + switch (npy_type) { + TYPE_CASE(BOOL, "bool") + TYPE_CASE(INT8, "int8") + TYPE_CASE(INT16, "int16") + TYPE_CASE(INT32, "int32") + TYPE_CASE(INT64, "int64") +#if (NPY_INT64 != NPY_LONGLONG) + TYPE_CASE(LONGLONG, "longlong") +#endif + TYPE_CASE(UINT8, "uint8") + TYPE_CASE(UINT16, "uint16") + TYPE_CASE(UINT32, "uint32") + TYPE_CASE(UINT64, "uint64") +#if (NPY_UINT64 != NPY_ULONGLONG) + TYPE_CASE(ULONGLONG, "ulonglong") +#endif + TYPE_CASE(FLOAT16, "float16") + TYPE_CASE(FLOAT32, "float32") + TYPE_CASE(FLOAT64, "float64") + TYPE_CASE(DATETIME, "datetime64") + TYPE_CASE(OBJECT, "object") + TYPE_CASE(VOID, "void") + default: + break; + } + +#undef TYPE_CASE + return "unrecognized type in GetNumPyTypeName"; +} + } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/python/pandas_to_arrow.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 060fcb2..b6cc16b 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -97,8 +97,6 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { int64_t null_count = 0; Ndarray1DIndexer<T> values(arr); - - // TODO(wesm): striding for (int i = 0; i < values.size(); ++i) { if (traits::isnull(values[i])) { ++null_count; @@ -125,22 +123,27 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap return null_count; } -template <int TYPE> -static int64_t ValuesToValidBytes(const void* data, int64_t length, - uint8_t* valid_bytes) { +template <int TYPE, typename BuilderType> +static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { typedef internal::npy_traits<TYPE> traits; typedef typename traits::value_type T; - int64_t null_count = 0; - const T* values = reinterpret_cast<const T*>(data); - - // TODO(wesm): striding - for (int i = 0; i < length; ++i) { - valid_bytes[i] = !traits::isnull(values[i]); - if (traits::isnull(values[i])) null_count++; + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer<T> values(array); + if (traits::supports_nulls) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(builder->Append(values[i])); + } } - - return null_count; + return Status::OK(); } Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { @@ -148,14 +151,14 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::Invalid("only handle 1-dimensional arrays"); } - if (PyArray_DESCR(numpy_array)->type_num != np_type) { - return Status::Invalid("can only handle exact conversions"); + const int received_type = PyArray_DESCR(numpy_array)->type_num; + if (received_type != np_type) { + std::stringstream ss; + ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " + << GetNumPyTypeName(received_type); + return Status::Invalid(ss.str()); } - npy_intp* astrides = PyArray_STRIDES(numpy_array); - if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) { - return Status::Invalid("No support for strided arrays in lists yet"); - } return Status::OK(); } @@ -577,7 +580,7 @@ Status PandasConverter::ConvertDecimals() { RETURN_NOT_OK(ImportModule("decimal", &decimal)); RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); - PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + Ndarray1DIndexer<PyObject*> objects(arr_); PyObject* object = objects[0]; int precision; @@ -618,7 +621,7 @@ Status PandasConverter::ConvertTimes() { PyAcquireGIL lock; PyDateTime_IMPORT; - PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + Ndarray1DIndexer<PyObject*> objects(arr_); // datetime.time stores microsecond resolution Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); @@ -906,7 +909,7 @@ Status LoopPySequence(PyObject* sequence, T func) { Py_ssize_t size = PySequence_Size(sequence); if (PyArray_Check(sequence)) { auto array = reinterpret_cast<PyArrayObject*>(sequence); - PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(array)); + Ndarray1DIndexer<PyObject*> objects(array); for (int64_t i = 0; i < size; ++i) { RETURN_NOT_OK(func(objects[i])); } @@ -934,7 +937,6 @@ template <int ITEM_TYPE, typename ArrowType> inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) { typedef internal::npy_traits<ITEM_TYPE> traits; - typedef typename traits::value_type T; typedef typename traits::BuilderClass BuilderT; PyAcquireGIL lock; @@ -956,24 +958,13 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType> // TODO(uwe): Support more complex numpy array structures RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE)); - int64_t size = PyArray_DIM(numpy_array, 0); - auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array)); - if (traits::supports_nulls) { - RETURN_NOT_OK(null_bitmap_->Resize(size, false)); - // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't - // currently support this. - // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data()); - ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data()); - return value_builder->Append(data, size, null_bitmap_->data()); - } else { - return value_builder->Append(data, size); - } + return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder); } else if (PyList_Check(object)) { int64_t size; std::shared_ptr<DataType> inferred_type; RETURN_NOT_OK(builder->Append(true)); RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); - if (inferred_type->id() != type->id()) { + if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) { std::stringstream ss; ss << inferred_type->ToString() << " cannot be converted to " << type->ToString(); return Status::TypeError(ss.str()); @@ -1064,7 +1055,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>( std::shared_ptr<DataType> inferred_type; RETURN_NOT_OK(builder->Append(true)); RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); - if (inferred_type->id() != Type::STRING) { + if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) { std::stringstream ss; ss << inferred_type->ToString() << " cannot be converted to STRING."; return Status::TypeError(ss.str()); http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 1f0c6d7..ae48698 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr<Schema>& schema, columns_.resize(columns.size()); for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = std::make_shared<Column>(schema->field(static_cast<int>(i)), - columns[i]); + columns_[i] = + std::make_shared<Column>(schema->field(static_cast<int>(i)), columns[i]); } } http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/error.pxi ---------------------------------------------------------------------- diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 8a3f57d..8793c4e 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -65,7 +65,7 @@ cdef int check_status(const CStatus& status) nogil except -1: return 0 with gil: - message = frombytes(status.ToString()) + message = frombytes(status.message()) if status.IsInvalid(): raise ArrowInvalid(message) elif status.IsIOError(): @@ -85,4 +85,5 @@ cdef int check_status(const CStatus& status) nogil except -1: elif status.IsPlasmaStoreFull(): raise PlasmaStoreFull(message) else: + message = frombytes(status.ToString()) raise ArrowException(message) http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/includes/common.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 637a133..6be08b0 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -42,6 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus() c_string ToString() + c_string message() c_bool ok() c_bool IsIOError() http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/tests/pandas_examples.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index 17ad4b2..c145e96 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -98,21 +98,25 @@ def dataframe_with_lists(include_index=False): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, - [0] + [0], + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, + dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, - [0.] + [0.], + np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], - [u"1", u"2", u"3"] + [u"1", u"2", u"3"], + [], ] if include_index: http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 8969777..61bd072 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -534,6 +534,15 @@ class TestPandasConversion(unittest.TestCase): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], type=field.type) + def test_column_of_lists_strided(self): + df, schema = dataframe_with_lists() + df = pd.concat([df] * 6, ignore_index=True) + + arr = df['int64'].values[::3] + assert arr.strides[0] != 8 + + self._check_array_roundtrip(arr) + def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object)