This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 171340f ARROW-2135: [Python] Fix NaN conversion when casting from Numpy array 171340f is described below commit 171340fdb05fc1d281f50c2c49b272ab1b908f6a Author: Antoine Pitrou <anto...@python.org> AuthorDate: Mon Mar 12 15:03:14 2018 -0400 ARROW-2135: [Python] Fix NaN conversion when casting from Numpy array Author: Antoine Pitrou <anto...@python.org> Closes #1681 from pitrou/ARROW-2135-nan-conversion-when-casting and squashes the following commits: 939428dd <Antoine Pitrou> ARROW-2135: Fix NaN conversion when casting from Numpy array --- cpp/src/arrow/python/numpy-internal.h | 54 ++++++++++- cpp/src/arrow/python/numpy_interop.h | 25 +++++ cpp/src/arrow/python/numpy_to_arrow.cc | 142 ++++++++++++++++++---------- cpp/src/arrow/python/type_traits.h | 40 +++++--- python/pyarrow/tests/test_convert_pandas.py | 19 ++++ 5 files changed, 214 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 8d43080..7672861 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -68,6 +68,9 @@ class Ndarray1DIndexer { int64_t stride_; }; +// Handling of Numpy Types by their static numbers +// (the NPY_TYPES enum and related defines) + static inline std::string GetNumPyTypeName(int npy_type) { #define TYPE_CASE(TYPE, NAME) \ case NPY_##TYPE: \ @@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) { TYPE_CASE(INT16, "int16") TYPE_CASE(INT32, "int32") TYPE_CASE(INT64, "int64") -#if (NPY_INT64 != NPY_LONGLONG) +#if !NPY_INT32_IS_INT + TYPE_CASE(INT, "intc") +#endif +#if !NPY_INT64_IS_LONG_LONG TYPE_CASE(LONGLONG, "longlong") #endif TYPE_CASE(UINT8, "uint8") TYPE_CASE(UINT16, "uint16") TYPE_CASE(UINT32, "uint32") TYPE_CASE(UINT64, "uint64") -#if (NPY_UINT64 != NPY_ULONGLONG) +#if !NPY_INT32_IS_INT + TYPE_CASE(UINT, "uintc") +#endif +#if !NPY_INT64_IS_LONG_LONG TYPE_CASE(ULONGLONG, "ulonglong") #endif TYPE_CASE(FLOAT16, "float16") @@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) { } #undef TYPE_CASE - return "unrecognized type in GetNumPyTypeName"; + std::stringstream ss; + ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; + return ss.str(); } +#define TYPE_VISIT_INLINE(TYPE) \ + case NPY_##TYPE: \ + return visitor->template Visit<NPY_##TYPE>(arr); + +template <typename VISITOR> +inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { + switch (PyArray_TYPE(arr)) { + TYPE_VISIT_INLINE(BOOL); + TYPE_VISIT_INLINE(INT8); + TYPE_VISIT_INLINE(UINT8); + TYPE_VISIT_INLINE(INT16); + TYPE_VISIT_INLINE(UINT16); + TYPE_VISIT_INLINE(INT32); + TYPE_VISIT_INLINE(UINT32); + TYPE_VISIT_INLINE(INT64); + TYPE_VISIT_INLINE(UINT64); +#if !NPY_INT32_IS_INT + TYPE_VISIT_INLINE(INT); + TYPE_VISIT_INLINE(UINT); +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_VISIT_INLINE(LONGLONG); + TYPE_VISIT_INLINE(ULONGLONG); +#endif + TYPE_VISIT_INLINE(FLOAT16); + TYPE_VISIT_INLINE(FLOAT32); + TYPE_VISIT_INLINE(FLOAT64); + TYPE_VISIT_INLINE(DATETIME); + TYPE_VISIT_INLINE(OBJECT); + } + std::stringstream ss; + ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr)); + return Status::NotImplemented(ss.str()); +} + +#undef TYPE_VISIT_INLINE + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_interop.h b/cpp/src/arrow/python/numpy_interop.h index 8c569e2..0715c66 100644 --- a/cpp/src/arrow/python/numpy_interop.h +++ b/cpp/src/arrow/python/numpy_interop.h @@ -43,6 +43,31 @@ #include <numpy/arrayscalars.h> #include <numpy/ufuncobject.h> +// A bit subtle. Numpy has 5 canonical integer types: +// (or, rather, type pairs: signed and unsigned) +// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG +// It also has 4 fixed-width integer aliases. +// When mapping Arrow integer types to these 4 fixed-width aliases, +// we always miss one of the canonical types (even though it may +// have the same width as one of the aliases). +// Which one depends on the platform... +// On a LP64 system, NPY_INT64 maps to NPY_LONG and +// NPY_LONGLONG needs to be handled separately. +// On a LLP64 system, NPY_INT32 maps to NPY_LONG and +// NPY_INT needs to be handled separately. + +#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64 +#define NPY_INT64_IS_LONG_LONG 1 +#else +#define NPY_INT64_IS_LONG_LONG 0 +#endif + +#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64 +#define NPY_INT32_IS_INT 1 +#else +#define NPY_INT32_IS_INT 0 +#endif + namespace arrow { namespace py { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 9e3534d..c22afb7 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -79,6 +79,38 @@ inline bool PyObject_is_integer(PyObject* obj) { return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj); } +Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { + if (PyArray_NDIM(numpy_array) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + const int received_type = PyArray_DESCR(numpy_array)->type_num; + if (received_type != np_type) { + std::stringstream ss; + ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " + << GetNumPyTypeName(received_type); + return Status::Invalid(ss.str()); + } + + return Status::OK(); +} + +Status AllocateNullBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr<ResizableBuffer>* out) { + int64_t null_bytes = BitUtil::BytesForBits(length); + std::shared_ptr<ResizableBuffer> null_bitmap; + + null_bitmap = std::make_shared<PoolBuffer>(pool); + RETURN_NOT_OK(null_bitmap->Resize(null_bytes)); + + memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes)); + *out = null_bitmap; + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Conversion from NumPy-in-Pandas to Arrow null bitmap + template <int TYPE> inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits<TYPE> traits; @@ -98,6 +130,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { return null_count; } +class NumPyNullsConverter { + public: + /// Convert the given array's null values to a null bitmap. + /// The null bitmap is only allocated if null values are ever possible. + static Status Convert(MemoryPool* pool, PyArrayObject* arr, + bool use_pandas_null_sentinels, + std::shared_ptr<ResizableBuffer>* out_null_bitmap_, + int64_t* out_null_count) { + NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels); + RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); + *out_null_bitmap_ = converter.null_bitmap_; + *out_null_count = converter.null_count_; + return Status::OK(); + } + + template <int TYPE> + Status Visit(PyArrayObject* arr) { + typedef internal::npy_traits<TYPE> traits; + + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + TYPE == NPY_DATETIME || + // Observing pandas's null sentinels + (use_pandas_null_sentinels_ && traits::supports_nulls); + + if (null_sentinels_possible) { + RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_)); + null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data()); + } + return Status::OK(); + } + + protected: + NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, + bool use_pandas_null_sentinels) + : pool_(pool), + arr_(arr), + use_pandas_null_sentinels_(use_pandas_null_sentinels), + null_bitmap_data_(nullptr), + null_count_(0) {} + + MemoryPool* pool_; + PyArrayObject* arr_; + bool use_pandas_null_sentinels_; + std::shared_ptr<ResizableBuffer> null_bitmap_; + uint8_t* null_bitmap_data_; + int64_t null_count_; +}; + // Returns null count int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; @@ -114,22 +195,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { return null_count; } -Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { - if (PyArray_NDIM(numpy_array) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - const int received_type = PyArray_DESCR(numpy_array)->type_num; - if (received_type != np_type) { - std::stringstream ss; - ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " - << GetNumPyTypeName(received_type); - return Status::Invalid(ss.str()); - } - - return Status::OK(); -} - } // namespace /// Append as many string objects from NumPy arrays to a `StringBuilder` as we @@ -296,7 +361,9 @@ class NumPyConverter { dtype_(PyArray_DESCR(arr_)), mask_(nullptr), use_pandas_null_sentinels_(use_pandas_null_sentinels), - decimal_type_() { + decimal_type_(), + null_bitmap_data_(nullptr), + null_count_(0) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast<PyArrayObject*>(mo); } @@ -351,14 +418,8 @@ class NumPyConverter { protected: Status InitNullBitmap() { - int64_t null_bytes = BitUtil::BytesForBits(length_); - - null_bitmap_ = std::make_shared<PoolBuffer>(pool_); - RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); - + RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_)); null_bitmap_data_ = null_bitmap_->mutable_data(); - memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes)); - return Status::OK(); } @@ -409,32 +470,18 @@ class NumPyConverter { template <typename ArrowType> Status VisitNative() { - using traits = internal::arrow_traits<ArrowType::type_id>; - - const bool null_sentinels_possible = - // NumPy has a NaT type - (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == Type::DATE32) || - - // Observing pandas's null sentinels - ((use_pandas_null_sentinels_ && traits::supports_nulls)); - - if (mask_ != nullptr || null_sentinels_possible) { + if (mask_ != nullptr) { RETURN_NOT_OK(InitNullBitmap()); + null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else { + RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, use_pandas_null_sentinels_, + &null_bitmap_, &null_count_)); } std::shared_ptr<Buffer> data; RETURN_NOT_OK(ConvertData<ArrowType>(&data)); - int64_t null_count = 0; - if (mask_ != nullptr) { - null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (null_sentinels_possible) { - // TODO(wesm): this presumes the NumPy C type and arrow C type are the - // same - null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_); - } - - auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count, 0); + auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); return PushArray(arr_data); } @@ -488,6 +535,7 @@ class NumPyConverter { std::shared_ptr<ResizableBuffer> null_bitmap_; uint8_t* null_bitmap_data_; + int64_t null_count_; }; Status NumPyConverter::Convert() { @@ -654,12 +702,10 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data); RETURN_NOT_OK(s); } else { - // TODO(wesm): This is redundant, and recomputed in VisitNative() - const int64_t null_count = ValuesToBitmap<NPY_DATETIME>(arr_, null_bitmap_data_); - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count, + // The null bitmap was already computed in VisitNative() + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, pool_, data)); } } diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index 587b27c..ff39aad 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -34,6 +34,9 @@ namespace arrow { namespace py { namespace internal { +// +// Type traits for Numpy -> Arrow equivalence +// template <int TYPE> struct npy_traits {}; @@ -68,7 +71,11 @@ NPY_INT_DECL(UINT16, UInt16, uint16_t); NPY_INT_DECL(UINT32, UInt32, uint32_t); NPY_INT_DECL(UINT64, UInt64, uint64_t); -#if NPY_INT64 != NPY_LONGLONG +#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 +NPY_INT_DECL(INT, Int32, int32_t); +NPY_INT_DECL(UINT, UInt32, uint32_t); +#endif +#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 NPY_INT_DECL(LONGLONG, Int64, int64_t); NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); #endif @@ -127,8 +134,14 @@ template <> struct npy_traits<NPY_OBJECT> { typedef PyObject* value_type; static constexpr bool supports_nulls = true; + + static inline bool isnull(PyObject* v) { return v == Py_None; } }; +// +// Type traits for Arrow -> Numpy equivalence +// Note *supports_nulls* means the equivalent Numpy type support nulls +// template <int TYPE> struct arrow_traits {}; @@ -252,30 +265,27 @@ struct arrow_traits<Type::BINARY> { static inline int NumPyTypeSize(int npy_type) { switch (npy_type) { case NPY_BOOL: - return 1; case NPY_INT8: - return 1; - case NPY_INT16: - return 2; - case NPY_INT32: - return 4; - case NPY_INT64: - return 8; -#if (NPY_INT64 != NPY_LONGLONG) - case NPY_LONGLONG: - return 8; -#endif case NPY_UINT8: return 1; + case NPY_INT16: case NPY_UINT16: return 2; + case NPY_INT32: case NPY_UINT32: return 4; + case NPY_INT64: case NPY_UINT64: return 8; -#if (NPY_UINT64 != NPY_ULONGLONG) +#if !NPY_INT32_IS_INT + case NPY_INT: + case NPY_UINT: + return NPY_BITSOF_INT / 8; +#endif +#if !NPY_INT64_IS_LONG_LONG + case NPY_LONGLONG: case NPY_ULONGLONG: - return 8; + return NPY_BITSOF_LONGLONG / 8; #endif case NPY_FLOAT16: return 2; diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 7929135..48db204 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -502,6 +502,14 @@ class TestConvertPrimitiveTypes(object): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) + def test_float_nulls_to_ints(self): + # ARROW-2135 + df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]}) + schema = pa.schema([pa.field("a", pa.int16(), nullable=True)]) + table = pa.Table.from_pandas(df, schema=schema) + assert table[0].to_pylist() == [1, 2, None] + tm.assert_frame_equal(df, table.to_pandas()) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -527,6 +535,17 @@ class TestConvertPrimitiveTypes(object): schema = pa.schema(fields) _check_pandas_roundtrip(df, expected_schema=schema) + def test_all_integer_types(self): + # Test all Numpy integer aliases + data = OrderedDict() + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', + 'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc', + 'int_', 'uint', 'longlong', 'ulonglong'] + for dtype in numpy_dtypes: + data[dtype] = np.arange(12, dtype=dtype) + df = pd.DataFrame(data) + _check_pandas_roundtrip(df) + def test_integer_with_nulls(self): # pandas requires upcast to float dtype -- To stop receiving notification emails like this one, please contact w...@apache.org.