[arrow] branch master updated: ARROW-2135: [Python] Fix NaN conversion when casting from Numpy array

wesm Mon, 12 Mar 2018 12:03:59 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 171340f  ARROW-2135: [Python] Fix NaN conversion when casting from 
Numpy array
171340f is described below

commit 171340fdb05fc1d281f50c2c49b272ab1b908f6a
Author: Antoine Pitrou <anto...@python.org>
AuthorDate: Mon Mar 12 15:03:14 2018 -0400

    ARROW-2135: [Python] Fix NaN conversion when casting from Numpy array
    
    Author: Antoine Pitrou <anto...@python.org>
    
    Closes #1681 from pitrou/ARROW-2135-nan-conversion-when-casting and 
squashes the following commits:
    
    939428dd <Antoine Pitrou> ARROW-2135:  Fix NaN conversion when casting from 
Numpy array
---
 cpp/src/arrow/python/numpy-internal.h       |  54 ++++++++++-
 cpp/src/arrow/python/numpy_interop.h        |  25 +++++
 cpp/src/arrow/python/numpy_to_arrow.cc      | 142 ++++++++++++++++++----------
 cpp/src/arrow/python/type_traits.h          |  40 +++++---
 python/pyarrow/tests/test_convert_pandas.py |  19 ++++
 5 files changed, 214 insertions(+), 66 deletions(-)

diff --git a/cpp/src/arrow/python/numpy-internal.h 
b/cpp/src/arrow/python/numpy-internal.h
index 8d43080..7672861 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -68,6 +68,9 @@ class Ndarray1DIndexer {
   int64_t stride_;
 };
 
+// Handling of Numpy Types by their static numbers
+// (the NPY_TYPES enum and related defines)
+
 static inline std::string GetNumPyTypeName(int npy_type) {
 #define TYPE_CASE(TYPE, NAME) \
   case NPY_##TYPE:            \
@@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) {
     TYPE_CASE(INT16, "int16")
     TYPE_CASE(INT32, "int32")
     TYPE_CASE(INT64, "int64")
-#if (NPY_INT64 != NPY_LONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(INT, "intc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(LONGLONG, "longlong")
 #endif
     TYPE_CASE(UINT8, "uint8")
     TYPE_CASE(UINT16, "uint16")
     TYPE_CASE(UINT32, "uint32")
     TYPE_CASE(UINT64, "uint64")
-#if (NPY_UINT64 != NPY_ULONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(UINT, "uintc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(ULONGLONG, "ulonglong")
 #endif
     TYPE_CASE(FLOAT16, "float16")
@@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) {
   }
 
 #undef TYPE_CASE
-  return "unrecognized type in GetNumPyTypeName";
+  std::stringstream ss;
+  ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
+  return ss.str();
 }
 
+#define TYPE_VISIT_INLINE(TYPE) \
+  case NPY_##TYPE:              \
+    return visitor->template Visit<NPY_##TYPE>(arr);
+
+template <typename VISITOR>
+inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
+  switch (PyArray_TYPE(arr)) {
+    TYPE_VISIT_INLINE(BOOL);
+    TYPE_VISIT_INLINE(INT8);
+    TYPE_VISIT_INLINE(UINT8);
+    TYPE_VISIT_INLINE(INT16);
+    TYPE_VISIT_INLINE(UINT16);
+    TYPE_VISIT_INLINE(INT32);
+    TYPE_VISIT_INLINE(UINT32);
+    TYPE_VISIT_INLINE(INT64);
+    TYPE_VISIT_INLINE(UINT64);
+#if !NPY_INT32_IS_INT
+    TYPE_VISIT_INLINE(INT);
+    TYPE_VISIT_INLINE(UINT);
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_VISIT_INLINE(LONGLONG);
+    TYPE_VISIT_INLINE(ULONGLONG);
+#endif
+    TYPE_VISIT_INLINE(FLOAT16);
+    TYPE_VISIT_INLINE(FLOAT32);
+    TYPE_VISIT_INLINE(FLOAT64);
+    TYPE_VISIT_INLINE(DATETIME);
+    TYPE_VISIT_INLINE(OBJECT);
+  }
+  std::stringstream ss;
+  ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr));
+  return Status::NotImplemented(ss.str());
+}
+
+#undef TYPE_VISIT_INLINE
+
 }  // namespace py
 }  // namespace arrow
 
diff --git a/cpp/src/arrow/python/numpy_interop.h 
b/cpp/src/arrow/python/numpy_interop.h
index 8c569e2..0715c66 100644
--- a/cpp/src/arrow/python/numpy_interop.h
+++ b/cpp/src/arrow/python/numpy_interop.h
@@ -43,6 +43,31 @@
 #include <numpy/arrayscalars.h>
 #include <numpy/ufuncobject.h>
 
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+//   NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+
+#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
+#define NPY_INT64_IS_LONG_LONG 1
+#else
+#define NPY_INT64_IS_LONG_LONG 0
+#endif
+
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#define NPY_INT32_IS_INT 1
+#else
+#define NPY_INT32_IS_INT 0
+#endif
+
 namespace arrow {
 namespace py {
 
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 9e3534d..c22afb7 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -79,6 +79,38 @@ inline bool PyObject_is_integer(PyObject* obj) {
   return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
 }
 
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+  if (PyArray_NDIM(numpy_array) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " 
but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
+  }
+
+  return Status::OK();
+}
+
+Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
+                          std::shared_ptr<ResizableBuffer>* out) {
+  int64_t null_bytes = BitUtil::BytesForBits(length);
+  std::shared_ptr<ResizableBuffer> null_bitmap;
+
+  null_bitmap = std::make_shared<PoolBuffer>(pool);
+  RETURN_NOT_OK(null_bitmap->Resize(null_bytes));
+
+  memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
+  *out = null_bitmap;
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow null bitmap
+
 template <int TYPE>
 inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   typedef internal::npy_traits<TYPE> traits;
@@ -98,6 +130,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* 
bitmap) {
   return null_count;
 }
 
+class NumPyNullsConverter {
+ public:
+  /// Convert the given array's null values to a null bitmap.
+  /// The null bitmap is only allocated if null values are ever possible.
+  static Status Convert(MemoryPool* pool, PyArrayObject* arr,
+                        bool use_pandas_null_sentinels,
+                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+                        int64_t* out_null_count) {
+    NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out_null_bitmap_ = converter.null_bitmap_;
+    *out_null_count = converter.null_count_;
+    return Status::OK();
+  }
+
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    typedef internal::npy_traits<TYPE> traits;
+
+    const bool null_sentinels_possible =
+        // Always treat Numpy's NaT as null
+        TYPE == NPY_DATETIME ||
+        // Observing pandas's null sentinels
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    if (null_sentinels_possible) {
+      RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), 
&null_bitmap_));
+      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr,
+                      bool use_pandas_null_sentinels)
+      : pool_(pool),
+        arr_(arr),
+        use_pandas_null_sentinels_(use_pandas_null_sentinels),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {}
+
+  MemoryPool* pool_;
+  PyArrayObject* arr_;
+  bool use_pandas_null_sentinels_;
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+  int64_t null_count_;
+};
+
 // Returns null count
 int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   int64_t null_count = 0;
@@ -114,22 +195,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, 
uint8_t* bitmap) {
   return null_count;
 }
 
-Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
-  if (PyArray_NDIM(numpy_array) != 1) {
-    return Status::Invalid("only handle 1-dimensional arrays");
-  }
-
-  const int received_type = PyArray_DESCR(numpy_array)->type_num;
-  if (received_type != np_type) {
-    std::stringstream ss;
-    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " 
but got "
-       << GetNumPyTypeName(received_type);
-    return Status::Invalid(ss.str());
-  }
-
-  return Status::OK();
-}
-
 }  // namespace
 
 /// Append as many string objects from NumPy arrays to a `StringBuilder` as we
@@ -296,7 +361,9 @@ class NumPyConverter {
         dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         use_pandas_null_sentinels_(use_pandas_null_sentinels),
-        decimal_type_() {
+        decimal_type_(),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {
     if (mo != nullptr && mo != Py_None) {
       mask_ = reinterpret_cast<PyArrayObject*>(mo);
     }
@@ -351,14 +418,8 @@ class NumPyConverter {
 
  protected:
   Status InitNullBitmap() {
-    int64_t null_bytes = BitUtil::BytesForBits(length_);
-
-    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
-
+    RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
     null_bitmap_data_ = null_bitmap_->mutable_data();
-    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
-
     return Status::OK();
   }
 
@@ -409,32 +470,18 @@ class NumPyConverter {
 
   template <typename ArrowType>
   Status VisitNative() {
-    using traits = internal::arrow_traits<ArrowType::type_id>;
-
-    const bool null_sentinels_possible =
-        // NumPy has a NaT type
-        (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == 
Type::DATE32) ||
-
-        // Observing pandas's null sentinels
-        ((use_pandas_null_sentinels_ && traits::supports_nulls));
-
-    if (mask_ != nullptr || null_sentinels_possible) {
+    if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, 
use_pandas_null_sentinels_,
+                                                 &null_bitmap_, &null_count_));
     }
 
     std::shared_ptr<Buffer> data;
     RETURN_NOT_OK(ConvertData<ArrowType>(&data));
 
-    int64_t null_count = 0;
-    if (mask_ != nullptr) {
-      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
-    } else if (null_sentinels_possible) {
-      // TODO(wesm): this presumes the NumPy C type and arrow C type are the
-      // same
-      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
-    }
-
-    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, 
null_count, 0);
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, 
null_count_, 0);
     return PushArray(arr_data);
   }
 
@@ -488,6 +535,7 @@ class NumPyConverter {
 
   std::shared_ptr<ResizableBuffer> null_bitmap_;
   uint8_t* null_bitmap_data_;
+  int64_t null_count_;
 };
 
 Status NumPyConverter::Convert() {
@@ -654,12 +702,10 @@ inline Status 
NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, 
data);
       RETURN_NOT_OK(s);
     } else {
-      // TODO(wesm): This is redundant, and recomputed in VisitNative()
-      const int64_t null_count = ValuesToBitmap<NPY_DATETIME>(arr_, 
null_bitmap_data_);
-
       RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
       if (!input_type->Equals(*type_)) {
-        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count,
+        // The null bitmap was already computed in VisitNative()
+        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
                                  type_, pool_, data));
       }
     }
diff --git a/cpp/src/arrow/python/type_traits.h 
b/cpp/src/arrow/python/type_traits.h
index 587b27c..ff39aad 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -34,6 +34,9 @@ namespace arrow {
 namespace py {
 namespace internal {
 
+//
+// Type traits for Numpy -> Arrow equivalence
+//
 template <int TYPE>
 struct npy_traits {};
 
@@ -68,7 +71,11 @@ NPY_INT_DECL(UINT16, UInt16, uint16_t);
 NPY_INT_DECL(UINT32, UInt32, uint32_t);
 NPY_INT_DECL(UINT64, UInt64, uint64_t);
 
-#if NPY_INT64 != NPY_LONGLONG
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+NPY_INT_DECL(INT, Int32, int32_t);
+NPY_INT_DECL(UINT, UInt32, uint32_t);
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
 NPY_INT_DECL(LONGLONG, Int64, int64_t);
 NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
 #endif
@@ -127,8 +134,14 @@ template <>
 struct npy_traits<NPY_OBJECT> {
   typedef PyObject* value_type;
   static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(PyObject* v) { return v == Py_None; }
 };
 
+//
+// Type traits for Arrow -> Numpy equivalence
+// Note *supports_nulls* means the equivalent Numpy type support nulls
+//
 template <int TYPE>
 struct arrow_traits {};
 
@@ -252,30 +265,27 @@ struct arrow_traits<Type::BINARY> {
 static inline int NumPyTypeSize(int npy_type) {
   switch (npy_type) {
     case NPY_BOOL:
-      return 1;
     case NPY_INT8:
-      return 1;
-    case NPY_INT16:
-      return 2;
-    case NPY_INT32:
-      return 4;
-    case NPY_INT64:
-      return 8;
-#if (NPY_INT64 != NPY_LONGLONG)
-    case NPY_LONGLONG:
-      return 8;
-#endif
     case NPY_UINT8:
       return 1;
+    case NPY_INT16:
     case NPY_UINT16:
       return 2;
+    case NPY_INT32:
     case NPY_UINT32:
       return 4;
+    case NPY_INT64:
     case NPY_UINT64:
       return 8;
-#if (NPY_UINT64 != NPY_ULONGLONG)
+#if !NPY_INT32_IS_INT
+    case NPY_INT:
+    case NPY_UINT:
+      return NPY_BITSOF_INT / 8;
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    case NPY_LONGLONG:
     case NPY_ULONGLONG:
-      return 8;
+      return NPY_BITSOF_LONGLONG / 8;
 #endif
     case NPY_FLOAT16:
       return 2;
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 7929135..48db204 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -502,6 +502,14 @@ class TestConvertPrimitiveTypes(object):
         result = table.to_pandas()
         tm.assert_frame_equal(result, ex_frame)
 
+    def test_float_nulls_to_ints(self):
+        # ARROW-2135
+        df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
+        schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema)
+        assert table[0].to_pylist() == [1, 2, None]
+        tm.assert_frame_equal(df, table.to_pandas())
+
     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []
@@ -527,6 +535,17 @@ class TestConvertPrimitiveTypes(object):
         schema = pa.schema(fields)
         _check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_all_integer_types(self):
+        # Test all Numpy integer aliases
+        data = OrderedDict()
+        numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
+                        'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc',
+                        'int_', 'uint', 'longlong', 'ulonglong']
+        for dtype in numpy_dtypes:
+            data[dtype] = np.arange(12, dtype=dtype)
+        df = pd.DataFrame(data)
+        _check_pandas_roundtrip(df)
+
     def test_integer_with_nulls(self):
         # pandas requires upcast to float dtype
 

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

[arrow] branch master updated: ARROW-2135: [Python] Fix NaN conversion when casting from Numpy array

Reply via email to