This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8f2ff30  ARROW-2240: [Python] Array initialization with leading numpy 
nan fails with exception
8f2ff30 is described below

commit 8f2ff30664f3a92163801a84bf0be1f4499fcfc3
Author: Phillip Cloud <[email protected]>
AuthorDate: Mon Mar 12 14:00:14 2018 -0400

    ARROW-2240: [Python] Array initialization with leading numpy nan fails with 
exception
    
    Author: Phillip Cloud <[email protected]>
    
    Closes #1686 from cpcloud/ARROW-2240 and squashes the following commits:
    
    2da8ed31 <Phillip Cloud> Remove duplicated isnan logic
    d2f833fd <Phillip Cloud> Remove redundant test
    70837fbd <Phillip Cloud> Formatting
    bdc5f962 <Phillip Cloud> Resolve conflicts
    0a576267 <Phillip Cloud> Python 2, gross
    62018f9b <Phillip Cloud> ARROW-2240:  Array initialization with leading 
numpy nan fails with exception
---
 cpp/src/arrow/python/builtin_convert.cc | 17 +++++++--------
 cpp/src/arrow/python/helpers.cc         | 13 ++++++++----
 cpp/src/arrow/python/helpers.h          |  5 ++++-
 cpp/src/arrow/python/numpy_to_arrow.cc  | 37 ++++++++++++++-------------------
 cpp/src/arrow/python/python-test.cc     | 15 +++++++++++++
 5 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/cpp/src/arrow/python/builtin_convert.cc 
b/cpp/src/arrow/python/builtin_convert.cc
index d2f900f..595499d 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -88,7 +88,7 @@ class ScalarVisitor {
 
   Status Visit(PyObject* obj) {
     ++total_count_;
-    if (obj == Py_None) {
+    if (obj == Py_None || internal::PyFloat_IsNaN(obj)) {
       ++none_count_;
     } else if (PyBool_Check(obj)) {
       ++bool_count_;
@@ -412,9 +412,10 @@ class TypedConverterVisitor : public 
TypedConverter<BuilderType> {
     RETURN_NOT_OK(this->typed_builder_->Reserve(size));
     // Iterate over the items adding each one
     if (PySequence_Check(obj)) {
+      auto self = static_cast<Derived*>(this);
       for (int64_t i = 0; i < size; ++i) {
         OwnedRef ref(PySequence_GetItem(obj, i));
-        RETURN_NOT_OK(static_cast<Derived*>(this)->AppendSingle(ref.obj()));
+        RETURN_NOT_OK(self->AppendSingle(ref.obj()));
       }
     } else {
       return Status::TypeError("Object is not a sequence");
@@ -424,7 +425,8 @@ class TypedConverterVisitor : public 
TypedConverter<BuilderType> {
 
   // Append a missing item (default implementation)
   Status AppendNull() { return this->typed_builder_->AppendNull(); }
-  bool IsNull(PyObject* obj) const { return obj == Py_None; }
+
+  bool IsNull(PyObject* obj) const { return internal::PandasObjectIsNull(obj); 
}
 };
 
 class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> 
{
@@ -438,7 +440,9 @@ class NullConverter : public 
TypedConverterVisitor<NullBuilder, NullConverter> {
 class BoolConverter : public TypedConverterVisitor<BooleanBuilder, 
BoolConverter> {
  public:
   // Append a non-missing item
-  Status AppendItem(PyObject* obj) { return typed_builder_->Append(obj == 
Py_True); }
+  Status AppendItem(PyObject* obj) {
+    return typed_builder_->Append(PyObject_IsTrue(obj) == 1);
+  }
 };
 
 class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> 
{
@@ -851,11 +855,6 @@ class DecimalConverter
     RETURN_NOT_OK(internal::DecimalFromPythonDecimal(obj, type, &value));
     return typed_builder_->Append(value);
   }
-
-  bool IsNull(PyObject* obj) const {
-    return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) 
||
-           (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
-  }
 };
 
 // Dynamic constructor for sequence converters
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 13dcc46..5719af6 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -226,10 +226,6 @@ Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
   return Status::OK();
 }
 
-bool PyFloat_isnan(PyObject* obj) {
-  return PyFloat_Check(obj) && std::isnan(PyFloat_AS_DOUBLE(obj));
-}
-
 bool PyDecimal_Check(PyObject* obj) {
   // TODO(phillipc): Is this expensive?
   OwnedRef Decimal;
@@ -283,6 +279,15 @@ Status DecimalMetadata::Update(PyObject* object) {
   return Update(precision, scale);
 }
 
+bool PyFloat_IsNaN(PyObject* obj) {
+  return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
+}
+
+bool PandasObjectIsNull(PyObject* obj) {
+  return obj == Py_None || obj == numpy_nan || PyFloat_IsNaN(obj) ||
+         (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
+}
+
 }  // namespace internal
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 6be0e49..b9f505a 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -82,8 +82,11 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, 
const DecimalType& arr
 // \brief Check whether obj is an integer, independent of Python versions.
 bool IsPyInteger(PyObject* obj);
 
+// \brief Use pandas missing value semantics to check if a value is null
+bool PandasObjectIsNull(PyObject* obj);
+
 // \brief Check whether obj is nan
-bool PyFloat_isnan(PyObject* obj);
+bool PyFloat_IsNaN(PyObject* obj);
 
 // \brief Check whether obj is an instance of Decimal
 bool PyDecimal_Check(PyObject* obj);
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 04a71c1..9e3534d 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -67,11 +67,6 @@ constexpr int64_t kBinaryMemoryLimit = 
std::numeric_limits<int32_t>::max();
 
 namespace {
 
-inline bool PandasObjectIsNull(PyObject* obj) {
-  return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) ||
-         (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
-}
-
 inline bool PyObject_is_string(PyObject* obj) {
 #if PY_MAJOR_VERSION >= 3
   return PyUnicode_Check(obj) || PyBytes_Check(obj);
@@ -161,7 +156,7 @@ static Status AppendObjectBinaries(PyArrayObject* arr, 
PyArrayObject* mask,
   for (; offset < objects.size(); ++offset) {
     OwnedRef tmp_obj;
     obj = objects[offset];
-    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[offset]) || 
internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
     } else if (!PyBytes_Check(obj)) {
@@ -207,7 +202,7 @@ static Status AppendObjectStrings(PyArrayObject* arr, 
PyArrayObject* mask, int64
   for (; offset < objects.size(); ++offset) {
     OwnedRef tmp_obj;
     obj = objects[offset];
-    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[offset]) || 
internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
     } else if (PyUnicode_Check(obj)) {
@@ -256,7 +251,7 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* 
arr, PyArrayObject* mas
   for (; offset < objects.size(); ++offset) {
     OwnedRef tmp_obj;
     obj = objects[offset];
-    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[offset]) || 
internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
     } else if (PyUnicode_Check(obj)) {
@@ -722,7 +717,7 @@ Status NumPyConverter::ConvertDates() {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else if (PyDate_CheckExact(obj)) {
       RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
@@ -770,7 +765,7 @@ Status NumPyConverter::ConvertDecimals() {
       RETURN_IF_PYERROR();
     }
 
-    if (PandasObjectIsNull(object)) {
+    if (internal::PandasObjectIsNull(object)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else {
       Decimal128 value;
@@ -798,7 +793,7 @@ Status NumPyConverter::ConvertDateTimes() {
     if (PyDateTime_Check(obj)) {
       RETURN_NOT_OK(
           
builder.Append(PyDateTime_to_us(reinterpret_cast<PyDateTime_DateTime*>(obj))));
-    } else if (PandasObjectIsNull(obj)) {
+    } else if (internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else {
       std::stringstream ss;
@@ -826,7 +821,7 @@ Status NumPyConverter::ConvertTimes() {
     obj = objects[i];
     if (PyTime_Check(obj)) {
       RETURN_NOT_OK(builder.Append(PyTime_to_us(obj)));
-    } else if (PandasObjectIsNull(obj)) {
+    } else if (internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else {
       std::stringstream ss;
@@ -895,7 +890,7 @@ Status NumPyConverter::ConvertObjectFloats() {
   PyObject* obj;
   for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else if (PyFloat_Check(obj)) {
       double val = PyFloat_AsDouble(obj);
@@ -930,7 +925,7 @@ Status NumPyConverter::ConvertObjectIntegers() {
   PyObject* obj;
   for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
     } else if (PyObject_is_integer(obj)) {
       const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
@@ -999,7 +994,7 @@ Status NumPyConverter::ConvertBooleans() {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+    if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
       ++null_count;
     } else if (obj == Py_True) {
       BitUtil::SetBit(bitmap, i);
@@ -1028,7 +1023,7 @@ Status NumPyConverter::ConvertObjectsInfer() {
 
   for (int64_t i = 0; i < length_; ++i) {
     PyObject* obj = objects[i];
-    if (PandasObjectIsNull(obj)) {
+    if (internal::PandasObjectIsNull(obj)) {
       continue;
     } else if (PyObject_is_string(obj)) {
       return ConvertObjectStrings();
@@ -1221,7 +1216,7 @@ inline Status NumPyConverter::ConvertTypedLists(const 
std::shared_ptr<DataType>&
   BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object, bool mask) {
-    if (mask || PandasObjectIsNull(object)) {
+    if (mask || internal::PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1266,7 +1261,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
   auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object, bool mask) {
-    if (mask || PandasObjectIsNull(object)) {
+    if (mask || internal::PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1312,7 +1307,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, BinaryType>(
   auto value_builder = static_cast<BinaryBuilder*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object, bool mask) {
-    if (mask || PandasObjectIsNull(object)) {
+    if (mask || internal::PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1365,7 +1360,7 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
   auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object, bool mask) {
-    if (mask || PandasObjectIsNull(object)) {
+    if (mask || internal::PandasObjectIsNull(object)) {
       return builder->AppendNull();
     } else if (PyArray_Check(object)) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1428,7 +1423,7 @@ Status NumPyConverter::ConvertLists(const 
std::shared_ptr<DataType>& type,
       auto value_builder = static_cast<ListBuilder*>(builder->value_builder());
 
       auto foreach_item = [this, &builder, &value_builder, 
&list_type](PyObject* object) {
-        if (PandasObjectIsNull(object)) {
+        if (internal::PandasObjectIsNull(object)) {
           return builder->AppendNull();
         } else {
           RETURN_NOT_OK(builder->Append(true));
diff --git a/cpp/src/arrow/python/python-test.cc 
b/cpp/src/arrow/python/python-test.cc
index 16ac1e3..c18b159 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -367,5 +367,20 @@ TEST_F(DecimalTest, UpdateWithNaN) {
   ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
 }
 
+TEST(PythonTest, ConstructStringArrayWithLeadingZeros) {
+  PyAcquireGIL lock;
+
+  OwnedRef list_ref(PyList_New(2));
+  PyObject* list = list_ref.obj();
+  std::string str("str");
+
+  ASSERT_EQ(0, PyList_SetItem(list, 0, PyFloat_FromDouble(NAN)));
+  ASSERT_EQ(0, PyList_SetItem(list, 1, PyUnicode_FromString(str.c_str())));
+
+  std::shared_ptr<Array> out;
+  auto pool = default_memory_pool();
+  ASSERT_OK(ConvertPySequence(list, pool, &out));
+}
+
 }  // namespace py
 }  // namespace arrow

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to