This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8f2ff30 ARROW-2240: [Python] Array initialization with leading numpy
nan fails with exception
8f2ff30 is described below
commit 8f2ff30664f3a92163801a84bf0be1f4499fcfc3
Author: Phillip Cloud <[email protected]>
AuthorDate: Mon Mar 12 14:00:14 2018 -0400
ARROW-2240: [Python] Array initialization with leading numpy nan fails with
exception
Author: Phillip Cloud <[email protected]>
Closes #1686 from cpcloud/ARROW-2240 and squashes the following commits:
2da8ed31 <Phillip Cloud> Remove duplicated isnan logic
d2f833fd <Phillip Cloud> Remove redundant test
70837fbd <Phillip Cloud> Formatting
bdc5f962 <Phillip Cloud> Resolve conflicts
0a576267 <Phillip Cloud> Python 2, gross
62018f9b <Phillip Cloud> ARROW-2240: Array initialization with leading
numpy nan fails with exception
---
cpp/src/arrow/python/builtin_convert.cc | 17 +++++++--------
cpp/src/arrow/python/helpers.cc | 13 ++++++++----
cpp/src/arrow/python/helpers.h | 5 ++++-
cpp/src/arrow/python/numpy_to_arrow.cc | 37 ++++++++++++++-------------------
cpp/src/arrow/python/python-test.cc | 15 +++++++++++++
5 files changed, 52 insertions(+), 35 deletions(-)
diff --git a/cpp/src/arrow/python/builtin_convert.cc
b/cpp/src/arrow/python/builtin_convert.cc
index d2f900f..595499d 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -88,7 +88,7 @@ class ScalarVisitor {
Status Visit(PyObject* obj) {
++total_count_;
- if (obj == Py_None) {
+ if (obj == Py_None || internal::PyFloat_IsNaN(obj)) {
++none_count_;
} else if (PyBool_Check(obj)) {
++bool_count_;
@@ -412,9 +412,10 @@ class TypedConverterVisitor : public
TypedConverter<BuilderType> {
RETURN_NOT_OK(this->typed_builder_->Reserve(size));
// Iterate over the items adding each one
if (PySequence_Check(obj)) {
+ auto self = static_cast<Derived*>(this);
for (int64_t i = 0; i < size; ++i) {
OwnedRef ref(PySequence_GetItem(obj, i));
- RETURN_NOT_OK(static_cast<Derived*>(this)->AppendSingle(ref.obj()));
+ RETURN_NOT_OK(self->AppendSingle(ref.obj()));
}
} else {
return Status::TypeError("Object is not a sequence");
@@ -424,7 +425,8 @@ class TypedConverterVisitor : public
TypedConverter<BuilderType> {
// Append a missing item (default implementation)
Status AppendNull() { return this->typed_builder_->AppendNull(); }
- bool IsNull(PyObject* obj) const { return obj == Py_None; }
+
+ bool IsNull(PyObject* obj) const { return internal::PandasObjectIsNull(obj);
}
};
class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter>
{
@@ -438,7 +440,9 @@ class NullConverter : public
TypedConverterVisitor<NullBuilder, NullConverter> {
class BoolConverter : public TypedConverterVisitor<BooleanBuilder,
BoolConverter> {
public:
// Append a non-missing item
- Status AppendItem(PyObject* obj) { return typed_builder_->Append(obj ==
Py_True); }
+ Status AppendItem(PyObject* obj) {
+ return typed_builder_->Append(PyObject_IsTrue(obj) == 1);
+ }
};
class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter>
{
@@ -851,11 +855,6 @@ class DecimalConverter
RETURN_NOT_OK(internal::DecimalFromPythonDecimal(obj, type, &value));
return typed_builder_->Append(value);
}
-
- bool IsNull(PyObject* obj) const {
- return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj)
||
- (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
- }
};
// Dynamic constructor for sequence converters
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 13dcc46..5719af6 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -226,10 +226,6 @@ Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
return Status::OK();
}
-bool PyFloat_isnan(PyObject* obj) {
- return PyFloat_Check(obj) && std::isnan(PyFloat_AS_DOUBLE(obj));
-}
-
bool PyDecimal_Check(PyObject* obj) {
// TODO(phillipc): Is this expensive?
OwnedRef Decimal;
@@ -283,6 +279,15 @@ Status DecimalMetadata::Update(PyObject* object) {
return Update(precision, scale);
}
+bool PyFloat_IsNaN(PyObject* obj) {
+ return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
+}
+
+bool PandasObjectIsNull(PyObject* obj) {
+ return obj == Py_None || obj == numpy_nan || PyFloat_IsNaN(obj) ||
+ (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
+}
+
} // namespace internal
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 6be0e49..b9f505a 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -82,8 +82,11 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal,
const DecimalType& arr
// \brief Check whether obj is an integer, independent of Python versions.
bool IsPyInteger(PyObject* obj);
+// \brief Use pandas missing value semantics to check if a value is null
+bool PandasObjectIsNull(PyObject* obj);
+
// \brief Check whether obj is nan
-bool PyFloat_isnan(PyObject* obj);
+bool PyFloat_IsNaN(PyObject* obj);
// \brief Check whether obj is an instance of Decimal
bool PyDecimal_Check(PyObject* obj);
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 04a71c1..9e3534d 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -67,11 +67,6 @@ constexpr int64_t kBinaryMemoryLimit =
std::numeric_limits<int32_t>::max();
namespace {
-inline bool PandasObjectIsNull(PyObject* obj) {
- return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) ||
- (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
-}
-
inline bool PyObject_is_string(PyObject* obj) {
#if PY_MAJOR_VERSION >= 3
return PyUnicode_Check(obj) || PyBytes_Check(obj);
@@ -161,7 +156,7 @@ static Status AppendObjectBinaries(PyArrayObject* arr,
PyArrayObject* mask,
for (; offset < objects.size(); ++offset) {
OwnedRef tmp_obj;
obj = objects[offset];
- if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[offset]) ||
internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
} else if (!PyBytes_Check(obj)) {
@@ -207,7 +202,7 @@ static Status AppendObjectStrings(PyArrayObject* arr,
PyArrayObject* mask, int64
for (; offset < objects.size(); ++offset) {
OwnedRef tmp_obj;
obj = objects[offset];
- if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[offset]) ||
internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
} else if (PyUnicode_Check(obj)) {
@@ -256,7 +251,7 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject*
arr, PyArrayObject* mas
for (; offset < objects.size(); ++offset) {
OwnedRef tmp_obj;
obj = objects[offset];
- if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[offset]) ||
internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
} else if (PyUnicode_Check(obj)) {
@@ -722,7 +717,7 @@ Status NumPyConverter::ConvertDates() {
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else if (PyDate_CheckExact(obj)) {
RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
@@ -770,7 +765,7 @@ Status NumPyConverter::ConvertDecimals() {
RETURN_IF_PYERROR();
}
- if (PandasObjectIsNull(object)) {
+ if (internal::PandasObjectIsNull(object)) {
RETURN_NOT_OK(builder.AppendNull());
} else {
Decimal128 value;
@@ -798,7 +793,7 @@ Status NumPyConverter::ConvertDateTimes() {
if (PyDateTime_Check(obj)) {
RETURN_NOT_OK(
builder.Append(PyDateTime_to_us(reinterpret_cast<PyDateTime_DateTime*>(obj))));
- } else if (PandasObjectIsNull(obj)) {
+ } else if (internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else {
std::stringstream ss;
@@ -826,7 +821,7 @@ Status NumPyConverter::ConvertTimes() {
obj = objects[i];
if (PyTime_Check(obj)) {
RETURN_NOT_OK(builder.Append(PyTime_to_us(obj)));
- } else if (PandasObjectIsNull(obj)) {
+ } else if (internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else {
std::stringstream ss;
@@ -895,7 +890,7 @@ Status NumPyConverter::ConvertObjectFloats() {
PyObject* obj;
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else if (PyFloat_Check(obj)) {
double val = PyFloat_AsDouble(obj);
@@ -930,7 +925,7 @@ Status NumPyConverter::ConvertObjectIntegers() {
PyObject* obj;
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else if (PyObject_is_integer(obj)) {
const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
@@ -999,7 +994,7 @@ Status NumPyConverter::ConvertBooleans() {
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+ if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
++null_count;
} else if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
@@ -1028,7 +1023,7 @@ Status NumPyConverter::ConvertObjectsInfer() {
for (int64_t i = 0; i < length_; ++i) {
PyObject* obj = objects[i];
- if (PandasObjectIsNull(obj)) {
+ if (internal::PandasObjectIsNull(obj)) {
continue;
} else if (PyObject_is_string(obj)) {
return ConvertObjectStrings();
@@ -1221,7 +1216,7 @@ inline Status NumPyConverter::ConvertTypedLists(const
std::shared_ptr<DataType>&
BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
auto foreach_item = [&](PyObject* object, bool mask) {
- if (mask || PandasObjectIsNull(object)) {
+ if (mask || internal::PandasObjectIsNull(object)) {
return builder->AppendNull();
} else if (PyArray_Check(object)) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1266,7 +1261,7 @@ inline Status
NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
auto foreach_item = [&](PyObject* object, bool mask) {
- if (mask || PandasObjectIsNull(object)) {
+ if (mask || internal::PandasObjectIsNull(object)) {
return builder->AppendNull();
} else if (PyArray_Check(object)) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1312,7 +1307,7 @@ inline Status
NumPyConverter::ConvertTypedLists<NPY_OBJECT, BinaryType>(
auto value_builder = static_cast<BinaryBuilder*>(builder->value_builder());
auto foreach_item = [&](PyObject* object, bool mask) {
- if (mask || PandasObjectIsNull(object)) {
+ if (mask || internal::PandasObjectIsNull(object)) {
return builder->AppendNull();
} else if (PyArray_Check(object)) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1365,7 +1360,7 @@ inline Status
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
auto foreach_item = [&](PyObject* object, bool mask) {
- if (mask || PandasObjectIsNull(object)) {
+ if (mask || internal::PandasObjectIsNull(object)) {
return builder->AppendNull();
} else if (PyArray_Check(object)) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
@@ -1428,7 +1423,7 @@ Status NumPyConverter::ConvertLists(const
std::shared_ptr<DataType>& type,
auto value_builder = static_cast<ListBuilder*>(builder->value_builder());
auto foreach_item = [this, &builder, &value_builder,
&list_type](PyObject* object) {
- if (PandasObjectIsNull(object)) {
+ if (internal::PandasObjectIsNull(object)) {
return builder->AppendNull();
} else {
RETURN_NOT_OK(builder->Append(true));
diff --git a/cpp/src/arrow/python/python-test.cc
b/cpp/src/arrow/python/python-test.cc
index 16ac1e3..c18b159 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -367,5 +367,20 @@ TEST_F(DecimalTest, UpdateWithNaN) {
ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
}
+TEST(PythonTest, ConstructStringArrayWithLeadingZeros) {
+ PyAcquireGIL lock;
+
+ OwnedRef list_ref(PyList_New(2));
+ PyObject* list = list_ref.obj();
+ std::string str("str");
+
+ ASSERT_EQ(0, PyList_SetItem(list, 0, PyFloat_FromDouble(NAN)));
+ ASSERT_EQ(0, PyList_SetItem(list, 1, PyUnicode_FromString(str.c_str())));
+
+ std::shared_ptr<Array> out;
+ auto pool = default_memory_pool();
+ ASSERT_OK(ConvertPySequence(list, pool, &out));
+}
+
} // namespace py
} // namespace arrow
--
To stop receiving notification emails like this one, please contact
[email protected].