Repository: arrow Updated Branches: refs/heads/master 067cd4ebf -> d75d7a96c
ARROW-736: [Python] Mixed-type object DataFrame columns should not silently co⦠â¦erce to an Arrow type by default Author: Phillip Cloud <[email protected]> Closes #465 from cpcloud/ARROW-736 and squashes the following commits: fd09def [Phillip Cloud] Update cmake bcf6236 [Phillip Cloud] Rename and move 4a18014 [Phillip Cloud] Move test e80efe1 [Phillip Cloud] Use OwnedRef instead of horror b2df3e9 [Phillip Cloud] Fix python error handling and make compatible with python27 84d33b4 [Phillip Cloud] ARROW-736: Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d75d7a96 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d75d7a96 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d75d7a96 Branch: refs/heads/master Commit: d75d7a96ca21bb2c1cfcf3bce8d09c2f24a5b8a6 Parents: 067cd4e Author: Phillip Cloud <[email protected]> Authored: Sat Apr 1 00:00:24 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Sat Apr 1 00:00:24 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/CMakeLists.txt | 2 +- cpp/src/arrow/python/pandas-test.cc | 69 ---------------- cpp/src/arrow/python/pandas_convert.cc | 52 ++++++++++-- cpp/src/arrow/python/pandas_convert.h | 4 + cpp/src/arrow/python/python-test.cc | 96 +++++++++++++++++++++++ python/pyarrow/tests/test_convert_builtin.py | 5 ++ python/pyarrow/tests/test_convert_pandas.py | 5 ++ 7 files changed, 156 insertions(+), 77 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/cpp/src/arrow/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 03f5afc..faaad89 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -88,6 +88,6 @@ install(FILES # INSTALL_RPATH "\$ORIGIN") if (ARROW_BUILD_TESTS) - ADD_ARROW_TEST(pandas-test + ADD_ARROW_TEST(python-test STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/cpp/src/arrow/python/pandas-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc deleted file mode 100644 index a4e640b..0000000 --- a/cpp/src/arrow/python/pandas-test.cc +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gtest/gtest.h" - -#include <cstdint> -#include <memory> -#include <string> -#include <vector> - -#include "arrow/array.h" -#include "arrow/builder.h" -#include "arrow/table.h" -#include "arrow/test-util.h" -#include "arrow/type.h" - -#include "arrow/python/common.h" -#include "arrow/python/pandas_convert.h" - -namespace arrow { -namespace py { - -TEST(PyBuffer, InvalidInputObject) { - PyBuffer buffer(Py_None); -} - -TEST(PandasConversionTest, TestObjectBlockWriteFails) { - StringBuilder builder(default_memory_pool()); - const char value[] = {'\xf1', '\0'}; - - for (int i = 0; i < 1000; ++i) { - builder.Append(value, static_cast<int32_t>(strlen(value))); - } - - std::shared_ptr<Array> arr; - ASSERT_OK(builder.Finish(&arr)); - - auto f1 = field("f1", utf8()); - auto f2 = field("f2", utf8()); - auto f3 = field("f3", utf8()); - std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3}; - std::vector<std::shared_ptr<Column>> cols = {std::make_shared<Column>(f1, arr), - std::make_shared<Column>(f2, arr), std::make_shared<Column>(f3, arr)}; - - auto schema = std::make_shared<Schema>(fields); - auto table = std::make_shared<Table>(schema, cols); - - PyObject* out; - Py_BEGIN_ALLOW_THREADS; - ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out)); - Py_END_ALLOW_THREADS; -} - -} // namespace py -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 68a8d7d..ae9b17c 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -46,6 +46,7 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { @@ -167,8 +168,10 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder, *have_bytes = true; const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj)); RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); + } else if (PyObject_is_null(obj)) { + RETURN_NOT_OK(builder->AppendNull()); } else { - builder->AppendNull(); + return InvalidConversion(obj, "string or bytes"); } } @@ -197,8 +200,10 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width)); RETURN_NOT_OK( builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)))); + } else if (PyObject_is_null(obj)) { + RETURN_NOT_OK(builder->AppendNull()); } else { - builder->AppendNull(); + return InvalidConversion(obj, "string or bytes"); } } @@ -413,6 +418,32 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* return Status::OK(); } +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { + OwnedRef type(PyObject_Type(obj)); + RETURN_IF_PYERROR(); + DCHECK_NE(type.obj(), nullptr); + + OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__")); + RETURN_IF_PYERROR(); + DCHECK_NE(type_name.obj(), nullptr); + + OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj())); + RETURN_IF_PYERROR(); + DCHECK_NE(bytes_obj.obj(), nullptr); + + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj()); + const char* bytes = PyBytes_AS_STRING(bytes_obj.obj()); + + DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; + + std::string cpp_type_name(bytes, size); + + std::stringstream ss; + ss << "Python object of type " << cpp_type_name << " is not None and is not a " + << expected_type_name << " object"; + return Status::TypeError(ss.str()); +} + Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) { PyAcquireGIL lock; @@ -427,8 +458,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) { if (PyDate_CheckExact(obj)) { PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj); date_builder.Append(PyDate_to_ms(pydate)); - } else { + } else if (PyObject_is_null(obj)) { date_builder.AppendNull(); + } else { + return InvalidConversion(obj, "date"); } } return date_builder.Finish(out); @@ -483,14 +516,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) { memset(bitmap, 0, nbytes); int64_t null_count = 0; + PyObject* obj; for (int64_t i = 0; i < length_; ++i) { - if (objects[i] == Py_True) { + obj = objects[i]; + if (obj == Py_True) { BitUtil::SetBit(bitmap, i); BitUtil::SetBit(null_bitmap_data_, i); - } else if (objects[i] != Py_False) { + } else if (obj == Py_False) { + BitUtil::SetBit(null_bitmap_data_, i); + } else if (PyObject_is_null(obj)) { ++null_count; } else { - BitUtil::SetBit(null_bitmap_data_, i); + return InvalidConversion(obj, "bool"); } } @@ -551,7 +588,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) { } else if (PyDate_CheckExact(objects[i])) { return ConvertDates(out); } else { - return Status::TypeError("unhandled python type"); + return InvalidConversion( + const_cast<PyObject*>(objects[i]), "string, bool, or date"); } } } http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/cpp/src/arrow/python/pandas_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h index 12644d9..105c159 100644 --- a/cpp/src/arrow/python/pandas_convert.h +++ b/cpp/src/arrow/python/pandas_convert.h @@ -24,6 +24,7 @@ #include <Python.h> #include <memory> +#include <string> #include "arrow/util/visibility.h" @@ -73,6 +74,9 @@ ARROW_EXPORT Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out); +ARROW_EXPORT +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name); + } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/cpp/src/arrow/python/python-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc new file mode 100644 index 0000000..01e30f5 --- /dev/null +++ b/cpp/src/arrow/python/python-test.cc @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include <memory> + +#include <Python.h> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/table.h" +#include "arrow/test-util.h" + +#include "arrow/python/common.h" +#include "arrow/python/pandas_convert.h" +#include "arrow/python/builtin_convert.h" + +namespace arrow { +namespace py { + +TEST(PyBuffer, InvalidInputObject) { + PyBuffer buffer(Py_None); +} + +TEST(PandasConversionTest, TestObjectBlockWriteFails) { + StringBuilder builder(default_memory_pool()); + const char value[] = {'\xf1', '\0'}; + + for (int i = 0; i < 1000; ++i) { + builder.Append(value, static_cast<int32_t>(strlen(value))); + } + + std::shared_ptr<Array> arr; + ASSERT_OK(builder.Finish(&arr)); + + auto f1 = field("f1", utf8()); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", utf8()); + std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3}; + std::vector<std::shared_ptr<Column>> cols = {std::make_shared<Column>(f1, arr), + std::make_shared<Column>(f2, arr), std::make_shared<Column>(f3, arr)}; + + auto schema = std::make_shared<Schema>(fields); + auto table = std::make_shared<Table>(schema, cols); + + PyObject* out; + Py_BEGIN_ALLOW_THREADS; + ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out)); + Py_END_ALLOW_THREADS; +} + +TEST(BuiltinConversionTest, TestMixedTypeFails) { + PyAcquireGIL lock; + MemoryPool* pool = default_memory_pool(); + std::shared_ptr<Array> arr; + + OwnedRef list_ref(PyList_New(3)); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + PyObject* str = PyUnicode_FromString("abc"); + ASSERT_NE(str, nullptr); + + PyObject* integer = PyLong_FromLong(1234L); + ASSERT_NE(integer, nullptr); + + PyObject* doub = PyFloat_FromDouble(123.0234); + ASSERT_NE(doub, nullptr); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, str), 0); + ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); + ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); + + ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr)); +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/python/pyarrow/tests/test_convert_builtin.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 9925125..3309ba0 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -157,3 +157,8 @@ class TestConvertList(unittest.TestCase): assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data + + def test_mixed_types_fails(self): + data = ['a', 1, 2.0] + with self.assertRaises(pyarrow.error.ArrowException): + pyarrow.from_pylist(data) http://git-wip-us.apache.org/repos/asf/arrow/blob/d75d7a96/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index f7cb47f..3f19b68 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -398,3 +398,8 @@ class TestPandasConversion(unittest.TestCase): ] for values in arrays: self._check_array_roundtrip(values) + + def test_mixed_types_fails(self): + data = pd.DataFrame({'a': ['a', 1, 2.0]}) + with self.assertRaises(A.error.ArrowException): + A.Table.from_pandas(data)
