Repository: arrow Updated Branches: refs/heads/master 393f46abd -> 37dbddf0d
ARROW-1004: [Python] Add conversions for numpy object arrays with integers and floats Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #681 from wesm/ARROW-1004 and squashes the following commits: 9e0b2eae [Wes McKinney] Code review comments 45f1ecb9 [Wes McKinney] Fixes for manylinux1 4e4c7529 [Wes McKinney] Add conversions for numpy object arrays with integers and floats Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/37dbddf0 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/37dbddf0 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/37dbddf0 Branch: refs/heads/master Commit: 37dbddf0dc6582586a2bea98a436cb20726799a4 Parents: 393f46a Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Sun May 14 16:30:19 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Sun May 14 16:30:19 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/pandas_convert.cc | 82 +++++++++++++++++++++++- python/pyarrow/tests/test_convert_pandas.py | 18 ++++++ 2 files changed, 98 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/37dbddf0/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index b6fb05e..96dd09a 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -80,6 +80,14 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } +static inline bool PyObject_is_float(const PyObject* obj) { + return PyFloat_Check(obj); +} + +static inline bool PyObject_is_integer(const PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); +} + template <int TYPE> static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef npy_traits<TYPE> traits; @@ -394,9 +402,11 @@ class PandasConverter { template <typename ArrowType> Status ConvertDates(); + Status ConvertBooleans(); Status ConvertObjectStrings(); + Status ConvertObjectFloats(); Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type); - Status ConvertBooleans(); + Status ConvertObjectIntegers(); Status ConvertLists(const std::shared_ptr<DataType>& type); Status ConvertObjects(); Status ConvertDecimals(); @@ -610,6 +620,70 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } +Status PandasConverter::ConvertObjectFloats() { + PyAcquireGIL lock; + + DoubleBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyFloat_Check(obj)) { + double val = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + return InvalidConversion(obj, "float"); + } + } + + return builder.Finish(&out_); +} + +Status PandasConverter::ConvertObjectIntegers() { + PyAcquireGIL lock; + + Int64Builder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer<PyObject*> objects(arr_); + Ndarray1DIndexer<uint8_t> mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyObject_is_integer(obj)) { + const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + return InvalidConversion(obj, "integer"); + } + } + + return builder.Finish(&out_); +} + Status PandasConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr<DataType>& type) { PyAcquireGIL lock; @@ -804,8 +878,12 @@ Status PandasConverter::ConvertObjects() { continue; } else if (PyObject_is_string(objects[i])) { return ConvertObjectStrings(); + } else if (PyObject_is_float(objects[i])) { + return ConvertObjectFloats(); } else if (PyBool_Check(objects[i])) { return ConvertBooleans(); + } else if (PyObject_is_integer(objects[i])) { + return ConvertObjectIntegers(); } else if (PyDate_CheckExact(objects[i])) { // We could choose Date32 or Date64 return ConvertDates<Date32Type>(); @@ -813,7 +891,7 @@ Status PandasConverter::ConvertObjects() { return ConvertDecimals(); } else { return InvalidConversion( - const_cast<PyObject*>(objects[i]), "string, bool, or date"); + const_cast<PyObject*>(objects[i]), "string, bool, float, int, date, decimal"); } } } http://git-wip-us.apache.org/repos/asf/arrow/blob/37dbddf0/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 9b9b751..be35905 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -140,6 +140,24 @@ class TestPandasConversion(unittest.TestCase): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) + def test_float_object_nulls(self): + arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) + df = pd.DataFrame({'floats': arr}) + expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) + field = pa.field('floats', pa.float64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + + def test_int_object_nulls(self): + arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) + df = pd.DataFrame({'ints': arr}) + expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) + field = pa.field('ints', pa.int64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + def test_integer_no_nulls(self): data = OrderedDict() fields = []