Repository: arrow Updated Branches: refs/heads/master f51259068 -> 312a66535
ARROW-707: [Python] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy -> from_pandas per ARROW-838, I reverted the `Array.from_numpy` name to `Array.from_pandas` to reflect that the import is specific to pandas 0.x's memory representation Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #554 from wesm/ARROW-707 and squashes the following commits: a875257 [Wes McKinney] Rename PyObject_is_null to reflect domain-specific nature 093b057 [Wes McKinney] Check more cases of all nulls. Fix segfault for NaN that resulted from computations 7d97f28 [Wes McKinney] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy -> from_pandas Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/312a6653 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/312a6653 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/312a6653 Branch: refs/heads/master Commit: 312a665353c420452e98b6b266a5a7cb214c936f Parents: f512590 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Apr 17 09:56:53 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Mon Apr 17 09:56:53 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/pandas_convert.cc | 31 +++++++++++++-------- python/doc/source/api.rst | 1 + python/pyarrow/__init__.py | 1 + python/pyarrow/_array.pxd | 4 +++ python/pyarrow/_array.pyx | 18 ++++++------- python/pyarrow/_io.pyx | 2 +- python/pyarrow/_table.pyx | 2 +- python/pyarrow/tests/test_array.py | 4 +-- python/pyarrow/tests/test_convert_pandas.py | 34 ++++++++++++++++-------- python/pyarrow/tests/test_scalars.py | 6 ++--- 10 files changed, 65 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index b33aea4..5cdcb6f 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -61,8 +61,16 @@ namespace py { // ---------------------------------------------------------------------- // Utility code -static inline bool PyObject_is_null(const PyObject* obj) { - return obj == Py_None || obj == numpy_nan; +static inline bool PyFloat_isnan(const PyObject* obj) { + if (PyFloat_Check(obj)) { + double val = PyFloat_AS_DOUBLE(obj); + return val != val; + } else { + return false; + } +} +static inline bool PandasObjectIsNull(const PyObject* obj) { + return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); } static inline bool PyObject_is_string(const PyObject* obj) { @@ -158,7 +166,7 @@ static Status AppendObjectStrings( for (int64_t i = 0; i < objects.size(); ++i) { obj = objects[i]; - if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) { + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { RETURN_NOT_OK(builder->AppendNull()); } else if (PyUnicode_Check(obj)) { obj = PyUnicode_AsUTF8String(obj); @@ -197,7 +205,7 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas for (int64_t i = 0; i < objects.size(); ++i) { obj = objects[i]; - if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) { + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { RETURN_NOT_OK(builder->AppendNull()); } else if (PyUnicode_Check(obj)) { obj = PyUnicode_AsUTF8String(obj); @@ -519,7 +527,7 @@ Status PandasConverter::ConvertDates() { obj = objects[i]; if (PyDate_CheckExact(obj)) { date_builder.Append(UnboxDate<ArrowType>::Unbox(obj)); - } else if (PyObject_is_null(obj)) { + } else if (PandasObjectIsNull(obj)) { date_builder.AppendNull(); } else { return InvalidConversion(obj, "date"); @@ -570,7 +578,7 @@ Status PandasConverter::ConvertDecimals() { default: break; } - } else if (PyObject_is_null(object)) { + } else if (PandasObjectIsNull(object)) { decimal_builder.AppendNull(); } else { return InvalidConversion(object, "decimal.Decimal"); @@ -724,7 +732,7 @@ Status PandasConverter::ConvertBooleans() { PyObject* obj; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; - if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) { + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { ++null_count; } else if (obj == Py_True) { BitUtil::SetBit(bitmap, i); @@ -791,7 +799,7 @@ Status PandasConverter::ConvertObjects() { RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { + if (PandasObjectIsNull(objects[i])) { continue; } else if (PyObject_is_string(objects[i])) { return ConvertObjectStrings(); @@ -809,7 +817,8 @@ Status PandasConverter::ConvertObjects() { } } - return Status::TypeError("Unable to infer type of object array, were all null"); + out_ = std::make_shared<NullArray>(length_); + return Status::OK(); } template <int ITEM_TYPE, typename ArrowType> @@ -833,7 +842,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType> ListBuilder list_builder(pool_, value_builder); PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { + if (PandasObjectIsNull(objects[i])) { RETURN_NOT_OK(list_builder.AppendNull()); } else if (PyArray_Check(objects[i])) { auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]); @@ -893,7 +902,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>( ListBuilder list_builder(pool_, value_builder); PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { + if (PandasObjectIsNull(objects[i])) { RETURN_NOT_OK(list_builder.AppendNull()); } else if (PyArray_Check(objects[i])) { auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]); http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/doc/source/api.rst ---------------------------------------------------------------------- diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index 801ab34..1b7b9bd 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -90,6 +90,7 @@ Array Types :toctree: generated/ Array + NullArray NumericArray IntegerArray FloatingPointArray http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 506d567..3db2a4f 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -40,6 +40,7 @@ from pyarrow._array import (null, bool_, Array, Tensor, from_pylist, from_numpy_dtype, + NullArray, NumericArray, IntegerArray, FloatingPointArray, BooleanArray, Int8Array, UInt8Array, http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd index 4041374..afb0c27 100644 --- a/python/pyarrow/_array.pxd +++ b/python/pyarrow/_array.pxd @@ -141,6 +141,10 @@ cdef class Tensor: cdef init(self, const shared_ptr[CTensor]& sp_tensor) +cdef class NullArray(Array): + pass + + cdef class BooleanArray(Array): pass http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index c5a595c..99ff6f2 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -843,9 +843,9 @@ cdef class Array: self.type = box_data_type(self.sp_array.get().type()) @staticmethod - def from_numpy(obj, mask=None, DataType type=None, - timestamps_to_ms=False, - MemoryPool memory_pool=None): + def from_pandas(obj, mask=None, DataType type=None, + timestamps_to_ms=False, + MemoryPool memory_pool=None): """ Convert pandas.Series to an Arrow Array. @@ -878,7 +878,7 @@ cdef class Array: >>> import pandas as pd >>> import pyarrow as pa - >>> pa.Array.from_numpy(pd.Series([1, 2])) + >>> pa.Array.from_pandas(pd.Series([1, 2])) <pyarrow.array.Int64Array object at 0x7f674e4c0e10> [ 1, @@ -886,7 +886,7 @@ cdef class Array: ] >>> import numpy as np - >>> pa.Array.from_numpy(pd.Series([1, 2]), np.array([0, 1], + >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], ... dtype=bool)) <pyarrow.array.Int64Array object at 0x7f9019e11208> [ @@ -1329,14 +1329,14 @@ cdef class DictionaryArray(Array): mask = indices == -1 else: mask = mask | (indices == -1) - arrow_indices = Array.from_numpy(indices, mask=mask, - memory_pool=memory_pool) + arrow_indices = Array.from_pandas(indices, mask=mask, + memory_pool=memory_pool) if isinstance(dictionary, Array): arrow_dictionary = dictionary else: - arrow_dictionary = Array.from_numpy(dictionary, - memory_pool=memory_pool) + arrow_dictionary = Array.from_pandas(dictionary, + memory_pool=memory_pool) if not isinstance(arrow_indices, IntegerArray): raise ValueError('Indices must be integer type') http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/_io.pyx index 9f067fb..ec37de0 100644 --- a/python/pyarrow/_io.pyx +++ b/python/pyarrow/_io.pyx @@ -1148,7 +1148,7 @@ cdef class FeatherWriter: if isinstance(col, Array): arr = col else: - arr = Array.from_numpy(col, mask=mask) + arr = Array.from_pandas(col, mask=mask) cdef c_string c_name = tobytes(name) http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_table.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx index 6558b2e..78fec75 100644 --- a/python/pyarrow/_table.pyx +++ b/python/pyarrow/_table.pyx @@ -321,7 +321,7 @@ cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema): if schema is not None: type = schema.field_by_name(name).type - arr = Array.from_numpy(col, type=type, + arr = Array.from_pandas(col, type=type, timestamps_to_ms=timestamps_to_ms) names.append(name) arrays.append(arr) http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_array.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 57b17f6..a1fe842 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -162,8 +162,8 @@ def test_dictionary_from_boxed_arrays(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) - iarr = pa.Array.from_numpy(indices) - darr = pa.Array.from_numpy(dictionary) + iarr = pa.Array.from_pandas(indices) + darr = pa.Array.from_pandas(dictionary) d1 = pa.DictionaryArray.from_arrays(iarr, darr) http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 2394d63..f360234 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -79,8 +79,8 @@ class TestPandasConversion(unittest.TestCase): def _check_array_roundtrip(self, values, expected=None, mask=None, timestamps_to_ms=False, type=None): - arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms, - mask=mask, type=type) + arr = pa.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms, + mask=mask, type=type) result = arr.to_pandas() values_nulls = pd.isnull(values) @@ -125,7 +125,7 @@ class TestPandasConversion(unittest.TestCase): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = pa.Array.from_numpy(values, null_mask) + arr = pa.Array.from_pandas(values, null_mask) arrays.append(arr) fields.append(pa.Field.from_py(name, arrow_dtype)) values[null_mask] = np.nan @@ -178,7 +178,7 @@ class TestPandasConversion(unittest.TestCase): for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) - arr = pa.Array.from_numpy(values, null_mask) + arr = pa.Array.from_pandas(values, null_mask) arrays.append(arr) expected = values.astype('f8') @@ -212,7 +212,7 @@ class TestPandasConversion(unittest.TestCase): mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 - arr = pa.Array.from_numpy(values, mask) + arr = pa.Array.from_pandas(values, mask) expected = values.astype(object) expected[mask] = None @@ -375,11 +375,11 @@ class TestPandasConversion(unittest.TestCase): t32 = pa.date32() t64 = pa.date64() - a32 = pa.Array.from_numpy(arr, type=t32) - a64 = pa.Array.from_numpy(arr, type=t64) + a32 = pa.Array.from_pandas(arr, type=t32) + a64 = pa.Array.from_pandas(arr, type=t64) - a32_expected = pa.Array.from_numpy(arr_i4, mask=mask, type=t32) - a64_expected = pa.Array.from_numpy(arr_i8, mask=mask, type=t64) + a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32) + a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) @@ -406,8 +406,8 @@ class TestPandasConversion(unittest.TestCase): arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 - a1 = pa.Array.from_numpy(arr, type=t1) - a2 = pa.Array.from_numpy(arr2, type=t2) + a1 = pa.Array.from_pandas(arr, type=t1) + a2 = pa.Array.from_pandas(arr2, type=t2) expected = datetime.date(2017, 4, 3) assert a1[0].as_py() == expected @@ -586,3 +586,15 @@ class TestPandasConversion(unittest.TestCase): converted = pa.Table.from_pandas(expected) df = converted.to_pandas() tm.assert_frame_equal(df, expected) + + def test_all_nones(self): + def _check_series(s): + converted = pa.Array.from_pandas(s) + assert isinstance(converted, pa.NullArray) + assert len(converted) == 3 + assert converted.null_count == 3 + assert converted[0] is pa.NA + + _check_series(pd.Series([None] * 3, dtype=object)) + _check_series(pd.Series([np.nan] * 3, dtype=object)) + _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object)) http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_scalars.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index f4f275b..df2a898 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -124,7 +124,7 @@ class TestScalars(unittest.TestCase): for unit in units: dtype = 'datetime64[{0}]'.format(unit) - arrow_arr = pa.Array.from_numpy(arr.astype(dtype)) + arrow_arr = pa.Array.from_pandas(arr.astype(dtype)) expected = pd.Timestamp('2000-01-01 12:34:56') assert arrow_arr[0].as_py() == expected @@ -133,8 +133,8 @@ class TestScalars(unittest.TestCase): arrow_type = pa.timestamp(unit, tz=tz) dtype = 'datetime64[{0}]'.format(unit) - arrow_arr = pa.Array.from_numpy(arr.astype(dtype), - type=arrow_type) + arrow_arr = pa.Array.from_pandas(arr.astype(dtype), + type=arrow_type) expected = (pd.Timestamp('2000-01-01 12:34:56') .tz_localize('utc') .tz_convert(tz))