This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f403804 ARROW-2205: [Python] Option for integer object nulls
f403804 is described below
commit f403804d97d325ee31b753dc8aba6a9a4d650e8d
Author: Albert Shieh <[email protected]>
AuthorDate: Fri Mar 2 15:31:42 2018 +0100
ARROW-2205: [Python] Option for integer object nulls
Fixes [ARROW-2205](https://issues.apache.org/jira/browse/ARROW-2205).
Author: Albert Shieh <[email protected]>
Closes #1650 from adshieh/integer_object_nulls and squashes the following
commits:
134eb25 <Albert Shieh> Clang format
7982d1e <Albert Shieh> Pass dtype through to array creation
3cfc876 <Albert Shieh> Add test for array case and parametrize tests
43451f2 <Albert Shieh> Array case for integer object nulls
305abf1 <Albert Shieh> Option for integer object nulls
---
cpp/src/arrow/python/arrow_to_pandas.cc | 84 ++++++++++++++++++++++-------
cpp/src/arrow/python/arrow_to_pandas.h | 6 ++-
python/pyarrow/array.pxi | 8 ++-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/table.pxi | 14 +++--
python/pyarrow/tests/test_convert_pandas.py | 45 ++++++++++++++++
6 files changed, 131 insertions(+), 27 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 17b87bf..5749e4f 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -362,6 +362,29 @@ static void ConvertBooleanNoNulls(PandasOptions options,
const ChunkedArray& dat
}
}
+template <typename T>
+static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray&
data,
+ PyObject** out_values) {
+ PyAcquireGIL lock;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const T* in_values = GetPrimitiveValues<T>(arr);
+
+ for (int i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else {
+ *out_values++ = std::is_signed<T>::value
+ ? PyLong_FromLongLong(in_values[i])
+ : PyLong_FromUnsignedLongLong(in_values[i]);
+ RETURN_IF_PYERROR();
+ }
+ }
+ }
+ return Status::OK();
+}
+
template <typename Type>
inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray&
data,
PyObject** out_values) {
@@ -685,6 +708,22 @@ class ObjectBlock : public PandasBlock {
if (type == Type::BOOL) {
RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer));
+ } else if (type == Type::UINT8) {
+ RETURN_NOT_OK(ConvertIntegerObjects<uint8_t>(options_, data,
out_buffer));
+ } else if (type == Type::INT8) {
+ RETURN_NOT_OK(ConvertIntegerObjects<int8_t>(options_, data, out_buffer));
+ } else if (type == Type::UINT16) {
+ RETURN_NOT_OK(ConvertIntegerObjects<uint16_t>(options_, data,
out_buffer));
+ } else if (type == Type::INT16) {
+ RETURN_NOT_OK(ConvertIntegerObjects<int16_t>(options_, data,
out_buffer));
+ } else if (type == Type::UINT32) {
+ RETURN_NOT_OK(ConvertIntegerObjects<uint32_t>(options_, data,
out_buffer));
+ } else if (type == Type::INT32) {
+ RETURN_NOT_OK(ConvertIntegerObjects<int32_t>(options_, data,
out_buffer));
+ } else if (type == Type::UINT64) {
+ RETURN_NOT_OK(ConvertIntegerObjects<uint64_t>(options_, data,
out_buffer));
+ } else if (type == Type::INT64) {
+ RETURN_NOT_OK(ConvertIntegerObjects<int64_t>(options_, data,
out_buffer));
} else if (type == Type::BINARY) {
RETURN_NOT_OK(ConvertBinaryLike<BinaryType>(options_, data, out_buffer));
} else if (type == Type::STRING) {
@@ -1203,34 +1242,33 @@ using BlockMap = std::unordered_map<int,
std::shared_ptr<PandasBlock>>;
static Status GetPandasBlockType(const Column& col, const PandasOptions&
options,
PandasBlock::type* output_type) {
+#define INTEGER_CASE(NAME)
\
+ *output_type =
\
+ col.null_count() > 0
\
+ ? options.integer_object_nulls ? PandasBlock::OBJECT :
PandasBlock::DOUBLE \
+ : PandasBlock::NAME;
\
+ break;
+
switch (col.type()->id()) {
case Type::BOOL:
*output_type = col.null_count() > 0 ? PandasBlock::OBJECT :
PandasBlock::BOOL;
break;
case Type::UINT8:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::UINT8;
- break;
+ INTEGER_CASE(UINT8);
case Type::INT8:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::INT8;
- break;
+ INTEGER_CASE(INT8);
case Type::UINT16:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::UINT16;
- break;
+ INTEGER_CASE(UINT16);
case Type::INT16:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::INT16;
- break;
+ INTEGER_CASE(INT16);
case Type::UINT32:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::UINT32;
- break;
+ INTEGER_CASE(UINT32);
case Type::INT32:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::INT32;
- break;
- case Type::INT64:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::INT64;
- break;
+ INTEGER_CASE(INT32);
case Type::UINT64:
- *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE :
PandasBlock::UINT64;
- break;
+ INTEGER_CASE(UINT64);
+ case Type::INT64:
+ INTEGER_CASE(INT64);
case Type::FLOAT:
*output_type = PandasBlock::FLOAT;
break;
@@ -1648,9 +1686,15 @@ class ArrowDeserializer {
}
if (data_.null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
- auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
- ConvertIntegerWithNulls<T>(options_, data_, out_values);
+ if (options_.integer_object_nulls) {
+ using c_type = typename Type::c_type;
+
+ return VisitObjects(ConvertIntegerObjects<c_type>);
+ } else {
+ RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+ auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
+ ConvertIntegerWithNulls<T>(options_, data_, out_values);
+ }
} else {
RETURN_NOT_OK(AllocateOutput(traits::npy_type));
auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h
b/cpp/src/arrow/python/arrow_to_pandas.h
index 0541b0f..4819eb4 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -44,8 +44,12 @@ struct PandasOptions {
/// If true, we will convert all string columns to categoricals
bool strings_to_categorical;
bool zero_copy_only;
+ bool integer_object_nulls;
- PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {}
+ PandasOptions()
+ : strings_to_categorical(false),
+ zero_copy_only(false),
+ integer_object_nulls(false) {}
};
ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 80e15f2..d4e53ec 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -430,7 +430,8 @@ cdef class Array:
return pyarrow_wrap_array(result)
def to_pandas(self, c_bool strings_to_categorical=False,
- c_bool zero_copy_only=False):
+ c_bool zero_copy_only=False,
+ c_bool integer_object_nulls=False):
"""
Convert to an array object suitable for use in pandas
@@ -441,6 +442,8 @@ cdef class Array:
zero_copy_only : boolean, default False
Raise an ArrowException if this function call would require copying
the underlying data
+ integer_object_nulls : boolean, default False
+ Cast integers with nulls to objects
See also
--------
@@ -454,7 +457,8 @@ cdef class Array:
options = PandasOptions(
strings_to_categorical=strings_to_categorical,
- zero_copy_only=zero_copy_only)
+ zero_copy_only=zero_copy_only,
+ integer_object_nulls=integer_object_nulls)
with nogil:
check_status(ConvertArrayToPandas(options, self.sp_array,
self, &out))
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index b9abf2b..233f2cb 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -908,6 +908,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py"
nogil:
cdef struct PandasOptions:
c_bool strings_to_categorical
c_bool zero_copy_only
+ c_bool integer_object_nulls
cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 178df57..c27c0ed 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -273,7 +273,8 @@ cdef class Column:
def to_pandas(self,
c_bool strings_to_categorical=False,
- c_bool zero_copy_only=False):
+ c_bool zero_copy_only=False,
+ c_bool integer_object_nulls=False):
"""
Convert the arrow::Column to a pandas.Series
@@ -287,7 +288,8 @@ cdef class Column:
options = PandasOptions(
strings_to_categorical=strings_to_categorical,
- zero_copy_only=zero_copy_only)
+ zero_copy_only=zero_copy_only,
+ integer_object_nulls=integer_object_nulls)
with nogil:
check_status(libarrow.ConvertColumnToPandas(options,
@@ -1017,7 +1019,8 @@ cdef class Table:
return result
def to_pandas(self, nthreads=None, strings_to_categorical=False,
- memory_pool=None, zero_copy_only=False, categories=None):
+ memory_pool=None, zero_copy_only=False, categories=None,
+ integer_object_nulls=False):
"""
Convert the arrow::Table to a pandas DataFrame
@@ -1036,6 +1039,8 @@ cdef class Table:
the underlying data
categories: list, default empty
List of columns that should be returned as pandas.Categorical
+ integer_object_nulls : boolean, default False
+ Cast integers with nulls to objects
Returns
-------
@@ -1046,7 +1051,8 @@ cdef class Table:
options = PandasOptions(
strings_to_categorical=strings_to_categorical,
- zero_copy_only=zero_copy_only)
+ zero_copy_only=zero_copy_only,
+ integer_object_nulls=integer_object_nulls)
self._check_nullptr()
if nthreads is None:
nthreads = cpu_count()
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 813fbdf..5abc026 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -634,6 +634,51 @@ class TestConvertPrimitiveTypes(object):
_check_type(pa.float64())
[email protected]('dtype',
+ ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_array_integer_object_nulls_option(dtype):
+ num_values = 100
+
+ null_mask = np.random.randint(0, 10, size=num_values) < 3
+ values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+ array = pa.array(values, mask=null_mask)
+
+ if null_mask.any():
+ expected = values.astype('O')
+ expected[null_mask] = None
+ else:
+ expected = values
+
+ result = array.to_pandas(integer_object_nulls=True)
+
+ np.testing.assert_equal(result, expected)
+
+
[email protected]('dtype',
+ ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_table_integer_object_nulls_option(dtype):
+ num_values = 100
+
+ null_mask = np.random.randint(0, 10, size=num_values) < 3
+ values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+ array = pa.array(values, mask=null_mask)
+
+ if null_mask.any():
+ expected = values.astype('O')
+ expected[null_mask] = None
+ else:
+ expected = values
+
+ expected = pd.DataFrame({dtype: expected})
+
+ table = pa.Table.from_arrays([array], [dtype])
+ result = table.to_pandas(integer_object_nulls=True)
+
+ tm.assert_frame_equal(result, expected)
+
+
class TestConvertDateTimeLikeTypes(object):
"""
Conversion tests for datetime- and timestamp-like types (date64, etc.).
--
To stop receiving notification emails like this one, please contact
[email protected].