This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f403804  ARROW-2205: [Python] Option for integer object nulls
f403804 is described below

commit f403804d97d325ee31b753dc8aba6a9a4d650e8d
Author: Albert Shieh <ash...@ansatzcapital.com>
AuthorDate: Fri Mar 2 15:31:42 2018 +0100

    ARROW-2205: [Python] Option for integer object nulls
    
    Fixes [ARROW-2205](https://issues.apache.org/jira/browse/ARROW-2205).
    
    Author: Albert Shieh <ash...@ansatzcapital.com>
    
    Closes #1650 from adshieh/integer_object_nulls and squashes the following 
commits:
    
    134eb25 <Albert Shieh> Clang format
    7982d1e <Albert Shieh> Pass dtype through to array creation
    3cfc876 <Albert Shieh> Add test for array case and parametrize tests
    43451f2 <Albert Shieh> Array case for integer object nulls
    305abf1 <Albert Shieh> Option for integer object nulls
---
 cpp/src/arrow/python/arrow_to_pandas.cc     | 84 ++++++++++++++++++++++-------
 cpp/src/arrow/python/arrow_to_pandas.h      |  6 ++-
 python/pyarrow/array.pxi                    |  8 ++-
 python/pyarrow/includes/libarrow.pxd        |  1 +
 python/pyarrow/table.pxi                    | 14 +++--
 python/pyarrow/tests/test_convert_pandas.py | 45 ++++++++++++++++
 6 files changed, 131 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 17b87bf..5749e4f 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -362,6 +362,29 @@ static void ConvertBooleanNoNulls(PandasOptions options, 
const ChunkedArray& dat
   }
 }
 
+template <typename T>
+static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray& 
data,
+                                    PyObject** out_values) {
+  PyAcquireGIL lock;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
+
+    for (int i = 0; i < arr.length(); ++i) {
+      if (arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values++ = Py_None;
+      } else {
+        *out_values++ = std::is_signed<T>::value
+                            ? PyLong_FromLongLong(in_values[i])
+                            : PyLong_FromUnsignedLongLong(in_values[i]);
+        RETURN_IF_PYERROR();
+      }
+    }
+  }
+  return Status::OK();
+}
+
 template <typename Type>
 inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& 
data,
                                 PyObject** out_values) {
@@ -685,6 +708,22 @@ class ObjectBlock : public PandasBlock {
 
     if (type == Type::BOOL) {
       RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer));
+    } else if (type == Type::UINT8) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint8_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT8) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int8_t>(options_, data, out_buffer));
+    } else if (type == Type::UINT16) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint16_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT16) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int16_t>(options_, data, 
out_buffer));
+    } else if (type == Type::UINT32) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint32_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT32) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int32_t>(options_, data, 
out_buffer));
+    } else if (type == Type::UINT64) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint64_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT64) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int64_t>(options_, data, 
out_buffer));
     } else if (type == Type::BINARY) {
       RETURN_NOT_OK(ConvertBinaryLike<BinaryType>(options_, data, out_buffer));
     } else if (type == Type::STRING) {
@@ -1203,34 +1242,33 @@ using BlockMap = std::unordered_map<int, 
std::shared_ptr<PandasBlock>>;
 
 static Status GetPandasBlockType(const Column& col, const PandasOptions& 
options,
                                  PandasBlock::type* output_type) {
+#define INTEGER_CASE(NAME)                                                     
      \
+  *output_type =                                                               
      \
+      col.null_count() > 0                                                     
      \
+          ? options.integer_object_nulls ? PandasBlock::OBJECT : 
PandasBlock::DOUBLE \
+          : PandasBlock::NAME;                                                 
      \
+  break;
+
   switch (col.type()->id()) {
     case Type::BOOL:
       *output_type = col.null_count() > 0 ? PandasBlock::OBJECT : 
PandasBlock::BOOL;
       break;
     case Type::UINT8:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT8;
-      break;
+      INTEGER_CASE(UINT8);
     case Type::INT8:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT8;
-      break;
+      INTEGER_CASE(INT8);
     case Type::UINT16:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT16;
-      break;
+      INTEGER_CASE(UINT16);
     case Type::INT16:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT16;
-      break;
+      INTEGER_CASE(INT16);
     case Type::UINT32:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT32;
-      break;
+      INTEGER_CASE(UINT32);
     case Type::INT32:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT32;
-      break;
-    case Type::INT64:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT64;
-      break;
+      INTEGER_CASE(INT32);
     case Type::UINT64:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT64;
-      break;
+      INTEGER_CASE(UINT64);
+    case Type::INT64:
+      INTEGER_CASE(INT64);
     case Type::FLOAT:
       *output_type = PandasBlock::FLOAT;
       break;
@@ -1648,9 +1686,15 @@ class ArrowDeserializer {
     }
 
     if (data_.null_count() > 0) {
-      RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
-      auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
-      ConvertIntegerWithNulls<T>(options_, data_, out_values);
+      if (options_.integer_object_nulls) {
+        using c_type = typename Type::c_type;
+
+        return VisitObjects(ConvertIntegerObjects<c_type>);
+      } else {
+        RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+        auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
+        ConvertIntegerWithNulls<T>(options_, data_, out_values);
+      }
     } else {
       RETURN_NOT_OK(AllocateOutput(traits::npy_type));
       auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h 
b/cpp/src/arrow/python/arrow_to_pandas.h
index 0541b0f..4819eb4 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -44,8 +44,12 @@ struct PandasOptions {
   /// If true, we will convert all string columns to categoricals
   bool strings_to_categorical;
   bool zero_copy_only;
+  bool integer_object_nulls;
 
-  PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {}
+  PandasOptions()
+      : strings_to_categorical(false),
+        zero_copy_only(false),
+        integer_object_nulls(false) {}
 };
 
 ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 80e15f2..d4e53ec 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -430,7 +430,8 @@ cdef class Array:
         return pyarrow_wrap_array(result)
 
     def to_pandas(self, c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False):
+                  c_bool zero_copy_only=False,
+                  c_bool integer_object_nulls=False):
         """
         Convert to an array object suitable for use in pandas
 
@@ -441,6 +442,8 @@ cdef class Array:
         zero_copy_only : boolean, default False
             Raise an ArrowException if this function call would require copying
             the underlying data
+        integer_object_nulls : boolean, default False
+            Cast integers with nulls to objects
 
         See also
         --------
@@ -454,7 +457,8 @@ cdef class Array:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
         with nogil:
             check_status(ConvertArrayToPandas(options, self.sp_array,
                                               self, &out))
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index b9abf2b..233f2cb 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -908,6 +908,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
     cdef struct PandasOptions:
         c_bool strings_to_categorical
         c_bool zero_copy_only
+        c_bool integer_object_nulls
 
 cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 178df57..c27c0ed 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -273,7 +273,8 @@ cdef class Column:
 
     def to_pandas(self,
                   c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False):
+                  c_bool zero_copy_only=False,
+                  c_bool integer_object_nulls=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -287,7 +288,8 @@ cdef class Column:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
 
         with nogil:
             check_status(libarrow.ConvertColumnToPandas(options,
@@ -1017,7 +1019,8 @@ cdef class Table:
         return result
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False, categories=None):
+                  memory_pool=None, zero_copy_only=False, categories=None,
+                  integer_object_nulls=False):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1036,6 +1039,8 @@ cdef class Table:
             the underlying data
         categories: list, default empty
             List of columns that should be returned as pandas.Categorical
+        integer_object_nulls : boolean, default False
+            Cast integers with nulls to objects
 
         Returns
         -------
@@ -1046,7 +1051,8 @@ cdef class Table:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
         self._check_nullptr()
         if nthreads is None:
             nthreads = cpu_count()
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 813fbdf..5abc026 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -634,6 +634,51 @@ class TestConvertPrimitiveTypes(object):
         _check_type(pa.float64())
 
 
+@pytest.mark.parametrize('dtype',
+                         ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_array_integer_object_nulls_option(dtype):
+    num_values = 100
+
+    null_mask = np.random.randint(0, 10, size=num_values) < 3
+    values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+    array = pa.array(values, mask=null_mask)
+
+    if null_mask.any():
+        expected = values.astype('O')
+        expected[null_mask] = None
+    else:
+        expected = values
+
+    result = array.to_pandas(integer_object_nulls=True)
+
+    np.testing.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype',
+                         ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_table_integer_object_nulls_option(dtype):
+    num_values = 100
+
+    null_mask = np.random.randint(0, 10, size=num_values) < 3
+    values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+    array = pa.array(values, mask=null_mask)
+
+    if null_mask.any():
+        expected = values.astype('O')
+        expected[null_mask] = None
+    else:
+        expected = values
+
+    expected = pd.DataFrame({dtype: expected})
+
+    table = pa.Table.from_arrays([array], [dtype])
+    result = table.to_pandas(integer_object_nulls=True)
+
+    tm.assert_frame_equal(result, expected)
+
+
 class TestConvertDateTimeLikeTypes(object):
     """
     Conversion tests for datetime- and timestamp-like types (date64, etc.).

-- 
To stop receiving notification emails like this one, please contact
u...@apache.org.

Reply via email to