[ 
https://issues.apache.org/jira/browse/ARROW-2205?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16383637#comment-16383637
 ] 

ASF GitHub Bot commented on ARROW-2205:
---------------------------------------

xhochy closed pull request #1650: ARROW-2205: [Python] Option for integer 
object nulls
URL: https://github.com/apache/arrow/pull/1650
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index aefd4d76d..21e848281 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -362,6 +362,29 @@ static void ConvertBooleanNoNulls(PandasOptions options, 
const ChunkedArray& dat
   }
 }
 
+template <typename T>
+static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray& 
data,
+                                    PyObject** out_values) {
+  PyAcquireGIL lock;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
+
+    for (int i = 0; i < arr.length(); ++i) {
+      if (arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values++ = Py_None;
+      } else {
+        *out_values++ = std::is_signed<T>::value
+                            ? PyLong_FromLongLong(in_values[i])
+                            : PyLong_FromUnsignedLongLong(in_values[i]);
+        RETURN_IF_PYERROR();
+      }
+    }
+  }
+  return Status::OK();
+}
+
 template <typename Type>
 inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& 
data,
                                 PyObject** out_values) {
@@ -684,6 +707,22 @@ class ObjectBlock : public PandasBlock {
 
     if (type == Type::BOOL) {
       RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer));
+    } else if (type == Type::UINT8) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint8_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT8) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int8_t>(options_, data, out_buffer));
+    } else if (type == Type::UINT16) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint16_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT16) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int16_t>(options_, data, 
out_buffer));
+    } else if (type == Type::UINT32) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint32_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT32) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int32_t>(options_, data, 
out_buffer));
+    } else if (type == Type::UINT64) {
+      RETURN_NOT_OK(ConvertIntegerObjects<uint64_t>(options_, data, 
out_buffer));
+    } else if (type == Type::INT64) {
+      RETURN_NOT_OK(ConvertIntegerObjects<int64_t>(options_, data, 
out_buffer));
     } else if (type == Type::BINARY) {
       RETURN_NOT_OK(ConvertBinaryLike<BinaryType>(options_, data, out_buffer));
     } else if (type == Type::STRING) {
@@ -1202,34 +1241,33 @@ using BlockMap = std::unordered_map<int, 
std::shared_ptr<PandasBlock>>;
 
 static Status GetPandasBlockType(const Column& col, const PandasOptions& 
options,
                                  PandasBlock::type* output_type) {
+#define INTEGER_CASE(NAME)                                                     
      \
+  *output_type =                                                               
      \
+      col.null_count() > 0                                                     
      \
+          ? options.integer_object_nulls ? PandasBlock::OBJECT : 
PandasBlock::DOUBLE \
+          : PandasBlock::NAME;                                                 
      \
+  break;
+
   switch (col.type()->id()) {
     case Type::BOOL:
       *output_type = col.null_count() > 0 ? PandasBlock::OBJECT : 
PandasBlock::BOOL;
       break;
     case Type::UINT8:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT8;
-      break;
+      INTEGER_CASE(UINT8);
     case Type::INT8:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT8;
-      break;
+      INTEGER_CASE(INT8);
     case Type::UINT16:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT16;
-      break;
+      INTEGER_CASE(UINT16);
     case Type::INT16:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT16;
-      break;
+      INTEGER_CASE(INT16);
     case Type::UINT32:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT32;
-      break;
+      INTEGER_CASE(UINT32);
     case Type::INT32:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT32;
-      break;
-    case Type::INT64:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::INT64;
-      break;
+      INTEGER_CASE(INT32);
     case Type::UINT64:
-      *output_type = col.null_count() > 0 ? PandasBlock::DOUBLE : 
PandasBlock::UINT64;
-      break;
+      INTEGER_CASE(UINT64);
+    case Type::INT64:
+      INTEGER_CASE(INT64);
     case Type::FLOAT:
       *output_type = PandasBlock::FLOAT;
       break;
@@ -1647,9 +1685,15 @@ class ArrowDeserializer {
     }
 
     if (data_.null_count() > 0) {
-      RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
-      auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
-      ConvertIntegerWithNulls<T>(options_, data_, out_values);
+      if (options_.integer_object_nulls) {
+        using c_type = typename Type::c_type;
+
+        return VisitObjects(ConvertIntegerObjects<c_type>);
+      } else {
+        RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+        auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
+        ConvertIntegerWithNulls<T>(options_, data_, out_values);
+      }
     } else {
       RETURN_NOT_OK(AllocateOutput(traits::npy_type));
       auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h 
b/cpp/src/arrow/python/arrow_to_pandas.h
index 0541b0f9a..4819eb42c 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -44,8 +44,12 @@ struct PandasOptions {
   /// If true, we will convert all string columns to categoricals
   bool strings_to_categorical;
   bool zero_copy_only;
+  bool integer_object_nulls;
 
-  PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {}
+  PandasOptions()
+      : strings_to_categorical(false),
+        zero_copy_only(false),
+        integer_object_nulls(false) {}
 };
 
 ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5b8621f13..73c140ad6 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -430,7 +430,8 @@ cdef class Array:
         return pyarrow_wrap_array(result)
 
     def to_pandas(self, c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False):
+                  c_bool zero_copy_only=False,
+                  c_bool integer_object_nulls=False):
         """
         Convert to an array object suitable for use in pandas
 
@@ -441,6 +442,8 @@ cdef class Array:
         zero_copy_only : boolean, default False
             Raise an ArrowException if this function call would require copying
             the underlying data
+        integer_object_nulls : boolean, default False
+            Cast integers with nulls to objects
 
         See also
         --------
@@ -454,7 +457,8 @@ cdef class Array:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
         with nogil:
             check_status(ConvertArrayToPandas(options, self.sp_array,
                                               self, &out))
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index b9abf2b16..233f2cb47 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -908,6 +908,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
     cdef struct PandasOptions:
         c_bool strings_to_categorical
         c_bool zero_copy_only
+        c_bool integer_object_nulls
 
 cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 178df5767..c27c0edd9 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -273,7 +273,8 @@ cdef class Column:
 
     def to_pandas(self,
                   c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False):
+                  c_bool zero_copy_only=False,
+                  c_bool integer_object_nulls=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -287,7 +288,8 @@ cdef class Column:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
 
         with nogil:
             check_status(libarrow.ConvertColumnToPandas(options,
@@ -1017,7 +1019,8 @@ cdef class Table:
         return result
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False, categories=None):
+                  memory_pool=None, zero_copy_only=False, categories=None,
+                  integer_object_nulls=False):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1036,6 +1039,8 @@ cdef class Table:
             the underlying data
         categories: list, default empty
             List of columns that should be returned as pandas.Categorical
+        integer_object_nulls : boolean, default False
+            Cast integers with nulls to objects
 
         Returns
         -------
@@ -1046,7 +1051,8 @@ cdef class Table:
 
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
-            zero_copy_only=zero_copy_only)
+            zero_copy_only=zero_copy_only,
+            integer_object_nulls=integer_object_nulls)
         self._check_nullptr()
         if nthreads is None:
             nthreads = cpu_count()
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 986aeffca..a25337639 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -634,6 +634,51 @@ def _check_type(t):
         _check_type(pa.float64())
 
 
+@pytest.mark.parametrize('dtype',
+                         ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_array_integer_object_nulls_option(dtype):
+    num_values = 100
+
+    null_mask = np.random.randint(0, 10, size=num_values) < 3
+    values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+    array = pa.array(values, mask=null_mask)
+
+    if null_mask.any():
+        expected = values.astype('O')
+        expected[null_mask] = None
+    else:
+        expected = values
+
+    result = array.to_pandas(integer_object_nulls=True)
+
+    np.testing.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype',
+                         ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
+def test_table_integer_object_nulls_option(dtype):
+    num_values = 100
+
+    null_mask = np.random.randint(0, 10, size=num_values) < 3
+    values = np.random.randint(0, 100, size=num_values, dtype=dtype)
+
+    array = pa.array(values, mask=null_mask)
+
+    if null_mask.any():
+        expected = values.astype('O')
+        expected[null_mask] = None
+    else:
+        expected = values
+
+    expected = pd.DataFrame({dtype: expected})
+
+    table = pa.Table.from_arrays([array], [dtype])
+    result = table.to_pandas(integer_object_nulls=True)
+
+    tm.assert_frame_equal(result, expected)
+
+
 class TestConvertDateTimeLikeTypes(object):
     """
     Conversion tests for datetime- and timestamp-like types (date64, etc.).


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Option for integer object nulls
> ----------------------------------------
>
>                 Key: ARROW-2205
>                 URL: https://issues.apache.org/jira/browse/ARROW-2205
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++, Python
>    Affects Versions: 0.8.0
>            Reporter: Albert Shieh
>            Assignee: Albert Shieh
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> I have a use case where the loss of precision in casting integers to floats 
> matters, and pandas supports storing integers with nulls without loss of 
> precision in object columns. However, a roundtrip through arrow will cast the 
> object columns to float columns, even though the object columns are stored in 
> arrow as integers with nulls.
> This is a minimal example demonstrating the behavior of a roundtrip:
> {code}
> import numpy as np
> import pandas as pd
> import pyarrow as pa
> df = pd.DataFrame({"a": np.array([None, 1], dtype=object)})
> df_pa = pa.Table.from_pandas(df).to_pandas()
> print(df)
> print(df_pa)
> {code}
> The output is:
> {code}
>       a
> 0  None
> 1     1
>      a
> 0  NaN
> 1  1.0
> {code}
> This seems to be the desired behavior, given test_int_object_nulls in 
> test_convert_pandas.
> I think it would be useful to add an option in the to_pandas methods to allow 
> integers with nulls to be returned as object columns. The option can default 
> to false in order to preserve the current behavior.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to