[jira] [Commented] (ARROW-1689) [Python] Categorical Indices Should Be Zero-Copy

ASF GitHub Bot (JIRA) Thu, 26 Oct 2017 06:59:24 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1689?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16220457#comment-16220457
 ]


ASF GitHub Bot commented on ARROW-1689:
---------------------------------------

wesm closed pull request #1233: ARROW-1689: [Python] Allow user to request no 
data copies
URL: https://github.com/apache/arrow/pull/1233
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 88b594cac..0c2e0ad85 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1040,6 +1040,8 @@ class CategoricalBlock : public PandasBlock {
     return Status::OK();
   }
 
+  PyObject* dictionary() const { return dictionary_.obj(); }
+
  protected:
   MemoryPool* pool_;
   OwnedRef dictionary_;
@@ -1399,6 +1401,11 @@ class ArrowDeserializer {
 
     if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != 
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(options_, npy_type, data_.chunk(0));
+    } else if (options_.zero_copy_only) {
+      std::stringstream ss;
+      ss << "Needed to copy " << data_.num_chunks() << " chunks with "
+         << data_.null_count() << " nulls, but zero_copy_only was True";
+      return Status::Invalid(ss.str());
     }
 
     RETURN_NOT_OK(AllocateOutput(npy_type));
@@ -1413,6 +1420,10 @@ class ArrowDeserializer {
                               std::is_base_of<TimestampType, Type>::value,
                           Status>::type
   Visit(const Type& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Copy Needed, but zero_copy_only was True");
+    }
+
     constexpr int TYPE = Type::type_id;
     using traits = internal::arrow_traits<TYPE>;
     using c_type = typename Type::c_type;
@@ -1453,6 +1464,11 @@ class ArrowDeserializer {
 
     if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != 
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(options_, traits::npy_type, 
data_.chunk(0));
+    } else if (options_.zero_copy_only) {
+      std::stringstream ss;
+      ss << "Needed to copy " << data_.num_chunks() << " chunks with "
+         << data_.null_count() << " nulls, but zero_copy_only was True";
+      return Status::Invalid(ss.str());
     }
 
     if (data_.null_count() > 0) {
@@ -1470,6 +1486,9 @@ class ArrowDeserializer {
 
   template <typename FUNCTOR>
   inline Status VisitObjects(FUNCTOR func) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Object types need copies, but zero_copy_only was 
True");
+    }
     RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
     auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
     return func(options_, data_, out_values);
@@ -1499,7 +1518,9 @@ class ArrowDeserializer {
 
   // Boolean specialization
   Status Visit(const BooleanType& type) {
-    if (data_.null_count() > 0) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("BooleanType needs copies, but zero_copy_only was 
True");
+    } else if (data_.null_count() > 0) {
       return VisitObjects(ConvertBooleanWithNulls);
     } else {
       
RETURN_NOT_OK(AllocateOutput(internal::arrow_traits<Type::BOOL>::npy_type));
@@ -1510,6 +1531,9 @@ class ArrowDeserializer {
   }
 
   Status Visit(const ListType& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("ListType needs copies, but zero_copy_only was 
True");
+    }
 #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \
   case Type::ArrowEnum:                                    \
     return ConvertListsLike<ArrowType>(options_, col_, out_values);
@@ -1542,26 +1566,19 @@ class ArrowDeserializer {
   }
 
   Status Visit(const DictionaryType& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("DictionaryType needs copies, but zero_copy_only 
was True");
+    }
+
     auto block = std::make_shared<CategoricalBlock>(options_, nullptr, 
col_->length());
     RETURN_NOT_OK(block->Write(col_, 0, 0));
 
-    auto dict_type = static_cast<const DictionaryType*>(col_->type().get());
-
     PyAcquireGIL lock;
     result_ = PyDict_New();
     RETURN_IF_PYERROR();
 
-    PyObject* dictionary;
-
-    // Release GIL before calling ConvertArrayToPandas, will be reacquired
-    // there if needed
-    lock.release();
-    RETURN_NOT_OK(
-        ConvertArrayToPandas(options_, dict_type->dictionary(), nullptr, 
&dictionary));
-    lock.acquire();
-
     PyDict_SetItemString(result_, "indices", block->block_arr());
-    PyDict_SetItemString(result_, "dictionary", dictionary);
+    PyDict_SetItemString(result_, "dictionary", block->dictionary());
 
     return Status::OK();
   }
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h 
b/cpp/src/arrow/python/arrow_to_pandas.h
index 1d716a5c9..1e4864637 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -41,6 +41,9 @@ namespace py {
 
 struct PandasOptions {
   bool strings_to_categorical;
+  bool zero_copy_only;
+
+  PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {}
 };
 
 ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index c5f28a951..ddf7a7810 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -373,7 +373,7 @@ cdef class Array:
 
         return pyarrow_wrap_array(result)
 
-    def to_pandas(self, c_bool strings_to_categorical=False):
+    def to_pandas(self, c_bool strings_to_categorical=False, 
zero_copy_only=False):
         """
         Convert to an array object suitable for use in pandas
 
@@ -381,6 +381,9 @@ cdef class Array:
         ----------
         strings_to_categorical : boolean, default False
             Encode string (UTF8) and binary types to pandas.Categorical
+        zero_copy_only : boolean, default False
+            Raise an ArrowException if this function call would require copying
+            the underlying data
 
         See also
         --------
@@ -392,7 +395,9 @@ cdef class Array:
             PyObject* out
             PandasOptions options
 
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
         with nogil:
             check_status(ConvertArrayToPandas(options, self.sp_array,
                                               self, &out))
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 60aa4d694..58dec7367 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -812,6 +812,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
 
     cdef struct PandasOptions:
         c_bool strings_to_categorical
+        c_bool zero_copy_only
 
 cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index dd42cf231..d4bd4dee0 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -159,7 +159,7 @@ cdef class Column:
         sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array))
         return pyarrow_wrap_column(sp_column)
 
-    def to_pandas(self, strings_to_categorical=False):
+    def to_pandas(self, strings_to_categorical=False, zero_copy_only=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -171,7 +171,9 @@ cdef class Column:
             PyObject* out
             PandasOptions options
 
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
 
         with nogil:
             check_status(libarrow.ConvertColumnToPandas(options,
@@ -853,7 +855,7 @@ cdef class Table:
         return pyarrow_wrap_table(c_table)
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None):
+                  memory_pool=None, zero_copy_only=False):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -867,6 +869,9 @@ cdef class Table:
             Encode string (UTF8) and binary types to pandas.Categorical
         memory_pool: MemoryPool, optional
             Specific memory pool to use to allocate casted columns
+        zero_copy_only : boolean, default False
+            Raise an ArrowException if this function call would require copying
+            the underlying data
 
         Returns
         -------
@@ -874,7 +879,9 @@ cdef class Table:
         """
         cdef:
             PandasOptions options
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
         self._check_nullptr()
         if nthreads is None:
             nthreads = cpu_count()
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 41ad20102..9abba646a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -212,6 +212,46 @@ def test_float_no_nulls(self):
         schema = pa.schema(fields)
         self._check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_zero_copy_success(self):
+        result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
+        npt.assert_array_equal(result, [0, 1, 2])
+
+    def test_zero_copy_failure_on_object_types(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_with_int_when_nulls(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([0, 1, None]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_with_float_when_nulls(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_bool_types(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([True, False]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_list_types(self):
+        arr = np.array([[1, 2], [8, 9]], dtype=object)
+
+        with self.assertRaises(pa.ArrowException):
+            pa.array(arr).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_timestamp_types(self):
+        arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
+
+        with self.assertRaises(pa.ArrowException):
+            pa.array(arr).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_dictionaries(self):
+        arr = pa.DictionaryArray.from_arrays(
+            np.array([0, 0]),
+            np.array(['A']))
+
+        with self.assertRaises(pa.ArrowException):
+            arr.to_pandas(zero_copy_only=True)
+
     def test_float_nulls(self):
         num_values = 100
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Categorical Indices Should Be Zero-Copy
> ------------------------------------------------
>
>                 Key: ARROW-1689
>                 URL: https://issues.apache.org/jira/browse/ARROW-1689
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>    Affects Versions: 0.7.1
>            Reporter: Nick White
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> It seems like 
> [WriteIndices|https://github.com/apache/arrow/blob/0c8b861f93884f2868eb631d8fceee3a8b8905ec/cpp/src/arrow/python/arrow_to_pandas.cc#L955-L981]
>  could reuse some of the logic in 
> [ConvertValuesZeroCopy|https://github.com/apache/arrow/blob/0c8b861f93884f2868eb631d8fceee3a8b8905ec/cpp/src/arrow/python/arrow_to_pandas.cc#L1348-L1385]
>  to avoid copying the integer indices array?



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1689) [Python] Categorical Indices Should Be Zero-Copy

Reply via email to