[jira] [Commented] (ARROW-1689) [Python] Categorical Indices Should Be Zero-Copy

ASF GitHub Bot (JIRA) Sat, 28 Oct 2017 04:55:16 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1689?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16223448#comment-16223448
 ]


ASF GitHub Bot commented on ARROW-1689:
---------------------------------------

xhochy closed pull request #1237: ARROW-1689: [Python] Implement zero-copy 
conversions for DictionaryArray
URL: https://github.com/apache/arrow/pull/1237
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 0c2e0ad85..7f1591213 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -109,6 +109,20 @@ static inline bool ListTypeSupported(const DataType& type) 
{
   }
   return false;
 }
+// ----------------------------------------------------------------------
+// PyCapsule code for setting ndarray base to reference C++ object
+
+struct ArrowCapsule {
+  std::shared_ptr<Array> array;
+};
+
+namespace {
+
+void ArrowCapsule_Destructor(PyObject* capsule) {
+  delete reinterpret_cast<ArrowCapsule*>(PyCapsule_GetPointer(capsule, 
"arrow"));
+}
+
+}  // namespace
 
 // ----------------------------------------------------------------------
 // pandas 0.x DataFrame conversion internals
@@ -957,23 +971,40 @@ class CategoricalBlock : public PandasBlock {
     using TRAITS = internal::arrow_traits<ARROW_INDEX_TYPE>;
     using T = typename TRAITS::T;
     constexpr int npy_type = TRAITS::npy_type;
-    RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
-
-    // No relative placement offset because a single column
-    T* out_values = reinterpret_cast<T*>(block_data_);
 
     const ChunkedArray& data = *col->data().get();
 
-    for (int c = 0; c < data.num_chunks(); c++) {
-      const std::shared_ptr<Array> arr = data.chunk(c);
-      const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
+    // Sniff the first chunk
+    const std::shared_ptr<Array> arr_first = data.chunk(0);
+    const auto& dict_arr_first = static_cast<const 
DictionaryArray&>(*arr_first);
+    const auto& indices_first =
+        static_cast<const PrimitiveArray&>(*dict_arr_first.indices());
+
+    if (data.num_chunks() == 1 && indices_first.null_count() == 0) {
+      RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
+    } else {
+      if (options_.zero_copy_only) {
+        std::stringstream ss;
+        ss << "Needed to copy " << data.num_chunks() << " chunks with "
+           << indices_first.null_count() << " indices nulls, but 
zero_copy_only was True";
+        return Status::Invalid(ss.str());
+      }
+      RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
+
+      // No relative placement offset because a single column
+      T* out_values = reinterpret_cast<T*>(block_data_);
 
-      const auto& indices = static_cast<const 
PrimitiveArray&>(*dict_arr.indices());
-      auto in_values = reinterpret_cast<const T*>(indices.raw_values());
+      for (int c = 0; c < data.num_chunks(); c++) {
+        const std::shared_ptr<Array> arr = data.chunk(c);
+        const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
 
-      // Null is -1 in CategoricalBlock
-      for (int i = 0; i < arr->length(); ++i) {
-        *out_values++ = indices.IsNull(i) ? -1 : in_values[i];
+        const auto& indices = static_cast<const 
PrimitiveArray&>(*dict_arr.indices());
+        auto in_values = reinterpret_cast<const T*>(indices.raw_values());
+
+        // Null is -1 in CategoricalBlock
+        for (int i = 0; i < arr->length(); ++i) {
+          *out_values++ = indices.IsNull(i) ? -1 : in_values[i];
+        }
       }
     }
 
@@ -1043,6 +1074,43 @@ class CategoricalBlock : public PandasBlock {
   PyObject* dictionary() const { return dictionary_.obj(); }
 
  protected:
+  template <typename T>
+  Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& 
indices) {
+    npy_intp block_dims[1] = {num_rows_};
+
+    auto in_values = reinterpret_cast<const T*>(indices.raw_values());
+    void* data = const_cast<T*>(in_values);
+
+    PyAcquireGIL lock;
+
+    PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+    if (descr == nullptr) {
+      // Error occurred, trust error state is set
+      return Status::OK();
+    }
+
+    PyObject* block_arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, 
block_dims,
+                                               nullptr, data, 
NPY_ARRAY_CARRAY, nullptr);
+
+    npy_intp placement_dims[1] = {num_columns_};
+    PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
+    if (placement_arr == NULL) {
+      // TODO(wesm): propagating Python exception
+      return Status::OK();
+    }
+
+    block_arr_.reset(block_arr);
+    placement_arr_.reset(placement_arr);
+
+    block_data_ = reinterpret_cast<uint8_t*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(block_arr)));
+
+    placement_data_ = reinterpret_cast<int64_t*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
+
+    return Status::OK();
+  }
+
   MemoryPool* pool_;
   OwnedRef dictionary_;
   bool ordered_;
@@ -1369,12 +1437,26 @@ class ArrowDeserializer {
       return Status::OK();
     }
 
-    if (PyArray_SetBaseObject(arr_, py_ref_) == -1) {
+    PyObject* base;
+    if (py_ref_ == nullptr) {
+      ArrowCapsule* capsule = new ArrowCapsule;
+      capsule->array = arr;
+      base = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow",
+                           &ArrowCapsule_Destructor);
+      if (base == nullptr) {
+        delete capsule;
+        RETURN_IF_PYERROR();
+      }
+    } else {
+      base = py_ref_;
+    }
+
+    if (PyArray_SetBaseObject(arr_, base) == -1) {
       // Error occurred, trust that SetBaseObject set the error state
       return Status::OK();
     } else {
-      // PyArray_SetBaseObject steals our reference to py_ref_
-      Py_INCREF(py_ref_);
+      // PyArray_SetBaseObject steals our reference to base
+      Py_INCREF(base);
     }
 
     // Arrow data is immutable.
@@ -1399,7 +1481,7 @@ class ArrowDeserializer {
     typedef typename traits::T T;
     int npy_type = traits::npy_type;
 
-    if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != 
nullptr) {
+    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
       return ConvertValuesZeroCopy<TYPE>(options_, npy_type, data_.chunk(0));
     } else if (options_.zero_copy_only) {
       std::stringstream ss;
@@ -1462,7 +1544,7 @@ class ArrowDeserializer {
 
     typedef typename traits::T T;
 
-    if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != 
nullptr) {
+    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
       return ConvertValuesZeroCopy<TYPE>(options_, traits::npy_type, 
data_.chunk(0));
     } else if (options_.zero_copy_only) {
       std::stringstream ss;
@@ -1566,10 +1648,6 @@ class ArrowDeserializer {
   }
 
   Status Visit(const DictionaryType& type) {
-    if (options_.zero_copy_only) {
-      return Status::Invalid("DictionaryType needs copies, but zero_copy_only 
was True");
-    }
-
     auto block = std::make_shared<CategoricalBlock>(options_, nullptr, 
col_->length());
     RETURN_NOT_OK(block->Write(col_, 0, 0));
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 8360dae54..d00bf1b28 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -217,6 +217,17 @@ def test_zero_copy_success(self):
         result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
         npt.assert_array_equal(result, [0, 1, 2])
 
+    def test_zero_copy_dictionaries(self):
+        arr = pa.DictionaryArray.from_arrays(
+            np.array([0, 0]),
+            np.array([5]))
+
+        result = arr.to_pandas(zero_copy_only=True)
+        values = pd.Categorical([5, 5])
+
+        tm.assert_series_equal(pd.Series(result), pd.Series(values),
+                               check_names=False)
+
     def test_zero_copy_failure_on_object_types(self):
         with pytest.raises(pa.ArrowException):
             pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
@@ -245,14 +256,6 @@ def test_zero_copy_failure_on_timestamp_types(self):
         with pytest.raises(pa.ArrowException):
             pa.array(arr).to_pandas(zero_copy_only=True)
 
-    def test_zero_copy_dictionaries(self):
-        arr = pa.DictionaryArray.from_arrays(
-            np.array([0, 0]),
-            np.array(['A']))
-
-        with pytest.raises(pa.ArrowException):
-            arr.to_pandas(zero_copy_only=True)
-
     def test_float_nulls(self):
         num_values = 100
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Categorical Indices Should Be Zero-Copy
> ------------------------------------------------
>
>                 Key: ARROW-1689
>                 URL: https://issues.apache.org/jira/browse/ARROW-1689
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>    Affects Versions: 0.7.1
>            Reporter: Nick White
>            Assignee: Nick White
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> It seems like 
> [WriteIndices|https://github.com/apache/arrow/blob/0c8b861f93884f2868eb631d8fceee3a8b8905ec/cpp/src/arrow/python/arrow_to_pandas.cc#L955-L981]
>  could reuse some of the logic in 
> [ConvertValuesZeroCopy|https://github.com/apache/arrow/blob/0c8b861f93884f2868eb631d8fceee3a8b8905ec/cpp/src/arrow/python/arrow_to_pandas.cc#L1348-L1385]
>  to avoid copying the integer indices array?



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1689) [Python] Categorical Indices Should Be Zero-Copy

Reply via email to