[arrow] branch master updated: ARROW-2052: [C++ / Python] Rework OwnedRef, remove ScopedRef

wesm Thu, 01 Feb 2018 21:07:50 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 3098c14  ARROW-2052: [C++ / Python] Rework OwnedRef, remove ScopedRef
3098c14 is described below

commit 3098c1411930259070efb571fb350304b18ddc70
Author: Antoine Pitrou <[email protected]>
AuthorDate: Fri Feb 2 00:07:03 2018 -0500

    ARROW-2052: [C++ / Python] Rework OwnedRef, remove ScopedRef
    
    OwnedRef API cleaned up:
    - doesn't try to take the GIL in its destructor anymore
    - reset() decrefs the underlying pointer
    - detach() unowns the pointer without decrefing it
    
    Add OwnedRefNoGIL which forcefully takes the GIL in its destructor, at the 
expense of runtime performance.
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #1534 from pitrou/ARROW-2052-unify-ownedref-scopedref and squashes 
the following commits:
    
    772c5c19 [Antoine Pitrou] ARROW-2052: [C++ / Python] Rework OwnedRef, 
remove ScopedRef
---
 cpp/src/arrow/python/arrow_to_pandas.cc |  6 ++--
 cpp/src/arrow/python/arrow_to_python.cc | 38 ++++++++++++------------
 cpp/src/arrow/python/builtin_convert.cc |  2 +-
 cpp/src/arrow/python/common.h           | 52 +++++++++++----------------------
 cpp/src/arrow/python/numpy_to_arrow.cc  | 20 ++++++-------
 cpp/src/arrow/python/python_to_arrow.cc | 42 +++++++++++++-------------
 6 files changed, 71 insertions(+), 89 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 5c8c970..fcf05f8 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -265,13 +265,13 @@ class PandasBlock {
   int64_t num_rows_;
   int num_columns_;
 
-  OwnedRef block_arr_;
+  OwnedRefNoGIL block_arr_;
   uint8_t* block_data_;
 
   PandasOptions options_;
 
   // ndarray<int32>
-  OwnedRef placement_arr_;
+  OwnedRefNoGIL placement_arr_;
   int64_t* placement_data_;
 
  private:
@@ -1140,7 +1140,7 @@ class CategoricalBlock : public PandasBlock {
   }
 
   MemoryPool* pool_;
-  OwnedRef dictionary_;
+  OwnedRefNoGIL dictionary_;
   bool ordered_;
   bool needs_copy_;
 };
diff --git a/cpp/src/arrow/python/arrow_to_python.cc 
b/cpp/src/arrow/python/arrow_to_python.cc
index c67e541..54a71d5 100644
--- a/cpp/src/arrow/python/arrow_to_python.cc
+++ b/cpp/src/arrow/python/arrow_to_python.cc
@@ -64,8 +64,8 @@ Status DeserializeDict(PyObject* context, const Array& array, 
int64_t start_idx,
                        int64_t stop_idx, PyObject* base, const 
SerializedPyObject& blobs,
                        PyObject** out) {
   const auto& data = static_cast<const StructArray&>(array);
-  ScopedRef keys, vals;
-  ScopedRef result(PyDict_New());
+  OwnedRef keys, vals;
+  OwnedRef result(PyDict_New());
   RETURN_IF_PYERROR();
 
   DCHECK_EQ(2, data.num_fields());
@@ -77,16 +77,16 @@ Status DeserializeDict(PyObject* context, const Array& 
array, int64_t start_idx,
   for (int64_t i = start_idx; i < stop_idx; ++i) {
     // PyDict_SetItem behaves differently from PyList_SetItem and 
PyTuple_SetItem.
     // The latter two steal references whereas PyDict_SetItem does not. So we 
need
-    // to make sure the reference count is decremented by letting the ScopedRef
+    // to make sure the reference count is decremented by letting the OwnedRef
     // go out of scope at the end.
-    PyDict_SetItem(result.get(), PyList_GET_ITEM(keys.get(), i - start_idx),
-                   PyList_GET_ITEM(vals.get(), i - start_idx));
+    PyDict_SetItem(result.obj(), PyList_GET_ITEM(keys.obj(), i - start_idx),
+                   PyList_GET_ITEM(vals.obj(), i - start_idx));
   }
   static PyObject* py_type = PyUnicode_FromString("_pytype_");
-  if (PyDict_Contains(result.get(), py_type)) {
-    RETURN_NOT_OK(CallDeserializeCallback(context, result.get(), out));
+  if (PyDict_Contains(result.obj(), py_type)) {
+    RETURN_NOT_OK(CallDeserializeCallback(context, result.obj(), out));
   } else {
-    *out = result.release();
+    *out = result.detach();
   }
   return Status::OK();
 }
@@ -96,10 +96,10 @@ Status DeserializeArray(const Array& array, int64_t offset, 
PyObject* base,
   int32_t index = static_cast<const Int32Array&>(array).Value(offset);
   RETURN_NOT_OK(py::TensorToNdarray(*blobs.tensors[index], base, out));
   // Mark the array as immutable
-  ScopedRef flags(PyObject_GetAttrString(*out, "flags"));
-  DCHECK(flags.get() != NULL) << "Could not mark Numpy array immutable";
+  OwnedRef flags(PyObject_GetAttrString(*out, "flags"));
+  DCHECK(flags.obj() != NULL) << "Could not mark Numpy array immutable";
   Py_INCREF(Py_False);
-  int flag_set = PyObject_SetAttrString(flags.get(), "writeable", Py_False);
+  int flag_set = PyObject_SetAttrString(flags.obj(), "writeable", Py_False);
   DCHECK(flag_set == 0) << "Could not mark Numpy array immutable";
   return Status::OK();
 }
@@ -184,23 +184,23 @@ Status GetValue(PyObject* context, const UnionArray& 
parent, const Array& arr,
 
 #define DESERIALIZE_SEQUENCE(CREATE_FN, SET_ITEM_FN)                           
          \
   const auto& data = static_cast<const UnionArray&>(array);                    
          \
-  ScopedRef result(CREATE_FN(stop_idx - start_idx));                           
          \
+  OwnedRef result(CREATE_FN(stop_idx - start_idx));                            
          \
   const uint8_t* type_ids = data.raw_type_ids();                               
          \
   const int32_t* value_offsets = data.raw_value_offsets();                     
          \
   for (int64_t i = start_idx; i < stop_idx; ++i) {                             
          \
     if (data.IsNull(i)) {                                                      
          \
       Py_INCREF(Py_None);                                                      
          \
-      SET_ITEM_FN(result.get(), i - start_idx, Py_None);                       
          \
+      SET_ITEM_FN(result.obj(), i - start_idx, Py_None);                       
          \
     } else {                                                                   
          \
       int64_t offset = value_offsets[i];                                       
          \
       uint8_t type = type_ids[i];                                              
          \
       PyObject* value;                                                         
          \
       RETURN_NOT_OK(GetValue(context, data, *data.UnsafeChild(type), offset, 
type, base, \
                              blobs, &value));                                  
          \
-      SET_ITEM_FN(result.get(), i - start_idx, value);                         
          \
+      SET_ITEM_FN(result.obj(), i - start_idx, value);                         
          \
     }                                                                          
          \
   }                                                                            
          \
-  *out = result.release();                                                     
          \
+  *out = result.detach();                                                      
          \
   return Status::OK()
 
 Status DeserializeList(PyObject* context, const Array& array, int64_t 
start_idx,
@@ -219,13 +219,13 @@ Status DeserializeSet(PyObject* context, const Array& 
array, int64_t start_idx,
                       int64_t stop_idx, PyObject* base, const 
SerializedPyObject& blobs,
                       PyObject** out) {
   const auto& data = static_cast<const UnionArray&>(array);
-  ScopedRef result(PySet_New(nullptr));
+  OwnedRef result(PySet_New(nullptr));
   const uint8_t* type_ids = data.raw_type_ids();
   const int32_t* value_offsets = data.raw_value_offsets();
   for (int64_t i = start_idx; i < stop_idx; ++i) {
     if (data.IsNull(i)) {
       Py_INCREF(Py_None);
-      if (PySet_Add(result.get(), Py_None) < 0) {
+      if (PySet_Add(result.obj(), Py_None) < 0) {
         RETURN_IF_PYERROR();
       }
     } else {
@@ -234,12 +234,12 @@ Status DeserializeSet(PyObject* context, const Array& 
array, int64_t start_idx,
       PyObject* value;
       RETURN_NOT_OK(GetValue(context, data, *data.UnsafeChild(type), offset, 
type, base,
                              blobs, &value));
-      if (PySet_Add(result.get(), value) < 0) {
+      if (PySet_Add(result.obj(), value) < 0) {
         RETURN_IF_PYERROR();
       }
     }
   }
-  *out = result.release();
+  *out = result.detach();
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/python/builtin_convert.cc 
b/cpp/src/arrow/python/builtin_convert.cc
index 1b3c101..1e431c2 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -931,7 +931,7 @@ static Status ConvertPySequenceReal(PyObject* obj, int64_t 
size,
   PyAcquireGIL lock;
 
   PyObject* seq;
-  ScopedRef tmp_seq_nanny;
+  OwnedRef tmp_seq_nanny;
 
   std::shared_ptr<DataType> real_type;
 
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index 146864f..b1e0888 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -61,59 +61,30 @@ class ARROW_EXPORT PyAcquireGIL {
 
 #define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2
 
+// A RAII primitive that DECREFs the underlying PyObject* when it
+// goes out of scope.
 class ARROW_EXPORT OwnedRef {
  public:
   OwnedRef() : obj_(NULLPTR) {}
 
   explicit OwnedRef(PyObject* obj) : obj_(obj) {}
 
-  ~OwnedRef() {
-    PyAcquireGIL lock;
-    release();
-  }
+  ~OwnedRef() { reset(); }
 
   void reset(PyObject* obj) {
-    /// TODO(phillipc): Should we acquire the GIL here? It definitely needs to 
be
-    /// acquired,
-    /// but callers have probably already acquired it
     Py_XDECREF(obj_);
     obj_ = obj;
   }
 
-  void release() {
-    Py_XDECREF(obj_);
-    obj_ = NULLPTR;
-  }
-
-  PyObject* obj() const { return obj_; }
-
- private:
-  PyObject* obj_;
-};
-
-// This is different from OwnedRef in that it assumes that
-// the GIL is held by the caller and doesn't decrement the
-// reference count when release is called.
-class ARROW_EXPORT ScopedRef {
- public:
-  ScopedRef() : obj_(NULLPTR) {}
+  void reset() { reset(NULLPTR); }
 
-  explicit ScopedRef(PyObject* obj) : obj_(obj) {}
-
-  ~ScopedRef() { Py_XDECREF(obj_); }
-
-  void reset(PyObject* obj) {
-    Py_XDECREF(obj_);
-    obj_ = obj;
-  }
-
-  PyObject* release() {
+  PyObject* detach() {
     PyObject* result = obj_;
     obj_ = NULLPTR;
     return result;
   }
 
-  PyObject* get() const { return obj_; }
+  PyObject* obj() const { return obj_; }
 
   PyObject** ref() { return &obj_; }
 
@@ -121,6 +92,17 @@ class ARROW_EXPORT ScopedRef {
   PyObject* obj_;
 };
 
+// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
+// This is for situations where the GIL is not always known to be held
+// (e.g. if it is released in the middle of a function for performance reasons)
+class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef {
+ public:
+  ~OwnedRefNoGIL() {
+    PyAcquireGIL lock;
+    reset();
+  }
+};
+
 struct ARROW_EXPORT PyObjectStringify {
   OwnedRef tmp_obj;
   const char* bytes;
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index a1161fe..1b1673b 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -1116,10 +1116,10 @@ Status LoopPySequence(PyObject* sequence, T func) {
       }
     }
   } else if (PyObject_HasAttrString(sequence, "__iter__")) {
-    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
+    OwnedRef iter(PyObject_GetIter(sequence));
     PyObject* item;
     while ((item = PyIter_Next(iter.obj()))) {
-      OwnedRef ref = OwnedRef(item);
+      OwnedRef ref(item);
       RETURN_NOT_OK(func(ref.obj()));
     }
   } else {
@@ -1149,11 +1149,11 @@ Status LoopPySequenceWithMasks(PyObject* sequence,
       }
     }
   } else if (PyObject_HasAttrString(sequence, "__iter__")) {
-    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
+    OwnedRef iter(PyObject_GetIter(sequence));
     PyObject* item;
     int64_t i = 0;
     while ((item = PyIter_Next(iter.obj()))) {
-      OwnedRef ref = OwnedRef(item);
+      OwnedRef ref(item);
       RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
       i++;
     }
@@ -1476,20 +1476,20 @@ Status AppendUTF32(const char* data, int itemsize, int 
byteorder,
     }
   }
 
-  ScopedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * 
kNumPyUnicodeSize,
-                                              nullptr, &byteorder));
+  OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * 
kNumPyUnicodeSize,
+                                             nullptr, &byteorder));
   RETURN_IF_PYERROR();
-  ScopedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.get()));
-  if (utf8_obj.get() == NULL) {
+  OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
+  if (utf8_obj.obj() == NULL) {
     PyErr_Clear();
     return Status::Invalid("failed converting UTF32 to UTF8");
   }
 
-  const int32_t length = 
static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.get()));
+  const int32_t length = 
static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
   if (builder->value_data_length() + length > kBinaryMemoryLimit) {
     return Status::Invalid("Encoded string length exceeds maximum size (2GB)");
   }
-  return builder->Append(PyBytes_AS_STRING(utf8_obj.get()), length);
+  return builder->Append(PyBytes_AS_STRING(utf8_obj.obj()), length);
 }
 
 }  // namespace
diff --git a/cpp/src/arrow/python/python_to_arrow.cc 
b/cpp/src/arrow/python/python_to_arrow.cc
index 253e9d9..6d4f646 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -365,15 +365,15 @@ Status CallCustomCallback(PyObject* context, PyObject* 
method_name, PyObject* el
   *result = NULL;
   if (context == Py_None) {
     std::stringstream ss;
-    ScopedRef repr(PyObject_Repr(elem));
+    OwnedRef repr(PyObject_Repr(elem));
     RETURN_IF_PYERROR();
 #if PY_MAJOR_VERSION >= 3
-    ScopedRef ascii(PyUnicode_AsASCIIString(repr.get()));
+    OwnedRef ascii(PyUnicode_AsASCIIString(repr.obj()));
     RETURN_IF_PYERROR();
-    ss << "error while calling callback on " << PyBytes_AsString(ascii.get())
+    ss << "error while calling callback on " << PyBytes_AsString(ascii.obj())
        << ": handler not registered";
 #else
-    ss << "error while calling callback on " << PyString_AsString(repr.get())
+    ss << "error while calling callback on " << PyString_AsString(repr.obj())
        << ": handler not registered";
 #endif
     return Status::SerializationError(ss.str());
@@ -386,8 +386,8 @@ Status CallCustomCallback(PyObject* context, PyObject* 
method_name, PyObject* el
 
 Status CallSerializeCallback(PyObject* context, PyObject* value,
                              PyObject** serialized_object) {
-  ScopedRef method_name(PyUnicode_FromString("_serialize_callback"));
-  RETURN_NOT_OK(CallCustomCallback(context, method_name.get(), value, 
serialized_object));
+  OwnedRef method_name(PyUnicode_FromString("_serialize_callback"));
+  RETURN_NOT_OK(CallCustomCallback(context, method_name.obj(), value, 
serialized_object));
   if (!PyDict_Check(*serialized_object)) {
     return Status::TypeError("serialization callback must return a valid 
dictionary");
   }
@@ -396,8 +396,8 @@ Status CallSerializeCallback(PyObject* context, PyObject* 
value,
 
 Status CallDeserializeCallback(PyObject* context, PyObject* value,
                                PyObject** deserialized_object) {
-  ScopedRef method_name(PyUnicode_FromString("_deserialize_callback"));
-  return CallCustomCallback(context, method_name.get(), value, 
deserialized_object);
+  OwnedRef method_name(PyUnicode_FromString("_deserialize_callback"));
+  return CallCustomCallback(context, method_name.obj(), value, 
deserialized_object);
 }
 
 Status SerializeDict(PyObject* context, std::vector<PyObject*> dicts,
@@ -493,9 +493,9 @@ Status Append(PyObject* context, PyObject* elem, 
SequenceBuilder* builder,
 #if PY_MAJOR_VERSION >= 3
     char* data = PyUnicode_AsUTF8AndSize(elem, &size);
 #else
-    ScopedRef str(PyUnicode_AsUTF8String(elem));
-    char* data = PyString_AS_STRING(str.get());
-    size = PyString_GET_SIZE(str.get());
+    OwnedRef str(PyUnicode_AsUTF8String(elem));
+    char* data = PyString_AS_STRING(str.obj());
+    size = PyString_GET_SIZE(str.obj());
 #endif
     if (size > std::numeric_limits<int32_t>::max()) {
       return Status::Invalid("Cannot writes bytes over 2GB");
@@ -585,15 +585,15 @@ Status SerializeSequences(PyObject* context, 
std::vector<PyObject*> sequences,
   SequenceBuilder builder(nullptr);
   std::vector<PyObject*> sublists, subtuples, subdicts, subsets;
   for (const auto& sequence : sequences) {
-    ScopedRef iterator(PyObject_GetIter(sequence));
+    OwnedRef iterator(PyObject_GetIter(sequence));
     RETURN_IF_PYERROR();
-    ScopedRef item;
+    OwnedRef item;
     while (true) {
-      item.reset(PyIter_Next(iterator.get()));
-      if (!item.get()) {
+      item.reset(PyIter_Next(iterator.obj()));
+      if (!item.obj()) {
         break;
       }
-      RETURN_NOT_OK(Append(context, item.get(), &builder, &sublists, 
&subtuples,
+      RETURN_NOT_OK(Append(context, item.obj(), &builder, &sublists, 
&subtuples,
                            &subdicts, &subsets, blobs_out));
     }
   }
@@ -739,18 +739,18 @@ Status SerializedPyObject::WriteTo(io::OutputStream* dst) 
{
 Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** 
out) {
   PyAcquireGIL py_gil;
 
-  ScopedRef result(PyDict_New());
+  OwnedRef result(PyDict_New());
   PyObject* buffers = PyList_New(0);
 
   // TODO(wesm): Not sure how pedantic we need to be about checking the return
   // values of these functions. There are other places where we do not check
   // PyDict_SetItem/SetItemString return value, but these failures would be
   // quite esoteric
-  PyDict_SetItemString(result.get(), "num_tensors",
+  PyDict_SetItemString(result.obj(), "num_tensors",
                        PyLong_FromSize_t(this->tensors.size()));
-  PyDict_SetItemString(result.get(), "num_buffers",
+  PyDict_SetItemString(result.obj(), "num_buffers",
                        PyLong_FromSize_t(this->buffers.size()));
-  PyDict_SetItemString(result.get(), "data", buffers);
+  PyDict_SetItemString(result.obj(), "data", buffers);
   RETURN_IF_PYERROR();
 
   Py_DECREF(buffers);
@@ -792,7 +792,7 @@ Status SerializedPyObject::GetComponents(MemoryPool* 
memory_pool, PyObject** out
     RETURN_NOT_OK(PushBuffer(buf));
   }
 
-  *out = result.release();
+  *out = result.detach();
   return Status::OK();
 }
 

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-2052: [C++ / Python] Rework OwnedRef, remove ScopedRef

Reply via email to