[ 
https://issues.apache.org/jira/browse/ARROW-1783?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16266519#comment-16266519
 ] 

ASF GitHub Bot commented on ARROW-1783:
---------------------------------------

pitrou commented on a change in pull request #1362: ARROW-1783: [Python] 
Provide a "component" dict representation of a serialized Python object with 
minimal allocation
URL: https://github.com/apache/arrow/pull/1362#discussion_r153133685
 
 

 ##########
 File path: cpp/src/arrow/python/python_to_arrow.cc
 ##########
 @@ -710,27 +712,88 @@ Status SerializeObject(PyObject* context, PyObject* 
sequence, SerializedPyObject
   return Status::OK();
 }
 
-Status WriteSerializedObject(const SerializedPyObject& obj, io::OutputStream* 
dst) {
-  int32_t num_tensors = static_cast<int32_t>(obj.tensors.size());
-  int32_t num_buffers = static_cast<int32_t>(obj.buffers.size());
-  RETURN_NOT_OK(dst->Write(reinterpret_cast<uint8_t*>(&num_tensors), 
sizeof(int32_t)));
-  RETURN_NOT_OK(dst->Write(reinterpret_cast<uint8_t*>(&num_buffers), 
sizeof(int32_t)));
-  RETURN_NOT_OK(ipc::WriteRecordBatchStream({obj.batch}, dst));
+Status SerializedPyObject::WriteTo(io::OutputStream* dst) {
+  int32_t num_tensors = static_cast<int32_t>(this->tensors.size());
+  int32_t num_buffers = static_cast<int32_t>(this->buffers.size());
+  RETURN_NOT_OK(
+      dst->Write(reinterpret_cast<const uint8_t*>(&num_tensors), 
sizeof(int32_t)));
+  RETURN_NOT_OK(
+      dst->Write(reinterpret_cast<const uint8_t*>(&num_buffers), 
sizeof(int32_t)));
+  RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, dst));
 
   int32_t metadata_length;
   int64_t body_length;
-  for (const auto& tensor : obj.tensors) {
+  for (const auto& tensor : this->tensors) {
     RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, 
&body_length));
   }
 
-  for (const auto& buffer : obj.buffers) {
+  for (const auto& buffer : this->buffers) {
     int64_t size = buffer->size();
-    RETURN_NOT_OK(dst->Write(reinterpret_cast<uint8_t*>(&size), 
sizeof(int64_t)));
+    RETURN_NOT_OK(dst->Write(reinterpret_cast<const uint8_t*>(&size), 
sizeof(int64_t)));
     RETURN_NOT_OK(dst->Write(buffer->data(), size));
   }
 
   return Status::OK();
 }
 
+Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** 
out) {
+  PyAcquireGIL py_gil;
+
+  ScopedRef result(PyDict_New());
+  PyObject* buffers = PyList_New(0);
+
+  // TODO(wesm): Not sure how pedantic we need to be about checking the return
+  // values of these functions. There are other places where we do not check
+  // PyDict_SetItem/SetItemString return value, but these failures would be
+  // quite esoteric
+  PyDict_SetItemString(result.get(), "num_tensors",
+                       PyLong_FromSize_t(this->tensors.size()));
+  PyDict_SetItemString(result.get(), "num_buffers",
+                       PyLong_FromSize_t(this->buffers.size()));
+  PyDict_SetItemString(result.get(), "data", buffers);
+  RETURN_IF_PYERROR();
+
+  Py_DECREF(buffers);
+
+  auto PushBuffer = [&buffers](const std::shared_ptr<Buffer>& buffer) {
+    PyObject* wrapped_buffer = wrap_buffer(buffer);
+    RETURN_IF_PYERROR();
+    if (PyList_Append(buffers, wrapped_buffer) < 0) {
+      RETURN_IF_PYERROR();
 
 Review comment:
   You probably need `Py_DECREF(wrapper_buffer)` here as well.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Convert SerializedPyObject to/from sequence of component buffers 
> with minimal memory allocation / copying
> ------------------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-1783
>                 URL: https://issues.apache.org/jira/browse/ARROW-1783
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> See discussion on Dask org:
> https://github.com/dask/distributed/pull/931
> It would be valuable for downstream users to compute the serialized payload 
> as a sequence of memoryview-compatible objects without having to allocate new 
> memory on write. This means that the component tensor messages must have 
> their metadata and bodies in separate buffers. This will require a bit of 
> work internally reassemble the object from a collection of {{pyarrow.Buffer}} 
> objects
> see also ARROW-1509



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to