Xianjin YE created ARROW-1863:
---------------------------------

             Summary: Should use PyObject_Str or PyObject_Repr in 
PyObjectStringify
                 Key: ARROW-1863
                 URL: https://issues.apache.org/jira/browse/ARROW-1863
             Project: Apache Arrow
          Issue Type: Bug
          Components: C++, Python
            Reporter: Xianjin YE


PyObjectStringify doesn't handle non-string(bytes or utf-8) type correctly. 
Should use PyObject_Repr(or PyObject_Str) to get string representation of 
PyObject.


{code:java}
struct ARROW_EXPORT PyObjectStringify {
  OwnedRef tmp_obj;
  const char* bytes;
  Py_ssize_t size;

  explicit PyObjectStringify(PyObject* obj) {
    PyObject* bytes_obj;
    if (PyUnicode_Check(obj)) {
      bytes_obj = PyUnicode_AsUTF8String(obj);
      tmp_obj.reset(bytes_obj);
      bytes = PyBytes_AsString(bytes_obj);
      size = PyBytes_GET_SIZE(bytes_obj);
    } else if (PyBytes_Check(obj)) {
      bytes = PyBytes_AsString(obj);
      size = PyBytes_GET_SIZE(obj);
    } else {
      bytes = NULLPTR;
      size = -1;
    }
  }
};
{code}
should change to 

{code:java}
struct ARROW_EXPORT PyObjectStringify {
  OwnedRef tmp_obj;
  const char* bytes;
  Py_ssize_t size;

  explicit PyObjectStringify(PyObject* obj) {
    PyObject* bytes_obj;
    if (PyUnicode_Check(obj)) {
      bytes_obj = PyUnicode_AsUTF8String(obj);
      tmp_obj.reset(bytes_obj);
      bytes = PyBytes_AsString(bytes_obj);
      size = PyBytes_GET_SIZE(bytes_obj);
    } else if (PyBytes_Check(obj)) {
      bytes = PyBytes_AsString(obj);
      size = PyBytes_GET_SIZE(obj);
    } else {
      bytes_obj = PyObject_Repr(obj);
      tmp_obj.reset(bytes_obj);
      bytes = PyBytes_AsString(bytes_obj);
      size = PyBytes_GET_SIZE(bytes_obj);
    }
  }
};
{code}

How do this infect pyarrow? Minimal reproduction case:

{code:java}
import pyarrow

data = ['-10', '-5', {'a': 1}, '0', '5', '10']

arr = pyarrow.array(data, type=pyarrow.string())

[1]    64491 segmentation fault  ipython
{code}

This case is found by my colleague. I would ask him to send a pr here.  

cc [~wesmckinn]



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to