Xianjin YE created ARROW-1863:
---------------------------------
Summary: Should use PyObject_Str or PyObject_Repr in
PyObjectStringify
Key: ARROW-1863
URL: https://issues.apache.org/jira/browse/ARROW-1863
Project: Apache Arrow
Issue Type: Bug
Components: C++, Python
Reporter: Xianjin YE
PyObjectStringify doesn't handle non-string(bytes or utf-8) type correctly.
Should use PyObject_Repr(or PyObject_Str) to get string representation of
PyObject.
{code:java}
struct ARROW_EXPORT PyObjectStringify {
OwnedRef tmp_obj;
const char* bytes;
Py_ssize_t size;
explicit PyObjectStringify(PyObject* obj) {
PyObject* bytes_obj;
if (PyUnicode_Check(obj)) {
bytes_obj = PyUnicode_AsUTF8String(obj);
tmp_obj.reset(bytes_obj);
bytes = PyBytes_AsString(bytes_obj);
size = PyBytes_GET_SIZE(bytes_obj);
} else if (PyBytes_Check(obj)) {
bytes = PyBytes_AsString(obj);
size = PyBytes_GET_SIZE(obj);
} else {
bytes = NULLPTR;
size = -1;
}
}
};
{code}
should change to
{code:java}
struct ARROW_EXPORT PyObjectStringify {
OwnedRef tmp_obj;
const char* bytes;
Py_ssize_t size;
explicit PyObjectStringify(PyObject* obj) {
PyObject* bytes_obj;
if (PyUnicode_Check(obj)) {
bytes_obj = PyUnicode_AsUTF8String(obj);
tmp_obj.reset(bytes_obj);
bytes = PyBytes_AsString(bytes_obj);
size = PyBytes_GET_SIZE(bytes_obj);
} else if (PyBytes_Check(obj)) {
bytes = PyBytes_AsString(obj);
size = PyBytes_GET_SIZE(obj);
} else {
bytes_obj = PyObject_Repr(obj);
tmp_obj.reset(bytes_obj);
bytes = PyBytes_AsString(bytes_obj);
size = PyBytes_GET_SIZE(bytes_obj);
}
}
};
{code}
How do this infect pyarrow? Minimal reproduction case:
{code:java}
import pyarrow
data = ['-10', '-5', {'a': 1}, '0', '5', '10']
arr = pyarrow.array(data, type=pyarrow.string())
[1] 64491 segmentation fault ipython
{code}
This case is found by my colleague. I would ask him to send a pr here.
cc [~wesmckinn]
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)