Hi there, I'm looking into the arrow ListType object crash issue when using pybind11. Any help is appreciated.
I have found a similar issue here. https://www.mail-archive.com/[email protected]/msg00624.html, but no answer for this crashed issue. Description: - When accessing list type data from pybind11 wrapped arrow table, it is segmentation fault. However, when I built pyarrow via source code, list type data can be accessed correctly. My environment: - MacOS 10.15 - arrow version 2.0.0 Sample code: *1. two method to create table with int64 data and list<int64> data* ```cpp void check_arrow_status(arrow::Status status) { if (!status.ok()) { throw std::runtime_error(status.message()); } } // a simple example to create a table // table has only one column with int64 type pybind11::object generate_int_array(const int32_t count) { // create int64 array builder arrow::Int64Builder builder; for (auto i = 0; i < count; i++) { auto status = builder.Append(i); check_arrow_status(status); } // finish array builder std::shared_ptr<arrow::Array> array; auto status = builder.Finish(&array); check_arrow_status(status); // create table auto record_batch = arrow::RecordBatch::Make(arrow::schema({arrow::field("int_value", arrow::int64())}), count, {array}); auto table = arrow::Table::FromRecordBatches({record_batch}).ValueOrDie(); auto table_status = table->ValidateFull(); check_arrow_status(table_status); // wrap table auto result = arrow::py::import_pyarrow(); auto wrapped_table = pybind11::reinterpret_borrow<pybind11::object>(pybind11::handle(arrow::py::wrap_table(table))); return wrapped_table; } // another example to create a table // table has only one column with ListType pybind11::object generate_list_array(const int32_t count) { // create List array builder with int64 value arrow::MemoryPool *pool = arrow::default_memory_pool(); auto value_builder = make_shared<arrow::Int64Builder>(pool); auto builder = make_shared<arrow::ListBuilder>(pool, value_builder); for (auto i = 1; i <= count; i++) { auto status = builder->Append(); check_arrow_status(status); for (auto j = 0; j < i; j++) { status = value_builder->Append(j); check_arrow_status(status); } } // finish array builder std::shared_ptr<arrow::Array> array; auto status = builder->Finish(&array); check_arrow_status(status); // create table auto record_batch = arrow::RecordBatch::Make(arrow::schema({arrow::field("list_int_value", arrow::list(arrow::int64()))}), count, {array}); auto table = arrow::Table::FromRecordBatches({record_batch}).ValueOrDie(); auto table_status = table->ValidateFull(); check_arrow_status(table_status); // wrap table auto result = arrow::py::import_pyarrow(); auto wrapped_table = pybind11::reinterpret_borrow<pybind11::object>(pybind11::handle(arrow::py::wrap_table(table))); return wrapped_table; } PYBIND11_MODULE(test, m) { m.def("generate_int_array", &generate_int_array); m.def("generate_list_array", &generate_list_array); } ``` *2. access method* ```python >>> import test >>> int_table = test.generate_int_array(3) >>> int_table pyarrow.Table int_value: int64 >>> int_table.columns # int64 column can be accessed correctly [<pyarrow.lib.ChunkedArray object at 0x10c447c70> [ [ 0, 1, 2 ] ]] >>> list_table = test.generate_list_array(3) >>> list_table pyarrow.Table list_int_value: list<item: int64> child 0, item: int64 >>> list_table.columns # list column cause segmentation fault [1] 11858 segmentation fault python ``` You can see, a table with a single int64 column can be accessed correctly, but a table with a single list<int64> column caused segmentation fault issues. *3. Crashed Logs* ``` Thread 0 Crashed:: Dispatch queue: com.apple.main-thread 0 libarrow.200.dylib 0x0000000107e53d83 arrow::ArrayPrinter::PrintChildren(std::__1::vector<std::__1::shared_ptr<arrow::Array>, std::__1::allocator<std::__1::shared_ptr<arrow::Array> > > const&, long long, long long) + 515 1 libarrow.200.dylib 0x0000000107e4e164 arrow::ArrayPrinter::Visit(arrow::StructArray const&) + 308 2 libarrow.200.dylib 0x0000000107e4b86b arrow::Status arrow::VisitArrayInline<arrow::ArrayPrinter>(arrow::Array const&, arrow::ArrayPrinter*) + 587 3 libarrow.200.dylib 0x0000000107e49e91 arrow::PrettyPrint(arrow::ChunkedArray const&, arrow::PrettyPrintOptions const&, std::__1::basic_ostream<char, std::__1::char_traits<char> >*) + 833 4 libarrow.200.dylib 0x0000000107e4a0a0 arrow::PrettyPrint(arrow::ChunkedArray const&, arrow::PrettyPrintOptions const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*) + 208 5 lib.cpython-38-darwin.so 0x0000000107b3fe1d __pyx_pw_7pyarrow_3lib_12ChunkedArray_13to_string(_object*, _object*, _object*) + 765 6 python 0x0000000106e0c72a cfunction_call_varargs + 122 7 lib.cpython-38-darwin.so 0x0000000107a81ca2 __Pyx_PyObject_Call(_object*, _object*, _object*) + 98 8 lib.cpython-38-darwin.so 0x0000000107b3db89 __pyx_pw_7pyarrow_3lib_12ChunkedArray_17__str__(_object*) + 345 9 python 0x0000000106e4ddd2 PyObject_Str + 146 10 python 0x0000000106e9662d unicode_new + 365 11 python 0x0000000106e612dc type_call + 44 12 lib.cpython-38-darwin.so 0x0000000107a81ca2 __Pyx_PyObject_Call(_object*, _object*, _object*) + 98 13 lib.cpython-38-darwin.so 0x0000000107a82157 __Pyx_PyObject_CallOneArg(_object*, _object*) + 151 14 lib.cpython-38-darwin.so 0x0000000107b3d59b __pyx_pw_7pyarrow_3lib_12ChunkedArray_11__repr__(_object*) + 251 15 python 0x0000000106e4df3f PyObject_Repr + 127 16 python 0x0000000106e2a41b list_repr + 187 17 python 0x0000000106e4df3f PyObject_Repr + 127 18 python 0x0000000106e1fef6 PyFile_WriteObject + 70 19 python 0x0000000106f3dc94 sys_displayhook + 180 20 python 0x0000000106e4b046 cfunction_vectorcall_O + 214 21 python 0x0000000106e0e55b object_vacall + 459 22 python 0x0000000106e0e8a8 PyObject_CallFunctionObjArgs + 152 23 python 0x0000000106ee3f20 _PyEval_EvalFrameDefault + 10464 24 python 0x0000000106eed447 _PyEval_EvalCodeWithName + 3287 25 python 0x0000000106ee15a0 PyEval_EvalCode + 48 26 python 0x0000000106f31c03 PyRun_InteractiveOneObjectEx + 707 27 python 0x0000000106f31329 PyRun_InteractiveLoopFlags + 169 28 python 0x0000000106f3124c PyRun_AnyFileExFlags + 60 29 python 0x0000000106f50b34 Py_RunMain + 2596 30 python 0x0000000106f50e73 pymain_main + 403 31 python 0x0000000106f50ecb Py_BytesMain + 43 32 libdyld.dylib 0x00007fff6a9b9cc9 start + 1 ``` * 4. However, when I build pyarrow by source code via steps: https://arrow.apache.org/docs/developers/python.html <https://arrow.apache.org/docs/developers/python.html>. And then use this version of pyarrow, i can get correct results.* ``` >>> import test >>> list_table = test.generate_list_array(3) >>> list_table pyarrow.Table list_int_value: list<item: int64> child 0, item: int64 >>> list_table.columns >>> [<pyarrow.lib.ChunkedArray object at 0x10f15e590> [ [ [ 0 ], [ 0, 1 ], [ 0, 1, 2 ] ] ]] ``` Best Regards, Jiangtao
