jorisvandenbossche commented on issue #43604: URL: https://github.com/apache/arrow/issues/43604#issuecomment-2286103529
@jpfeuffer I was able to reproduce the segfault with the big file you provided. Running with gdb gives the following backtrace: <details> ``` Finished chunk 1 Finished chunk 2 Finished chunk 3 Finished chunk 4 Done. Closing file. [Thread 0x7fff49c8f640 (LWP 911090) exited] Thread 22 "python" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7fff4a7ff640 (LWP 911089)] 0x00007ffff604f87d in arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)::{lambda()#1}&&) (this=this@entry=0x5555560f7a70, func=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85 85 return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(), (gdb) bt #0 0x00007ffff604f87d in arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)::{lambda()#1}&&) (this=this@entry=0x5555560f7a70, func=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85 #1 0x00007ffff604fb83 in arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&) (result=..., __closure=0x7fff3c008778) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:247 #2 arrow::Future<std::shared_ptr<arrow::Buffer> >::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}>::operator()(arrow::FutureImpl const&) && ( impl=..., this=0x7fff3c008778) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:442 #3 arrow::internal::FnOnce<void (arrow::FutureImpl const&)>::FnImpl<arrow::Future<std::shared_ptr<arrow::Buffer> >::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, arrow::Future<std::shared_ptr<arrow::Buffer> >, arrow::Result<std::shared_ptr<arrow::Buffer> > >(arrow::Future<std::shared_ptr<arrow::Buffer> >, bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}> >::invoke(arrow::FutureImpl const&) (this=0x7fff3c008770, a#0=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152 #4 0x00007ffff6283c01 in arrow::internal::FnOnce<void (arrow::FutureImpl const&)>::operator()(arrow::FutureImpl const&) && (a#0=..., this=0x7fff3c009980) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191 #5 arrow::ConcreteFutureImpl::RunOrScheduleCallback (in_add_callback=<optimized out>, callback_record=..., self=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:110 #6 arrow::ConcreteFutureImpl::RunOrScheduleCallback (self=..., callback_record=..., in_add_callback=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:100 #7 0x00007ffff6283fd5 in arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed (this=<optimized out>, state=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:148 #8 0x00007ffff628428a in arrow::ConcreteFutureImpl::DoMarkFinished (this=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:39 #9 0x00007ffff604ed8c in arrow::Future<std::shared_ptr<arrow::Buffer> >::DoMarkFinished (res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...), this=0x7fff4a7fecb0) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:658 #10 arrow::Future<std::shared_ptr<arrow::Buffer> >::MarkFinished (this=this@entry=0x7fff4a7fecb0, res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...)) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:403 #11 0x00007ffff60563d6 in arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::WorkerTask (state=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1797 #12 0x00007ffff605686c in arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::State>, arrow::util::Mutex::Guard)::{lambda()#1}::operator()() const (__closure=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1666 #13 arrow::internal::FnOnce<void ()>::FnImpl<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::State>, arrow::util::Mutex::Guard)::{lambda()#1}>::invoke() (this=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152 #14 0x00007ffff62c568c in arrow::internal::FnOnce<void ()>::operator()() && (this=0x7fff4a7fedc0) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191 #15 arrow::internal::WorkerLoop (it=..., state=...) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:457 #16 operator() (__closure=<optimized out>) at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:618 #17 std::__invoke_impl<void, arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__f=...) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:61 #18 std::__invoke<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__fn=...) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:96 #19 std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > >::_M_invoke<0> (this=<optimized out>) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:279 #20 std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > >::operator() (this=<optimized out>) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:286 #21 std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > > >::_M_run(void) (this=<optimized out>) at /home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:231 #22 0x00007ffff4cf0e95 in std::execute_native_thread_routine (__p=<optimized out>) at ../../../../../libstdc++-v3/src/c++11/thread.cc:104 #23 0x00007ffff7c94ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442 #24 0x00007ffff7d26850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 ``` </details> If I unpack the archive, and read the csv file directly (so without the automatic decompression), it doesn't segfault. Based on that, trying to create a simpler reproducer with some generated data: ```python import pyarrow as pa from pyarrow import csv from pyarrow.tests.util import rands batch_size = 10_000 random_strings = [rands(10) for _ in range(batch_size)] table = pa.table({"col1":range(batch_size), "col2": random_strings}) with pa.CompressedOutputStream("data.csv.bz2", "bz2") as out: with csv.CSVWriter(out, table.schema) as writer: i = 0 for _ in range(50): table = pa.table({"col1":range(i, i+batch_size), "col2": random_strings}) writer.write_table(table) i += batch_size ``` Reading this compressed file with a similar script as your original file: ``` import pyarrow as pa from pyarrow import csv input_file = "data.csv.bz2" output_file = "test_out.csv" num_chunks = 2 schema = pa.schema([ ('col1', pa.string()), ('col2', pa.string()) ]) convertopt = csv.ConvertOptions(column_types=schema) reader = csv.open_csv(input_file, convert_options=convertopt) with csv.CSVWriter(output_file, schema=schema) as writer: cnt = 0 while cnt < num_chunks or num_chunks == 0: try: writer.write_batch(reader.read_next_batch()) cnt += 1 print("Finished chunk ", cnt) except StopIteration: break print("Done. Closing file.") ``` also segfaults (similarly, if I increase the `num_chunks` so that the file is read until the end, it doesn't segfault). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org