jorisvandenbossche commented on issue #43604:
URL: https://github.com/apache/arrow/issues/43604#issuecomment-2286266576

   Further simplication (no need to also write the data out, just reading a 
subset if enough to trigger the segfault):
   
   Generate some data:
   ```python
   import pyarrow as pa
   from pyarrow import csv
   from pyarrow.tests.util import rands
   
   
   size = 500_000
   random_strings = [rands(10) for _ in range(size//100)]*100
   table = pa.table({"col1": range(size), "col2": random_strings})
   
   with pa.CompressedOutputStream("data.csv.bz2", "bz2") as out:
       csv.write_csv(table, out)
   ```
   
   Read part of the file:
   ```python
   import pyarrow as pa
   from pyarrow import csv
   
   
   input_file = "data.csv.bz2"
   output_file = "test_out.csv"
   num_chunks = 2
   
   schema = pa.schema([('col1', pa.string()), ('col2', pa.string())])
   convertopt = csv.ConvertOptions(column_types=schema)
   
   with csv.open_csv(input_file, convert_options=convertopt) as reader:
       cnt = 0
       while cnt < num_chunks or num_chunks == 0:
           try:
               batch = reader.read_next_batch()
               cnt += 1
               print("Finished reading chunk ", cnt)
           except StopIteration:
               break
   
       print("Done. Closing file.")
   ```
   
   Backtrace from running the above with gdb:
   
   
   <details>
   
   ```
   Finished reading chunk  1
   Finished reading chunk  2
   Done. Closing file.
   [Thread 0x7fff49c8f640 (LWP 1033591) exited]
   
   Thread 22 "python" received signal SIGSEGV, Segmentation fault.
   [Switching to Thread 0x7fff4a7ff640 (LWP 1033590)]
   0x00007ffff604f87d in 
arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)::{lambda()#1}&&) (this=this@entry=0x5555560eec40, func=...)
       at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85
   85       return SpawnReal(TaskHints{}, std::forward<Function>(func), 
StopToken::Unstoppable(),
   (gdb) bt
   #0  0x00007ffff604f87d in 
arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)::{lambda()#1}&&) (this=this@entry=0x5555560eec40, func=...)
       at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85
   #1  0x00007ffff604fb83 in 
arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>, 
arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&) 
(result=..., __closure=0x7fff3c006e88)
       at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:247
   #2  arrow::Future<std::shared_ptr<arrow::Buffer> 
>::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > 
const&)#2}>::operator()(arrow::FutureImpl const&) && (
       impl=..., this=0x7fff3c006e88) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:442
   #3  arrow::internal::FnOnce<void (arrow::FutureImpl 
const&)>::FnImpl<arrow::Future<std::shared_ptr<arrow::Buffer> 
>::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
 arrow::Future<std::shared_ptr<arrow::Buffer> >, 
arrow::Result<std::shared_ptr<arrow::Buffer> > 
>(arrow::Future<std::shared_ptr<arrow::Buffer> >, 
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}> 
>::invoke(arrow::FutureImpl const&) (this=0x7fff3c006e80, a#0=...) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152
   #4  0x00007ffff6283c01 in arrow::internal::FnOnce<void (arrow::FutureImpl 
const&)>::operator()(arrow::FutureImpl const&) && (a#0=..., this=0x7fff3c006170)
       at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191
   #5  arrow::ConcreteFutureImpl::RunOrScheduleCallback 
(in_add_callback=<optimized out>, callback_record=..., self=...) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:110
   #6  arrow::ConcreteFutureImpl::RunOrScheduleCallback (self=..., 
callback_record=..., in_add_callback=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:100
   #7  0x00007ffff6283fd5 in arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed 
(this=<optimized out>, state=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:148
   #8  0x00007ffff628428a in arrow::ConcreteFutureImpl::DoMarkFinished 
(this=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:39
   #9  0x00007ffff604ed8c in arrow::Future<std::shared_ptr<arrow::Buffer> 
>::DoMarkFinished (res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...), 
this=0x7fff4a7fecb0)
       at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:658
   #10 arrow::Future<std::shared_ptr<arrow::Buffer> >::MarkFinished 
(this=this@entry=0x7fff4a7fecb0, 
res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...))
       at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:403
   #11 0x00007ffff60563d6 in 
arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::WorkerTask 
(state=...) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1797
   #12 0x00007ffff605686c in 
arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> 
>::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
 >::State>, arrow::util::Mutex::Guard)::{lambda()#1}::operator()() const 
(__closure=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1666
   #13 arrow::internal::FnOnce<void 
()>::FnImpl<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> 
>::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
 >::State>, arrow::util::Mutex::Guard)::{lambda()#1}>::invoke() 
(this=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152
   #14 0x00007ffff62c568c in arrow::internal::FnOnce<void ()>::operator()() && 
(this=0x7fff4a7fedc0) at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191
   #15 arrow::internal::WorkerLoop (it=..., state=...) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:457
   #16 operator() (__closure=<optimized out>) at 
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:618
   #17 std::__invoke_impl<void, 
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__f=...) 
at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:61
   #18 
std::__invoke<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > (__fn=...) at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:96
   #19 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > >::_M_invoke<0> (this=<optimized out>)
       at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:279
   #20 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > >::operator() (this=<optimized out>)
       at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:286
   #21 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > > >::_M_run(void) (this=<optimized out>)
       at 
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:231
   #22 0x00007ffff4cf0e95 in std::execute_native_thread_routine (__p=<optimized 
out>) at ../../../../../libstdc++-v3/src/c++11/thread.cc:104
   #23 0x00007ffff7c94ac3 in start_thread (arg=<optimized out>) at 
./nptl/pthread_create.c:442
   #24 0x00007ffff7d26850 in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
   ```
   
   </details>
   
   The backtrace points to the threadpool / task spawning code (not very 
familiar with this part).
   
   cc @pitrou  


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to