jorisvandenbossche commented on issue #43604:
URL: https://github.com/apache/arrow/issues/43604#issuecomment-2286266576
Further simplication (no need to also write the data out, just reading a
subset if enough to trigger the segfault):
Generate some data:
```python
import pyarrow as pa
from pyarrow import csv
from pyarrow.tests.util import rands
size = 500_000
random_strings = [rands(10) for _ in range(size//100)]*100
table = pa.table({"col1": range(size), "col2": random_strings})
with pa.CompressedOutputStream("data.csv.bz2", "bz2") as out:
csv.write_csv(table, out)
```
Read part of the file:
```python
import pyarrow as pa
from pyarrow import csv
input_file = "data.csv.bz2"
output_file = "test_out.csv"
num_chunks = 2
schema = pa.schema([('col1', pa.string()), ('col2', pa.string())])
convertopt = csv.ConvertOptions(column_types=schema)
with csv.open_csv(input_file, convert_options=convertopt) as reader:
cnt = 0
while cnt < num_chunks or num_chunks == 0:
try:
batch = reader.read_next_batch()
cnt += 1
print("Finished reading chunk ", cnt)
except StopIteration:
break
print("Done. Closing file.")
```
Backtrace from running the above with gdb:
<details>
```
Finished reading chunk 1
Finished reading chunk 2
Done. Closing file.
[Thread 0x7fff49c8f640 (LWP 1033591) exited]
Thread 22 "python" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fff4a7ff640 (LWP 1033590)]
0x00007ffff604f87d in
arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)::{lambda()#1}&&) (this=this@entry=0x5555560eec40, func=...)
at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85
85 return SpawnReal(TaskHints{}, std::forward<Function>(func),
StopToken::Unstoppable(),
(gdb) bt
#0 0x00007ffff604f87d in
arrow::internal::Executor::Spawn<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)::{lambda()#1}>(arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)::{lambda()#1}&&) (this=this@entry=0x5555560eec40, func=...)
at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:85
#1 0x00007ffff604fb83 in
arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}::operator()(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)
(result=..., __closure=0x7fff3c006e88)
at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.h:247
#2 arrow::Future<std::shared_ptr<arrow::Buffer>
>::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> >
const&)#2}>::operator()(arrow::FutureImpl const&) && (
impl=..., this=0x7fff3c006e88) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:442
#3 arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::FnImpl<arrow::Future<std::shared_ptr<arrow::Buffer>
>::WrapResultOnComplete::Callback<arrow::internal::Executor::DoTransfer<std::shared_ptr<arrow::Buffer>,
arrow::Future<std::shared_ptr<arrow::Buffer> >,
arrow::Result<std::shared_ptr<arrow::Buffer> >
>(arrow::Future<std::shared_ptr<arrow::Buffer> >,
bool)::{lambda(arrow::Result<std::shared_ptr<arrow::Buffer> > const&)#2}>
>::invoke(arrow::FutureImpl const&) (this=0x7fff3c006e80, a#0=...) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152
#4 0x00007ffff6283c01 in arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::operator()(arrow::FutureImpl const&) && (a#0=..., this=0x7fff3c006170)
at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191
#5 arrow::ConcreteFutureImpl::RunOrScheduleCallback
(in_add_callback=<optimized out>, callback_record=..., self=...) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:110
#6 arrow::ConcreteFutureImpl::RunOrScheduleCallback (self=...,
callback_record=..., in_add_callback=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:100
#7 0x00007ffff6283fd5 in arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed
(this=<optimized out>, state=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:148
#8 0x00007ffff628428a in arrow::ConcreteFutureImpl::DoMarkFinished
(this=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.cc:39
#9 0x00007ffff604ed8c in arrow::Future<std::shared_ptr<arrow::Buffer>
>::DoMarkFinished (res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...),
this=0x7fff4a7fecb0)
at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:658
#10 arrow::Future<std::shared_ptr<arrow::Buffer> >::MarkFinished
(this=this@entry=0x7fff4a7fecb0,
res=arrow::Result<std::shared_ptr<arrow::Buffer>>(...))
at /home/joris/scipy/repos/arrow/cpp/src/arrow/util/future.h:403
#11 0x00007ffff60563d6 in
arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer> >::WorkerTask
(state=...) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1797
#12 0x00007ffff605686c in
arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
>::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
>::State>, arrow::util::Mutex::Guard)::{lambda()#1}::operator()() const
(__closure=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/async_generator.h:1666
#13 arrow::internal::FnOnce<void
()>::FnImpl<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
>::State::DoRestartTask(std::shared_ptr<arrow::BackgroundGenerator<std::shared_ptr<arrow::Buffer>
>::State>, arrow::util::Mutex::Guard)::{lambda()#1}>::invoke()
(this=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/functional.h:152
#14 0x00007ffff62c568c in arrow::internal::FnOnce<void ()>::operator()() &&
(this=0x7fff4a7fedc0) at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/unique_ptr.h:191
#15 arrow::internal::WorkerLoop (it=..., state=...) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:457
#16 operator() (__closure=<optimized out>) at
/home/joris/scipy/repos/arrow/cpp/src/arrow/util/thread_pool.cc:618
#17 std::__invoke_impl<void,
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__f=...)
at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:61
#18
std::__invoke<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> (__fn=...) at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/invoke.h:96
#19
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> >::_M_invoke<0> (this=<optimized out>)
at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:279
#20
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> >::operator() (this=<optimized out>)
at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:286
#21
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> > >::_M_run(void) (this=<optimized out>)
at
/home/joris/conda/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/12.3.0/bits/std_thread.h:231
#22 0x00007ffff4cf0e95 in std::execute_native_thread_routine (__p=<optimized
out>) at ../../../../../libstdc++-v3/src/c++11/thread.cc:104
#23 0x00007ffff7c94ac3 in start_thread (arg=<optimized out>) at
./nptl/pthread_create.c:442
#24 0x00007ffff7d26850 in clone3 () at
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
```
</details>
The backtrace points to the threadpool / task spawning code (not very
familiar with this part).
cc @pitrou
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]