mrd0ll4r commented on issue #46814:
URL: https://github.com/apache/arrow/issues/46814#issuecomment-2979430949
Update: I hit this again today, on a different dataset and a different query.
Query:
```r
open_dataset("data/bluesky/labeler_logs_clean_parquet") %>%
filter(date(ts) <= date("2025-06-14"))
distinct(uri, src) %>%
group_by(uri) %>%
tally() %>%
filter(n > 1) %>%
collect() %>%
pull(uri)
```
Dataset:
```r
> open_dataset("data/bluesky/labeler_logs_clean_parquet")
FileSystemDataset with 54824 Parquet files
7 columns
dom: int64
uri: string
val: string
ts: timestamp[us, tz=UTC]
src: string
year: int32
month: int32
```
it's a bit smaller:
```bash
$ du -sh data/bluesky/labeler_logs_clean_parquet/
2.7G data/bluesky/labeler_logs_clean_parquet/
```
Backtrace:
```
Thread 65 "R" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffee5ffb6c0 (LWP 73846)]
__memcpy_evex_unaligned_erms () at
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:273
273 ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such
file or directory.
(gdb) bt
#0 __memcpy_evex_unaligned_erms () at
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:273
#1 0x00007fffeb75ef14 in memcpy (__len=32, __src=<optimized out>,
__dest=<optimized out>) at
/usr/include/x86_64-linux-gnu/bits/string_fortified.h:29
#2 arrow::BufferBuilder::UnsafeAppend (length=32, data=<optimized out>,
this=<optimized out>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/buffer_builder.h:143
#3 arrow::TypedBufferBuilder<unsigned char, void>::UnsafeAppend
(num_elements=32, values=<optimized out>, this=<optimized out>)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/buffer_builder.h:268
#4 operator() (__closure=__closure@entry=0x7ffee5ff9950, position=2,
length=1) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:565
#5 0x00007fffeb76bb84 in
arrow::internal::VisitSetBitRuns<arrow::compute::internal::(anonymous
namespace)::BinaryFilterNonNullImpl<arrow::BinaryType>(arrow::compute::KernelContext*,
const arrow::ArraySpan&, const arrow::ArraySpan&, int64_t,
arrow::compute::FilterOptions::NullSelectionBehavior,
arrow::ArrayData*)::<lambda(int64_t, int64_t)> > (visit=..., length=<optimized
out>, offset=<optimized out>,
bitmap=<optimized out>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/bit_run_reader.h:477
#6 arrow::compute::internal::(anonymous
namespace)::BinaryFilterNonNullImpl<arrow::BinaryType> (ctx=<optimized out>,
out=0x7ffec8146350, null_selection=arrow::compute::FilterOptions::DROP,
output_length=4020,
filter=..., values=...) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:586
#7 arrow::compute::internal::(anonymous namespace)::BinaryFilterExec
(ctx=<optimized out>, batch=..., out=<optimized out>)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:832
#8 0x00007fffeb611232 in arrow::compute::detail::(anonymous
namespace)::VectorExecutor::Exec (this=this@entry=0x7ffec80069f0, span=...,
listener=listener@entry=0x7ffee5ff9dc0)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1109
#9 0x00007fffeb611a64 in arrow::compute::detail::(anonymous
namespace)::VectorExecutor::Execute (this=0x7ffec80069f0, batch=...,
listener=0x7ffee5ff9dc0)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1049
#10 0x00007fffeb63233d in
arrow::compute::detail::FunctionExecutorImpl::Execute (this=0x7ffec8135d10,
args=std::vector of length 2, capacity 2 = {...}, passed_length=-1)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:278
#11 0x00007fffeb62ed56 in arrow::compute::(anonymous
namespace)::ExecuteInternal (func=..., args=std::vector of length 2, capacity 2
= {...}, passed_length=passed_length@entry=-1,
options=options@entry=0x7ffee5ffa7c0, ctx=ctx@entry=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:343
#12 0x00007fffeb62f297 in arrow::compute::Function::Execute
(this=0x55555bd3b870, args=..., options=0x7ffee5ffa7c0, ctx=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:350
#13 0x00007fffeb60dc41 in arrow::compute::CallFunction
(func_name="array_filter", args=std::vector of length 2, capacity 2 = {...},
options=options@entry=0x7ffee5ffa7c0,
ctx=ctx@entry=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1369
#14 0x00007fffeb76e071 in arrow::compute::internal::(anonymous
namespace)::FilterMetaFunction::ExecuteImpl (this=<optimized out>,
args=std::vector of length 2, capacity 2 = {...}, options=0x7ffee5ffa7c0,
ctx=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>) at
/usr/include/c++/12/bits/basic_string.tcc:238
#15 0x00007fffeb62dfa7 in arrow::compute::MetaFunction::Execute
(this=0x555556c08290, args=std::vector of length 2, capacity 2 = {...},
options=0x7ffee5ffa7c0,
ctx=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:483
#16 0x00007fffeb60dc41 in arrow::compute::CallFunction (func_name="filter",
args=std::vector of length 2, capacity 2 = {...},
options=options@entry=0x7ffee5ffa7c0,
ctx=0x7fffed4bb420
<arrow::compute::default_exec_context()::default_ctx>, ctx@entry=0x0) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1369
#17 0x00007fffeb5e8e77 in arrow::compute::Filter (values=..., filter=...,
options=..., ctx=ctx@entry=0x0) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/api_vector.cc:412
#18 0x00007fffeb22c37b in arrow::acero::(anonymous
namespace)::FilterNode::ProcessBatch (this=<optimized out>, batch=...)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/filter_node.cc:102
#19 0x00007fffeb23e565 in arrow::acero::MapNode::InputReceived
(this=this@entry=0x55555ba9cdd0, input=input@entry=0x55555d630c30, batch=...)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/map_node.cc:76
#20 0x00007fffeb2d9744 in
arrow::acero::aggregate::GroupByNode::OutputNthBatch (this=0x55555d630c30,
n=1041) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/groupby_aggregate_node.cc:341
#21 0x00007fffeb2d9883 in operator() (task_id=<optimized out>,
__closure=<optimized out>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/groupby_aggregate_node.cc:64
#22 std::__invoke_impl<arrow::Status,
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)>&, long
unsigned int, long int> (__f=...) at /usr/include/c++/12/bits/invoke.h:61
#23 std::__invoke_r<arrow::Status,
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)>&, long
unsigned int, long int> (__fn=...) at /usr/include/c++/12/bits/invoke.h:116
#24 std::_Function_handler<arrow::Status(long unsigned int, long int),
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)>
>::_M_invoke(const std::_Any_data &, unsigned long &&, long &&) (
__functor=..., __args#0=<optimized out>, __args#1=<optimized out>) at
/usr/include/c++/12/bits/std_function.h:291
#25 0x00007fffeb289a7e in std::function<arrow::Status (unsigned long,
long)>::operator()(unsigned long, long) const (__args#1=<optimized out>,
__args#0=<optimized out>, this=<optimized out>)
at /usr/include/c++/12/bits/std_function.h:591
#26 arrow::acero::TaskSchedulerImpl::ExecuteTask (this=0x55556bf00b20,
thread_id=<optimized out>, group_id=<optimized out>, task_id=<optimized out>,
task_group_finished=0x7ffee5ffac16)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:212
#27 0x00007fffeb28a2c1 in operator() (__closure=<optimized out>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:366
#28 operator() (thread_id=52, __closure=0x7fffc81583f0) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:366
#29 std::__invoke_impl<arrow::Status,
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)>&,
long unsigned int> (__f=...) at /usr/include/c++/12/bits/invoke.h:61
#30 std::__invoke_r<arrow::Status,
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)>&,
long unsigned int> (__fn=...) at /usr/include/c++/12/bits/invoke.h:116
#31 std::_Function_handler<arrow::Status(long unsigned int),
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)>
>::_M_invoke(const std::_Any_data &, unsigned long &&) (__functor=...,
__args#0=<optimized out>) at /usr/include/c++/12/bits/std_function.h:291
#32 0x00007fffeb24f034 in std::function<arrow::Status (unsigned
long)>::operator()(unsigned long) const (__args#0=<optimized out>,
this=0x7fffdc007138) at /usr/include/c++/12/bits/std_function.h:591
#33 operator() (__closure=0x7fffdc007130) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/query_context.cc:72
#34 std::__invoke_impl<arrow::Status,
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long
unsigned int)>, std::string_view)::<lambda()>&> (__f=...) at
/usr/include/c++/12/bits/invoke.h:61
#35 std::__invoke_r<arrow::Status,
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long
unsigned int)>, std::string_view)::<lambda()>&> (__fn=...) at
/usr/include/c++/12/bits/invoke.h:116
#36 std::_Function_handler<arrow::Status(),
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long
unsigned int)>, std::string_view)::<lambda()> >::_M_invoke(const std::_Any_data
&) (
__functor=...) at /usr/include/c++/12/bits/std_function.h:291
#37 0x00007fffeb250e5f in std::function<arrow::Status ()>::operator()()
const (this=<optimized out>) at /usr/include/c++/12/bits/std_function.h:591
#38 arrow::detail::ContinueFuture::operator()<std::function<arrow::Status
()>&, , arrow::Status, arrow::Future<arrow::internal::Empty>
>(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status ()>&)
const (this=<optimized out>, f=..., next=...) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/future.h:150
#39 std::__invoke_impl<void, arrow::detail::ContinueFuture&,
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status
()>&>(std::__invoke_other, arrow::detail::ContinueFuture&,
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status ()>&)
(__f=...) at /usr/include/c++/12/bits/invoke.h:61
#40 std::__invoke<arrow::detail::ContinueFuture&,
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status
()>&>(arrow::detail::ContinueFuture&, arrow::Future<arrow::internal::Empty>&,
std::function<arrow::Status ()>&) (__fn=...) at
/usr/include/c++/12/bits/invoke.h:96
#41 std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status
()>)>::__call<void, , 0ul, 1ul>(std::tuple<>&&, std::_Index_tuple<0ul, 1ul>)
(__args=...,
this=<optimized out>) at /usr/include/c++/12/functional:484
#42 std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status
()>)>::operator()<, void>() (this=<optimized out>) at
/usr/include/c++/12/functional:567
#43 arrow::internal::FnOnce<void
()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status ()>)>
>::invoke() (this=<optimized out>)
at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/functional.h:152
#44 0x00007fffec3e969f in arrow::internal::FnOnce<void ()>::operator()() &&
(this=0x7ffee5ffad30) at /usr/include/c++/12/bits/unique_ptr.h:191
#45 arrow::internal::WorkerLoop (it=..., state=...) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/thread_pool.cc:478
#46 operator() (__closure=<optimized out>) at
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/thread_pool.cc:643
#47 std::__invoke_impl<void,
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__f=...)
at /usr/include/c++/12/bits/invoke.h:61
#48
std::__invoke<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> (__fn=...) at /usr/include/c++/12/bits/invoke.h:96
#49
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> >::_M_invoke<0> (this=<optimized out>) at
/usr/include/c++/12/bits/std_thread.h:252
#50
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> >::operator() (this=<optimized out>) at
/usr/include/c++/12/bits/std_thread.h:259
#51
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
> > >::_M_run(void) (this=<optimized out>)
at /usr/include/c++/12/bits/std_thread.h:210
#52 0x00007ffff4cd44a3 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#53 0x00007ffff78a81f5 in start_thread (arg=<optimized out>) at
./nptl/pthread_create.c:442
#54 0x00007ffff792889c in clone3 () at
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]