mrd0ll4r commented on issue #46814:
URL: https://github.com/apache/arrow/issues/46814#issuecomment-2979430949

   Update: I hit this again today, on a different dataset and a different query.
   
   Query:
   ```r
   open_dataset("data/bluesky/labeler_logs_clean_parquet") %>%
     filter(date(ts) <= date("2025-06-14"))
     distinct(uri, src) %>%
     group_by(uri) %>%
     tally() %>%
     filter(n > 1) %>%
     collect() %>%
     pull(uri)
   ```
   
   Dataset:
   ```r
   > open_dataset("data/bluesky/labeler_logs_clean_parquet")
   FileSystemDataset with 54824 Parquet files
   7 columns
   dom: int64
   uri: string
   val: string
   ts: timestamp[us, tz=UTC]
   src: string
   year: int32
   month: int32
   ```
   
   it's a bit smaller:
   ```bash
   $ du -sh data/bluesky/labeler_logs_clean_parquet/
   2.7G    data/bluesky/labeler_logs_clean_parquet/
   ```
   
   Backtrace:
   ```
   Thread 65 "R" received signal SIGSEGV, Segmentation fault.
   [Switching to Thread 0x7ffee5ffb6c0 (LWP 73846)]
   __memcpy_evex_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:273
   273     ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such 
file or directory.
   (gdb) bt
   #0  __memcpy_evex_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:273
   #1  0x00007fffeb75ef14 in memcpy (__len=32, __src=<optimized out>, 
__dest=<optimized out>) at 
/usr/include/x86_64-linux-gnu/bits/string_fortified.h:29
   #2  arrow::BufferBuilder::UnsafeAppend (length=32, data=<optimized out>, 
this=<optimized out>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/buffer_builder.h:143
   #3  arrow::TypedBufferBuilder<unsigned char, void>::UnsafeAppend 
(num_elements=32, values=<optimized out>, this=<optimized out>)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/buffer_builder.h:268
   #4  operator() (__closure=__closure@entry=0x7ffee5ff9950, position=2, 
length=1) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:565
   #5  0x00007fffeb76bb84 in 
arrow::internal::VisitSetBitRuns<arrow::compute::internal::(anonymous 
namespace)::BinaryFilterNonNullImpl<arrow::BinaryType>(arrow::compute::KernelContext*,
 const arrow::ArraySpan&, const arrow::ArraySpan&, int64_t, 
arrow::compute::FilterOptions::NullSelectionBehavior, 
arrow::ArrayData*)::<lambda(int64_t, int64_t)> > (visit=..., length=<optimized 
out>, offset=<optimized out>,
       bitmap=<optimized out>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/bit_run_reader.h:477
   #6  arrow::compute::internal::(anonymous 
namespace)::BinaryFilterNonNullImpl<arrow::BinaryType> (ctx=<optimized out>, 
out=0x7ffec8146350, null_selection=arrow::compute::FilterOptions::DROP, 
output_length=4020,
       filter=..., values=...) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:586
   #7  arrow::compute::internal::(anonymous namespace)::BinaryFilterExec 
(ctx=<optimized out>, batch=..., out=<optimized out>)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:832
   #8  0x00007fffeb611232 in arrow::compute::detail::(anonymous 
namespace)::VectorExecutor::Exec (this=this@entry=0x7ffec80069f0, span=..., 
listener=listener@entry=0x7ffee5ff9dc0)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1109
   #9  0x00007fffeb611a64 in arrow::compute::detail::(anonymous 
namespace)::VectorExecutor::Execute (this=0x7ffec80069f0, batch=..., 
listener=0x7ffee5ff9dc0)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1049
   #10 0x00007fffeb63233d in 
arrow::compute::detail::FunctionExecutorImpl::Execute (this=0x7ffec8135d10, 
args=std::vector of length 2, capacity 2 = {...}, passed_length=-1)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:278
   #11 0x00007fffeb62ed56 in arrow::compute::(anonymous 
namespace)::ExecuteInternal (func=..., args=std::vector of length 2, capacity 2 
= {...}, passed_length=passed_length@entry=-1,
       options=options@entry=0x7ffee5ffa7c0, ctx=ctx@entry=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:343
   #12 0x00007fffeb62f297 in arrow::compute::Function::Execute 
(this=0x55555bd3b870, args=..., options=0x7ffee5ffa7c0, ctx=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:350
   #13 0x00007fffeb60dc41 in arrow::compute::CallFunction 
(func_name="array_filter", args=std::vector of length 2, capacity 2 = {...}, 
options=options@entry=0x7ffee5ffa7c0,
       ctx=ctx@entry=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1369
   #14 0x00007fffeb76e071 in arrow::compute::internal::(anonymous 
namespace)::FilterMetaFunction::ExecuteImpl (this=<optimized out>, 
args=std::vector of length 2, capacity 2 = {...}, options=0x7ffee5ffa7c0,
       ctx=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>) at 
/usr/include/c++/12/bits/basic_string.tcc:238
   #15 0x00007fffeb62dfa7 in arrow::compute::MetaFunction::Execute 
(this=0x555556c08290, args=std::vector of length 2, capacity 2 = {...}, 
options=0x7ffee5ffa7c0,
       ctx=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/function.cc:483
   #16 0x00007fffeb60dc41 in arrow::compute::CallFunction (func_name="filter", 
args=std::vector of length 2, capacity 2 = {...}, 
options=options@entry=0x7ffee5ffa7c0,
       ctx=0x7fffed4bb420 
<arrow::compute::default_exec_context()::default_ctx>, ctx@entry=0x0) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/exec.cc:1369
   #17 0x00007fffeb5e8e77 in arrow::compute::Filter (values=..., filter=..., 
options=..., ctx=ctx@entry=0x0) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/compute/api_vector.cc:412
   #18 0x00007fffeb22c37b in arrow::acero::(anonymous 
namespace)::FilterNode::ProcessBatch (this=<optimized out>, batch=...)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/filter_node.cc:102
   #19 0x00007fffeb23e565 in arrow::acero::MapNode::InputReceived 
(this=this@entry=0x55555ba9cdd0, input=input@entry=0x55555d630c30, batch=...)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/map_node.cc:76
   #20 0x00007fffeb2d9744 in 
arrow::acero::aggregate::GroupByNode::OutputNthBatch (this=0x55555d630c30, 
n=1041) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/groupby_aggregate_node.cc:341
   #21 0x00007fffeb2d9883 in operator() (task_id=<optimized out>, 
__closure=<optimized out>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/groupby_aggregate_node.cc:64
   #22 std::__invoke_impl<arrow::Status, 
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)>&, long 
unsigned int, long int> (__f=...) at /usr/include/c++/12/bits/invoke.h:61
   #23 std::__invoke_r<arrow::Status, 
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)>&, long 
unsigned int, long int> (__fn=...) at /usr/include/c++/12/bits/invoke.h:116
   #24 std::_Function_handler<arrow::Status(long unsigned int, long int), 
arrow::acero::aggregate::GroupByNode::Init()::<lambda(size_t, int64_t)> 
>::_M_invoke(const std::_Any_data &, unsigned long &&, long &&) (
       __functor=..., __args#0=<optimized out>, __args#1=<optimized out>) at 
/usr/include/c++/12/bits/std_function.h:291
   #25 0x00007fffeb289a7e in std::function<arrow::Status (unsigned long, 
long)>::operator()(unsigned long, long) const (__args#1=<optimized out>, 
__args#0=<optimized out>, this=<optimized out>)
       at /usr/include/c++/12/bits/std_function.h:591
   #26 arrow::acero::TaskSchedulerImpl::ExecuteTask (this=0x55556bf00b20, 
thread_id=<optimized out>, group_id=<optimized out>, task_id=<optimized out>, 
task_group_finished=0x7ffee5ffac16)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:212
   #27 0x00007fffeb28a2c1 in operator() (__closure=<optimized out>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:366
   #28 operator() (thread_id=52, __closure=0x7fffc81583f0) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/task_util.cc:366
   #29 std::__invoke_impl<arrow::Status, 
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)>&, 
long unsigned int> (__f=...) at /usr/include/c++/12/bits/invoke.h:61
   #30 std::__invoke_r<arrow::Status, 
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)>&, 
long unsigned int> (__fn=...) at /usr/include/c++/12/bits/invoke.h:116
   #31 std::_Function_handler<arrow::Status(long unsigned int), 
arrow::acero::TaskSchedulerImpl::ScheduleMore(size_t, int)::<lambda(size_t)> 
>::_M_invoke(const std::_Any_data &, unsigned long &&) (__functor=...,
       __args#0=<optimized out>) at /usr/include/c++/12/bits/std_function.h:291
   #32 0x00007fffeb24f034 in std::function<arrow::Status (unsigned 
long)>::operator()(unsigned long) const (__args#0=<optimized out>, 
this=0x7fffdc007138) at /usr/include/c++/12/bits/std_function.h:591
   #33 operator() (__closure=0x7fffdc007130) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/acero/query_context.cc:72
   #34 std::__invoke_impl<arrow::Status, 
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long 
unsigned int)>, std::string_view)::<lambda()>&> (__f=...) at 
/usr/include/c++/12/bits/invoke.h:61
   #35 std::__invoke_r<arrow::Status, 
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long 
unsigned int)>, std::string_view)::<lambda()>&> (__fn=...) at 
/usr/include/c++/12/bits/invoke.h:116
   #36 std::_Function_handler<arrow::Status(), 
arrow::acero::QueryContext::ScheduleTask(std::function<arrow::Status(long 
unsigned int)>, std::string_view)::<lambda()> >::_M_invoke(const std::_Any_data 
&) (
       __functor=...) at /usr/include/c++/12/bits/std_function.h:291
   #37 0x00007fffeb250e5f in std::function<arrow::Status ()>::operator()() 
const (this=<optimized out>) at /usr/include/c++/12/bits/std_function.h:591
   #38 arrow::detail::ContinueFuture::operator()<std::function<arrow::Status 
()>&, , arrow::Status, arrow::Future<arrow::internal::Empty> 
>(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status ()>&) 
const (this=<optimized out>, f=..., next=...) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/future.h:150
   #39 std::__invoke_impl<void, arrow::detail::ContinueFuture&, 
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status 
()>&>(std::__invoke_other, arrow::detail::ContinueFuture&, 
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status ()>&) 
(__f=...) at /usr/include/c++/12/bits/invoke.h:61
   #40 std::__invoke<arrow::detail::ContinueFuture&, 
arrow::Future<arrow::internal::Empty>&, std::function<arrow::Status 
()>&>(arrow::detail::ContinueFuture&, arrow::Future<arrow::internal::Empty>&, 
std::function<arrow::Status ()>&) (__fn=...) at 
/usr/include/c++/12/bits/invoke.h:96
   #41 std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status 
()>)>::__call<void, , 0ul, 1ul>(std::tuple<>&&, std::_Index_tuple<0ul, 1ul>) 
(__args=...,
       this=<optimized out>) at /usr/include/c++/12/functional:484
   #42 std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status 
()>)>::operator()<, void>() (this=<optimized out>) at 
/usr/include/c++/12/functional:567
   #43 arrow::internal::FnOnce<void 
()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, std::function<arrow::Status ()>)> 
>::invoke() (this=<optimized out>)
       at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/functional.h:152
   #44 0x00007fffec3e969f in arrow::internal::FnOnce<void ()>::operator()() && 
(this=0x7ffee5ffad30) at /usr/include/c++/12/bits/unique_ptr.h:191
   #45 arrow::internal::WorkerLoop (it=..., state=...) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/thread_pool.cc:478
   #46 operator() (__closure=<optimized out>) at 
/tmp/RtmpvG0xfW/R.INSTALL2875d41805ac1/arrow/tools/cpp/src/arrow/util/thread_pool.cc:643
   #47 std::__invoke_impl<void, 
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()> > (__f=...) 
at /usr/include/c++/12/bits/invoke.h:61
   #48 
std::__invoke<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > (__fn=...) at /usr/include/c++/12/bits/invoke.h:96
   #49 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > >::_M_invoke<0> (this=<optimized out>) at 
/usr/include/c++/12/bits/std_thread.h:252
   #50 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > >::operator() (this=<optimized out>) at 
/usr/include/c++/12/bits/std_thread.h:259
   #51 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::<lambda()>
 > > >::_M_run(void) (this=<optimized out>)
       at /usr/include/c++/12/bits/std_thread.h:210
   #52 0x00007ffff4cd44a3 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
   #53 0x00007ffff78a81f5 in start_thread (arg=<optimized out>) at 
./nptl/pthread_create.c:442
   #54 0x00007ffff792889c in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to