[ https://issues.apache.org/jira/browse/ARROW-17252?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17573068#comment-17573068 ]
Dewey Dunnington edited comment on ARROW-17252 at 7/29/22 5:17 PM: ------------------------------------------------------------------- I can get a similar leak locally, too using a dockerfile: {noformat} FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive ENV TZ=America/Halifax RUN apt-get update && apt-get install -y valgrind r-base cmake git libxml2-dev libcurl4-openssl-dev libssl-dev libgit2-dev libfontconfig1-dev libfreetype6-dev libharfbuzz-dev libfribidi-dev libpng-dev libtiff5-dev libjpeg-dev RUN git clone https://github.com/apache/arrow.git /arrow && mkdir /arrow-build && cd /arrow-build && cmake /arrow/cpp -DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_COMPUTE=ON -DBoost_SOURCE=BUNDLED && cmake --build . && cmake --install . --prefix /arrow-dist RUN R -e 'install.packages(c("devtools", "cpp11", "R6", "assertthat", "bit64", "bit", "cli", "ellipsis", "glue", "magrittr", "purrr", "rlang", "tidyselect", "vctrs", "lubridate", "dplyr", "hms"), repos = "https://cloud.r-project.org")' ENV ARROW_HOME /arrow-dist ENV LD_LIBRARY_PATH /arrow-dist/lib RUN cd /arrow/r && R CMD INSTALL . {noformat} Launching R with valgrind: {noformat} R -d "valgrind --tool=memcheck --leak-check=full" {noformat} ...and I get this leak: {noformat} ==387== 2,608 (72 direct, 2,536 indirect) bytes in 1 blocks are definitely lost in loss record 625 of 4,108 ==387== at 0x484A3C4: operator new(unsigned long) (in /usr/lib/aarch64-linux-gnu/valgrind/vgpreload_memcheck-arm64-linux.so) ==387== by 0x1566648F: arrow::Table::FromRecordBatches(std::shared_ptr<arrow::Schema>, std::vector<std::shared_ptr<arrow::RecordBatch>, std::allocator<std::shared_ptr<arrow::RecordBatch> > > const&) (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x15629FB7: arrow::RecordBatchReader::ToTable() (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x1501C503: operator() (compute-exec.cpp:147) ==387== by 0x1501C503: std::_Function_handler<arrow::Result<std::shared_ptr<arrow::Table> > (), ExecPlan_read_table(std::shared_ptr<arrow::compute::ExecPlan> const&, std::shared_ptr<arrow::compute::ExecNode> const&, cpp11::r_vector<SEXPREC*>, cpp11::r_vector<cpp11::r_string>, long)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (std_function.h:286) ==387== by 0x15023427: std::function<arrow::Result<std::shared_ptr<arrow::Table> > ()>::operator()() const (std_function.h:688) ==387== by 0x1502352F: operator()<std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (future.h:150) ==387== by 0x1502352F: __invoke_impl<void, arrow::detail::ContinueFuture&, arrow::Future<std::shared_ptr<arrow::Table> >&, std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (invoke.h:60) ==387== by 0x1502352F: __invoke<arrow::detail::ContinueFuture&, arrow::Future<std::shared_ptr<arrow::Table> >&, std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (invoke.h:95) ==387== by 0x1502352F: __call<void, 0, 1> (functional:400) ==387== by 0x1502352F: operator()<> (functional:484) ==387== by 0x1502352F: arrow::internal::FnOnce<void ()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture (arrow::Future<std::shared_ptr<arrow::Table> >, std::function<arrow::Result<std::shared_ptr<arrow::Table> > ()>)> >::invoke() (functional.h:152) ==387== by 0x1579636B: std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::{lambda()#1}> > >::_M_run() (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x71F4FAB: ??? (in /usr/lib/aarch64-linux-gnu/libstdc++.so.6.0.28) ==387== by 0x55F1623: start_thread (pthread_create.c:477) ==387== by 0x4DA949B: thread_start (clone.S:78) {noformat} (Although this dockerfile doesn't use r-devel...it uses R 3.6 which is a bit old). was (Author: paleolimbot): I can get a similar leak locally, too using a dockerfile: {noformat} FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive ENV TZ=America/Halifax RUN apt-get update && apt-get install -y valgrind r-base cmake git libxml2-dev libcurl4-openssl-dev libssl-dev libgit2-dev libfontconfig1-dev libfreetype6-dev libharfbuzz-dev libfribidi-dev libpng-dev libtiff5-dev libjpeg-dev RUN git clone https://github.com/apache/arrow.git /arrow && mkdir /arrow-build && cd /arrow-build && cmake /arrow/cpp -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_FILESYSTEM=ON -DARROW_COMPUTE=ON -DBoost_SOURCE=BUNDLED && cmake --build . && cmake --install . --prefix /arrow-dist RUN R -e 'install.packages(c("devtools", "cpp11", "R6", "assertthat", "bit64", "bit", "cli", "ellipsis", "glue", "magrittr", "purrr", "rlang", "tidyselect", "vctrs", "lubridate", "dplyr", "hms"), repos = "https://cloud.r-project.org")' ENV ARROW_HOME /arrow-dist ENV LD_LIBRARY_PATH /arrow-dist/lib RUN cd /arrow/r && R CMD INSTALL . {noformat} Launching R with valgrind: {noformat} R -d "valgrind --tool=memcheck --leak-check=full" {noformat} ...and I get this leak: {noformat} ==387== 2,608 (72 direct, 2,536 indirect) bytes in 1 blocks are definitely lost in loss record 625 of 4,108 ==387== at 0x484A3C4: operator new(unsigned long) (in /usr/lib/aarch64-linux-gnu/valgrind/vgpreload_memcheck-arm64-linux.so) ==387== by 0x1566648F: arrow::Table::FromRecordBatches(std::shared_ptr<arrow::Schema>, std::vector<std::shared_ptr<arrow::RecordBatch>, std::allocator<std::shared_ptr<arrow::RecordBatch> > > const&) (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x15629FB7: arrow::RecordBatchReader::ToTable() (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x1501C503: operator() (compute-exec.cpp:147) ==387== by 0x1501C503: std::_Function_handler<arrow::Result<std::shared_ptr<arrow::Table> > (), ExecPlan_read_table(std::shared_ptr<arrow::compute::ExecPlan> const&, std::shared_ptr<arrow::compute::ExecNode> const&, cpp11::r_vector<SEXPREC*>, cpp11::r_vector<cpp11::r_string>, long)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (std_function.h:286) ==387== by 0x15023427: std::function<arrow::Result<std::shared_ptr<arrow::Table> > ()>::operator()() const (std_function.h:688) ==387== by 0x1502352F: operator()<std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (future.h:150) ==387== by 0x1502352F: __invoke_impl<void, arrow::detail::ContinueFuture&, arrow::Future<std::shared_ptr<arrow::Table> >&, std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (invoke.h:60) ==387== by 0x1502352F: __invoke<arrow::detail::ContinueFuture&, arrow::Future<std::shared_ptr<arrow::Table> >&, std::function<arrow::Result<std::shared_ptr<arrow::Table> >()>&> (invoke.h:95) ==387== by 0x1502352F: __call<void, 0, 1> (functional:400) ==387== by 0x1502352F: operator()<> (functional:484) ==387== by 0x1502352F: arrow::internal::FnOnce<void ()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture (arrow::Future<std::shared_ptr<arrow::Table> >, std::function<arrow::Result<std::shared_ptr<arrow::Table> > ()>)> >::invoke() (functional.h:152) ==387== by 0x1579636B: std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::{lambda()#1}> > >::_M_run() (in /arrow-dist/lib/libarrow.so.900.0.0) ==387== by 0x71F4FAB: ??? (in /usr/lib/aarch64-linux-gnu/libstdc++.so.6.0.28) ==387== by 0x55F1623: start_thread (pthread_create.c:477) ==387== by 0x4DA949B: thread_start (clone.S:78) {noformat} (Although this dockerfile doesn't use r-devel...it uses R 3.6 which is a bit old). > [R] Intermittent valgrind failure > --------------------------------- > > Key: ARROW-17252 > URL: https://issues.apache.org/jira/browse/ARROW-17252 > Project: Apache Arrow > Issue Type: Improvement > Components: R > Reporter: Dewey Dunnington > Priority: Major > Labels: pull-request-available > Time Spent: 40m > Remaining Estimate: 0h > > A number of recent nightly builds have intermittent failures with valgrind, > which fails because of possibly leaked memory around an exec plan. This seems > related to a change in XXX that separated {{ExecPlan_prepare()}} from > {{ExecPlan_run()}} and added a {{ExecPlan_read_table()}} that uses > {{RunWithCapturedR()}}. The reported leaks vary but include ExecPlans and > ExecNodes and fields of those objects. > A failed run: > https://dev.azure.com/ursacomputing/crossbow/_build/results?buildId=30310&view=logs&j=0da5d1d9-276d-5173-c4c4-9d4d4ed14fdb&t=d9b15392-e4ce-5e4c-0c8c-b69645229181&l=24980 > Some example output: > {noformat} > ==5249== 14,112 (384 direct, 13,728 indirect) bytes in 1 blocks are > definitely lost in loss record 1,988 of 3,883 > ==5249== at 0x4849013: operator new(unsigned long) (in > /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) > ==5249== by 0x10B2902B: > std::_Function_handler<arrow::Result<arrow::compute::ExecNode*> > (arrow::compute::ExecPlan*, std::vector<arrow::compute::ExecNode*, > std::allocator<arrow::compute::ExecNode*> >, arrow::compute::ExecNodeOptions > const&), > arrow::compute::internal::RegisterAggregateNode(arrow::compute::ExecFactoryRegistry*)::{lambda(arrow::compute::ExecPlan*, > std::vector<arrow::compute::ExecNode*, > std::allocator<arrow::compute::ExecNode*> >, arrow::compute::ExecNodeOptions > const&)#1}>::_M_invoke(std::_Any_data const&, arrow::compute::ExecPlan*&&, > std::vector<arrow::compute::ExecNode*, > std::allocator<arrow::compute::ExecNode*> >&&, > arrow::compute::ExecNodeOptions const&) (exec_plan.h:60) > ==5249== by 0xFA83A0C: > std::function<arrow::Result<arrow::compute::ExecNode*> > (arrow::compute::ExecPlan*, std::vector<arrow::compute::ExecNode*, > std::allocator<arrow::compute::ExecNode*> >, arrow::compute::ExecNodeOptions > const&)>::operator()(arrow::compute::ExecPlan*, > std::vector<arrow::compute::ExecNode*, > std::allocator<arrow::compute::ExecNode*> >, arrow::compute::ExecNodeOptions > const&) const (std_function.h:622) > ==5249== 14,528 (160 direct, 14,368 indirect) bytes in 1 blocks are > definitely lost in loss record 1,989 of 3,883 > ==5249== at 0x4849013: operator new(unsigned long) (in > /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) > ==5249== by 0x10096CB7: arrow::FutureImpl::Make() (future.cc:187) > ==5249== by 0xFCB6F9A: arrow::Future<arrow::internal::Empty>::Make() > (future.h:420) > ==5249== by 0x101AE927: ExecPlanImpl (exec_plan.cc:50) > ==5249== by 0x101AE927: > arrow::compute::ExecPlan::Make(arrow::compute::ExecContext*, > std::shared_ptr<arrow::KeyValueMetadata const>) (exec_plan.cc:355) > ==5249== by 0xFA77BA2: ExecPlan_create(bool) (compute-exec.cpp:45) > ==5249== by 0xF9FAE9F: _arrow_ExecPlan_create (arrowExports.cpp:868) > ==5249== by 0x4953B60: R_doDotCall (dotcode.c:601) > ==5249== by 0x49C2C16: bcEval (eval.c:7682) > ==5249== by 0x499DB95: Rf_eval (eval.c:748) > ==5249== by 0x49A0904: R_execClosure (eval.c:1918) > ==5249== by 0x49A05B7: Rf_applyClosure (eval.c:1844) > ==5249== by 0x49B2122: bcEval (eval.c:7094) > ==5249== > ==5249== 36,322 (416 direct, 35,906 indirect) bytes in 1 blocks are > definitely lost in loss record 2,929 of 3,883 > ==5249== at 0x4849013: operator new(unsigned long) (in > /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) > ==5249== by 0x10214F92: arrow::compute::TaskScheduler::Make() > (task_util.cc:421) > ==5249== by 0x101AEA6C: ExecPlanImpl (exec_plan.cc:50) > ==5249== by 0x101AEA6C: > arrow::compute::ExecPlan::Make(arrow::compute::ExecContext*, > std::shared_ptr<arrow::KeyValueMetadata const>) (exec_plan.cc:355) > ==5249== by 0xFA77BA2: ExecPlan_create(bool) (compute-exec.cpp:45) > ==5249== by 0xF9FAE9F: _arrow_ExecPlan_create (arrowExports.cpp:868) > ==5249== by 0x4953B60: R_doDotCall (dotcode.c:601) > ==5249== by 0x49C2C16: bcEval (eval.c:7682) > ==5249== by 0x499DB95: Rf_eval (eval.c:748) > ==5249== by 0x49A0904: R_execClosure (eval.c:1918) > ==5249== by 0x49A05B7: Rf_applyClosure (eval.c:1844) > ==5249== by 0x49B2122: bcEval (eval.c:7094) > ==5249== by 0x499DB95: Rf_eval (eval.c:748) > {noformat} > We also occasionally get leaked Schemas, and in one case a leaked InputType > that seemed completely unrelated to the other leaks (ARROW-17225). > I'm wondering if these have to do with references in lambdas that get passed > by reference? Or perhaps a cache issue? There were some instances in previous > leaks where the backtrace to the {{new}} allocator was different between > reported leaks. -- This message was sent by Atlassian Jira (v8.20.10#820010)