[ 
https://issues.apache.org/jira/browse/ARROW-14583?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17440119#comment-17440119
 ] 

David Li commented on ARROW-14583:
----------------------------------

It appears this fails in a different place so this might be a different bug. 
(At least, the fix here still crashes for this example.) I'm going to 
investigate more tomorrow.

{noformat}
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1  0x00007ffff7543921 in __GI_abort () at abort.c:79
#2  0x00007ffff1e61047 in google::logging_fail() ()
   from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#3  0x00007ffff1e68c0d in google::LogMessage::Fail() ()
   from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#4  0x00007ffff1e6b7a6 in google::LogMessage::SendToLog() ()
   from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#5  0x00007ffff1e68705 in google::LogMessage::Flush() ()
   from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#6  0x00007ffff1e688fd in google::LogMessage::~LogMessage() ()
   from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#7  0x00007fff6f9c60cc in arrow::util::ArrowLog::~ArrowLog (this=0x7fff94e24028)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/logging.cc:250
#8  0x00007fff6f999ab5 in arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed (
    this=0x55555a7d2890, state=arrow::FutureState::SUCCESS)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:309
#9  0x00007fff6f997e5a in arrow::ConcreteFutureImpl::DoMarkFinished 
(this=0x55555a7d2890)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:231
#10 0x00007fff6f994f5d in arrow::FutureImpl::MarkFinished (this=0x55555a7d2890)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:383
#11 0x00007fffd0372570 in arrow::Future<arrow::internal::Empty>::DoMarkFinished 
(
    this=0x55555aa056e8, res=...)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:712
#12 0x00007fffd0372482 in 
arrow::Future<arrow::internal::Empty>::MarkFinished<arrow::internal::Empty, 
void> (this=0x55555aa056e8, s=...)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:463
#13 0x00007fff6fc8fa3d in arrow::compute::(anonymous 
namespace)::GroupByNode::StopProducing (this=0x55555aa05650, 
output=0x55555b088e40)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:533
#14 0x00007fff6fca29f9 in arrow::compute::MapNode::StopProducing 
(this=0x55555b088e40)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:348
#15 0x00007fff6fca28f1 in arrow::compute::MapNode::StopProducing 
(this=0x55555b088e40, 
    output=0x555555a5ff30)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:338
#16 0x00007fff6fca29f9 in arrow::compute::MapNode::StopProducing 
(this=0x555555a5ff30)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:348
#17 0x00007fff6fca28f1 in arrow::compute::MapNode::StopProducing 
(this=0x555555a5ff30, 
    output=0x55555877de40)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:338
#18 0x00007fff6fd2de33 in arrow::compute::(anonymous 
namespace)::SinkNode::ErrorReceived
    (this=0x55555877de40, input=0x555555a5ff30, error=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/sink_node.cc:117
---Type <return> to continue, or q <return> to quit---
#19 0x00007fff6fca2452 in arrow::compute::MapNode::ErrorReceived 
(this=0x555555a5ff30, 
    input=0x55555b088e40, error=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:319
#20 0x00007fff6fca2452 in arrow::compute::MapNode::ErrorReceived 
(this=0x55555b088e40, 
    input=0x55555aa05650, error=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:319
#21 0x00007fff6fca1e32 in arrow::compute::ExecNode::ErrorIfNotOk 
(this=0x55555aa05650, 
    status=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:300
#22 0x00007fff6fc8f351 in arrow::compute::(anonymous 
namespace)::GroupByNode::InputReceived (this=0x55555aa05650, 
input=0x55555bde3440, batch=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:492
#23 0x00007fff6fca32ce in 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_1::operator()() 
const (this=0x7fff2c004028)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:368
#24 0x00007fff6fca655f in 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1}::operator()() const (this=0x7fff2c004020)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:375
#25 0x00007fff6fca64f0 in 
arrow::detail::ContinueFuture::operator()<arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1}&, , arrow::Status, arrow::Future<arrow::internal::Empty> > 
(this=0x7fff2c004018, next=..., f=...)
    at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:148
#26 0x00007fff6fca6476 in std::__invoke_impl<void, 
arrow::detail::ContinueFuture&, arrow::Future<arrow::internal::Empty>&, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1}&>(std::__invoke_other, arrow::detail::ContinueFuture&, 
arrow::Future<arrow::internal::Empty>&, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1}&) (__f=..., __args=..., __args=...)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:60
#27 0x00007fff6fca63a7 in std::__invoke<arrow::detail::ContinueFuture&, 
arrow::Future<arrow::internal::Empty>&, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1}&>(std::__invoke_result&&, 
(arrow::detail::ContinueFuture&)...) (__fn=..., __args=..., __args=...)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:95
#28 0x00007fff6fca6358 in std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1})>::__call<void, , 0ul, 1ul>(std::tuple<>&&, 
std::_Index_tuple<0ul,---Type <return> to continue, or q <return> to quit---
 1ul>) (this=0x7fff2c004018, __args=...)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/functional:467
#29 0x00007fff6fca62e6 in std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1})>::operator()<, void>() (this=0x7fff2c004018)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/functional:549
#30 0x00007fff6fca62b1 in arrow::internal::FnOnce<void 
()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture 
(arrow::Future<arrow::internal::Empty>, 
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
 (arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()() 
const::{lambda()#1})> >::invoke() (
    this=0x7fff2c004010)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/functional.h:152
#31 0x00007fff6f9e9b0a in arrow::internal::FnOnce<void ()>::operator()() && (
    this=0x7fff94e24c90)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/functional.h:140
#32 0x00007fff6f9e92d2 in arrow::internal::WorkerLoop (state=..., it=...)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/thread_pool.cc:177
#33 0x00007fff6f9e8f68 in 
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3::operator()() 
const (this=0x555557f9ae98)
    at 
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/thread_pool.cc:344
#34 0x00007fff6f9e8efd in 
_ZSt13__invoke_implIvZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_3JEET_St14__invoke_otherOT0_DpOT1_
 (__f=...)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:60
#35 0x00007fff6f9e8e8d in 
_ZSt8__invokeIZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_3JEENSt15__invoke_resultIT_JDpT0_EE4typeEOS5_DpOS6_
 (__fn=...)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:95
#36 0x00007fff6f9e8e65 in 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
 >::_M_invoke<0ul> (this=0x555557f9ae98)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:234
#37 0x00007fff6f9e8e35 in 
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
 >::operator()() (this=0x555557f9ae98)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:243
#38 0x00007fff6f9e8cd9 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
 > >::_M_run() (
    this=0x555557f9ae90)
    at 
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:186
#39 0x00007ffff276d9b0 in std::execute_native_thread_routine (__p=<optimized 
out>)
    at 
/home/conda/feedstock_root/build_artifacts/gcc_compilers_1628138005912/work/build/x86_64-conda-linux-gnu/libstdc++-v3/include/bits/new_allocator.h:82
#40 0x00007ffff4f1d6db in start_thread (arg=0x7fff94e34700) at 
pthread_create.c:463
#41 0x00007ffff762471f in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:95
{noformat}}

It also doesn't fail consistently for me, it sometimes instead gives this error 
so I think there's at least two bugs wrapped up here:

{noformat}
Error: Invalid: Arrays used to construct an ExecBatch must have equal length
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:387
  ExecBatch::Make({batch.values[agg_src_field_ids_[i]], id_batch})
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:417
  iterator_.Next()
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/record_batch.cc:318  
ReadNext(&batch)
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/record_batch.cc:329  
ReadAll(&batches)
{noformat}

> [R][C++] Crash when summarizing after filtering to no rows on partitioned data
> ------------------------------------------------------------------------------
>
>                 Key: ARROW-14583
>                 URL: https://issues.apache.org/jira/browse/ARROW-14583
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, R
>    Affects Versions: 6.0.0
>         Environment: I am using a windows 10 machine, R 4.1.0, up to date R 
> packages, and latest RStudio IDE.
>            Reporter: Zsolt Kegyes-Brassai
>            Assignee: David Li
>            Priority: Major
>              Labels: pull-request-available, query-engine
>          Time Spent: 1h
>  Remaining Estimate: 0h
>
> Original issue report is below; here's an even more minimal example:
> {code:r}
> library(arrow)
> library(dplyr)
> td <- tempfile()
> dir.create(td)
> # if there is no partitioning in data data, this won't segfault
> # write_dataset(iris, td) - swap this in and won't segfault
> write_dataset(group_by(iris, Species), td)
> open_dataset(td) %>%
>   filter(Species == "tulip") %>%
>   group_by(Sepal.Length) %>%
>   summarise(n = n()) %>%
>   collect()
> {code}
> ----
> I was trying the new features introduced in latest {{arrow (6.0.2)}} package 
> based on examples from the “New Directions for Apache Arrow” talk.
> The RStudio IDE was crashing and the R session was aborted.
> Looking closely I found that I downloaded only 2 years of data (2018 & 2019) 
> and after the first filter ({{year == 2015}}) no data remains to be processed 
> further.
> After some debugging, by replacing the collect() function, it turns out that 
> the {{summarize()}} is the one which function is causing the crash.
>  
> {code:java}
> as_dataset <- open_dataset("c:/Rproj_learn/nyc-taxi/", 
>                                 partitioning = c("year", "month")) %>%
>   filter(total_amount > 100 & year == 2015) %>%
>   select(tip_amount, total_amount, passenger_count) %>%
>   mutate(tip_pct = tip_amount / total_amount * 100) %>%
>   group_by(passenger_count) %>%
>   summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
>   filter(n > 5000) %>%
>   arrange(desc(avg_tip_pct)) %>%
>   collect(){code}
>  
> I would expect to get an error message (without crashing the IDE), which can 
> be handled in code.
> Another alternative result would be an empty data.frame, like in case when 
> the parquet file was read in as a data.frame. I simulated this situation by 
> setting a high {{total_amount}} value when filtering. Note: when using an 
> Arrow table an error message is generated.
>  
> {code:java}
>  library(tidyverse)
> #> Warning: package 'tibble' was built under R version 4.1.1
> #> Warning: package 'tidyr' was built under R version 4.1.1
> #> Warning: package 'readr' was built under R version 4.1.1
> library(arrow)
> #> Warning: package 'arrow' was built under R version 4.1.1
> #> 
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #> 
> #>     timestamp
> read_parquet("c:/Rproj_learn/nyc-taxi/2018/01/data.parquet", 
>              as_data_frame = FALSE) %>%
>   # filter(total_amount > 100) %>%
>   filter(total_amount > 1e10) %>%
>   select(tip_amount, total_amount, passenger_count) %>%
>   mutate(tip_pct = tip_amount / total_amount * 100) %>%
>   group_by(passenger_count) %>%
>   summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
>   filter(n > 500) %>%
>   arrange(desc(avg_tip_pct)) %>%
>   collect()
> #> Error: Invalid: Must pass at least one array
> read_parquet("c:/Rproj_learn/nyc-taxi/2018/01/data.parquet", 
>              as_data_frame = TRUE) %>%
>   # filter(total_amount > 100) %>%
>   filter(total_amount > 1e10) %>%
>   select(tip_amount, total_amount, passenger_count) %>%
>   mutate(tip_pct = tip_amount / total_amount * 100) %>%
>   group_by(passenger_count) %>%
>   summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
>   filter(n > 500) %>%
>   arrange(desc(avg_tip_pct)) %>%
>   collect()
> #> # A tibble: 0 x 3
> #> # ... with 3 variables: passenger_count <int>, avg_tip_pct <dbl>, n <int>
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to