[
https://issues.apache.org/jira/browse/ARROW-14583?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17440119#comment-17440119
]
David Li commented on ARROW-14583:
----------------------------------
It appears this fails in a different place so this might be a different bug.
(At least, the fix here still crashes for this example.) I'm going to
investigate more tomorrow.
{noformat}
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1 0x00007ffff7543921 in __GI_abort () at abort.c:79
#2 0x00007ffff1e61047 in google::logging_fail() ()
from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#3 0x00007ffff1e68c0d in google::LogMessage::Fail() ()
from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#4 0x00007ffff1e6b7a6 in google::LogMessage::SendToLog() ()
from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#5 0x00007ffff1e68705 in google::LogMessage::Flush() ()
from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#6 0x00007ffff1e688fd in google::LogMessage::~LogMessage() ()
from /home/lidavidm/miniconda3/envs/dev/lib/libglog.so.0
#7 0x00007fff6f9c60cc in arrow::util::ArrowLog::~ArrowLog (this=0x7fff94e24028)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/logging.cc:250
#8 0x00007fff6f999ab5 in arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed (
this=0x55555a7d2890, state=arrow::FutureState::SUCCESS)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:309
#9 0x00007fff6f997e5a in arrow::ConcreteFutureImpl::DoMarkFinished
(this=0x55555a7d2890)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:231
#10 0x00007fff6f994f5d in arrow::FutureImpl::MarkFinished (this=0x55555a7d2890)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.cc:383
#11 0x00007fffd0372570 in arrow::Future<arrow::internal::Empty>::DoMarkFinished
(
this=0x55555aa056e8, res=...)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:712
#12 0x00007fffd0372482 in
arrow::Future<arrow::internal::Empty>::MarkFinished<arrow::internal::Empty,
void> (this=0x55555aa056e8, s=...)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:463
#13 0x00007fff6fc8fa3d in arrow::compute::(anonymous
namespace)::GroupByNode::StopProducing (this=0x55555aa05650,
output=0x55555b088e40)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:533
#14 0x00007fff6fca29f9 in arrow::compute::MapNode::StopProducing
(this=0x55555b088e40)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:348
#15 0x00007fff6fca28f1 in arrow::compute::MapNode::StopProducing
(this=0x55555b088e40,
output=0x555555a5ff30)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:338
#16 0x00007fff6fca29f9 in arrow::compute::MapNode::StopProducing
(this=0x555555a5ff30)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:348
#17 0x00007fff6fca28f1 in arrow::compute::MapNode::StopProducing
(this=0x555555a5ff30,
output=0x55555877de40)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:338
#18 0x00007fff6fd2de33 in arrow::compute::(anonymous
namespace)::SinkNode::ErrorReceived
(this=0x55555877de40, input=0x555555a5ff30, error=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/sink_node.cc:117
---Type <return> to continue, or q <return> to quit---
#19 0x00007fff6fca2452 in arrow::compute::MapNode::ErrorReceived
(this=0x555555a5ff30,
input=0x55555b088e40, error=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:319
#20 0x00007fff6fca2452 in arrow::compute::MapNode::ErrorReceived
(this=0x55555b088e40,
input=0x55555aa05650, error=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:319
#21 0x00007fff6fca1e32 in arrow::compute::ExecNode::ErrorIfNotOk
(this=0x55555aa05650,
status=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:300
#22 0x00007fff6fc8f351 in arrow::compute::(anonymous
namespace)::GroupByNode::InputReceived (this=0x55555aa05650,
input=0x55555bde3440, batch=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:492
#23 0x00007fff6fca32ce in
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_1::operator()()
const (this=0x7fff2c004028)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:368
#24 0x00007fff6fca655f in
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1}::operator()() const (this=0x7fff2c004020)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:375
#25 0x00007fff6fca64f0 in
arrow::detail::ContinueFuture::operator()<arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1}&, , arrow::Status, arrow::Future<arrow::internal::Empty> >
(this=0x7fff2c004018, next=..., f=...)
at /home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/future.h:148
#26 0x00007fff6fca6476 in std::__invoke_impl<void,
arrow::detail::ContinueFuture&, arrow::Future<arrow::internal::Empty>&,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1}&>(std::__invoke_other, arrow::detail::ContinueFuture&,
arrow::Future<arrow::internal::Empty>&,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1}&) (__f=..., __args=..., __args=...)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:60
#27 0x00007fff6fca63a7 in std::__invoke<arrow::detail::ContinueFuture&,
arrow::Future<arrow::internal::Empty>&,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1}&>(std::__invoke_result&&,
(arrow::detail::ContinueFuture&)...) (__fn=..., __args=..., __args=...)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:95
#28 0x00007fff6fca6358 in std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1})>::__call<void, , 0ul, 1ul>(std::tuple<>&&,
std::_Index_tuple<0ul,---Type <return> to continue, or q <return> to quit---
1ul>) (this=0x7fff2c004018, __args=...)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/functional:467
#29 0x00007fff6fca62e6 in std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1})>::operator()<, void>() (this=0x7fff2c004018)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/functional:549
#30 0x00007fff6fca62b1 in arrow::internal::FnOnce<void
()>::FnImpl<std::_Bind<arrow::detail::ContinueFuture
(arrow::Future<arrow::internal::Empty>,
arrow::compute::MapNode::SubmitTask(std::function<arrow::Result<arrow::compute::ExecBatch>
(arrow::compute::ExecBatch)>, arrow::compute::ExecBatch)::$_2::operator()()
const::{lambda()#1})> >::invoke() (
this=0x7fff2c004010)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/functional.h:152
#31 0x00007fff6f9e9b0a in arrow::internal::FnOnce<void ()>::operator()() && (
this=0x7fff94e24c90)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/functional.h:140
#32 0x00007fff6f9e92d2 in arrow::internal::WorkerLoop (state=..., it=...)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/thread_pool.cc:177
#33 0x00007fff6f9e8f68 in
arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3::operator()()
const (this=0x555557f9ae98)
at
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/util/thread_pool.cc:344
#34 0x00007fff6f9e8efd in
_ZSt13__invoke_implIvZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_3JEET_St14__invoke_otherOT0_DpOT1_
(__f=...)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:60
#35 0x00007fff6f9e8e8d in
_ZSt8__invokeIZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_3JEENSt15__invoke_resultIT_JDpT0_EE4typeEOS5_DpOS6_
(__fn=...)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/bits/invoke.h:95
#36 0x00007fff6f9e8e65 in
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
>::_M_invoke<0ul> (this=0x555557f9ae98)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:234
#37 0x00007fff6f9e8e35 in
std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
>::operator()() (this=0x555557f9ae98)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:243
#38 0x00007fff6f9e8cd9 in
std::thread::_State_impl<std::thread::_Invoker<std::tuple<arrow::internal::ThreadPool::LaunchWorkersUnlocked(int)::$_3>
> >::_M_run() (
this=0x555557f9ae90)
at
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/thread:186
#39 0x00007ffff276d9b0 in std::execute_native_thread_routine (__p=<optimized
out>)
at
/home/conda/feedstock_root/build_artifacts/gcc_compilers_1628138005912/work/build/x86_64-conda-linux-gnu/libstdc++-v3/include/bits/new_allocator.h:82
#40 0x00007ffff4f1d6db in start_thread (arg=0x7fff94e34700) at
pthread_create.c:463
#41 0x00007ffff762471f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:95
{noformat}}
It also doesn't fail consistently for me, it sometimes instead gives this error
so I think there's at least two bugs wrapped up here:
{noformat}
Error: Invalid: Arrays used to construct an ExecBatch must have equal length
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/aggregate_node.cc:387
ExecBatch::Make({batch.values[agg_src_field_ids_[i]], id_batch})
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/compute/exec/exec_plan.cc:417
iterator_.Next()
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/record_batch.cc:318
ReadNext(&batch)
/home/lidavidm/Code/upstream/arrow-14583/cpp/src/arrow/record_batch.cc:329
ReadAll(&batches)
{noformat}
> [R][C++] Crash when summarizing after filtering to no rows on partitioned data
> ------------------------------------------------------------------------------
>
> Key: ARROW-14583
> URL: https://issues.apache.org/jira/browse/ARROW-14583
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++, R
> Affects Versions: 6.0.0
> Environment: I am using a windows 10 machine, R 4.1.0, up to date R
> packages, and latest RStudio IDE.
> Reporter: Zsolt Kegyes-Brassai
> Assignee: David Li
> Priority: Major
> Labels: pull-request-available, query-engine
> Time Spent: 1h
> Remaining Estimate: 0h
>
> Original issue report is below; here's an even more minimal example:
> {code:r}
> library(arrow)
> library(dplyr)
> td <- tempfile()
> dir.create(td)
> # if there is no partitioning in data data, this won't segfault
> # write_dataset(iris, td) - swap this in and won't segfault
> write_dataset(group_by(iris, Species), td)
> open_dataset(td) %>%
> filter(Species == "tulip") %>%
> group_by(Sepal.Length) %>%
> summarise(n = n()) %>%
> collect()
> {code}
> ----
> I was trying the new features introduced in latest {{arrow (6.0.2)}} package
> based on examples from the “New Directions for Apache Arrow” talk.
> The RStudio IDE was crashing and the R session was aborted.
> Looking closely I found that I downloaded only 2 years of data (2018 & 2019)
> and after the first filter ({{year == 2015}}) no data remains to be processed
> further.
> After some debugging, by replacing the collect() function, it turns out that
> the {{summarize()}} is the one which function is causing the crash.
>
> {code:java}
> as_dataset <- open_dataset("c:/Rproj_learn/nyc-taxi/",
> partitioning = c("year", "month")) %>%
> filter(total_amount > 100 & year == 2015) %>%
> select(tip_amount, total_amount, passenger_count) %>%
> mutate(tip_pct = tip_amount / total_amount * 100) %>%
> group_by(passenger_count) %>%
> summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
> filter(n > 5000) %>%
> arrange(desc(avg_tip_pct)) %>%
> collect(){code}
>
> I would expect to get an error message (without crashing the IDE), which can
> be handled in code.
> Another alternative result would be an empty data.frame, like in case when
> the parquet file was read in as a data.frame. I simulated this situation by
> setting a high {{total_amount}} value when filtering. Note: when using an
> Arrow table an error message is generated.
>
> {code:java}
> library(tidyverse)
> #> Warning: package 'tibble' was built under R version 4.1.1
> #> Warning: package 'tidyr' was built under R version 4.1.1
> #> Warning: package 'readr' was built under R version 4.1.1
> library(arrow)
> #> Warning: package 'arrow' was built under R version 4.1.1
> #>
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #>
> #> timestamp
> read_parquet("c:/Rproj_learn/nyc-taxi/2018/01/data.parquet",
> as_data_frame = FALSE) %>%
> # filter(total_amount > 100) %>%
> filter(total_amount > 1e10) %>%
> select(tip_amount, total_amount, passenger_count) %>%
> mutate(tip_pct = tip_amount / total_amount * 100) %>%
> group_by(passenger_count) %>%
> summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
> filter(n > 500) %>%
> arrange(desc(avg_tip_pct)) %>%
> collect()
> #> Error: Invalid: Must pass at least one array
> read_parquet("c:/Rproj_learn/nyc-taxi/2018/01/data.parquet",
> as_data_frame = TRUE) %>%
> # filter(total_amount > 100) %>%
> filter(total_amount > 1e10) %>%
> select(tip_amount, total_amount, passenger_count) %>%
> mutate(tip_pct = tip_amount / total_amount * 100) %>%
> group_by(passenger_count) %>%
> summarize(avg_tip_pct = mean(tip_pct), n = n()) %>%
> filter(n > 500) %>%
> arrange(desc(avg_tip_pct)) %>%
> collect()
> #> # A tibble: 0 x 3
> #> # ... with 3 variables: passenger_count <int>, avg_tip_pct <dbl>, n <int>
> {code}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)