[
https://issues.apache.org/jira/browse/ARROW-16417?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17530081#comment-17530081
]
David Li commented on ARROW-16417:
----------------------------------
Tried porting to C++ for testing but this doesn't fail:
{code:cpp}
TEST(HashJoin, Arrow16417) {
auto t1 = TableFromJSON(schema({field("colA", int64()), field("col2",
utf8())}), {});
auto t2 = TableFromJSON(schema({field("colB", int64()), field("col3",
utf8())}), {});
HashJoinNodeOptions join_opts(
JoinType::RIGHT_SEMI,
/*left_keys=*/{"colA"},
/*right_keys=*/{"colB"},
/*left_output=*/{},
/*right_output=*/{FieldRef("colB"), FieldRef("col3")},
/*filter=*/literal(true),
/*left_suffix=*/"",
/*right_suffix=*/"");
auto exp_schema = schema({field("colB", int64()), field("col3", utf8())});
auto exec_ctx = arrow::internal::make_unique<ExecContext>(
default_memory_pool(), arrow::internal::GetCpuThreadPool());
for (int i = 0; i < 10000; i++) {
std::shared_ptr<Table> exp_table;
{
ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get()));
ASSERT_OK_AND_ASSIGN(auto left_options, SourceNodeOptions::FromTable(*t1,
exec_ctx->executor()));
ASSERT_OK_AND_ASSIGN(auto right_options,
SourceNodeOptions::FromTable(*t2, exec_ctx->executor()));
Declaration join{"hashjoin", join_opts};
join.inputs.emplace_back(Declaration{"source", *left_options});
join.inputs.emplace_back(Declaration{"source", *right_options});
AsyncGenerator<util::optional<ExecBatch>> sink_gen;
ASSERT_OK(Declaration::Sequence({join, {"sink",
SinkNodeOptions{&sink_gen}}})
.AddToPlan(plan.get()));
auto reader = MakeGeneratorReader(exp_schema, sink_gen,
default_memory_pool());
ASSERT_OK(plan->Validate());
ASSERT_OK(plan->StartProducing());
ASSERT_OK_AND_ASSIGN(exp_table,
Table::FromRecordBatchReader(reader.get()));
plan->StopProducing();
}
ASSERT_OK_AND_ASSIGN(auto combined, exp_table->CombineChunks());
ASSERT_OK_AND_ASSIGN(auto indices, SortIndices(Datum(combined),
SortOptions{{SortKey{"colB"}}}));
ASSERT_OK_AND_ASSIGN(auto sorted, Take(Datum(combined), indices));
}
}
{code}
> [C++][Python] Segfault in test_exec_plan.py / test_joins
> --------------------------------------------------------
>
> Key: ARROW-16417
> URL: https://issues.apache.org/jira/browse/ARROW-16417
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++, Python
> Affects Versions: 8.0.0
> Reporter: David Li
> Priority: Major
>
> Occurs during wheel verification. It also happens to master. The failure is
> sporadic but fairly reliable. test_joins is parameterized; it's not
> consistent in the parameters it occurs on, but it consistently occurs on that
> test.
> The backtrace reaches into malloc_consolidate. MALLOC_CHECK doesn't help.
> However:
> {noformat}
> (gdb) b main
> Breakpoint 1 at 0x11ea20: file
> /home/conda/feedstock_root/build_artifacts/python-split_1625973859697/work/Programs/python.c,
> line 15.
> (gdb) command 1
> Type commands for breakpoint(s) 1, one per line.
> End with a line saying just "end".
> >call mcheck(0)
> >continue
> >end {noformat}
> This fairly consistently fails with "memory clobbered before allocated block"
> but the location varies.
> This may be a red herring though. I also tried LD_PRELOADING a secure build
> of mimalloc to see if it would catch any sort of heap corruption but instead
> the tests pass consistently with mimalloc.
>
--
This message was sent by Atlassian Jira
(v8.20.7#820007)