[ 
https://issues.apache.org/jira/browse/ARROW-16417?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17530081#comment-17530081
 ] 

David Li commented on ARROW-16417:
----------------------------------

Tried porting to C++ for testing but this doesn't fail:

{code:cpp}
TEST(HashJoin, Arrow16417) {
  auto t1 = TableFromJSON(schema({field("colA", int64()), field("col2", 
utf8())}), {});
  auto t2 = TableFromJSON(schema({field("colB", int64()), field("col3", 
utf8())}), {});

  HashJoinNodeOptions join_opts(
      JoinType::RIGHT_SEMI,
      /*left_keys=*/{"colA"},
      /*right_keys=*/{"colB"},
      /*left_output=*/{},
      /*right_output=*/{FieldRef("colB"), FieldRef("col3")},
      /*filter=*/literal(true),
      /*left_suffix=*/"",
      /*right_suffix=*/"");
  auto exp_schema = schema({field("colB", int64()), field("col3", utf8())});

  auto exec_ctx = arrow::internal::make_unique<ExecContext>(
      default_memory_pool(), arrow::internal::GetCpuThreadPool());
  for (int i = 0; i < 10000; i++) {
    std::shared_ptr<Table> exp_table;
    {
      ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get()));

      ASSERT_OK_AND_ASSIGN(auto left_options, SourceNodeOptions::FromTable(*t1, 
exec_ctx->executor()));
      ASSERT_OK_AND_ASSIGN(auto right_options, 
SourceNodeOptions::FromTable(*t2, exec_ctx->executor()));

      Declaration join{"hashjoin", join_opts};
      join.inputs.emplace_back(Declaration{"source", *left_options});
      join.inputs.emplace_back(Declaration{"source", *right_options});

      AsyncGenerator<util::optional<ExecBatch>> sink_gen;
      ASSERT_OK(Declaration::Sequence({join, {"sink", 
SinkNodeOptions{&sink_gen}}})
                .AddToPlan(plan.get()));

      auto reader = MakeGeneratorReader(exp_schema, sink_gen, 
default_memory_pool());

      ASSERT_OK(plan->Validate());
      ASSERT_OK(plan->StartProducing());

      ASSERT_OK_AND_ASSIGN(exp_table, 
Table::FromRecordBatchReader(reader.get()));

      plan->StopProducing();
    }

    ASSERT_OK_AND_ASSIGN(auto combined, exp_table->CombineChunks());
    ASSERT_OK_AND_ASSIGN(auto indices, SortIndices(Datum(combined), 
SortOptions{{SortKey{"colB"}}}));
    ASSERT_OK_AND_ASSIGN(auto sorted, Take(Datum(combined), indices));
  }
}
{code}

> [C++][Python] Segfault in test_exec_plan.py / test_joins
> --------------------------------------------------------
>
>                 Key: ARROW-16417
>                 URL: https://issues.apache.org/jira/browse/ARROW-16417
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>    Affects Versions: 8.0.0
>            Reporter: David Li
>            Priority: Major
>
> Occurs during wheel verification. It also happens to master. The failure is 
> sporadic but fairly reliable. test_joins is parameterized; it's not 
> consistent in the parameters it occurs on, but it consistently occurs on that 
> test.
> The backtrace reaches into malloc_consolidate. MALLOC_CHECK doesn't help. 
> However:
> {noformat}
> (gdb) b main
> Breakpoint 1 at 0x11ea20: file 
> /home/conda/feedstock_root/build_artifacts/python-split_1625973859697/work/Programs/python.c,
>  line 15.
> (gdb) command 1
> Type commands for breakpoint(s) 1, one per line.
> End with a line saying just "end".
> >call mcheck(0)
> >continue
> >end {noformat}
> This fairly consistently fails with "memory clobbered before allocated block" 
> but the location varies. 
> This may be a red herring though. I also tried LD_PRELOADING a secure build 
> of mimalloc to see if it would catch any sort of heap corruption but instead 
> the tests pass consistently with mimalloc.
>  



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to