[ 
https://issues.apache.org/jira/browse/ARROW-14898?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Chenxi Li updated ARROW-14898:
------------------------------
    Description: 
I encountered a crash when executing GroupBy on specific data with 
`ARROW_JEMALLOC=ON'. I can't use a simple test to reproduce the crash. So I put 
the code and data here. I think the root cause is the tail process in 
{{Hashing::hash_varlen}} of {{key_hash.cc.}}

The steps of related code are as follows:
 # {{Hashing::hash_varlen}} calls {{helper_tail}} if {{key_length}} for the 
tail part of the key
 # {{helper_tail}} calls {{util::SafeLoadAs}} to load 8 bytes data of the key
 # {{util::SafeLoadAs}} calls {{std::memcpy}} to copy 8 bytes of data from the 
key

If the key is less than 8 bytes, the {{std::memcpy}} still copies 8 bytes which 
may access illegal memory.

Stacktrace:
{noformat}
Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
0   libarrow.700.0.0.dylib            0x000000010a77af2c 
std::__1::enable_if<std::is_trivial<unsigned long long const>::value, unsigned 
long long const>::type arrow::util::SafeLoadAs<unsigned long long 
const>(unsigned char const*) + 12 (ubsan.h:59)
1   libarrow.700.0.0.dylib            0x000000010a779c95 
arrow::compute::Hashing::helper_tail(unsigned int, unsigned long long, unsigned 
char const*, unsigned int) + 37 (key_hash.cc:130)
2   libarrow.700.0.0.dylib            0x000000010a77a62d 
arrow::compute::Hashing::hash_varlen(long long, unsigned int, unsigned int 
const*, unsigned char const*, unsigned int*, unsigned int*) + 381 
(key_hash.cc:234)
3   libarrow.700.0.0.dylib            0x000000010a77abb8 
arrow::compute::Hashing::HashMultiColumn(std::__1::vector<arrow::compute::KeyEncoder::KeyColumnArray,
 std::__1::allocator<arrow::compute::KeyEncoder::KeyColumnArray> > const&, 
arrow::compute::KeyEncoder::KeyEncoderContext*, unsigned int*) + 1176 
(key_hash.cc:292)
4   libarrow.700.0.0.dylib            0x000000010a9ee135 
arrow::compute::internal::(anonymous 
namespace)::GrouperFastImpl::ConsumeImpl(arrow::compute::ExecBatch const&) + 
2149 (hash_aggregate.cc:355)
5   libarrow.700.0.0.dylib            0x000000010a9ea0d9 
arrow::compute::internal::(anonymous 
namespace)::GrouperFastImpl::Consume(arrow::compute::ExecBatch const&) + 1001 
(hash_aggregate.cc:297)
6   libarrow.700.0.0.dylib            0x000000010aa856ef 
arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2::operator()() const + 607 
(hash_aggregate.cc:2501)
7   libarrow.700.0.0.dylib            0x000000010aa85414 
arrow::internal::FnOnce<arrow::Status 
()>::FnImpl<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2>::invoke() + 52 (functional.h:152)
8   libarrow.700.0.0.dylib            0x000000010a46ab0d 
arrow::internal::FnOnce<arrow::Status ()>::operator()() && + 77 
(functional.h:140)
9   libarrow.700.0.0.dylib            0x000000010a46911f 
arrow::internal::(anonymous 
namespace)::SerialTaskGroup::AppendReal(arrow::internal::FnOnce<arrow::Status 
()>) + 335 (task_group.cc:49)
10  libarrow.700.0.0.dylib            0x000000010a9d8856 void 
arrow::internal::TaskGroup::Append<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
 std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2>(arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
 std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2&&) + 102 (task_group.h:59)
11  libarrow.700.0.0.dylib            0x000000010a9d6e18 
arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*) + 4472 (hash_aggregate.cc:2489)
12  arrow-compute-aggregate-test      0x000000010242d549 
run_group_by(std::__1::basic_string<char, std::__1::char_traits<char>, 
std::__1::allocator<char> > const&) + 1001 (hash_aggregate_test.cc:2901)
13  arrow-compute-aggregate-test      0x000000010242d124 
GroupBy_wtf_Test::TestBody() + 68 (hash_aggregate_test.cc:2909)
14  libarrow_testing.700.0.0.dylib    0x0000000102fb3344 void 
testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) + 132
15  libarrow_testing.700.0.0.dylib    0x0000000102f75d3b void 
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) + 123
16  libarrow_testing.700.0.0.dylib    0x0000000102f75c73 testing::Test::Run() + 
195
17  libarrow_testing.700.0.0.dylib    0x0000000102f76e54 
testing::TestInfo::Run() + 244
18  libarrow_testing.700.0.0.dylib    0x0000000102f77f95 
testing::TestSuite::Run() + 261
19  libarrow_testing.700.0.0.dylib    0x0000000102f8725d 
testing::internal::UnitTestImpl::RunAllTests() + 1021
20  libarrow_testing.700.0.0.dylib    0x0000000102fb8894 bool 
testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
 bool>(testing::internal::UnitTestImpl*, bool 
(testing::internal::UnitTestImpl::*)(), char const*) + 132
21  libarrow_testing.700.0.0.dylib    0x0000000102f86bfb bool 
testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
 bool>(testing::internal::UnitTestImpl*, bool 
(testing::internal::UnitTestImpl::*)(), char const*) + 123
22  libarrow_testing.700.0.0.dylib    0x0000000102f86acd 
testing::UnitTest::Run() + 205
23  arrow-compute-aggregate-test      0x00000001024adab1 RUN_ALL_TESTS() + 17
24  arrow-compute-aggregate-test      0x00000001024ada90 main + 64
25  libdyld.dylib                     0x00007fff204def3d start + 1{noformat}
The code and data file [^arrow_14898.arr] to reproduce the crash. You can paste 
it into the end of 
{{arrow/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc}}
{code:c++}
#include "arrow/ipc/api.h"
#include "arrow/io/file.h"

#include <string>
#include <vector>

TEST(GroupBy, ARROW_14898) {
  // Repeat enough times
  for (int i = 0; i < 100; i++) {
    // Read file
    auto file_patch = "arrow_14898.arr";
    auto mmap_file =
        arrow::io::MemoryMappedFile::Open(file_patch, arrow::io::FileMode::READ)
            .ValueOrDie();
    auto record_batch_reader =
        arrow::ipc::RecordBatchFileReader::Open(mmap_file.get(),
                                                
arrow::ipc::IpcReadOptions::Defaults())
            .ValueOrDie();
    auto record_batch = record_batch_reader->ReadRecordBatch(0).ValueOrDie();

    // Create data for GroupBy
    // If the size is close to 8192, the crash happens
    int size = 8192;
    std::vector<std::string> vec(size, "a");
    std::shared_ptr<arrow::Array> array;
    arrow::StringBuilder builder;
    auto _ = builder.AppendValues(vec);
    _ = builder.Finish(&array);

    // Call GroupBy will crash in Hashing::helper_tail
    arrow::compute::CountOptions all(arrow::compute::CountOptions::ALL);
    auto res =
        arrow::compute::internal::GroupBy({array}, {array}, {{"hash_count", 
&all}}, false);
  }
}
{code}

  was:
I encountered a crash when executing GroupBy on specific data with 
`ARROW_JEMALLOC=ON'. I can't use simple code to reproduce the crash. So I put 
the code and data here. I think the root cause is the tail process in 
{{Hashing::hash_varlen}} of {{key_hash.cc.}}

The steps of related code are as follows:
 # {{Hashing::hash_varlen}} calls {{helper_tail}} if {{key_length}} for the 
tail part of the key
 # {{helper_tail}} calls {{util::SafeLoadAs}} to load 8 bytes data of the key
 # {{util::SafeLoadAs}} calls {{std::memcpy}} to copy 8 bytes of data from the 
key

If the key is less than 8 bytes, the {{std::memcpy}} still copies 8 bytes which 
may access illegal memory.

Stacktrace:
{noformat}
Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
0   libarrow.700.0.0.dylib            0x000000010a77af2c 
std::__1::enable_if<std::is_trivial<unsigned long long const>::value, unsigned 
long long const>::type arrow::util::SafeLoadAs<unsigned long long 
const>(unsigned char const*) + 12 (ubsan.h:59)
1   libarrow.700.0.0.dylib            0x000000010a779c95 
arrow::compute::Hashing::helper_tail(unsigned int, unsigned long long, unsigned 
char const*, unsigned int) + 37 (key_hash.cc:130)
2   libarrow.700.0.0.dylib            0x000000010a77a62d 
arrow::compute::Hashing::hash_varlen(long long, unsigned int, unsigned int 
const*, unsigned char const*, unsigned int*, unsigned int*) + 381 
(key_hash.cc:234)
3   libarrow.700.0.0.dylib            0x000000010a77abb8 
arrow::compute::Hashing::HashMultiColumn(std::__1::vector<arrow::compute::KeyEncoder::KeyColumnArray,
 std::__1::allocator<arrow::compute::KeyEncoder::KeyColumnArray> > const&, 
arrow::compute::KeyEncoder::KeyEncoderContext*, unsigned int*) + 1176 
(key_hash.cc:292)
4   libarrow.700.0.0.dylib            0x000000010a9ee135 
arrow::compute::internal::(anonymous 
namespace)::GrouperFastImpl::ConsumeImpl(arrow::compute::ExecBatch const&) + 
2149 (hash_aggregate.cc:355)
5   libarrow.700.0.0.dylib            0x000000010a9ea0d9 
arrow::compute::internal::(anonymous 
namespace)::GrouperFastImpl::Consume(arrow::compute::ExecBatch const&) + 1001 
(hash_aggregate.cc:297)
6   libarrow.700.0.0.dylib            0x000000010aa856ef 
arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2::operator()() const + 607 
(hash_aggregate.cc:2501)
7   libarrow.700.0.0.dylib            0x000000010aa85414 
arrow::internal::FnOnce<arrow::Status 
()>::FnImpl<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2>::invoke() + 52 (functional.h:152)
8   libarrow.700.0.0.dylib            0x000000010a46ab0d 
arrow::internal::FnOnce<arrow::Status ()>::operator()() && + 77 
(functional.h:140)
9   libarrow.700.0.0.dylib            0x000000010a46911f 
arrow::internal::(anonymous 
namespace)::SerialTaskGroup::AppendReal(arrow::internal::FnOnce<arrow::Status 
()>) + 335 (task_group.cc:49)
10  libarrow.700.0.0.dylib            0x000000010a9d8856 void 
arrow::internal::TaskGroup::Append<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
 std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2>(arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
 std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*)::$_2&&) + 102 (task_group.h:59)
11  libarrow.700.0.0.dylib            0x000000010a9d6e18 
arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
std::__1::allocator<arrow::Datum> > const&, 
std::__1::vector<arrow::compute::internal::Aggregate, 
std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
arrow::compute::ExecContext*) + 4472 (hash_aggregate.cc:2489)
12  arrow-compute-aggregate-test      0x000000010242d549 
run_group_by(std::__1::basic_string<char, std::__1::char_traits<char>, 
std::__1::allocator<char> > const&) + 1001 (hash_aggregate_test.cc:2901)
13  arrow-compute-aggregate-test      0x000000010242d124 
GroupBy_wtf_Test::TestBody() + 68 (hash_aggregate_test.cc:2909)
14  libarrow_testing.700.0.0.dylib    0x0000000102fb3344 void 
testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) + 132
15  libarrow_testing.700.0.0.dylib    0x0000000102f75d3b void 
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) + 123
16  libarrow_testing.700.0.0.dylib    0x0000000102f75c73 testing::Test::Run() + 
195
17  libarrow_testing.700.0.0.dylib    0x0000000102f76e54 
testing::TestInfo::Run() + 244
18  libarrow_testing.700.0.0.dylib    0x0000000102f77f95 
testing::TestSuite::Run() + 261
19  libarrow_testing.700.0.0.dylib    0x0000000102f8725d 
testing::internal::UnitTestImpl::RunAllTests() + 1021
20  libarrow_testing.700.0.0.dylib    0x0000000102fb8894 bool 
testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
 bool>(testing::internal::UnitTestImpl*, bool 
(testing::internal::UnitTestImpl::*)(), char const*) + 132
21  libarrow_testing.700.0.0.dylib    0x0000000102f86bfb bool 
testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
 bool>(testing::internal::UnitTestImpl*, bool 
(testing::internal::UnitTestImpl::*)(), char const*) + 123
22  libarrow_testing.700.0.0.dylib    0x0000000102f86acd 
testing::UnitTest::Run() + 205
23  arrow-compute-aggregate-test      0x00000001024adab1 RUN_ALL_TESTS() + 17
24  arrow-compute-aggregate-test      0x00000001024ada90 main + 64
25  libdyld.dylib                     0x00007fff204def3d start + 1{noformat}
The code and data file [^arrow_14898.arr] to reproduce the crash. You can paste 
it into the end of 
{{arrow/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc}}
{code:c++}
#include "arrow/ipc/api.h"
#include "arrow/io/file.h"

#include <string>
#include <vector>

TEST(GroupBy, ARROW_14898) {
  // Repeat enough times
  for (int i = 0; i < 100; i++) {
    // Read file
    auto file_patch = "arrow_14898.arr";
    auto mmap_file =
        arrow::io::MemoryMappedFile::Open(file_patch, arrow::io::FileMode::READ)
            .ValueOrDie();
    auto record_batch_reader =
        arrow::ipc::RecordBatchFileReader::Open(mmap_file.get(),
                                                
arrow::ipc::IpcReadOptions::Defaults())
            .ValueOrDie();
    auto record_batch = record_batch_reader->ReadRecordBatch(0).ValueOrDie();

    // Create data for GroupBy
    // If the size is close to 8192, the crash happens
    int size = 8192;
    std::vector<std::string> vec(size, "a");
    std::shared_ptr<arrow::Array> array;
    arrow::StringBuilder builder;
    auto _ = builder.AppendValues(vec);
    _ = builder.Finish(&array);

    // Call GroupBy will crash in Hashing::helper_tail
    arrow::compute::CountOptions all(arrow::compute::CountOptions::ALL);
    auto res =
        arrow::compute::internal::GroupBy({array}, {array}, {{"hash_count", 
&all}}, false);
  }
}
{code}


> [C++] Crash of out-of-bounds memory accessing in key_hash if a key is smaller 
> than int64
> ----------------------------------------------------------------------------------------
>
>                 Key: ARROW-14898
>                 URL: https://issues.apache.org/jira/browse/ARROW-14898
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>            Reporter: Chenxi Li
>            Assignee: Chenxi Li
>            Priority: Major
>              Labels: pull-request-available
>         Attachments: arrow_14898.arr
>
>          Time Spent: 20m
>  Remaining Estimate: 0h
>
> I encountered a crash when executing GroupBy on specific data with 
> `ARROW_JEMALLOC=ON'. I can't use a simple test to reproduce the crash. So I 
> put the code and data here. I think the root cause is the tail process in 
> {{Hashing::hash_varlen}} of {{key_hash.cc.}}
> The steps of related code are as follows:
>  # {{Hashing::hash_varlen}} calls {{helper_tail}} if {{key_length}} for the 
> tail part of the key
>  # {{helper_tail}} calls {{util::SafeLoadAs}} to load 8 bytes data of the key
>  # {{util::SafeLoadAs}} calls {{std::memcpy}} to copy 8 bytes of data from 
> the key
> If the key is less than 8 bytes, the {{std::memcpy}} still copies 8 bytes 
> which may access illegal memory.
> Stacktrace:
> {noformat}
> Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
> 0   libarrow.700.0.0.dylib            0x000000010a77af2c 
> std::__1::enable_if<std::is_trivial<unsigned long long const>::value, 
> unsigned long long const>::type arrow::util::SafeLoadAs<unsigned long long 
> const>(unsigned char const*) + 12 (ubsan.h:59)
> 1   libarrow.700.0.0.dylib            0x000000010a779c95 
> arrow::compute::Hashing::helper_tail(unsigned int, unsigned long long, 
> unsigned char const*, unsigned int) + 37 (key_hash.cc:130)
> 2   libarrow.700.0.0.dylib            0x000000010a77a62d 
> arrow::compute::Hashing::hash_varlen(long long, unsigned int, unsigned int 
> const*, unsigned char const*, unsigned int*, unsigned int*) + 381 
> (key_hash.cc:234)
> 3   libarrow.700.0.0.dylib            0x000000010a77abb8 
> arrow::compute::Hashing::HashMultiColumn(std::__1::vector<arrow::compute::KeyEncoder::KeyColumnArray,
>  std::__1::allocator<arrow::compute::KeyEncoder::KeyColumnArray> > const&, 
> arrow::compute::KeyEncoder::KeyEncoderContext*, unsigned int*) + 1176 
> (key_hash.cc:292)
> 4   libarrow.700.0.0.dylib            0x000000010a9ee135 
> arrow::compute::internal::(anonymous 
> namespace)::GrouperFastImpl::ConsumeImpl(arrow::compute::ExecBatch const&) + 
> 2149 (hash_aggregate.cc:355)
> 5   libarrow.700.0.0.dylib            0x000000010a9ea0d9 
> arrow::compute::internal::(anonymous 
> namespace)::GrouperFastImpl::Consume(arrow::compute::ExecBatch const&) + 1001 
> (hash_aggregate.cc:297)
> 6   libarrow.700.0.0.dylib            0x000000010aa856ef 
> arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, 
> std::__1::vector<arrow::compute::internal::Aggregate, 
> std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
> arrow::compute::ExecContext*)::$_2::operator()() const + 607 
> (hash_aggregate.cc:2501)
> 7   libarrow.700.0.0.dylib            0x000000010aa85414 
> arrow::internal::FnOnce<arrow::Status 
> ()>::FnImpl<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, 
> std::__1::vector<arrow::compute::internal::Aggregate, 
> std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
> arrow::compute::ExecContext*)::$_2>::invoke() + 52 (functional.h:152)
> 8   libarrow.700.0.0.dylib            0x000000010a46ab0d 
> arrow::internal::FnOnce<arrow::Status ()>::operator()() && + 77 
> (functional.h:140)
> 9   libarrow.700.0.0.dylib            0x000000010a46911f 
> arrow::internal::(anonymous 
> namespace)::SerialTaskGroup::AppendReal(arrow::internal::FnOnce<arrow::Status 
> ()>) + 335 (task_group.cc:49)
> 10  libarrow.700.0.0.dylib            0x000000010a9d8856 void 
> arrow::internal::TaskGroup::Append<arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
>  std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, 
> std::__1::vector<arrow::compute::internal::Aggregate, 
> std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
> arrow::compute::ExecContext*)::$_2>(arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum,
>  std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, 
> std::__1::vector<arrow::compute::internal::Aggregate, 
> std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
> arrow::compute::ExecContext*)::$_2&&) + 102 (task_group.h:59)
> 11  libarrow.700.0.0.dylib            0x000000010a9d6e18 
> arrow::compute::internal::GroupBy(std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, std::__1::vector<arrow::Datum, 
> std::__1::allocator<arrow::Datum> > const&, 
> std::__1::vector<arrow::compute::internal::Aggregate, 
> std::__1::allocator<arrow::compute::internal::Aggregate> > const&, bool, 
> arrow::compute::ExecContext*) + 4472 (hash_aggregate.cc:2489)
> 12  arrow-compute-aggregate-test      0x000000010242d549 
> run_group_by(std::__1::basic_string<char, std::__1::char_traits<char>, 
> std::__1::allocator<char> > const&) + 1001 (hash_aggregate_test.cc:2901)
> 13  arrow-compute-aggregate-test      0x000000010242d124 
> GroupBy_wtf_Test::TestBody() + 68 (hash_aggregate_test.cc:2909)
> 14  libarrow_testing.700.0.0.dylib    0x0000000102fb3344 void 
> testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, 
> void>(testing::Test*, void (testing::Test::*)(), char const*) + 132
> 15  libarrow_testing.700.0.0.dylib    0x0000000102f75d3b void 
> testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, 
> void>(testing::Test*, void (testing::Test::*)(), char const*) + 123
> 16  libarrow_testing.700.0.0.dylib    0x0000000102f75c73 testing::Test::Run() 
> + 195
> 17  libarrow_testing.700.0.0.dylib    0x0000000102f76e54 
> testing::TestInfo::Run() + 244
> 18  libarrow_testing.700.0.0.dylib    0x0000000102f77f95 
> testing::TestSuite::Run() + 261
> 19  libarrow_testing.700.0.0.dylib    0x0000000102f8725d 
> testing::internal::UnitTestImpl::RunAllTests() + 1021
> 20  libarrow_testing.700.0.0.dylib    0x0000000102fb8894 bool 
> testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
>  bool>(testing::internal::UnitTestImpl*, bool 
> (testing::internal::UnitTestImpl::*)(), char const*) + 132
> 21  libarrow_testing.700.0.0.dylib    0x0000000102f86bfb bool 
> testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl,
>  bool>(testing::internal::UnitTestImpl*, bool 
> (testing::internal::UnitTestImpl::*)(), char const*) + 123
> 22  libarrow_testing.700.0.0.dylib    0x0000000102f86acd 
> testing::UnitTest::Run() + 205
> 23  arrow-compute-aggregate-test      0x00000001024adab1 RUN_ALL_TESTS() + 17
> 24  arrow-compute-aggregate-test      0x00000001024ada90 main + 64
> 25  libdyld.dylib                     0x00007fff204def3d start + 1{noformat}
> The code and data file [^arrow_14898.arr] to reproduce the crash. You can 
> paste it into the end of 
> {{arrow/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc}}
> {code:c++}
> #include "arrow/ipc/api.h"
> #include "arrow/io/file.h"
> #include <string>
> #include <vector>
> TEST(GroupBy, ARROW_14898) {
>   // Repeat enough times
>   for (int i = 0; i < 100; i++) {
>     // Read file
>     auto file_patch = "arrow_14898.arr";
>     auto mmap_file =
>         arrow::io::MemoryMappedFile::Open(file_patch, 
> arrow::io::FileMode::READ)
>             .ValueOrDie();
>     auto record_batch_reader =
>         arrow::ipc::RecordBatchFileReader::Open(mmap_file.get(),
>                                                 
> arrow::ipc::IpcReadOptions::Defaults())
>             .ValueOrDie();
>     auto record_batch = record_batch_reader->ReadRecordBatch(0).ValueOrDie();
>     // Create data for GroupBy
>     // If the size is close to 8192, the crash happens
>     int size = 8192;
>     std::vector<std::string> vec(size, "a");
>     std::shared_ptr<arrow::Array> array;
>     arrow::StringBuilder builder;
>     auto _ = builder.AppendValues(vec);
>     _ = builder.Finish(&array);
>     // Call GroupBy will crash in Hashing::helper_tail
>     arrow::compute::CountOptions all(arrow::compute::CountOptions::ALL);
>     auto res =
>         arrow::compute::internal::GroupBy({array}, {array}, {{"hash_count", 
> &all}}, false);
>   }
> }
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to