[ 
https://issues.apache.org/jira/browse/ARROW-14898?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Chenxi Li updated ARROW-14898:
------------------------------
    Description: 
The code and data file [^arrow_14898.arr] to reproduce the crash. You can paste 
it into the end of 
arrow/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
{code:c++}
#include "arrow/ipc/api.h"
#include "arrow/io/file.h"

#include <string>
#include <vector>

TEST(GroupBy, ARROW_14898) {
  // Repeat enough times
  for (int i = 0; i < 100; i++) {
    // Read file
    auto file_patch = "arrow_14898.arr";
    auto mmap_file =
        arrow::io::MemoryMappedFile::Open(file_patch, arrow::io::FileMode::READ)
            .ValueOrDie();
    auto record_batch_reader =
        arrow::ipc::RecordBatchFileReader::Open(mmap_file.get(),
                                                
arrow::ipc::IpcReadOptions::Defaults())
            .ValueOrDie();
    auto record_batch = record_batch_reader->ReadRecordBatch(0).ValueOrDie();

    // Create data for GroupBy
    // If the size is close to 8192, the crash happens
    int size = 8192;
    std::vector<std::string> vec(size, "a");
    std::shared_ptr<arrow::Array> array;
    arrow::StringBuilder builder;
    auto _ = builder.AppendValues(vec);
    _ = builder.Finish(&array);

    // Call GroupBy will crash in Hashing::helper_tail
    arrow::compute::CountOptions all(arrow::compute::CountOptions::ALL);
    auto res =
        arrow::compute::internal::GroupBy({array}, {array}, {{"hash_count", 
&all}}, false);
  }
}
{code}

> [C++] Out-of-bounds memory accessing in key_hash if a key is smaller than 
> int64
> -------------------------------------------------------------------------------
>
>                 Key: ARROW-14898
>                 URL: https://issues.apache.org/jira/browse/ARROW-14898
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>            Reporter: Chenxi Li
>            Assignee: Chenxi Li
>            Priority: Major
>         Attachments: arrow_14898.arr
>
>
> The code and data file [^arrow_14898.arr] to reproduce the crash. You can 
> paste it into the end of 
> arrow/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
> {code:c++}
> #include "arrow/ipc/api.h"
> #include "arrow/io/file.h"
> #include <string>
> #include <vector>
> TEST(GroupBy, ARROW_14898) {
>   // Repeat enough times
>   for (int i = 0; i < 100; i++) {
>     // Read file
>     auto file_patch = "arrow_14898.arr";
>     auto mmap_file =
>         arrow::io::MemoryMappedFile::Open(file_patch, 
> arrow::io::FileMode::READ)
>             .ValueOrDie();
>     auto record_batch_reader =
>         arrow::ipc::RecordBatchFileReader::Open(mmap_file.get(),
>                                                 
> arrow::ipc::IpcReadOptions::Defaults())
>             .ValueOrDie();
>     auto record_batch = record_batch_reader->ReadRecordBatch(0).ValueOrDie();
>     // Create data for GroupBy
>     // If the size is close to 8192, the crash happens
>     int size = 8192;
>     std::vector<std::string> vec(size, "a");
>     std::shared_ptr<arrow::Array> array;
>     arrow::StringBuilder builder;
>     auto _ = builder.AppendValues(vec);
>     _ = builder.Finish(&array);
>     // Call GroupBy will crash in Hashing::helper_tail
>     arrow::compute::CountOptions all(arrow::compute::CountOptions::ALL);
>     auto res =
>         arrow::compute::internal::GroupBy({array}, {array}, {{"hash_count", 
> &all}}, false);
>   }
> }
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to