wesm commented on a change in pull request #7454: URL: https://github.com/apache/arrow/pull/7454#discussion_r442474926
########## File path: cpp/src/arrow/table.cc ########## @@ -721,12 +723,34 @@ Result<std::shared_ptr<Table>> Table::CombineChunks(MemoryPool* pool) const { const int ncolumns = num_columns(); std::vector<std::shared_ptr<ChunkedArray>> compacted_columns(ncolumns); for (int i = 0; i < ncolumns; ++i) { - auto col = column(i); + const auto& col = column(i); if (col->num_chunks() <= 1) { compacted_columns[i] = col; + continue; + } + + if (is_binary_like(col->type()->id())) { + // ARROW-5744 Allow binary columns to be combined into multiple chunks to avoid + // buffer overflow + ArrayVector chunks; + int chunk_i = 0; + while (chunk_i < col->num_chunks()) { + ArrayVector safe_chunks; + int64_t data_length = 0; + for (; chunk_i < col->num_chunks(); ++chunk_i) { + const auto& chunk = col->chunk(chunk_i); + data_length += checked_cast<const BinaryArray&>(*chunk).total_values_length(); + if (data_length >= std::numeric_limits<int32_t>::max() - 1) { Review comment: We should use `kBinaryMemoryLimit` here ########## File path: cpp/src/arrow/table_test.cc ########## @@ -429,6 +429,32 @@ TEST_F(TestTable, CombineChunks) { } } +TEST_F(TestTable, LARGE_MEMORY_TEST(CombineChunksStringColumn)) { + schema_ = schema({field("str", utf8())}); + arrays_ = {nullptr}; + + constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; Review comment: This is declared in `array/builder_binary.h`. Either we should use that or move the declaration to somewhere like `arrow/type_fwd.h` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org