wesm commented on a change in pull request #7454:
URL: https://github.com/apache/arrow/pull/7454#discussion_r442474926



##########
File path: cpp/src/arrow/table.cc
##########
@@ -721,12 +723,34 @@ Result<std::shared_ptr<Table>> 
Table::CombineChunks(MemoryPool* pool) const {
   const int ncolumns = num_columns();
   std::vector<std::shared_ptr<ChunkedArray>> compacted_columns(ncolumns);
   for (int i = 0; i < ncolumns; ++i) {
-    auto col = column(i);
+    const auto& col = column(i);
     if (col->num_chunks() <= 1) {
       compacted_columns[i] = col;
+      continue;
+    }
+
+    if (is_binary_like(col->type()->id())) {
+      // ARROW-5744 Allow binary columns to be combined into multiple chunks 
to avoid
+      // buffer overflow
+      ArrayVector chunks;
+      int chunk_i = 0;
+      while (chunk_i < col->num_chunks()) {
+        ArrayVector safe_chunks;
+        int64_t data_length = 0;
+        for (; chunk_i < col->num_chunks(); ++chunk_i) {
+          const auto& chunk = col->chunk(chunk_i);
+          data_length += checked_cast<const 
BinaryArray&>(*chunk).total_values_length();
+          if (data_length >= std::numeric_limits<int32_t>::max() - 1) {

Review comment:
       We should use `kBinaryMemoryLimit` here

##########
File path: cpp/src/arrow/table_test.cc
##########
@@ -429,6 +429,32 @@ TEST_F(TestTable, CombineChunks) {
   }
 }
 
+TEST_F(TestTable, LARGE_MEMORY_TEST(CombineChunksStringColumn)) {
+  schema_ = schema({field("str", utf8())});
+  arrays_ = {nullptr};
+
+  constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 
1;

Review comment:
       This is declared in `array/builder_binary.h`. Either we should use that 
or move the declaration to somewhere like `arrow/type_fwd.h`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to