zanmato1984 commented on code in PR #43226:
URL: https://github.com/apache/arrow/pull/43226#discussion_r1679535970


##########
cpp/src/arrow/compute/row/row_test.cc:
##########
@@ -125,5 +125,134 @@ TEST(RowTableMemoryConsumption, Encode) {
   }
 }
 
+// GH-43202: Ensure that when offset overflow happens in encoding the row 
table, an
+// explicit error is raised instead of a silent wrong result.
+TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
+  if constexpr (sizeof(void*) == 4) {
+    GTEST_SKIP() << "Test only works on 64-bit platforms";
+  }
+
+  // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the 
row table.
+  constexpr int64_t num_rows = 8;
+  constexpr int64_t length_per_binary = 512 * 1024 * 1024;
+  constexpr int64_t row_alignment = sizeof(uint32_t);
+  constexpr int64_t var_length_alignment = sizeof(uint32_t);
+
+  MemoryPool* pool = default_memory_pool();
+
+  // The column to encode.
+  std::vector<KeyColumnArray> columns;
+  std::vector<Datum> values;
+  ASSERT_OK_AND_ASSIGN(
+      auto value, ::arrow::gen::Constant(
+                      
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
+                      ->Generate(1));
+  values.push_back(std::move(value));
+  ExecBatch batch = ExecBatch(std::move(values), 1);
+  ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));
+
+  // The row table.
+  std::vector<KeyColumnMetadata> column_metadatas;
+  ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
+  RowTableMetadata table_metadata;
+  table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
+                                          var_length_alignment);
+  RowTableImpl row_table;
+  ASSERT_OK(row_table.Init(pool, table_metadata));
+  RowTableEncoder row_encoder;
+  row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);
+
+  // The rows to encode.
+  std::vector<uint16_t> row_ids(num_rows, 0);
+
+  // Encoding 7 rows should be fine.
+  {
+    row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns);
+    ASSERT_OK(row_encoder.EncodeSelected(&row_table, 
static_cast<uint32_t>(num_rows - 1),
+                                         row_ids.data()));

Review Comment:
   Thanks for looking!
   
   Every time an encoder encodes into a row table, the row table gets 
effectively cleared. So the memory is only hold til the next `EncodeSelected` 
call. Clearing it here doesn't make much difference I guess(?) - the code in 
between is relatively trivial.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to