pitrou commented on code in PR #43226:
URL: https://github.com/apache/arrow/pull/43226#discussion_r1679506416


##########
cpp/src/arrow/compute/row/row_test.cc:
##########
@@ -125,5 +125,134 @@ TEST(RowTableMemoryConsumption, Encode) {
   }
 }
 
+// GH-43202: Ensure that when offset overflow happens in encoding the row 
table, an
+// explicit error is raised instead of a silent wrong result.
+TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
+  if constexpr (sizeof(void*) == 4) {
+    GTEST_SKIP() << "Test only works on 64-bit platforms";
+  }
+
+  // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the 
row table.
+  constexpr int64_t num_rows = 8;
+  constexpr int64_t length_per_binary = 512 * 1024 * 1024;
+  constexpr int64_t row_alignment = sizeof(uint32_t);
+  constexpr int64_t var_length_alignment = sizeof(uint32_t);
+
+  MemoryPool* pool = default_memory_pool();
+
+  // The column to encode.
+  std::vector<KeyColumnArray> columns;
+  std::vector<Datum> values;
+  ASSERT_OK_AND_ASSIGN(
+      auto value, ::arrow::gen::Constant(
+                      
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
+                      ->Generate(1));
+  values.push_back(std::move(value));
+  ExecBatch batch = ExecBatch(std::move(values), 1);
+  ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));
+
+  // The row table.
+  std::vector<KeyColumnMetadata> column_metadatas;
+  ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
+  RowTableMetadata table_metadata;
+  table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
+                                          var_length_alignment);
+  RowTableImpl row_table;
+  ASSERT_OK(row_table.Init(pool, table_metadata));
+  RowTableEncoder row_encoder;
+  row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);
+
+  // The rows to encode.
+  std::vector<uint16_t> row_ids(num_rows, 0);
+
+  // Encoding 7 rows should be fine.
+  {
+    row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns);
+    ASSERT_OK(row_encoder.EncodeSelected(&row_table, 
static_cast<uint32_t>(num_rows - 1),
+                                         row_ids.data()));

Review Comment:
   Should we drop/clear the `row_table` after this to release a bit of memory?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to