zanmato1984 commented on code in PR #43389:
URL: https://github.com/apache/arrow/pull/43389#discussion_r1698236259


##########
cpp/src/arrow/compute/row/compare_test.cc:
##########
@@ -166,138 +171,306 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) {
   }
 }
 
+namespace {
+
+Result<RowTableImpl> MakeRowTableFromExecBatch(const ExecBatch& batch) {
+  RowTableImpl row_table;
+
+  std::vector<KeyColumnMetadata> column_metadatas;
+  RETURN_NOT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
+  RowTableMetadata table_metadata;
+  table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
+                                          sizeof(uint64_t));
+  RETURN_NOT_OK(row_table.Init(default_memory_pool(), table_metadata));
+  std::vector<uint16_t> row_ids(batch.length);
+  std::iota(row_ids.begin(), row_ids.end(), 0);
+  RowTableEncoder row_encoder;
+  row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
+  std::vector<KeyColumnArray> column_arrays;
+  RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, &column_arrays));
+  row_encoder.PrepareEncodeSelected(0, batch.length, column_arrays);
+  RETURN_NOT_OK(row_encoder.EncodeSelected(
+      &row_table, static_cast<uint32_t>(batch.length), row_ids.data()));
+
+  return row_table;
+}
+
+Result<RowTableImpl> RepeatRowTableUntil(const RowTableImpl& seed, int64_t 
num_rows) {
+  RowTableImpl row_table;
+
+  RETURN_NOT_OK(row_table.Init(default_memory_pool(), seed.metadata()));
+  // Append the seed row table repeatedly to grow the row table to big enough.
+  while (row_table.length() < num_rows) {
+    RETURN_NOT_OK(row_table.AppendSelectionFrom(seed,
+                                                
static_cast<uint32_t>(seed.length()),
+                                                /*source_row_ids=*/NULLPTR));
+  }
+
+  return row_table;
+}
+
+void AssertCompareColumnsToRowsAllMatch(const std::vector<KeyColumnArray>& 
columns,
+                                        const RowTableImpl& row_table,
+                                        const std::vector<uint32_t>& 
row_ids_to_compare) {
+  uint32_t num_rows_to_compare = 
static_cast<uint32_t>(row_ids_to_compare.size());
+
+  TempVectorStack stack;
+  ASSERT_OK(
+      stack.Init(default_memory_pool(),
+                 
KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_to_compare)));
+  LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
+
+  {
+    // No selection, output no match row ids.
+    uint32_t num_rows_no_match;
+    std::vector<uint16_t> row_ids_out(num_rows_to_compare);
+    KeyCompare::CompareColumnsToRows(num_rows_to_compare, 
/*sel_left_maybe_null=*/NULLPTR,
+                                     row_ids_to_compare.data(), &ctx, 
&num_rows_no_match,
+                                     row_ids_out.data(), columns, row_table,
+                                     /*are_cols_in_encoding_order=*/true,
+                                     
/*out_match_bitvector_maybe_null=*/NULLPTR);
+    ASSERT_EQ(num_rows_no_match, 0);
+  }
+
+  {
+    // No selection, output match bit vector.
+    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_to_compare));
+    KeyCompare::CompareColumnsToRows(
+        num_rows_to_compare, /*sel_left_maybe_null=*/NULLPTR, 
row_ids_to_compare.data(),
+        &ctx,
+        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, 
columns, row_table,
+        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
+    ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare),
+              num_rows_to_compare);
+  }
+
+  std::vector<uint16_t> selection_left(num_rows_to_compare);
+  std::iota(selection_left.begin(), selection_left.end(), 0);
+
+  {
+    // With selection, output no match row ids.
+    uint32_t num_rows_no_match;
+    std::vector<uint16_t> row_ids_out(num_rows_to_compare);
+    KeyCompare::CompareColumnsToRows(num_rows_to_compare, 
selection_left.data(),
+                                     row_ids_to_compare.data(), &ctx, 
&num_rows_no_match,
+                                     row_ids_out.data(), columns, row_table,
+                                     /*are_cols_in_encoding_order=*/true,
+                                     
/*out_match_bitvector_maybe_null=*/NULLPTR);
+    ASSERT_EQ(num_rows_no_match, 0);
+  }
+
+  {
+    // With selection, output match bit vector.
+    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_to_compare));
+    KeyCompare::CompareColumnsToRows(
+        num_rows_to_compare, selection_left.data(), row_ids_to_compare.data(), 
&ctx,
+        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, 
columns, row_table,
+        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
+    ASSERT_EQ(CountSetBits(match_bitvector.data(), 0, num_rows_to_compare),
+              num_rows_to_compare);
+  }
+}
+
+}  // namespace
+
 // Compare columns to rows at offsets over 2GB within a row table.
 // Certain AVX2 instructions may behave unexpectedly causing troubles like 
GH-41813.
-TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsLarge)) {
+TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver2GB)) {
   if constexpr (sizeof(void*) == 4) {
     GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
   // The idea of this case is to create a row table using several fixed length 
columns and
   // one var length column (so the row is hence var length and has offset 
buffer), with
   // the overall data size exceeding 2GB. Then compare each row with itself.
-  constexpr int64_t two_gb = 2ll * 1024ll * 1024ll * 1024ll;
+  constexpr int64_t k2GB = 2ll * 1024ll * 1024ll * 1024ll;
   // The compare function requires the row id of the left column to be 
uint16_t, hence the
   // number of rows.
   constexpr int64_t num_rows = std::numeric_limits<uint16_t>::max() + 1;
   const std::vector<std::shared_ptr<DataType>> fixed_length_types{uint64(), 
uint32()};
   // The var length column should be a little smaller than 2GB to workaround 
the capacity
   // limitation in the var length builder.
-  constexpr int32_t var_length = two_gb / num_rows - 1;
+  constexpr int32_t var_length = k2GB / num_rows - 1;
   auto row_size = std::accumulate(fixed_length_types.begin(), 
fixed_length_types.end(),
                                   static_cast<int64_t>(var_length),
                                   [](int64_t acc, const 
std::shared_ptr<DataType>& type) {
                                     return acc + type->byte_width();
                                   });
   // The overall size should be larger than 2GB.
-  ASSERT_GT(row_size * num_rows, two_gb);
-
-  MemoryPool* pool = default_memory_pool();
+  ASSERT_GT(row_size * num_rows, k2GB);
 
-  // The left side columns.
-  std::vector<KeyColumnArray> columns_left;
+  // The left side batch.
   ExecBatch batch_left;
   {
     std::vector<Datum> values;
 
     // Several fixed length arrays containing random content.
     for (const auto& type : fixed_length_types) {
-      ASSERT_OK_AND_ASSIGN(auto value, 
::arrow::gen::Random(type)->Generate(num_rows));
+      ASSERT_OK_AND_ASSIGN(auto value, Random(type)->Generate(num_rows));
       values.push_back(std::move(value));
     }
     // A var length array containing 'X' repeated var_length times.
-    ASSERT_OK_AND_ASSIGN(auto value_var_length,
-                         ::arrow::gen::Constant(
-                             
std::make_shared<BinaryScalar>(std::string(var_length, 'X')))
-                             ->Generate(num_rows));
+    ASSERT_OK_AND_ASSIGN(
+        auto value_var_length,
+        Constant(std::make_shared<BinaryScalar>(std::string(var_length, 'X')))
+            ->Generate(num_rows));
     values.push_back(std::move(value_var_length));
 
     batch_left = ExecBatch(std::move(values), num_rows);
-    ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
   }
 
+  // The left side columns.
+  std::vector<KeyColumnArray> columns_left;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
+
   // The right side row table.
-  RowTableImpl row_table_right;
-  {
-    // Encode the row table with the left columns.
-    std::vector<KeyColumnMetadata> column_metadatas;
-    ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
-    RowTableMetadata table_metadata;
-    table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
-                                            sizeof(uint64_t));
-    ASSERT_OK(row_table_right.Init(pool, table_metadata));
-    std::vector<uint16_t> row_ids(num_rows);
-    std::iota(row_ids.begin(), row_ids.end(), 0);
-    RowTableEncoder row_encoder;
-    row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
-    row_encoder.PrepareEncodeSelected(0, num_rows, columns_left);
-    ASSERT_OK(row_encoder.EncodeSelected(
-        &row_table_right, static_cast<uint32_t>(num_rows), row_ids.data()));
-
-    // The row table must contain an offset buffer.
-    ASSERT_NE(row_table_right.offsets(), NULLPTR);
-    // The whole point of this test.
-    ASSERT_GT(row_table_right.offsets()[num_rows - 1], two_gb);
-  }
+  ASSERT_OK_AND_ASSIGN(RowTableImpl row_table_right,
+                       MakeRowTableFromExecBatch(batch_left));
+  // The row table must contain an offset buffer.
+  ASSERT_NE(row_table_right.data(2), NULLPTR);
+  // The whole point of this test.
+  ASSERT_GT(row_table_right.offsets()[num_rows - 1], k2GB);
 
   // The rows to compare.
   std::vector<uint32_t> row_ids_to_compare(num_rows);
   std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), 0);
 
-  TempVectorStack stack;
-  ASSERT_OK(stack.Init(pool, 
KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));
-  LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
+  AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, 
row_ids_to_compare);
+}
 
-  {
-    // No selection, output no match row ids.
-    uint32_t num_rows_no_match;
-    std::vector<uint16_t> row_ids_out(num_rows);
-    KeyCompare::CompareColumnsToRows(num_rows, /*sel_left_maybe_null=*/NULLPTR,
-                                     row_ids_to_compare.data(), &ctx, 
&num_rows_no_match,
-                                     row_ids_out.data(), columns_left, 
row_table_right,
-                                     /*are_cols_in_encoding_order=*/true,
-                                     
/*out_match_bitvector_maybe_null=*/NULLPTR);
-    ASSERT_EQ(num_rows_no_match, 0);
+// GH-XXXXX: Compare fixed length columns to rows over 4GB within a row table.
+TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) {
+  if constexpr (sizeof(void*) == 4) {
+    GTEST_SKIP() << "Test only works on 64-bit platforms";
   }
 
+  // The idea of this case is to create a row table using one fixed length 
column (so the
+  // row is hence fixed length), with more than 4GB data. Then compare the 
rows located at
+  // over 4GB.
+
+  // A small batch to append to the row table repeatedly to grow the row table 
to big
+  // enough.
+  constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
+  constexpr int fixed_length = 256;
+
+  // The size of the row table is one batch larger than 4GB, and we'll compare 
the last
+  // num_rows_batch rows.
+  constexpr int64_t k4GB = 4ll * 1024 * 1024 * 1024;
+  constexpr int64_t num_rows_row_table =
+      (k4GB / (fixed_length * num_rows_batch) + 1) * num_rows_batch;
+  static_assert(num_rows_row_table < std::numeric_limits<uint32_t>::max(),
+                "row table length must be less than uint32 max");
+  static_assert(num_rows_row_table * fixed_length > k4GB,
+                "row table size must be greater than 4GB");
+
+  // The left side batch with num_rows_batch rows.
+  ExecBatch batch_left;
   {
-    // No selection, output match bit vector.
-    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows));
-    KeyCompare::CompareColumnsToRows(
-        num_rows, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), 
&ctx,
-        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, 
columns_left,
-        row_table_right,
-        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
-    ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, 
num_rows),
-              num_rows);
+    std::vector<Datum> values;
+
+    // A fixed length array containing random values.
+    ASSERT_OK_AND_ASSIGN(
+        auto value_fixed_length,
+        Random(fixed_size_binary(fixed_length))->Generate(num_rows_batch));
+    values.push_back(std::move(value_fixed_length));
+
+    batch_left = ExecBatch(std::move(values), num_rows_batch);
   }
 
-  std::vector<uint16_t> selection_left(num_rows);
-  std::iota(selection_left.begin(), selection_left.end(), 0);
+  // The left side columns with num_rows_batch rows.
+  std::vector<KeyColumnArray> columns_left;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
+
+  // The right side row table with num_rows_row_table rows.
+  ASSERT_OK_AND_ASSIGN(
+      RowTableImpl row_table_right,
+      RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(),
+                          num_rows_row_table));
+  // The row table must not contain a third buffer.
+  ASSERT_EQ(row_table_right.data(2), NULLPTR);
+  // The row data must be greater than 4GB.
+  ASSERT_GT(row_table_right.buffer_size(1), k4GB);
+
+  // The rows to compare: the last num_rows_batch rows in the row table VS. 
the whole
+  // batch.
+  std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
+  std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
+            static_cast<uint32_t>(num_rows_row_table - num_rows_batch));
+
+  AssertCompareColumnsToRowsAllMatch(columns_left, row_table_right, 
row_ids_to_compare);

Review Comment:
   Note this assertion will fire for 32-bit offset, because there are 
unreported offset overflow issues in fix-length code path as described in 
#43495, quoted below:
   
   > Even for the fixed-length code path, which doesn’t deal with the offset 
buffer at all and thus is supposed to be less problematic, there are obvious 
offset overflow issues like [1] and [2] (these issues are currently unreported 
but observed in my local experiments).
   > [1] 
https://github.com/apache/arrow/blob/187197c369058f7d1377c1b161c469a9e4542caf/cpp/src/arrow/compute/row/compare_internal.cc#L108
   > [2] 
https://github.com/apache/arrow/blob/187197c369058f7d1377c1b161c469a9e4542caf/cpp/src/arrow/compute/row/compare_internal_avx2.cc#L243-L244



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to