pitrou commented on code in PR #43389:
URL: https://github.com/apache/arrow/pull/43389#discussion_r1708911819


##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -473,17 +473,10 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target,
     (*first_target_row_id)[sources.size()] = num_rows;
   }
 
-  if (num_bytes > std::numeric_limits<uint32_t>::max()) {
-    return Status::Invalid(
-        "There are more than 2^32 bytes of key data.  Acero cannot "
-        "process a join of this magnitude");
-  }
-
   // Allocate target memory
   //
   target->rows_.Clean();
-  RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast<uint32_t>(num_rows),
-                                          static_cast<uint32_t>(num_bytes)));
+  RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast<uint32_t>(num_rows), 
num_bytes));

Review Comment:
   Is the remaining cast still required here?



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -593,7 +586,8 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, 
const RowTableImpl&
       int64_t source_row_id = source_rows_permutation[i];
       const uint64_t* source_row_ptr = reinterpret_cast<const uint64_t*>(
           source.data(2) + source_offsets[source_row_id]);
-      uint32_t length = source_offsets[source_row_id + 1] - 
source_offsets[source_row_id];
+      int64_t length = source_offsets[source_row_id + 1] - 
source_offsets[source_row_id];
+      DCHECK_LE(length, std::numeric_limits<uint32_t>::max());

Review Comment:
   Is this really an internal invariant or does it depend on the input data? 
It's not obvious from reading the code.



##########
cpp/src/arrow/compute/row/row_test.cc:
##########
@@ -244,23 +228,15 @@ TEST(RowTableOffsetOverflow, 
LARGE_MEMORY_TEST(AppendFrom)) {
   RowTableImpl row_table;
   ASSERT_OK(row_table.Init(pool, table_metadata));
 
-  // Appending the seed 7 times should be fine.
-  for (int i = 0; i < num_rows - 1; ++i) {
+  // Append seed num_rows times.
+  for (int i = 0; i < num_rows; ++i) {
     ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
                                             /*source_row_ids=*/NULLPTR));
   }
 
-  // Appending the seed the 8-th time should overflow.
-  int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
-  std::stringstream expected_error_message;
-  expected_error_message
-      << "Invalid: Offset overflow detected in 
RowTableImpl::AppendSelectionFrom for row "
-      << num_rows - 1 << " of length " << length_per_row
-      << " bytes, current length in total is " << length_per_row * (num_rows - 
1)
-      << " bytes";
-  ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(),
-                             row_table.AppendSelectionFrom(row_table_seed, 
num_rows_seed,
-                                                           
/*source_row_ids=*/NULLPTR));
+  ASSERT_GT(row_table.offsets()[num_rows - 1], 
std::numeric_limits<uint32_t>::max());
+  ASSERT_GT(row_table.offsets()[num_rows],
+            row_table.offsets()[num_rows - 1] + length_per_binary);

Review Comment:
   Is it possible to instead `ASSERT_EQ` the exact offsets, or are they 
non-deterministic here?



##########
cpp/src/arrow/compute/row/compare_internal_avx2.cc:
##########
@@ -240,12 +206,20 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
             _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(left_to_right_map) + i);
       }
 
-      __m256i offset_right =
-          _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(fixed_length));
-      offset_right = _mm256_add_epi32(offset_right, 
_mm256_set1_epi32(offset_within_row));
-
-      reinterpret_cast<uint64_t*>(match_bytevector)[i] =
-          compare8_fn(rows_left, rows_right, i * unroll, irow_left, 
offset_right);
+      __m256i irow_right_lo = 
_mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right));

Review Comment:
   May you add a short common to explain those non-trivial SIMD code sequences?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -45,48 +45,62 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, 
int column_id, int nu
   if (!is_fixed_length_column) {
     int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
     const uint8_t* row_ptr_base = rows.data(2);
-    const uint32_t* row_offsets = rows.offsets();
+    const RowTableImpl::offset_type* row_offsets = rows.offsets();

Review Comment:
   Hmm, do you mean the AVX2 speedups are actually not wired anywhere?



##########
cpp/src/arrow/testing/random.h:
##########
@@ -433,12 +433,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   ///
   /// \param[in] size the size of the array to generate
   /// \param[in] byte_width the byte width of fixed-size binary items
+  /// \param[in] min_byte the lower bound of each byte in the binary 
determined by the
+  ///            uniform distribution
+  /// \param[in] max_byte the upper bound of each byte in the binary 
determined by the
+  ///            uniform distribution
   /// \param[in] null_probability the probability of a value being null
   /// \param[in] alignment alignment for memory allocations (in bytes)
   /// \param[in] memory_pool memory pool to allocate memory from
   ///
   /// \return a generated Array
   std::shared_ptr<Array> FixedSizeBinary(int64_t size, int32_t byte_width,
+                                         uint8_t min_byte = 
static_cast<uint8_t>('A'),
+                                         uint8_t max_byte = 
static_cast<uint8_t>('z'),

Review Comment:
   Is it possible to still expose the old signature as well for convenience? In 
many cases there's no need to override `min_byte` and `max_byte`, while still 
wanting a non-zero null probability.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to