Re: [PR] GH-45334: [C++][Acero] Fix swiss join overflow issues in row offset calculation for fixed length and null masks [arrow]

via GitHub Thu, 23 Jan 2025 02:56:12 -0800


pitrou commented on code in PR #45336:
URL: https://github.com/apache/arrow/pull/45336#discussion_r1926666179



##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -199,29 +199,44 @@ class ARROW_EXPORT RowTableImpl {
   const RowTableMetadata& metadata() const { return metadata_; }
   /// \brief The number of rows stored in the table
   int64_t length() const { return num_rows_; }
-  // Accessors into the table's buffers
-  const uint8_t* data(int i) const {
-    ARROW_DCHECK(i >= 0 && i < kMaxBuffers);
-    if (ARROW_PREDICT_TRUE(buffers_[i])) {
-      return buffers_[i]->data();
-    }
-    return NULLPTR;
+
+  inline const uint8_t* null_masks(uint32_t row_id) const {
+    return data(0) + static_cast<int64_t>(row_id) * 
metadata_.null_masks_bytes_per_row;
   }
-  uint8_t* mutable_data(int i) {
-    ARROW_DCHECK(i >= 0 && i < kMaxBuffers);
-    if (ARROW_PREDICT_TRUE(buffers_[i])) {
-      return buffers_[i]->mutable_data();
-    }
-    return NULLPTR;
+  inline uint8_t* null_masks(uint32_t row_id) {

Review Comment:
   Should it be called `mutable_null_masks`, for consistency with other helper 
methods below?



##########
cpp/src/arrow/compute/row/compare_internal_avx2.cc:
##########
@@ -64,14 +66,21 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
         irow_right =
             _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(left_to_right_map) + i);
       }
-      __m256i bitid =
-          _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes 
* 8));
-      bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id));
-      __m256i right =
-          _mm256_i32gather_epi32((const int*)null_masks, 
_mm256_srli_epi32(bitid, 3), 1);
-      right = _mm256_and_si256(
-          _mm256_set1_epi32(1),
-          _mm256_srlv_epi32(right, _mm256_and_si256(bitid, 
_mm256_set1_epi32(7))));
+      __m256i irow_right_lo = 
_mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right));
+      __m256i irow_right_hi =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1));
+      __m256i bit_id_lo =
+          _mm256_mul_epi32(irow_right_lo, 
_mm256_set1_epi64x(null_mask_num_bytes * 8));
+      __m256i bit_id_hi =
+          _mm256_mul_epi32(irow_right_hi, 
_mm256_set1_epi64x(null_mask_num_bytes * 8));
+      bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding);
+      bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding);
+      __m128i right_lo = _mm256_i64gather_epi32(reinterpret_cast<const 
int*>(null_masks),
+                                                _mm256_srli_epi64(bit_id_lo, 
3), 1);
+      __m128i right_hi = _mm256_i64gather_epi32(reinterpret_cast<const 
int*>(null_masks),
+                                                _mm256_srli_epi64(bit_id_hi, 
3), 1);
+      __m256i right = _mm256_set_m128i(right_hi, right_lo);
+      right = _mm256_and_si256(right, bit_in_right);

Review Comment:
   Also perhaps a more general helper:
   ```c++
   // Get null bits at `null_bit_id` as a vector of 32-bit ints
   __m256i GetRowNullBitsInt32(const RowTableImpl& rows, uint32_t null_bit_id, 
__m256 row_index32);
   ```



##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -199,29 +199,44 @@ class ARROW_EXPORT RowTableImpl {
   const RowTableMetadata& metadata() const { return metadata_; }
   /// \brief The number of rows stored in the table
   int64_t length() const { return num_rows_; }
-  // Accessors into the table's buffers
-  const uint8_t* data(int i) const {
-    ARROW_DCHECK(i >= 0 && i < kMaxBuffers);
-    if (ARROW_PREDICT_TRUE(buffers_[i])) {
-      return buffers_[i]->data();
-    }
-    return NULLPTR;
+
+  inline const uint8_t* null_masks(uint32_t row_id) const {

Review Comment:
   Note that `inline` is implied when the method definition is inside the class 
declaration, there's no need to put it explicitly.



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -477,14 +477,15 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, 
const RowTableImpl& so
                                     const int64_t* source_rows_permutation) {
   int64_t num_source_rows = source.length();
 
-  int64_t fixed_length = target->metadata().fixed_length;
+  uint32_t fixed_length = target->metadata().fixed_length;
 
   // Permutation of source rows is optional. Without permutation all that is
   // needed is memcpy.
   //
   if (!source_rows_permutation) {
-    memcpy(target->mutable_data(1) + fixed_length * first_target_row_id, 
source.data(1),
-           fixed_length * num_source_rows);
+    DCHECK_LE(first_target_row_id, std::numeric_limits<uint32_t>::max());

Review Comment:
   Were these size constraints already implied but not tested for?



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -477,14 +477,15 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, 
const RowTableImpl& so
                                     const int64_t* source_rows_permutation) {
   int64_t num_source_rows = source.length();
 
-  int64_t fixed_length = target->metadata().fixed_length;
+  uint32_t fixed_length = target->metadata().fixed_length;
 
   // Permutation of source rows is optional. Without permutation all that is
   // needed is memcpy.
   //
   if (!source_rows_permutation) {
-    memcpy(target->mutable_data(1) + fixed_length * first_target_row_id, 
source.data(1),
-           fixed_length * num_source_rows);
+    DCHECK_LE(first_target_row_id, std::numeric_limits<uint32_t>::max());
+    
memcpy(target->mutable_fixed_length_rows(static_cast<uint32_t>(first_target_row_id)),
+           source.fixed_length_rows(/*row_id=*/0), fixed_length * 
num_source_rows);
   } else {
     // Row length must be a multiple of 64-bits due to enforced alignment.

Review Comment:
   Unrelated, but that's an interesting piece of info. Doesn't it blow up 
memory use if there is a small number of very small columns?



##########
cpp/src/arrow/compute/row/encode_internal.cc:
##########
@@ -260,44 +260,40 @@ void EncoderInteger::Decode(uint32_t start_row, uint32_t 
num_rows,
       col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
     DCHECK_EQ(offset_within_row, 0);
     uint32_t row_size = rows.metadata().fixed_length;
-    memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
-           num_rows * row_size);
+    memcpy(col_prep.mutable_data(1), rows.fixed_length_rows(start_row),
+           static_cast<int64_t>(num_rows) * row_size);
   } else if (rows.metadata().is_fixed_length) {
-    uint32_t row_size = rows.metadata().fixed_length;
-    const uint8_t* row_base =
-        rows.data(1) + static_cast<RowTableImpl::offset_type>(start_row) * 
row_size;
-    row_base += offset_within_row;
     uint8_t* col_base = col_prep.mutable_data(1);
     switch (col_prep.metadata().fixed_length) {
       case 1:
         for (uint32_t i = 0; i < num_rows; ++i) {
-          col_base[i] = row_base[i * row_size];
+          col_base[i] = *rows.fixed_length_rows(start_row + i);

Review Comment:
   Hmm... we're not adding `offset_within_row` anymore? Is it deliberate?



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -494,10 +495,13 @@ void RowArrayMerge::CopyFixedLength(RowTableImpl* target, 
const RowTableImpl& so
     int64_t num_words_per_row = fixed_length / sizeof(uint64_t);
     for (int64_t i = 0; i < num_source_rows; ++i) {
       int64_t source_row_id = source_rows_permutation[i];
+      DCHECK_LE(source_row_id, std::numeric_limits<uint32_t>::max());

Review Comment:
   If that's always the case then are we wasting memory and CPU cache by having 
a 64-bit permutation array?



##########
cpp/src/arrow/compute/row/compare_internal_avx2.cc:
##########
@@ -64,14 +66,21 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
         irow_right =
             _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(left_to_right_map) + i);
       }
-      __m256i bitid =
-          _mm256_mullo_epi32(irow_right, _mm256_set1_epi32(null_mask_num_bytes 
* 8));
-      bitid = _mm256_add_epi32(bitid, _mm256_set1_epi32(null_bit_id));
-      __m256i right =
-          _mm256_i32gather_epi32((const int*)null_masks, 
_mm256_srli_epi32(bitid, 3), 1);
-      right = _mm256_and_si256(
-          _mm256_set1_epi32(1),
-          _mm256_srlv_epi32(right, _mm256_and_si256(bitid, 
_mm256_set1_epi32(7))));
+      __m256i irow_right_lo = 
_mm256_cvtepi32_epi64(_mm256_castsi256_si128(irow_right));
+      __m256i irow_right_hi =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(irow_right, 1));
+      __m256i bit_id_lo =
+          _mm256_mul_epi32(irow_right_lo, 
_mm256_set1_epi64x(null_mask_num_bytes * 8));
+      __m256i bit_id_hi =
+          _mm256_mul_epi32(irow_right_hi, 
_mm256_set1_epi64x(null_mask_num_bytes * 8));
+      bit_id_lo = _mm256_add_epi64(bit_id_lo, pos_after_encoding);
+      bit_id_hi = _mm256_add_epi64(bit_id_hi, pos_after_encoding);

Review Comment:
   I seem to have already seen this kind of code in other PRs, perhaps we can 
add helpers to make it more readable (perhaps in a `avx2_internal.h` header?):
   ```c++
   inline std::pair<__m256i, __m256i> WidenInt32to64(__m256 x) {
     __m256i x_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(x));
     __m256i x_hi =
         _mm256_cvtepi32_epi64(_mm256_extracti128_si256(x, 1));
     return {x_lo, x_hi};
   }
   
   // Compute `x * factor + addend` in the 64-bit domain
   inline __m256i MulAddInt64(__m256 x, __m256 factor, __m256 addend) {
     return _mm256_add_epi64(addend, _mm256_mul_epi32(x, factor));
   }
   
   inline __m256i MulAddInt64(__m256 x, int64_t factor, int64_t addend) {
     return MulAddInt64(x, _mm256_set1_epi64x(factor), 
_mm256_set1_epi64x(addend));
   }
   ```
   
   then the code above could look like:
   ```c++
     auto {irow_right_lo, irow_right_hi} = WidenInt32to64(irow_right);
     auto bit_id_lo = MulAddInt64(irow_right_lo, null_mask_num_bytes * 8, 
pos_after_encoding);
     auto bit_id_hi = MulAddInt64(irow_right_hi, null_mask_num_bytes * 8, 
pos_after_encoding);
   ```
   
   (and in an ideal future, this would be migrated to xsimd :))



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-45334: [C++][Acero] Fix swiss join overflow issues in row offset calculation for fixed length and null masks [arrow]

Reply via email to