Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

via GitHub Tue, 19 Nov 2024 08:28:18 -0800


zanmato1984 commented on code in PR #43832:
URL: https://github.com/apache/arrow/pull/43832#discussion_r1848527029



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.

Review Comment:
   The above trick is not feasible for decoding 8 complete bytes. Note the 
compare instruction above:
   ```
     // If a 64-bit value is zero, then we get 64 set bits.
     __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
     __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
   ```
   It only tells us if the byte is zero or not thus only works for 1-bit case 
(`0` or `1`), whereas for 1-byte case, the whole byte content must be preserved.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

Reply via email to