Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

via GitHub Wed, 20 Nov 2024 06:11:21 -0800


zanmato1984 commented on code in PR #43832:
URL: https://github.com/apache/arrow/pull/43832#discussion_r1848638558



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);

Review Comment:
   Done.



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.

Review Comment:
   Yes. See the code allocating the underlying buffers for `RowTableImpl`:
   
https://github.com/apache/arrow/blob/1302889c01bfbd04dc2dca4995078d2eca9311cd/cpp/src/arrow/compute/row/row_internal.cc#L232-L246
   
   and 
   
   
https://github.com/apache/arrow/blob/1302889c01bfbd04dc2dca4995078d2eca9311cd/cpp/src/arrow/compute/row/row_internal.h#L263



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

Reply via email to