[GitHub] [arrow] michalursa commented on a change in pull request #12067: ARROW-15239: [C++][Compute] Adding Bloom filter implementation

GitBox Tue, 08 Mar 2022 00:50:19 -0800


michalursa commented on a change in pull request #12067:
URL: https://github.com/apache/arrow/pull/12067#discussion_r821444975




##########
File path: cpp/src/arrow/compute/exec/key_hash_avx2.cc
##########
@@ -18,248 +18,302 @@
 #include <immintrin.h>
 
 #include "arrow/compute/exec/key_hash.h"
+#include "arrow/util/bit_util.h"
 
 namespace arrow {
 namespace compute {
 
 #if defined(ARROW_HAVE_AVX2)
 
-void Hashing::avalanche_avx2(uint32_t num_keys, uint32_t* hashes) {
+inline __m256i Hashing32::avalanche_avx2(__m256i hash) {
+  hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15));
+  hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2));
+  hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 13));
+  hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_3));
+  hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 16));
+  return hash;
+}
+
+inline __m256i Hashing32::combine_hashes_avx2(__m256i previous_hash, __m256i 
hash) {
+  // previous_hash ^= acc + kCombineConst + (previous_hash << 6) +
+  // (previous_hash >> 2);
+  //
+  __m256i x = _mm256_add_epi32(_mm256_slli_epi32(previous_hash, 6),
+                               _mm256_srli_epi32(previous_hash, 2));
+  __m256i y = _mm256_add_epi32(hash, _mm256_set1_epi32(kCombineConst));
+  __m256i new_hash = _mm256_xor_si256(previous_hash, _mm256_add_epi32(x, y));
+  return new_hash;
+}
+
+template <bool T_COMBINE_HASHES>
+void Hashing32::avalanche_all_avx2(uint32_t num_rows_to_process, uint32_t* 
hashes,
+                                   const uint32_t* hashes_temp_for_combine) {
   constexpr int unroll = 8;
-  ARROW_DCHECK(num_keys % unroll == 0);
-  for (uint32_t i = 0; i < num_keys / unroll; ++i) {
-    __m256i hash = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(hashes) 
+ i);
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15));
-    hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2));
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 13));
-    hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_3));
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 16));
-    _mm256_storeu_si256((reinterpret_cast<__m256i*>(hashes)) + i, hash);
+  for (uint32_t i = 0; i < num_rows_to_process / unroll; ++i) {
+    __m256i acc;
+    if (T_COMBINE_HASHES) {
+      acc = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(hashes_temp_for_combine) +
+                               i);
+    } else {
+      acc = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(hashes) + i);
+    }
+    acc = avalanche_avx2(acc);
+    if (T_COMBINE_HASHES) {
+      __m256i previous_hash =
+          _mm256_loadu_si256(reinterpret_cast<const __m256i*>(hashes) + i);
+      acc = combine_hashes_avx2(previous_hash, acc);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(hashes) + i, acc);
   }
+  for (uint32_t i = num_rows_to_process - (num_rows_to_process % unroll);
+       i < num_rows_to_process; ++i) {
+    if (T_COMBINE_HASHES) {
+      hashes[i] = combine_hashes(hashes[i], 
avalanche(hashes_temp_for_combine[i]));
+    } else {
+      hashes[i] = avalanche(hashes[i]);
+    }
+  }
+}
+
+inline __m256i Hashing32::round_avx2(__m256i acc, __m256i input) {
+  acc = _mm256_add_epi32(acc, _mm256_mullo_epi32(input, 
_mm256_set1_epi32(PRIME32_2)));
+  acc = _mm256_or_si256(_mm256_slli_epi32(acc, 13), _mm256_srli_epi32(acc, 32 
- 13));
+  acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_1));
+  return acc;
 }
 
-inline uint64_t Hashing::combine_accumulators_avx2(__m256i acc) {
-  acc = _mm256_or_si256(
-      _mm256_sllv_epi32(acc, _mm256_setr_epi32(1, 7, 12, 18, 1, 7, 12, 18)),
-      _mm256_srlv_epi32(acc, _mm256_setr_epi32(32 - 1, 32 - 7, 32 - 12, 32 - 
18, 32 - 1,
-                                               32 - 7, 32 - 12, 32 - 18)));
+inline uint64_t Hashing32::combine_accumulators_avx2(__m256i acc) {
+  // Each 128-bit lane of input represents a set of 4 accumulators related to
+  // a single hash (we process here two hashes together).
+  //
+  __m256i rotate_const_left = _mm256_setr_epi32(1, 7, 12, 18, 1, 7, 12, 18);
+  __m256i rotate_const_right = _mm256_setr_epi32(32 - 1, 32 - 7, 32 - 12, 32 - 
18, 32 - 1,
+                                                 32 - 7, 32 - 12, 32 - 18);
+
+  acc = _mm256_or_si256(_mm256_sllv_epi32(acc, rotate_const_left),
+                        _mm256_srlv_epi32(acc, rotate_const_right));
   acc = _mm256_add_epi32(acc, _mm256_shuffle_epi32(acc, 0xee));  // 0b11101110
   acc = _mm256_add_epi32(acc, _mm256_srli_epi64(acc, 32));
   acc = _mm256_permutevar8x32_epi32(acc, _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 
0, 0));
   uint64_t result = _mm256_extract_epi64(acc, 0);
   return result;
 }
 
-void Hashing::helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
-                                  const uint8_t* keys, uint32_t* hash) {
+inline __m256i Hashing32::stripe_mask_avx2(int i, int j) {
+  // Return two 16 byte masks, where the first i/j bytes are 0xff and the
+  // remaining ones are 0x00
+  //
+  ARROW_DCHECK(i >= 0 && i <= kStripeSize && j >= 0 && j <= kStripeSize);
+  return _mm256_cmpgt_epi8(
+      _mm256_blend_epi32(_mm256_set1_epi8(i), _mm256_set1_epi8(j), 0xf0),
+      _mm256_setr_epi64x(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+                         0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL));
+}
+
+template <bool two_equal_lengths>
+inline __m256i Hashing32::process_stripes_avx2(int64_t num_stripes_A,
+                                               int64_t num_stripes_B,
+                                               __m256i mask_last_stripe,
+                                               const uint8_t* keys, int64_t 
offset_A,
+                                               int64_t offset_B) {
+  ARROW_DCHECK(num_stripes_A > 0 && num_stripes_B > 0);
+
+  __m256i acc = _mm256_setr_epi32(
+      static_cast<uint32_t>((static_cast<uint64_t>(PRIME32_1) + PRIME32_2) & 
0xffffffff),
+      PRIME32_2, 0, static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1)),
+      static_cast<uint32_t>((static_cast<uint64_t>(PRIME32_1) + PRIME32_2) & 
0xffffffff),
+      PRIME32_2, 0, static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1)));
+
+  // Constant for permutexvar8x32 instruction that conditionally swaps two
+  // 128-bit lanes if and only if num_stripes_B > num_stripes_A.
+  //
+  __m256i swap_permute = _mm256_setzero_si256();
+  int64_t offset_shorter, offset_longer;
+  int64_t num_stripes_shorter, num_stripes_longer;
+
+  if (!two_equal_lengths) {
+    int64_t swap_mask = num_stripes_B > num_stripes_A ? ~0LL : 0LL;
+    swap_permute = _mm256_xor_si256(_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+                                    _mm256_set1_epi32(swap_mask & 4));
+    offset_shorter = (offset_A & swap_mask) | (offset_B & ~swap_mask);
+    offset_longer = (offset_A & ~swap_mask) | (offset_B & swap_mask);
+    num_stripes_shorter = (num_stripes_A & swap_mask) | (num_stripes_B & 
~swap_mask);
+    num_stripes_longer = (num_stripes_A & ~swap_mask) | (num_stripes_B & 
swap_mask);
+  } else {
+    ARROW_DCHECK(num_stripes_A == num_stripes_B);
+    offset_longer = offset_A;
+    offset_shorter = offset_B;
+    num_stripes_longer = num_stripes_A;
+    num_stripes_shorter = num_stripes_A;
+  }
+
+  int64_t istripe = 0;
+  for (; istripe + 1 < num_stripes_shorter; ++istripe) {
+    __m256i stripe = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(keys + offset_longer) + istripe)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(keys + 
offset_shorter) +
+                        istripe),
+        1);
+    acc = round_avx2(acc, stripe);
+  }
+  __m256i stripe = _mm256_inserti128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(keys + offset_longer) + istripe)),
+      _mm_loadu_si128(reinterpret_cast<const __m128i*>(keys + offset_shorter) 
+ istripe),
+      1);
+  if (!two_equal_lengths) {
+    __m256i acc_copy = acc;
+    for (; istripe + 1 < num_stripes_longer; ++istripe) {
+      acc = round_avx2(acc, stripe);
+      stripe = _mm256_inserti128_si256(
+          stripe,
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(keys + 
offset_longer) +
+                          istripe + 1),
+          0);
+    }
+    acc = _mm256_blend_epi32(acc, acc_copy, 0xf0);
+    mask_last_stripe = _mm256_permutevar8x32_epi32(mask_last_stripe, 
swap_permute);
+  }
+  stripe = _mm256_and_si256(stripe, mask_last_stripe);
+  acc = round_avx2(acc, stripe);
+  if (!two_equal_lengths) {
+    acc = _mm256_permutevar8x32_epi32(acc, swap_permute);
+  }
+  return acc;
+}
+
+template <bool combine_hashes>
+uint32_t Hashing32::hash_fixedlen_imp_avx2(uint32_t num_rows, uint64_t length,
+                                           const uint8_t* keys, uint32_t* 
hashes,
+                                           uint32_t* hashes_temp_for_combine) {
   constexpr int unroll = 2;
-  ARROW_DCHECK(num_keys % unroll == 0);
-
-  constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL;
-  constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL;
-
-  const __m256i mask_last_stripe =
-      (key_length % 16) <= 8
-          ? _mm256_set1_epi8(static_cast<char>(0xffU))
-          : _mm256_cmpgt_epi8(_mm256_set1_epi8(key_length % 16),
-                              _mm256_setr_epi64x(kByteSequence0To7, 
kByteSequence8To15,
-                                                 kByteSequence0To7, 
kByteSequence8To15));
-
-  // If length modulo stripe length is less than or equal 8, round down to the 
nearest 16B
-  // boundary (8B ending will be processed in a separate function), otherwise 
round up.
-  const uint32_t num_stripes = (key_length + 7) / 16;
-  for (uint32_t i = 0; i < num_keys / unroll; ++i) {
-    __m256i acc = _mm256_setr_epi32(
-        static_cast<uint32_t>((static_cast<uint64_t>(PRIME32_1) + PRIME32_2) &
-                              0xffffffff),
-        PRIME32_2, 0, static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1)),
-        static_cast<uint32_t>((static_cast<uint64_t>(PRIME32_1) + PRIME32_2) &
-                              0xffffffff),
-        PRIME32_2, 0, static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1)));
-    auto key0 = reinterpret_cast<const __m128i*>(keys + key_length * 2 * i);
-    auto key1 = reinterpret_cast<const __m128i*>(keys + key_length * 2 * i + 
key_length);
-    for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
-      auto key_stripe =
-          _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128(key0 
+ stripe)),
-                                  _mm_loadu_si128(key1 + stripe), 1);
-      acc = _mm256_add_epi32(
-          acc, _mm256_mullo_epi32(key_stripe, _mm256_set1_epi32(PRIME32_2)));
-      acc = _mm256_or_si256(_mm256_slli_epi32(acc, 13), _mm256_srli_epi32(acc, 
32 - 13));
-      acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_1));
+
+  // Do not process rows that could read past the end of the buffer using 16
+  // byte loads. Round down number of rows to process to multiple of 2.
+  //
+  uint64_t num_rows_to_skip = bit_util::CeilDiv(length, kStripeSize);
+  uint32_t num_rows_to_process =
+      (num_rows_to_skip > num_rows)
+          ? 0
+          : (num_rows - static_cast<uint32_t>(num_rows_to_skip));
+  num_rows_to_process -= (num_rows_to_process % unroll);
+
+  uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize);
+  int num_tail_bytes = ((length - 1) & (kStripeSize - 1)) + 1;
+  __m256i mask_last_stripe = stripe_mask_avx2(num_tail_bytes, num_tail_bytes);
+
+  for (uint32_t i = 0; i < num_rows_to_process / unroll; ++i) {
+    __m256i acc = process_stripes_avx2</*two_equal_lengths=*/true>(
+        num_stripes, num_stripes, mask_last_stripe, keys,
+        static_cast<int64_t>(i) * unroll * length,
+        static_cast<int64_t>(i) * unroll * length + length);
+
+    if (combine_hashes) {
+      reinterpret_cast<uint64_t*>(hashes_temp_for_combine)[i] =
+          combine_accumulators_avx2(acc);
+    } else {
+      reinterpret_cast<uint64_t*>(hashes)[i] = combine_accumulators_avx2(acc);
     }
-    auto key_stripe = _mm256_inserti128_si256(
-        _mm256_castsi128_si256(_mm_loadu_si128(key0 + num_stripes - 1)),
-        _mm_loadu_si128(key1 + num_stripes - 1), 1);
-    key_stripe = _mm256_and_si256(key_stripe, mask_last_stripe);
-    acc = _mm256_add_epi32(acc,
-                           _mm256_mullo_epi32(key_stripe, 
_mm256_set1_epi32(PRIME32_2)));
-    acc = _mm256_or_si256(_mm256_slli_epi32(acc, 13), _mm256_srli_epi32(acc, 
32 - 13));
-    acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_1));
-    uint64_t result = combine_accumulators_avx2(acc);
-    reinterpret_cast<uint64_t*>(hash)[i] = result;
   }
+
+  avalanche_all_avx2<combine_hashes>(num_rows_to_process, hashes,
+                                     hashes_temp_for_combine);
+
+  return num_rows_to_process;
 }
 
-void Hashing::helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
-                                const uint8_t* keys, uint32_t* hash) {
-  constexpr int unroll = 8;
-  ARROW_DCHECK(num_keys % unroll == 0);
-  auto keys_i64 = reinterpret_cast<arrow::util::int64_for_gather_t*>(keys);
-
-  // Process between 1 and 8 last bytes of each key, starting from 16B 
boundary.
-  // The caller needs to make sure that there are no more than 8 bytes to 
process after
-  // that 16B boundary.
-  uint32_t first_offset = key_length - (key_length % 16);
-  __m256i mask = _mm256_set1_epi64x((~0ULL) >> (8 * (8 - (key_length % 16))));
-  __m256i offset =
-      _mm256_setr_epi32(0, key_length, key_length * 2, key_length * 3, 
key_length * 4,
-                        key_length * 5, key_length * 6, key_length * 7);
-  offset = _mm256_add_epi32(offset, _mm256_set1_epi32(first_offset));
-  __m256i offset_incr = _mm256_set1_epi32(key_length * 8);
-
-  for (uint32_t i = 0; i < num_keys / unroll; ++i) {
-    auto v1 = _mm256_i32gather_epi64(keys_i64, _mm256_castsi256_si128(offset), 
1);
-    auto v2 = _mm256_i32gather_epi64(keys_i64, 
_mm256_extracti128_si256(offset, 1), 1);
-    v1 = _mm256_and_si256(v1, mask);
-    v2 = _mm256_and_si256(v2, mask);
-    v1 = _mm256_permutevar8x32_epi32(v1, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 
5, 7));
-    v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 
5, 7));
-    auto x1 = _mm256_permute2x128_si256(v1, v2, 0x20);
-    auto x2 = _mm256_permute2x128_si256(v1, v2, 0x31);
-    __m256i acc = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(hash)) 
+ i);
-
-    acc = _mm256_add_epi32(acc, _mm256_mullo_epi32(x1, 
_mm256_set1_epi32(PRIME32_3)));
-    acc = _mm256_or_si256(_mm256_slli_epi32(acc, 17), _mm256_srli_epi32(acc, 
32 - 17));
-    acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_4));
-
-    acc = _mm256_add_epi32(acc, _mm256_mullo_epi32(x2, 
_mm256_set1_epi32(PRIME32_3)));
-    acc = _mm256_or_si256(_mm256_slli_epi32(acc, 17), _mm256_srli_epi32(acc, 
32 - 17));
-    acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_4));
-
-    _mm256_storeu_si256((reinterpret_cast<__m256i*>(hash)) + i, acc);
-
-    offset = _mm256_add_epi32(offset, offset_incr);
+uint32_t Hashing32::hash_fixedlen_avx2(bool combine_hashes, uint32_t num_rows,
+                                       uint64_t length, const uint8_t* keys,
+                                       uint32_t* hashes,
+                                       uint32_t* hashes_temp_for_combine) {
+  if (combine_hashes) {
+    return hash_fixedlen_imp_avx2<true>(num_rows, length, keys, hashes,
+                                        hashes_temp_for_combine);
+  } else {
+    return hash_fixedlen_imp_avx2<false>(num_rows, length, keys, hashes,
+                                         hashes_temp_for_combine);
   }
 }
 
-void Hashing::hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
-                               const uint8_t* concatenated_keys,
-                               uint32_t* temp_buffer,  // Needs to hold 4 x 
32-bit per row
-                               uint32_t* hashes) {
-  constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL;
-  constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL;
+template <typename T, bool combine_hashes>
+uint32_t Hashing32::hash_varlen_imp_avx2(uint32_t num_rows, const T* offsets,
+                                         const uint8_t* concatenated_keys,
+                                         uint32_t* hashes,
+                                         uint32_t* hashes_temp_for_combine) {
+  constexpr int unroll = 2;
 
-  const __m128i sequence = _mm_set_epi64x(kByteSequence8To15, 
kByteSequence0To7);
-  const __m128i acc_init = _mm_setr_epi32(
-      static_cast<uint32_t>((static_cast<uint64_t>(PRIME32_1) + PRIME32_2) & 
0xffffffff),
-      PRIME32_2, 0, static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1)));
+  // Do not process rows that could read past the end of the buffer using 16
+  // byte loads. Round down number of rows to process to multiple of 2.
+  //
+  uint32_t num_rows_to_process = num_rows;
+  while (num_rows_to_process > 0 &&
+         offsets[num_rows_to_process] + kStripeSize > offsets[num_rows]) {
+    --num_rows_to_process;
+  }
+  num_rows_to_process -= (num_rows_to_process % unroll);
 
-  // Variable length keys are always processed as a sequence of 16B stripes,
-  // with the last stripe, if extending past the end of the key, having extra 
bytes set to
-  // 0 on the fly.
-  for (uint32_t ikey = 0; ikey < num_rows; ++ikey) {
-    uint32_t begin = offsets[ikey];
-    uint32_t end = offsets[ikey + 1];
-    uint32_t length = end - begin;
-    const uint8_t* base = concatenated_keys + begin;
-
-    __m128i acc = acc_init;
-
-    if (length) {
-      uint32_t i;
-      for (i = 0; i < (length - 1) / 16; ++i) {
-        __m128i key_stripe = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(base) + i);
-        acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, 
_mm_set1_epi32(PRIME32_2)));
-        acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 
13));
-        acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1));
-      }
-      __m128i key_stripe = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(base) + i);
-      __m128i mask = _mm_cmpgt_epi8(_mm_set1_epi8(((length - 1) % 16) + 1), 
sequence);
-      key_stripe = _mm_and_si128(key_stripe, mask);
-      acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, 
_mm_set1_epi32(PRIME32_2)));
-      acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 
13));
-      acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1));
-    }
+  for (uint32_t i = 0; i < num_rows_to_process / unroll; ++i) {
+    T offset_A = offsets[unroll * i + 0];
+    T offset_B = offsets[unroll * i + 1];
+    T offset_end = offsets[unroll * i + 2];
 
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(temp_buffer) + ikey, acc);
-  }
+    T length = offset_B - offset_A;
+    int is_non_empty = length == 0 ? 0 : 1;
+    int64_t num_stripes_A =
+        static_cast<int64_t>(bit_util::CeilDiv(length, kStripeSize)) + (1 - 
is_non_empty);
+    int num_tail_bytes_A = ((length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty;
 
-  // Combine accumulators and perform avalanche
-  constexpr int unroll = 8;
-  for (uint32_t i = 0; i < num_rows / unroll; ++i) {
-    __m256i accA =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(temp_buffer) + 4 * 
i + 0);
-    __m256i accB =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(temp_buffer) + 4 * 
i + 1);
-    __m256i accC =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(temp_buffer) + 4 * 
i + 2);
-    __m256i accD =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(temp_buffer) + 4 * 
i + 3);
-    // Transpose 2x 4x4 32-bit matrices
-    __m256i r0 = _mm256_unpacklo_epi32(accA, accB);
-    __m256i r1 = _mm256_unpackhi_epi32(accA, accB);
-    __m256i r2 = _mm256_unpacklo_epi32(accC, accD);
-    __m256i r3 = _mm256_unpackhi_epi32(accC, accD);
-    accA = _mm256_unpacklo_epi64(r0, r2);
-    accB = _mm256_unpackhi_epi64(r0, r2);
-    accC = _mm256_unpacklo_epi64(r1, r3);
-    accD = _mm256_unpackhi_epi64(r1, r3);
-    // _rotl(accA, 1)
-    // _rotl(accB, 7)
-    // _rotl(accC, 12)
-    // _rotl(accD, 18)
-    accA = _mm256_or_si256(_mm256_slli_epi32(accA, 1), _mm256_srli_epi32(accA, 
32 - 1));
-    accB = _mm256_or_si256(_mm256_slli_epi32(accB, 7), _mm256_srli_epi32(accB, 
32 - 7));
-    accC = _mm256_or_si256(_mm256_slli_epi32(accC, 12), 
_mm256_srli_epi32(accC, 32 - 12));
-    accD = _mm256_or_si256(_mm256_slli_epi32(accD, 18), 
_mm256_srli_epi32(accD, 32 - 18));
-    accA = _mm256_add_epi32(_mm256_add_epi32(accA, accB), 
_mm256_add_epi32(accC, accD));
-    // avalanche
-    __m256i hash = accA;
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15));
-    hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2));
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 13));
-    hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_3));
-    hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 16));
-    // Store.
-    // At this point, because of way 2x 4x4 transposition was done, output 
hashes are in
-    // order: 0, 2, 4, 6, 1, 3, 5, 7. Bring back the original order.
-    _mm256_storeu_si256(
-        reinterpret_cast<__m256i*>(hashes) + i,
-        _mm256_permutevar8x32_epi32(hash, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 
3, 7)));
+    length = offset_end - offset_B;
+    is_non_empty = length == 0 ? 0 : 1;
+    int64_t num_stripes_B =
+        static_cast<int64_t>(bit_util::CeilDiv(length, kStripeSize)) + (1 - 
is_non_empty);
+    int num_tail_bytes_B = ((length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty;
+
+    __m256i mask_last_stripe = stripe_mask_avx2(num_tail_bytes_A, 
num_tail_bytes_B);
+
+    __m256i acc = process_stripes_avx2</*two_equal_lengths=*/false>(
+        num_stripes_A, num_stripes_B, mask_last_stripe, concatenated_keys,
+        static_cast<int64_t>(offset_A), static_cast<int64_t>(offset_B));
+
+    if (combine_hashes) {

Review comment:
       I renamed template arguments and helper functions to not have name 
collisions.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] michalursa commented on a change in pull request #12067: ARROW-15239: [C++][Compute] Adding Bloom filter implementation

Reply via email to