pitrou commented on code in PR #45515:
URL: https://github.com/apache/arrow/pull/45515#discussion_r1952944366


##########
cpp/src/arrow/compute/key_map_internal_avx2.cc:
##########
@@ -385,33 +382,52 @@ int SwissTable::extract_group_ids_avx2(const int 
num_keys, const uint32_t* hashe
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_group_ids) + i, 
group_id);
     }
   } else {
+    int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+    int num_groupid_bytes = num_groupid_bits / 8;
+    uint32_t mask = num_groupid_bytes == 1   ? 0xFF
+                    : num_groupid_bytes == 2 ? 0xFFFF
+                                             : 0xFFFFFFFF;
+    int num_block_bytes = 
num_block_bytes_from_num_groupid_bits(num_groupid_bits);
+    const int* slots_base =
+        reinterpret_cast<const int*>(blocks_->data() + bytes_status_in_block_);
+
     for (int i = 0; i < num_keys / unroll; ++i) {
       __m256i hash = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(hashes) + i);
-      // Extend hash and local_slot to 64-bit to compute 64-bit group id 
offsets to
-      // gather from. This is to prevent index overflow issues in GH-44513.
-      // NB: Use zero-extend conversion for unsigned hash.
-      __m256i hash_lo = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(hash));
-      __m256i hash_hi = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(hash, 
1));
+      __m256i block_id =
+          _mm256_srlv_epi32(hash, _mm256_set1_epi32(bits_hash_ - log_blocks_));
+
       __m256i local_slot =
           _mm256_set1_epi64x(reinterpret_cast<const 
uint64_t*>(local_slots)[i]);
+
+      // Extend block_id and local_slot to 64-bit to compute 64-bit group id 
offsets to
+      // gather from. This is to prevent index overflow issues in GH-44513.
       __m256i local_slot_lo = _mm256_shuffle_epi8(
           local_slot, _mm256_setr_epi32(0x80808000, 0x80808080, 0x80808001, 
0x80808080,
                                         0x80808002, 0x80808080, 0x80808003, 
0x80808080));
       __m256i local_slot_hi = _mm256_shuffle_epi8(
           local_slot, _mm256_setr_epi32(0x80808004, 0x80808080, 0x80808005, 
0x80808080,
                                         0x80808006, 0x80808080, 0x80808007, 
0x80808080));
-      local_slot_lo = _mm256_mul_epu32(local_slot_lo, 
_mm256_set1_epi32(byte_size));
-      local_slot_hi = _mm256_mul_epu32(local_slot_hi, 
_mm256_set1_epi32(byte_size));
-      __m256i pos_lo = _mm256_srli_epi64(hash_lo, bits_hash_ - log_blocks_);
-      __m256i pos_hi = _mm256_srli_epi64(hash_hi, bits_hash_ - log_blocks_);
-      pos_lo = _mm256_mul_epu32(pos_lo, _mm256_set1_epi32(byte_multiplier));
-      pos_hi = _mm256_mul_epu32(pos_hi, _mm256_set1_epi32(byte_multiplier));
-      pos_lo = _mm256_add_epi64(pos_lo, local_slot_lo);
-      pos_hi = _mm256_add_epi64(pos_hi, local_slot_hi);
-      __m128i group_id_lo = _mm256_i64gather_epi32(elements, pos_lo, 1);
-      __m128i group_id_hi = _mm256_i64gather_epi32(elements, pos_hi, 1);
+      local_slot_lo =
+          _mm256_mul_epu32(local_slot_lo, 
_mm256_set1_epi32(num_groupid_bytes));

Review Comment:
   Ah, my bad, `_mm256_mul_epu32` is actually a 64-bit multiply.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to