Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

via GitHub Tue, 19 Nov 2024 03:35:54 -0800


pitrou commented on code in PR #43832:
URL: https://github.com/apache/arrow/pull/43832#discussion_r1848016340



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -437,16 +281,118 @@ void RowArray::DebugPrintToFile(const char* filename, 
bool print_sorted) const {
   }
 }
 
+void RowArray::DecodeFixedLength(ResizableArrayData* output, int 
output_start_row,
+                                 int column_id, uint32_t fixed_length,
+                                 int num_rows_to_append, const uint32_t* 
row_ids) const {
+  switch (fixed_length) {
+    case 0:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                bit_util::SetBitTo(output->mutable_data(1),
+                                                   output_start_row + i, *ptr 
!= 0);
+                              });
+      break;
+    case 1:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                output->mutable_data(1)[output_start_row + i] 
= *ptr;
+                              });
+      break;
+    case 2:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint16_t*>(output->mutable_data(1))[output_start_row + i] =

Review Comment:
   I realize this is probably copy-pasted, but it would be nice to be able to 
write:
   ```suggestion
               output->mutable_data_as<uint16_t>(1)[output_start_row + i] =
   ```



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -437,16 +281,118 @@ void RowArray::DebugPrintToFile(const char* filename, 
bool print_sorted) const {
   }
 }
 
+void RowArray::DecodeFixedLength(ResizableArrayData* output, int 
output_start_row,
+                                 int column_id, uint32_t fixed_length,
+                                 int num_rows_to_append, const uint32_t* 
row_ids) const {
+  switch (fixed_length) {
+    case 0:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                bit_util::SetBitTo(output->mutable_data(1),
+                                                   output_start_row + i, *ptr 
!= 0);
+                              });
+      break;
+    case 1:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                output->mutable_data(1)[output_start_row + i] 
= *ptr;
+                              });
+      break;
+    case 2:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint16_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint16_t*>(ptr);
+          });
+      break;
+    case 4:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint32_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint32_t*>(ptr);
+          });
+      break;
+    case 8:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint64_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint64_t*>(ptr);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            uint64_t* dst = reinterpret_cast<uint64_t*>(
+                output->mutable_data(1) + num_bytes * (output_start_row + i));
+            const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+            for (uint32_t word_id = 0;
+                 word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); 
++word_id) {
+              arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                               arrow::util::SafeLoad(src + 
word_id));
+            }
+          });
+      break;
+  }
+}
+
+void RowArray::DecodeOffsets(ResizableArrayData* output, int output_start_row,
+                             int column_id, int num_rows_to_append,
+                             const uint32_t* row_ids) const {
+  uint32_t* offsets =
+      reinterpret_cast<uint32_t*>(output->mutable_data(1)) + output_start_row;
+  uint32_t sum = (output_start_row == 0) ? 0 : offsets[0];
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = 
num_bytes; });
+  for (int i = 0; i < num_rows_to_append; ++i) {
+    uint32_t length = offsets[i];
+    offsets[i] = sum;
+    sum += length;
+  }
+  offsets[num_rows_to_append] = sum;
+}
+
+void RowArray::DecodeVarLength(ResizableArrayData* output, int 
output_start_row,
+                               int column_id, int num_rows_to_append,
+                               const uint32_t* row_ids) const {
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+        uint64_t* dst = reinterpret_cast<uint64_t*>(
+            output->mutable_data(2) + reinterpret_cast<const uint32_t*>(
+                                          
output->mutable_data(1))[output_start_row + i]);
+        const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+        for (uint32_t word_id = 0;
+             word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); 
++word_id) {
+          arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                           arrow::util::SafeLoad(src + 
word_id));
+        }
+      });
+}
+
+void RowArray::DecodeNulls(ResizableArrayData* output, int output_start_row,
+                           int column_id, int num_rows_to_append,
+                           const uint32_t* row_ids) const {
+  RowArrayAccessor::VisitNulls(
+      rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint8_t value) 
{
+        bit_util::SetBitTo(output->mutable_data(0), output_start_row + i, 
value == 0);

Review Comment:
   Er, so the convention is that the null bytes in the RowArray store a 1 for a 
null value, and 0 for a non-null value?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =

Review Comment:
   ```suggestion
       // `result` will contain one 32-bit word per tested null bit,
       // either 0xffffffff if the null bit was set or 0 if it was unset.
       __m256i result =
   ```



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -437,16 +281,118 @@ void RowArray::DebugPrintToFile(const char* filename, 
bool print_sorted) const {
   }
 }
 
+void RowArray::DecodeFixedLength(ResizableArrayData* output, int 
output_start_row,
+                                 int column_id, uint32_t fixed_length,
+                                 int num_rows_to_append, const uint32_t* 
row_ids) const {
+  switch (fixed_length) {
+    case 0:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                bit_util::SetBitTo(output->mutable_data(1),
+                                                   output_start_row + i, *ptr 
!= 0);
+                              });
+      break;
+    case 1:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                output->mutable_data(1)[output_start_row + i] 
= *ptr;
+                              });
+      break;
+    case 2:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint16_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint16_t*>(ptr);
+          });
+      break;
+    case 4:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint32_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint32_t*>(ptr);
+          });
+      break;
+    case 8:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint64_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint64_t*>(ptr);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            uint64_t* dst = reinterpret_cast<uint64_t*>(
+                output->mutable_data(1) + num_bytes * (output_start_row + i));
+            const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+            for (uint32_t word_id = 0;
+                 word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); 
++word_id) {
+              arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                               arrow::util::SafeLoad(src + 
word_id));
+            }

Review Comment:
   So this is a crude hand-written memcpy that overshoots the copy length?



##########
cpp/src/arrow/acero/swiss_join.cc:
##########
@@ -437,16 +281,118 @@ void RowArray::DebugPrintToFile(const char* filename, 
bool print_sorted) const {
   }
 }
 
+void RowArray::DecodeFixedLength(ResizableArrayData* output, int 
output_start_row,
+                                 int column_id, uint32_t fixed_length,
+                                 int num_rows_to_append, const uint32_t* 
row_ids) const {
+  switch (fixed_length) {
+    case 0:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                bit_util::SetBitTo(output->mutable_data(1),
+                                                   output_start_row + i, *ptr 
!= 0);
+                              });
+      break;
+    case 1:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t 
num_bytes) {
+                                output->mutable_data(1)[output_start_row + i] 
= *ptr;
+                              });
+      break;
+    case 2:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint16_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint16_t*>(ptr);
+          });
+      break;
+    case 4:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint32_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint32_t*>(ptr);
+          });
+      break;
+    case 8:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            
reinterpret_cast<uint64_t*>(output->mutable_data(1))[output_start_row + i] =
+                *reinterpret_cast<const uint64_t*>(ptr);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            uint64_t* dst = reinterpret_cast<uint64_t*>(
+                output->mutable_data(1) + num_bytes * (output_start_row + i));
+            const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+            for (uint32_t word_id = 0;
+                 word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); 
++word_id) {
+              arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                               arrow::util::SafeLoad(src + 
word_id));
+            }
+          });
+      break;
+  }
+}
+
+void RowArray::DecodeOffsets(ResizableArrayData* output, int output_start_row,
+                             int column_id, int num_rows_to_append,
+                             const uint32_t* row_ids) const {
+  uint32_t* offsets =
+      reinterpret_cast<uint32_t*>(output->mutable_data(1)) + output_start_row;
+  uint32_t sum = (output_start_row == 0) ? 0 : offsets[0];
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = 
num_bytes; });
+  for (int i = 0; i < num_rows_to_append; ++i) {
+    uint32_t length = offsets[i];
+    offsets[i] = sum;
+    sum += length;
+  }
+  offsets[num_rows_to_append] = sum;
+}
+
+void RowArray::DecodeVarLength(ResizableArrayData* output, int 
output_start_row,
+                               int column_id, int num_rows_to_append,
+                               const uint32_t* row_ids) const {
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+        uint64_t* dst = reinterpret_cast<uint64_t*>(
+            output->mutable_data(2) + reinterpret_cast<const uint32_t*>(
+                                          
output->mutable_data(1))[output_start_row + i]);
+        const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+        for (uint32_t word_id = 0;
+             word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); 
++word_id) {
+          arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                           arrow::util::SafeLoad(src + 
word_id));
+        }

Review Comment:
   Same question re memcpy.



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.

Review Comment:
   ```suggestion
     // If a 64-bit value is zero, then we get 64 set bits.
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);

Review Comment:
   Can we use `reinterpret_cast` throughout instead of C-style casts?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.

Review Comment:
   By the way, can't we use the same trick for `Decode8FixedLength0_avx2`? Or 
is shuffle costlier than the sequence of logical operations above?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.

Review Comment:
   ```suggestion
     // Gather the lower/higher 4 32-bit (only lower 1 bit interesting) values 
based on the
     // lower/higher 4 64-bit row offsets.
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.

Review Comment:
   ```suggestion
     // Keep the first 8 bits in each 64-bit value, as the other bits belong
     // to other columns.
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.

Review Comment:
   ```suggestion
     // 64 set bits per value to 8 set bits (one byte) per value.
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.

Review Comment:
   ```suggestion
     // Swap the second and the third 64-bit lane, so that all
     // 16-bit values end up in the lower half of `row`.
     // (0xd8 = 0b 11 01 10 00)
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -194,17 +202,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& 
rows, int column_id, int nu
       //
       const uint8_t* row_ptr_base = rows.data(2);
       const RowTableImpl::offset_type* row_offsets = rows.offsets();
+      auto row_offsets_i64 =
+          reinterpret_cast<const 
arrow::util::int64_for_gather_t*>(row_offsets);
       for (int i = 0; i < num_rows / unroll; ++i) {

Review Comment:
   Can we take the opportunity to rename all these `unroll` constants to 
`kUnroll`?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) 
{
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = 
static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep right-shifting the length and accumulate the offset by adding the 
length.

Review Comment:
   ```suggestion
     // We keep left-shifting the length and accumulate the offset by adding 
the length.
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.

Review Comment:
   Are we sure that rows and output have enough padding at the end to 
accomodate for the overshooting here?



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) 
{
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = 
static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep right-shifting the length and accumulate the offset by adding the 
length.
+  __m256i length =
+      _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+  length = _mm256_insert_epi32(length, 0, 0);
+  for (int i = 0; i < 7; ++i) {
+    offsets = _mm256_add_epi32(offsets, length);
+    length =
+        _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+    length = _mm256_insert_epi32(length, 0, 0);
+  }
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets);
+  return _mm256_extract_epi32(offsets, 7) + num_bytes_last;
+}
+
+inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) {
+  uint8_t null_bits =
+      
static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes)));
+  *output = ~null_bits;
+}
+
+}  // namespace
+
+int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int 
output_start_row,
+                                     int column_id, uint32_t fixed_length,
+                                     int num_rows_to_append,
+                                     const uint32_t* row_ids) const {
+  DCHECK_EQ(output_start_row % 8, 0);
+
+  int num_rows_processed = 0;
+  switch (fixed_length) {
+    case 0:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength0_avx2(output->mutable_data(1) + 
(output_start_row + i) / 8,

Review Comment:
   ```suggestion
               DCHECK_EQ(i % 8, 0);
               Decode8FixedLength0_avx2(output->mutable_data(1) + 
(output_start_row + i) / 8,
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) 
{
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = 
static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep right-shifting the length and accumulate the offset by adding the 
length.
+  __m256i length =
+      _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+  length = _mm256_insert_epi32(length, 0, 0);
+  for (int i = 0; i < 7; ++i) {

Review Comment:
   ```suggestion
     // `length` is now a sequence of 32-bit words such as:
     //   - length[0] = 0
     //   - length[1] = num_bytes[0]
     //   ...
     //   - length[7] = num_bytes[6]
     // (note that num_bytes[7] is kept in `num_bytes_last`)
     for (int i = 0; i < 7; ++i) {
   ```



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,270 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) 
{
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = 
static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep right-shifting the length and accumulate the offset by adding the 
length.
+  __m256i length =
+      _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+  length = _mm256_insert_epi32(length, 0, 0);
+  for (int i = 0; i < 7; ++i) {
+    offsets = _mm256_add_epi32(offsets, length);
+    length =
+        _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+    length = _mm256_insert_epi32(length, 0, 0);
+  }
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets);
+  return _mm256_extract_epi32(offsets, 7) + num_bytes_last;
+}
+
+inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) {
+  uint8_t null_bits =
+      
static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes)));
+  *output = ~null_bits;
+}
+
+}  // namespace
+
+int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int 
output_start_row,
+                                     int column_id, uint32_t fixed_length,
+                                     int num_rows_to_append,
+                                     const uint32_t* row_ids) const {
+  DCHECK_EQ(output_start_row % 8, 0);
+
+  int num_rows_processed = 0;
+  switch (fixed_length) {
+    case 0:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength0_avx2(output->mutable_data(1) + 
(output_start_row + i) / 8,
+                                     row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 1:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength1_avx2(output->mutable_data(1) + 
output_start_row + i,
+                                     row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 2:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength2_avx2(
+                reinterpret_cast<uint16_t*>(output->mutable_data(1)) + 
output_start_row +
+                    i,
+                row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 4:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength4_avx2(
+                reinterpret_cast<uint32_t*>(output->mutable_data(1)) + 
output_start_row +
+                    i,
+                row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 8:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength8_avx2(
+                reinterpret_cast<uint64_t*>(output->mutable_data(1)) + 
output_start_row +
+                    i,
+                row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr, uint32_t num_bytes) {
+            Decode1_avx2(output->mutable_data(1) + num_bytes * 
(output_start_row + i),
+                         row_ptr, num_bytes);
+          });
+      num_rows_processed = num_rows_to_append;
+      break;
+  }
+
+  return num_rows_processed;
+}
+
+int RowArray::DecodeOffsets_avx2(ResizableArrayData* output, int 
output_start_row,
+                                 int column_id, int num_rows_to_append,
+                                 const uint32_t* row_ids) const {
+  uint32_t* offsets =
+      reinterpret_cast<uint32_t*>(output->mutable_data(1)) + output_start_row;
+  uint32_t current_length = (output_start_row == 0) ? 0 : offsets[0];
+  int num_rows_processed = RowArrayAccessor::Visit_avx2(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i 
offset_hi,
+          __m256i num_bytes) {
+        current_length = Decode8Offset_avx2(offsets + i, current_length, 
num_bytes);
+      });
+  offsets[num_rows_processed] = current_length;
+  return num_rows_processed;
+}
+
+int RowArray::DecodeVarLength_avx2(ResizableArrayData* output, int 
output_start_row,
+                                   int column_id, int num_rows_to_append,
+                                   const uint32_t* row_ids) const {
+  RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                          [&](int i, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+                            uint8_t* dst =
+                                output->mutable_data(2) +
+                                reinterpret_cast<const uint32_t*>(
+                                    output->mutable_data(1))[output_start_row 
+ i];
+                            Decode1_avx2(dst, row_ptr, num_bytes);
+                          });
+  return num_rows_to_append;
+}
+
+int RowArray::DecodeNulls_avx2(ResizableArrayData* output, int 
output_start_row,
+                               int column_id, int num_rows_to_append,
+                               const uint32_t* row_ids) const {
+  DCHECK_EQ(output_start_row % 8, 0);
+
+  return RowArrayAccessor::VisitNulls_avx2(
+      rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint64_t 
null_bytes) {
+        Decode8Null_avx2(output->mutable_data(0) + (output_start_row + i) / 8,

Review Comment:
   ```suggestion
           DCHECK_EQ(i % 8, 0);
           Decode8Null_avx2(output->mutable_data(0) + (output_start_row + i) / 
8,
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

Reply via email to