This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 422618c  ARROW-8401: [C++] Add byte-stream-split AVX2/AVX512 
implementation
422618c is described below

commit 422618cac557b12131b1320bebba66b03508c0d6
Author: Frank Du <[email protected]>
AuthorDate: Tue Apr 14 17:07:48 2020 +0200

    ARROW-8401: [C++] Add byte-stream-split AVX2/AVX512 implementation
    
    For decode, similar implementation with the SSE version for both AVX512 and 
AVX2.
    
    For encode, float path implemented for AVX2/AVX512 also double for AVX512.
    AVX2 double fall back to SSE as currently no epi16 permute support.
    
    BM_ByteStreamSplit result reach up to 3x for AVX512, 2x for AVX2.
    
    Signed-off-by: Frank Du <[email protected]>
    
    Closes #6899 from jianxind/split-avx-intrinsics
    
    Authored-by: Frank Du <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/util/byte_stream_split.h | 492 ++++++++++++++++++++++++++++++---
 cpp/src/parquet/encoding.cc            |  31 +--
 cpp/src/parquet/encoding_benchmark.cc  |  78 +++++-
 cpp/src/parquet/encoding_test.cc       |  28 +-
 4 files changed, 552 insertions(+), 77 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split.h 
b/cpp/src/arrow/util/byte_stream_split.h
index 8cf89ef..eff595e 100644
--- a/cpp/src/arrow/util/byte_stream_split.h
+++ b/cpp/src/arrow/util/byte_stream_split.h
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef ARROW_UTIL_BYTE_STREAM_SPLIT_H
-#define ARROW_UTIL_BYTE_STREAM_SPLIT_H
+#pragma once
 
 #include "arrow/util/sse_util.h"
 #include "arrow/util/ubsan.h"
@@ -24,28 +23,36 @@
 #include <stdint.h>
 #include <algorithm>
 
+#ifdef ARROW_HAVE_AVX2
+#include <immintrin.h>
+#endif  // ARROW_HAVE_AVX2
+
+#ifdef ARROW_HAVE_SSE4_2
+// Enable the SIMD for ByteStreamSplit Encoder/Decoder
+#define ARROW_HAVE_SIMD_SPLIT
+#endif  // ARROW_HAVE_SSE4_2
+
 namespace arrow {
 namespace util {
 namespace internal {
 
 #if defined(ARROW_HAVE_SSE4_2)
-
 template <typename T>
-void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t num_values, 
int64_t stride,
+void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, 
int64_t stride,
                                T* out) {
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
   constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
 
   const int64_t size = num_values * sizeof(T);
-  const int64_t block_size = sizeof(__m128i) * kNumStreams;
-  const int64_t num_blocks = size / block_size;
+  constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
+  const int64_t num_blocks = size / kBlockSize;
   uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
 
   // First handle suffix.
   // This helps catch if the simd-based processing overflows into the suffix
   // since almost surely a test would fail.
-  const int64_t num_processed_elements = (num_blocks * block_size) / 
kNumStreams;
+  const int64_t num_processed_elements = (num_blocks * kBlockSize) / 
kNumStreams;
   for (int64_t i = num_processed_elements; i < num_values; ++i) {
     uint8_t gathered_byte_data[kNumStreams];
     for (size_t b = 0; b < kNumStreams; ++b) {
@@ -61,7 +68,7 @@ void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t 
num_values, int64_t
   // Stage 2: ACAC ACAC BDBD BDBD
   // Stage 3: ABCD ABCD ABCD ABCD
   __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
-  const size_t half = kNumStreams / 2U;
+  constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
 
   for (int64_t i = 0; i < num_blocks; ++i) {
     for (size_t j = 0; j < kNumStreams; ++j) {
@@ -69,11 +76,11 @@ void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t 
num_values, int64_t
           reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * 
stride]));
     }
     for (size_t step = 0; step < kNumStreamsLog2; ++step) {
-      for (size_t j = 0; j < half; ++j) {
+      for (size_t j = 0; j < kNumStreamsHalf; ++j) {
         stage[step + 1U][j * 2] =
-            _mm_unpacklo_epi8(stage[step][j], stage[step][half + j]);
+            _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
         stage[step + 1U][j * 2 + 1U] =
-            _mm_unpackhi_epi8(stage[step][j], stage[step][half + j]);
+            _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
       }
     }
     for (size_t j = 0; j < kNumStreams; ++j) {
@@ -85,27 +92,28 @@ void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t 
num_values, int64_t
 }
 
 template <typename T>
-void ByteStreamSplitEncodeSSE2(const uint8_t* raw_values, const size_t 
num_values,
+void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t 
num_values,
                                uint8_t* output_buffer_raw) {
-  constexpr size_t num_streams = sizeof(T);
-  static_assert(num_streams == 4U || num_streams == 8U, "Invalid number of 
streams.");
-  __m128i stage[3][num_streams];
-  __m128i final_result[num_streams];
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  __m128i stage[3][kNumStreams];
+  __m128i final_result[kNumStreams];
 
   const size_t size = num_values * sizeof(T);
-  const size_t block_size = sizeof(__m128i) * num_streams;
-  const size_t num_blocks = size / block_size;
+  constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+  const size_t num_blocks = size / kBlockSize;
   const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
-  __m128i* output_buffer_streams[num_streams];
-  for (size_t i = 0; i < num_streams; ++i) {
+  __m128i* output_buffer_streams[kNumStreams];
+  for (size_t i = 0; i < kNumStreams; ++i) {
     output_buffer_streams[i] =
         reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
   }
 
-  const size_t num_processed_elements = (num_blocks * block_size) / sizeof(T);
+  // First handle suffix.
+  const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
   for (size_t i = num_processed_elements; i < num_values; ++i) {
-    for (size_t j = 0U; j < num_streams; ++j) {
-      const uint8_t byte_in_value = raw_values[i * num_streams + j];
+    for (size_t j = 0U; j < kNumStreams; ++j) {
+      const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
       output_buffer_raw[j * num_values + i] = byte_in_value;
     }
   }
@@ -124,22 +132,22 @@ void ByteStreamSplitEncodeSSE2(const uint8_t* raw_values, 
const size_t num_value
   //   0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
   for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
     // First copy the data to stage 0.
-    for (size_t i = 0; i < num_streams; ++i) {
-      stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * num_streams 
+ i]);
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams 
+ i]);
     }
 
     // The shuffling of bytes is performed through the unpack intrinsics.
     // In my measurements this gives better performance then an implementation
     // which uses the shuffle intrinsics.
     for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
-      for (size_t i = 0; i < num_streams / 2U; ++i) {
+      for (size_t i = 0; i < kNumStreams / 2U; ++i) {
         stage[stage_lvl + 1][i * 2] =
             _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
         stage[stage_lvl + 1][i * 2 + 1] =
             _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
       }
     }
-    if (num_streams == 8U) {
+    if (kNumStreams == 8U) {
       // This is the path for double.
       __m128i tmp[8];
       for (size_t i = 0; i < 4; ++i) {
@@ -163,21 +171,421 @@ void ByteStreamSplitEncodeSSE2(const uint8_t* 
raw_values, const size_t num_value
         final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
       }
     }
-    for (size_t i = 0; i < num_streams; ++i) {
+    for (size_t i = 0; i < kNumStreams; ++i) {
       _mm_storeu_si128(&output_buffer_streams[i][block_index], 
final_result[i]);
     }
   }
 }
+#endif  // ARROW_HAVE_SSE4_2
+
+#if defined(ARROW_HAVE_AVX2)
+template <typename T>
+void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, 
int64_t stride,
+                               T* out) {
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+  const int64_t size = num_values * sizeof(T);
+  constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
+  if (size < kBlockSize)  // Back to SSE for small size
+    return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+  const int64_t num_blocks = size / kBlockSize;
+  uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+  // First handle suffix.
+  const int64_t num_processed_elements = (num_blocks * kBlockSize) / 
kNumStreams;
+  for (int64_t i = num_processed_elements; i < num_values; ++i) {
+    uint8_t gathered_byte_data[kNumStreams];
+    for (size_t b = 0; b < kNumStreams; ++b) {
+      const size_t byte_index = b * stride + i;
+      gathered_byte_data[b] = data[byte_index];
+    }
+    out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+  }
+
+  // Processed hierahically using unpack intrinsics, then permute intrinsics.
+  __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
+  __m256i final_result[kNumStreams];
+  constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+  for (int64_t i = 0; i < num_blocks; ++i) {
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      stage[0][j] = _mm256_loadu_si256(
+          reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * 
stride]));
+    }
+
+    for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+      for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+        stage[step + 1U][j * 2] =
+            _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+        stage[step + 1U][j * 2 + 1U] =
+            _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+      }
+    }
+
+    if (kNumStreams == 8U) {
+      // path for double, 128i index:
+      //   {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
+      //   {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
+      final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+                                                  stage[kNumStreamsLog2][1], 
0b00100000);
+      final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+                                                  stage[kNumStreamsLog2][3], 
0b00100000);
+      final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+                                                  stage[kNumStreamsLog2][5], 
0b00100000);
+      final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+                                                  stage[kNumStreamsLog2][7], 
0b00100000);
+      final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+                                                  stage[kNumStreamsLog2][1], 
0b00110001);
+      final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+                                                  stage[kNumStreamsLog2][3], 
0b00110001);
+      final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+                                                  stage[kNumStreamsLog2][5], 
0b00110001);
+      final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+                                                  stage[kNumStreamsLog2][7], 
0b00110001);
+    } else {
+      // path for float, 128i index:
+      //   {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
+      final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+                                                  stage[kNumStreamsLog2][1], 
0b00100000);
+      final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+                                                  stage[kNumStreamsLog2][3], 
0b00100000);
+      final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+                                                  stage[kNumStreamsLog2][1], 
0b00110001);
+      final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+                                                  stage[kNumStreamsLog2][3], 
0b00110001);
+    }
+
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(
+                              &output_data[(i * kNumStreams + j) * 
sizeof(__m256i)]),
+                          final_result[j]);
+    }
+  }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t 
num_values,
+                               uint8_t* output_buffer_raw) {
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  if (kNumStreams == 8U)  // Back to SSE, currently no path for double.
+    return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, 
output_buffer_raw);
+
+  const size_t size = num_values * sizeof(T);
+  constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+  if (size < kBlockSize)  // Back to SSE for small size
+    return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, 
output_buffer_raw);
+  const size_t num_blocks = size / kBlockSize;
+  const __m256i* raw_values_simd = reinterpret_cast<const 
__m256i*>(raw_values);
+  __m256i* output_buffer_streams[kNumStreams];
+
+  for (size_t i = 0; i < kNumStreams; ++i) {
+    output_buffer_streams[i] =
+        reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
+  }
+
+  // First handle suffix.
+  const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+  for (size_t i = num_processed_elements; i < num_values; ++i) {
+    for (size_t j = 0U; j < kNumStreams; ++j) {
+      const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+      output_buffer_raw[j * num_values + i] = byte_in_value;
+    }
+  }
+
+  // Path for float.
+  // 1. Processed hierahically to 32i blcok using the unpack intrinsics.
+  // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+  // 3. Pack final 256i block with _mm256_permute2x128_si256.
+  constexpr size_t kNumUnpack = 3U;
+  __m256i stage[kNumUnpack + 1][kNumStreams];
+  static const __m256i kPermuteMask =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  __m256i permute[kNumStreams];
+  __m256i final_result[kNumStreams];
+
+  for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * 
kNumStreams + i]);
+    }
+
+    for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
+      for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+        stage[stage_lvl + 1][i * 2] =
+            _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 
2 + 1]);
+        stage[stage_lvl + 1][i * 2 + 1] =
+            _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 
2 + 1]);
+      }
+    }
+
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], 
kPermuteMask);
+    }
+
+    final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 
0b00100000);
+    final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 
0b00110001);
+    final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 
0b00100000);
+    final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 
0b00110001);
+
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      _mm256_storeu_si256(&output_buffer_streams[i][block_index], 
final_result[i]);
+    }
+  }
+}
+#endif  // ARROW_HAVE_AVX2
+
+#if defined(ARROW_HAVE_AVX512)
+template <typename T>
+void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, 
int64_t stride,
+                                 T* out) {
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+  const int64_t size = num_values * sizeof(T);
+  constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
+  if (size < kBlockSize)  // Back to AVX2 for small size
+    return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+  const int64_t num_blocks = size / kBlockSize;
+  uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+  // First handle suffix.
+  const int64_t num_processed_elements = (num_blocks * kBlockSize) / 
kNumStreams;
+  for (int64_t i = num_processed_elements; i < num_values; ++i) {
+    uint8_t gathered_byte_data[kNumStreams];
+    for (size_t b = 0; b < kNumStreams; ++b) {
+      const size_t byte_index = b * stride + i;
+      gathered_byte_data[b] = data[byte_index];
+    }
+    out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+  }
+
+  // Processed hierahically using the unpack, then two shuffles.
+  __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
+  __m512i shuffle[kNumStreams];
+  __m512i final_result[kNumStreams];
+  constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+  for (int64_t i = 0; i < num_blocks; ++i) {
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      stage[0][j] = _mm512_loadu_si512(
+          reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * 
stride]));
+    }
+
+    for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+      for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+        stage[step + 1U][j * 2] =
+            _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+        stage[step + 1U][j * 2 + 1U] =
+            _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+      }
+    }
+
+    if (kNumStreams == 8U) {
+      // path for double, 128i index:
+      // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+      // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+      // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+      // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+      shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+                                        stage[kNumStreamsLog2][1], 0b01000100);
+      shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+                                        stage[kNumStreamsLog2][3], 0b01000100);
+      shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+                                        stage[kNumStreamsLog2][5], 0b01000100);
+      shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+                                        stage[kNumStreamsLog2][7], 0b01000100);
+      shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+                                        stage[kNumStreamsLog2][1], 0b11101110);
+      shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+                                        stage[kNumStreamsLog2][3], 0b11101110);
+      shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+                                        stage[kNumStreamsLog2][5], 0b11101110);
+      shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+                                        stage[kNumStreamsLog2][7], 0b11101110);
+
+      final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b10001000);
+      final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b10001000);
+      final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b11011101);
+      final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b11011101);
+      final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 
0b10001000);
+      final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 
0b10001000);
+      final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 
0b11011101);
+      final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 
0b11011101);
+    } else {
+      // path for float, 128i index:
+      // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
+      // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
+      shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+                                        stage[kNumStreamsLog2][1], 0b01000100);
+      shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+                                        stage[kNumStreamsLog2][3], 0b01000100);
+      shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+                                        stage[kNumStreamsLog2][1], 0b11101110);
+      shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+                                        stage[kNumStreamsLog2][3], 0b11101110);
+
+      final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b10001000);
+      final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b11011101);
+      final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b10001000);
+      final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b11011101);
+    }
+
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(
+                              &output_data[(i * kNumStreams + j) * 
sizeof(__m512i)]),
+                          final_result[j]);
+    }
+  }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t 
num_values,
+                                 uint8_t* output_buffer_raw) {
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  const size_t size = num_values * sizeof(T);
+  constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+  if (size < kBlockSize)  // Back to AVX2 for small size
+    return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, 
output_buffer_raw);
+
+  const size_t num_blocks = size / kBlockSize;
+  const __m512i* raw_values_simd = reinterpret_cast<const 
__m512i*>(raw_values);
+  __m512i* output_buffer_streams[kNumStreams];
+  for (size_t i = 0; i < kNumStreams; ++i) {
+    output_buffer_streams[i] =
+        reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
+  }
 
+  // First handle suffix.
+  const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+  for (size_t i = num_processed_elements; i < num_values; ++i) {
+    for (size_t j = 0U; j < kNumStreams; ++j) {
+      const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+      output_buffer_raw[j * num_values + i] = byte_in_value;
+    }
+  }
+
+  constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
+  __m512i final_result[kNumStreams];
+  __m512i unpack[KNumUnpack + 1][kNumStreams];
+  __m512i permutex[kNumStreams];
+  __m512i permutex_mask;
+  if (kNumStreams == 8U) {
+    // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
+    permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 
0x000E0006,
+                                     0x001D0015, 0x000D0005, 0x001C0014, 
0x000C0004,
+                                     0x001B0013, 0x000B0003, 0x001A0012, 
0x000A0002,
+                                     0x00190011, 0x00090001, 0x00180010, 
0x00080000);
+  } else {
+    permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 
0x02, 0x0D,
+                                     0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
+  }
+
+  for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * 
kNumStreams + i]);
+    }
+
+    for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
+      for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+        unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
+            unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+        unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
+            unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+      }
+    }
+
+    if (kNumStreams == 8U) {
+      // path for double
+      // 1. unpack to epi16 block
+      // 2. permutexvar_epi16 to 128i block
+      // 3. shuffle 128i to final 512i target, index:
+      //   {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+      //   {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+      //   {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+      //   {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+      for (size_t i = 0; i < kNumStreams; ++i)
+        permutex[i] = _mm512_permutexvar_epi16(permutex_mask, 
unpack[KNumUnpack][i]);
+
+      __m512i shuffle[kNumStreams];
+      shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+      shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
+      shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+      shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
+      shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+      shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
+      shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+      shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
+
+      final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b10001000);
+      final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 
0b11011101);
+      final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b10001000);
+      final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 
0b11011101);
+      final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 
0b10001000);
+      final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 
0b11011101);
+      final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 
0b10001000);
+      final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 
0b11011101);
+    } else {
+      // Path for float.
+      // 1. Processed hierahically to 32i blcok using the unpack intrinsics.
+      // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+      // 3. Pack final 256i block with _mm256_permute2x128_si256.
+      for (size_t i = 0; i < kNumStreams; ++i)
+        permutex[i] = _mm512_permutexvar_epi32(permutex_mask, 
unpack[KNumUnpack][i]);
+
+      final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 
0b01000100);
+      final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 
0b11101110);
+      final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 
0b01000100);
+      final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 
0b11101110);
+    }
+
+    for (size_t i = 0; i < kNumStreams; ++i) {
+      _mm512_storeu_si512(&output_buffer_streams[i][block_index], 
final_result[i]);
+    }
+  }
+}
+#endif  // ARROW_HAVE_AVX512
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+template <typename T>
+void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
+                                      int64_t stride, T* out) {
+#if defined(ARROW_HAVE_AVX512)
+  return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_AVX2)
+  return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_SSE4_2)
+  return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+#else
+#error "ByteStreamSplitDecodeSimd not implemented"
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t 
num_values,
+                                      uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_AVX512)
+  return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, 
output_buffer_raw);
+#elif defined(ARROW_HAVE_AVX2)
+  return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, 
output_buffer_raw);
+#elif defined(ARROW_HAVE_SSE4_2)
+  return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, 
output_buffer_raw);
+#else
+#error "ByteStreamSplitEncodeSimd not implemented"
+#endif
+}
 #endif
 
 template <typename T>
 void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t 
num_values,
                                  uint8_t* output_buffer_raw) {
-  constexpr size_t num_streams = sizeof(T);
+  constexpr size_t kNumStreams = sizeof(T);
   for (size_t i = 0U; i < num_values; ++i) {
-    for (size_t j = 0U; j < num_streams; ++j) {
-      const uint8_t byte_in_value = raw_values[i * num_streams + j];
+    for (size_t j = 0U; j < kNumStreams; ++j) {
+      const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
       output_buffer_raw[j * num_values + i] = byte_in_value;
     }
   }
@@ -198,8 +606,26 @@ void ByteStreamSplitDecodeScalar(const uint8_t* data, 
int64_t num_values, int64_
   }
 }
 
+template <typename T>
+void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t 
num_values,
+                                  uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+  return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, 
output_buffer_raw);
+#else
+  return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, 
output_buffer_raw);
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, 
int64_t stride,
+                                  T* out) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+  return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
+#else
+  return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
+#endif
+}
+
 }  // namespace internal
 }  // namespace util
 }  // namespace arrow
-
-#endif  // ARROW_UTIL_BYTE_STREAM_SPLIT_H
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 1988c36..f7df986 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -861,20 +861,15 @@ std::shared_ptr<Buffer> 
ByteStreamSplitEncoder<DType>::FlushValues() {
   uint8_t* output_buffer_raw = output_buffer->mutable_data();
   const size_t num_values = values_.length();
   const uint8_t* raw_values = reinterpret_cast<const uint8_t*>(values_.data());
-#if defined(ARROW_HAVE_SSE4_2)
-  arrow::util::internal::ByteStreamSplitEncodeSSE2<T>(raw_values, num_values,
-                                                      output_buffer_raw);
-#else
-  arrow::util::internal::ByteStreamSplitEncodeScalar<T>(raw_values, num_values,
-                                                        output_buffer_raw);
-#endif
+  arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values,
+                                                  output_buffer_raw);
   values_.Reset();
   return std::move(output_buffer);
 }
 
 template <typename DType>
 void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
-  PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values));
+  if (num_values > 0) PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values));
 }
 
 template <typename DType>
@@ -2344,13 +2339,8 @@ int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int 
max_values) {
   const int num_decoded_previously = num_values_in_buffer_ - num_values_;
   const uint8_t* data = data_ + num_decoded_previously;
 
-#if defined(ARROW_HAVE_SSE4_2)
-  arrow::util::internal::ByteStreamSplitDecodeSSE2<T>(data, values_to_decode,
-                                                      num_values_in_buffer_, 
buffer);
-#else
-  arrow::util::internal::ByteStreamSplitDecodeScalar<T>(data, values_to_decode,
-                                                        num_values_in_buffer_, 
buffer);
-#endif
+  arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
+                                                  num_values_in_buffer_, 
buffer);
   num_values_ -= values_to_decode;
   len_ -= sizeof(T) * values_to_decode;
   return values_to_decode;
@@ -2372,12 +2362,12 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
   const uint8_t* data = data_ + num_decoded_previously;
   int offset = 0;
 
-#if defined(ARROW_HAVE_SSE4_2)
+#if defined(ARROW_HAVE_SIMD_SPLIT)
   // Use fast decoding into intermediate buffer.  This will also decode
   // some null values, but it's fast enough that we don't care.
   T* decode_out = EnsureDecodeBuffer(values_decoded);
-  arrow::util::internal::ByteStreamSplitDecodeSSE2<T>(data, values_decoded,
-                                                      num_values_in_buffer_, 
decode_out);
+  arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
+                                                  num_values_in_buffer_, 
decode_out);
 
   // XXX If null_count is 0, we could even append in bulk or decode directly 
into
   // builder
@@ -2389,9 +2379,6 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
       builder->UnsafeAppendNull();
     }
   };
-
-  VisitNullBitmapInline(valid_bits, valid_bits_offset, num_values, null_count,
-                        std::move(decode_value));
 #else
   auto decode_value = [&](bool is_valid) {
     if (is_valid) {
@@ -2406,10 +2393,10 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
       builder->UnsafeAppendNull();
     }
   };
+#endif
 
   VisitNullBitmapInline(valid_bits, valid_bits_offset, num_values, null_count,
                         std::move(decode_value));
-#endif
 
   num_values_ -= values_decoded;
   len_ -= sizeof(T) * values_decoded;
diff --git a/cpp/src/parquet/encoding_benchmark.cc 
b/cpp/src/parquet/encoding_benchmark.cc
index c272aa7..c8e6322 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -252,30 +252,84 @@ 
BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);
 
 #if defined(ARROW_HAVE_SSE4_2)
-static void BM_ByteStreamSplitDecode_Float_SSE2(benchmark::State& state) {
+static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
-      state, arrow::util::internal::ByteStreamSplitDecodeSSE2<float>);
+      state, arrow::util::internal::ByteStreamSplitDecodeSse2<float>);
 }
 
-static void BM_ByteStreamSplitDecode_Double_SSE2(benchmark::State& state) {
+static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<double>(
-      state, arrow::util::internal::ByteStreamSplitDecodeSSE2<double>);
+      state, arrow::util::internal::ByteStreamSplitDecodeSse2<double>);
 }
 
-static void BM_ByteStreamSplitEncode_Float_SSE2(benchmark::State& state) {
+static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitEncode<float>(
-      state, arrow::util::internal::ByteStreamSplitEncodeSSE2<float>);
+      state, arrow::util::internal::ByteStreamSplitEncodeSse2<float>);
 }
 
-static void BM_ByteStreamSplitEncode_Double_SSE2(benchmark::State& state) {
+static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitEncode<double>(
-      state, arrow::util::internal::ByteStreamSplitEncodeSSE2<double>);
+      state, arrow::util::internal::ByteStreamSplitEncodeSse2<double>);
 }
 
-BENCHMARK(BM_ByteStreamSplitDecode_Float_SSE2)->Range(MIN_RANGE, MAX_RANGE);
-BENCHMARK(BM_ByteStreamSplitDecode_Double_SSE2)->Range(MIN_RANGE, MAX_RANGE);
-BENCHMARK(BM_ByteStreamSplitEncode_Float_SSE2)->Range(MIN_RANGE, MAX_RANGE);
-BENCHMARK(BM_ByteStreamSplitEncode_Double_SSE2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Range(MIN_RANGE, MAX_RANGE);
+#endif
+
+#if defined(ARROW_HAVE_AVX2)
+static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(
+      state, arrow::util::internal::ByteStreamSplitDecodeAvx2<float>);
+}
+
+static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<double>(
+      state, arrow::util::internal::ByteStreamSplitDecodeAvx2<double>);
+}
+
+static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<float>(
+      state, arrow::util::internal::ByteStreamSplitEncodeAvx2<float>);
+}
+
+static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<double>(
+      state, arrow::util::internal::ByteStreamSplitEncodeAvx2<double>);
+}
+
+BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE);
+#endif
+
+#if defined(ARROW_HAVE_AVX512)
+static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(
+      state, arrow::util::internal::ByteStreamSplitDecodeAvx512<float>);
+}
+
+static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<double>(
+      state, arrow::util::internal::ByteStreamSplitDecodeAvx512<double>);
+}
+
+static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<float>(
+      state, arrow::util::internal::ByteStreamSplitEncodeAvx512<float>);
+}
+
+static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<double>(
+      state, arrow::util::internal::ByteStreamSplitEncodeAvx512<double>);
+}
+
+BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
 #endif
 
 template <typename Type>
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 216da3e..6acc912 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1091,19 +1091,27 @@ typedef ::testing::Types<FloatType, DoubleType> 
ByteStreamSplitTypes;
 TYPED_TEST_SUITE(TestByteStreamSplitEncoding, ByteStreamSplitTypes);
 
 TYPED_TEST(TestByteStreamSplitEncoding, BasicRoundTrip) {
-  // We need to test with different sizes to guarantee that the SIMD 
implementation
-  // can handle both inputs with size divisible by 4/8 and sizes which would
-  // require a scalar loop for the suffix.
-
-  // Exercise only the SIMD loop.
-  ASSERT_NO_FATAL_FAILURE(this->Execute(256, 1));
-
-  // Exercise both.
-  ASSERT_NO_FATAL_FAILURE(this->Execute(1337, 1));
-
   for (int values = 0; values < 32; ++values) {
     ASSERT_NO_FATAL_FAILURE(this->Execute(values, 1));
   }
+
+  // We need to test with different sizes to guarantee that the SIMD 
implementation
+  // can handle both inputs with size divisible by 4/8 and sizes which would
+  // require a scalar loop for the suffix.
+  constexpr size_t kSuffixSize = 7;
+  constexpr size_t kAvx2Size = 32;    // sizeof(__m256i) for AVX2
+  constexpr size_t kAvx512Size = 64;  // sizeof(__m512i) for AVX512
+  constexpr size_t kMultiSimdSize = kAvx512Size * 7;
+
+  // Exercise only one SIMD loop. SSE and AVX2 covered in above loop.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx512Size, 1));
+  // Exercise one SIMD loop with suffix. SSE covered in above loop.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx2Size + kSuffixSize, 1));
+  ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx512Size + kSuffixSize, 1));
+  // Exercise multi SIMD loop.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(kMultiSimdSize, 1));
+  // Exercise multi SIMD loop with suffix.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(kMultiSimdSize + kSuffixSize, 1));
 }
 
 TYPED_TEST(TestByteStreamSplitEncoding, RoundTripSingleElement) {

Reply via email to