Re: [PR] GH-38560: [C++][Parquet] Rewrite BYTE_STREAM_SPLIT SSE optimizations using xsimd [arrow]

via GitHub Fri, 08 Mar 2024 06:39:28 -0800


pitrou commented on code in PR #40335:
URL: https://github.com/apache/arrow/pull/40335#discussion_r1517793488



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -25,23 +25,26 @@
 #include <cassert>
 #include <cstdint>
 
-#ifdef ARROW_HAVE_SSE4_2
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#include <xsimd/xsimd.hpp>
 #define ARROW_HAVE_SIMD_SPLIT
-#endif  // ARROW_HAVE_SSE4_2
+#endif
 
 namespace arrow::util::internal {
 
 //
 // SIMD implementations
 //
 
-#if defined(ARROW_HAVE_SSE4_2)
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
 template <int kNumStreams>
-void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, 
int64_t stride,
+void ByteStreamSplitDecode128B(const uint8_t* data, int64_t num_values, 
int64_t stride,

Review Comment:
   Ok, nit: rename this to `ByteStreamSplitDecodeSimd128`?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -139,40 +146,72 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const int64_t num_valu
     for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) {
       for (int i = 0; i < kNumStreams / 2; ++i) {
         stage[stage_lvl + 1][i * 2] =
-            _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
         stage[stage_lvl + 1][i * 2 + 1] =
-            _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
       }
     }
     if constexpr (kNumStreams == 8) {
-      // This is the path for double.
-      __m128i tmp[8];
+      // This is the path for 64bits data.
+      simd_batch tmp[8];
+      using int32_batch = xsimd::batch<int32_t, simd_arch>;
+      // This is a workaround, see: 
https://github.com/xtensor-stack/xsimd/issues/735
+      auto from_int32_batch = [](int32_batch from) -> simd_batch {
+        simd_batch dest;
+        memcpy(&dest, &from, sizeof(simd_batch));
+        return dest;
+      };
+      auto to_int32_batch = [](simd_batch from) -> int32_batch {
+        int32_batch dest;
+        memcpy(&dest, &from, sizeof(simd_batch));
+        return dest;
+      };
       for (int i = 0; i < 4; ++i) {
-        tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
-        tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
+        tmp[i * 2] = from_int32_batch(
+            xsimd::zip_lo(to_int32_batch(stage[2][i]), 
to_int32_batch(stage[2][i + 4])));
+        tmp[i * 2 + 1] = from_int32_batch(
+            xsimd::zip_hi(to_int32_batch(stage[2][i]), 
to_int32_batch(stage[2][i + 4])));
       }
       for (int i = 0; i < 4; ++i) {
-        final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
-        final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
+        final_result[i * 2] = from_int32_batch(
+            xsimd::zip_lo(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
+        final_result[i * 2 + 1] = from_int32_batch(
+            xsimd::zip_hi(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
       }
     } else {
-      // this is the path for float.
-      __m128i tmp[4];
+      // This is the path for 32bits data.
+      using int64_batch = xsimd::batch<int64_t, simd_arch>;

Review Comment:
   Similar question: can it just be `xsimd::batch<int64_t, 2>`?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -139,40 +146,72 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const int64_t num_valu
     for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) {
       for (int i = 0; i < kNumStreams / 2; ++i) {
         stage[stage_lvl + 1][i * 2] =
-            _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
         stage[stage_lvl + 1][i * 2 + 1] =
-            _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
       }
     }
     if constexpr (kNumStreams == 8) {
-      // This is the path for double.
-      __m128i tmp[8];
+      // This is the path for 64bits data.
+      simd_batch tmp[8];
+      using int32_batch = xsimd::batch<int32_t, simd_arch>;

Review Comment:
   Do you need `simd_arch` or can it just be `xsimd::batch<int32_t, 4>`?



##########
cpp/src/arrow/util/byte_stream_split_test.cc:
##########
@@ -75,8 +75,8 @@ class TestByteStreamSplitSpecialized : public ::testing::Test 
{
     decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd<kWidth>});
 #endif
 #if defined(ARROW_HAVE_SSE4_2)
-    encode_funcs_.push_back({"sse2", &ByteStreamSplitEncodeSse2<kWidth>});
-    decode_funcs_.push_back({"sse2", &ByteStreamSplitDecodeSse2<kWidth>});
+    encode_funcs_.push_back({"sse2", &ByteStreamSplitEncode128B<kWidth>});

Review Comment:
   Replace "sse2" with "simd128"?



##########
cpp/src/arrow/util/byte_stream_split_test.cc:
##########
@@ -75,8 +75,8 @@ class TestByteStreamSplitSpecialized : public ::testing::Test 
{
     decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd<kWidth>});
 #endif
 #if defined(ARROW_HAVE_SSE4_2)

Review Comment:
   Should also be enabled for NEON, no?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -117,20 +122,22 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const int64_t num_valu
   // The current shuffling algorithm diverges for float and double types but 
the compiler
   // should be able to remove the branch since only one path is taken for each 
template
   // instantiation.
-  // Example run for floats:
+  // Example run for 32-bit variables:
   // Step 0, copy:
   //   0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
-  // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
+  // Step 1: simd_batch<int8_t, 8>::xip_lo and simd_batch<int8_t, 8>::xip_hi:

Review Comment:
   "zip_lo", not "xip_lo"?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -64,46 +67,48 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t 
num_values, int64_t
   // Stage 1: AAAA BBBB CCCC DDDD
   // Stage 2: ACAC ACAC BDBD BDBD
   // Stage 3: ABCD ABCD ABCD ABCD
-  __m128i stage[kNumStreamsLog2 + 1][kNumStreams];
+  simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
   constexpr int kNumStreamsHalf = kNumStreams / 2U;
 
   for (int64_t i = 0; i < num_blocks; ++i) {
     for (int j = 0; j < kNumStreams; ++j) {
-      stage[0][j] = _mm_loadu_si128(
-          reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * 
stride]));
+      stage[0][j] =
+          simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j * 
stride]);
     }
     for (int step = 0; step < kNumStreamsLog2; ++step) {
       for (int j = 0; j < kNumStreamsHalf; ++j) {
         stage[step + 1U][j * 2] =
-            _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+            xsimd::zip_lo(stage[step][j], stage[step][kNumStreamsHalf + j]);
         stage[step + 1U][j * 2 + 1U] =
-            _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+            xsimd::zip_hi(stage[step][j], stage[step][kNumStreamsHalf + j]);
       }
     }
     for (int j = 0; j < kNumStreams; ++j) {
-      _mm_storeu_si128(
-          reinterpret_cast<__m128i*>(out + (i * kNumStreams + j) * 
sizeof(__m128i)),
+      xsimd::store_unaligned(
+          reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * 
sizeof(simd_batch)),
           stage[kNumStreamsLog2][j]);
     }
   }
 }
 
 template <int kNumStreams>
-void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const int64_t 
num_values,
+void ByteStreamSplitEncode128B(const uint8_t* raw_values, const int64_t 
num_values,

Review Comment:
   Same here.



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -117,20 +122,22 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const int64_t num_valu
   // The current shuffling algorithm diverges for float and double types but 
the compiler
   // should be able to remove the branch since only one path is taken for each 
template
   // instantiation.
-  // Example run for floats:
+  // Example run for 32-bit variables:
   // Step 0, copy:
   //   0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
-  // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
+  // Step 1: simd_batch<int8_t, 8>::xip_lo and simd_batch<int8_t, 8>::xip_hi:
   //   0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
   //   0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...

Review Comment:
   Can you add the missing comment for step 2?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-38560: [C++][Parquet] Rewrite BYTE_STREAM_SPLIT SSE optimizations using xsimd [arrow]

Reply via email to