Re: [PR] GH-38560: [C++][Parquet] Rewrite BYTE_STREAM_SPLIT SSE optimizations using xsimd [arrow]

via GitHub Wed, 06 Mar 2024 21:54:45 -0800


mapleFU commented on code in PR #40335:
URL: https://github.com/apache/arrow/pull/40335#discussion_r1515512722



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -64,46 +67,47 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t 
num_values, int64_t
   // Stage 1: AAAA BBBB CCCC DDDD
   // Stage 2: ACAC ACAC BDBD BDBD
   // Stage 3: ABCD ABCD ABCD ABCD
-  __m128i stage[kNumStreamsLog2 + 1][kNumStreams];
+  simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
   constexpr int kNumStreamsHalf = kNumStreams / 2U;
 
   for (int64_t i = 0; i < num_blocks; ++i) {
     for (int j = 0; j < kNumStreams; ++j) {
-      stage[0][j] = _mm_loadu_si128(
-          reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * 
stride]));
+      stage[0][j] =
+          simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j * 
stride]);
     }
     for (int step = 0; step < kNumStreamsLog2; ++step) {
       for (int j = 0; j < kNumStreamsHalf; ++j) {
         stage[step + 1U][j * 2] =
-            _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+            xsimd::zip_lo(stage[step][j], stage[step][kNumStreamsHalf + j]);
         stage[step + 1U][j * 2 + 1U] =
-            _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + 
j]);
+            xsimd::zip_hi(stage[step][j], stage[step][kNumStreamsHalf + j]);
       }
     }
     for (int j = 0; j < kNumStreams; ++j) {
-      _mm_storeu_si128(
-          reinterpret_cast<__m128i*>(out + (i * kNumStreams + j) * 
sizeof(__m128i)),
+      xsimd::store_unaligned(
+          reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * 
sizeof(simd_batch)),

Review Comment:
   cast to signed int for `store_unaligned`



##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -230,6 +230,11 @@ endif()
 set(PARQUET_SHARED_LINK_LIBS)
 set(PARQUET_SHARED_PRIVATE_LINK_LIBS)
 
+if(ARROW_USE_XSIMD)

Review Comment:
   should we do it here?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -25,23 +25,26 @@
 #include <cassert>
 #include <cstdint>
 
-#ifdef ARROW_HAVE_SSE4_2
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#include <xsimd/xsimd.hpp>
 #define ARROW_HAVE_SIMD_SPLIT
-#endif  // ARROW_HAVE_SSE4_2
+#endif
 
 namespace arrow::util::internal {
 
 //
 // SIMD implementations
 //
 
-#if defined(ARROW_HAVE_SSE4_2)
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
 template <int kNumStreams>
-void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, 
int64_t stride,
+void ByteStreamSplitDecode128B(const uint8_t* data, int64_t num_values, 
int64_t stride,
                                uint8_t* out) {
+  using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
+
   static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of 
streams.");

Review Comment:
   @pitrou Seems this only support 4B and 8B stream. For fp16 we can support it 
later?



##########
cpp/src/arrow/util/byte_stream_split_internal.h:
##########
@@ -139,39 +145,68 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const int64_t num_valu
     for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) {
       for (int i = 0; i < kNumStreams / 2; ++i) {
         stage[stage_lvl + 1][i * 2] =
-            _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
         stage[stage_lvl + 1][i * 2 + 1] =
-            _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
+            xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 
1]);
       }
     }
     if constexpr (kNumStreams == 8) {
-      // This is the path for double.
-      __m128i tmp[8];
+      // This is the path for 64bits data.
+      simd_batch tmp[8];
+
       for (int i = 0; i < 4; ++i) {
-        tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
-        tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
+        tmp[i * 2] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_lo(xsimd::bitwise_cast<int32_t>(stage[2][i]),
+                          xsimd::bitwise_cast<int32_t>(stage[2][i + 4])));
+        tmp[i * 2 + 1] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_hi(xsimd::bitwise_cast<int32_t>(stage[2][i]),
+                          xsimd::bitwise_cast<int32_t>(stage[2][i + 4])));
       }
       for (int i = 0; i < 4; ++i) {
-        final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
-        final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
+        final_result[i * 2] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_lo(xsimd::bitwise_cast<int32_t>(tmp[i]),
+                          xsimd::bitwise_cast<int32_t>(tmp[i + 4])));
+        final_result[i * 2 + 1] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_hi(xsimd::bitwise_cast<int32_t>(tmp[i]),
+                          xsimd::bitwise_cast<int32_t>(tmp[i + 4])));
       }
     } else {
-      // this is the path for float.
-      __m128i tmp[4];
+      // This is the path for 32bits data.
+      simd_batch tmp[4];
       for (int i = 0; i < 2; ++i) {
-        tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
-        tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 
1]);
+        tmp[i * 2] = xsimd::zip_lo(stage[2][i * 2], stage[2][i * 2 + 1]);
+        tmp[i * 2 + 1] = xsimd::zip_hi(stage[2][i * 2], stage[2][i * 2 + 1]);
       }
       for (int i = 0; i < 2; ++i) {
-        final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
-        final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
+        final_result[i * 2] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_lo(xsimd::bitwise_cast<int64_t>(tmp[i]),
+                          xsimd::bitwise_cast<int64_t>(tmp[i + 2])));
+        final_result[i * 2 + 1] = xsimd::bitwise_cast<int8_t>(
+            xsimd::zip_hi(xsimd::bitwise_cast<int64_t>(tmp[i]),
+                          xsimd::bitwise_cast<int64_t>(tmp[i + 2])));
       }
     }
     for (int i = 0; i < kNumStreams; ++i) {
-      _mm_storeu_si128(&output_buffer_streams[i][block_index], 
final_result[i]);
+      xsimd::store_unaligned(&output_buffer_streams[i][block_index * 
sizeof(simd_batch)],
+                             final_result[i]);
     }
   }
 }
+
+#endif
+
+#if defined(ARROW_HAVE_SSE4_2)
+template <int kNumStreams>
+void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, 
int64_t stride,

Review Comment:
   don't know should we just use `ByteStreamSplitDecode128B`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-38560: [C++][Parquet] Rewrite BYTE_STREAM_SPLIT SSE optimizations using xsimd [arrow]

Reply via email to