[arrow] branch master updated: ARROW-5458: [C++] Apache Arrow parallel CRC32c computation optimization

wesm Wed, 10 Jul 2019 12:37:38 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 1b65c55  ARROW-5458: [C++] Apache Arrow parallel CRC32c computation 
optimization
1b65c55 is described below

commit 1b65c5562fc792c9343ab6bcd91dfa68508136a4
Author: Yuqi Gu <[email protected]>
AuthorDate: Wed Jul 10 14:37:17 2019 -0500

    ARROW-5458: [C++] Apache Arrow parallel CRC32c computation optimization
    
    ARMv8 defines VMULL/PMULL crypto instruction.
    This patch optimizes crc32c calculate with the instruction when
    available rather than original linear crc instructions.
    
    Author: Yuqi Gu <[email protected]>
    
    Closes #4427 from guyuqi/Arm64-parallel-CRC32c and squashes the following 
commits:
    
    16e22af03 <Yuqi Gu> Use C++-style casts
    be0955038 <Yuqi Gu> Re-implemented with the algorithm derived from Intel 
whitepaper. Remove the useless code of cache prefetching.
    a3e2bacc1 <Yuqi Gu> Fix code style format-check
    349903a8d <Yuqi Gu> Fix style rules
    1cdb28380 <Yuqi Gu> ARROW-5458:  Apache Arrow parallel CRC32c computation 
optimization
---
 cpp/cmake_modules/SetupCxxFlags.cmake |  7 ++-
 cpp/src/arrow/util/hash-util.h        | 90 ++++++++++++++++++++++++++++++++++-
 cpp/src/arrow/util/hashing.h          |  4 ++
 cpp/src/arrow/util/neon-util.h        | 12 ++++-
 4 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake 
b/cpp/cmake_modules/SetupCxxFlags.cmake
index 496904b..9eba9e8 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -24,6 +24,7 @@ check_cxx_compiler_flag("-msse4.2" CXX_SUPPORTS_SSE4_2)
 check_cxx_compiler_flag("-maltivec" CXX_SUPPORTS_ALTIVEC)
 # Arm64 compiler flags
 check_cxx_compiler_flag("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC)
+check_cxx_compiler_flag("-march=armv8-a+crc+crypto" 
CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
 
 # Support C11
 set(CMAKE_C_STANDARD 11)
@@ -265,7 +266,11 @@ if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
 endif()
 
 if(CXX_SUPPORTS_ARMCRC)
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
+  if(CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc+crypto")
+  else()
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
+  endif()
 endif()
 
 if(ARROW_USE_SIMD)
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index 7aed3c1..4d33786 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -71,6 +71,86 @@ class HashUtil {
   static constexpr bool have_hardware_crc32 = false;
 #endif
 
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+/* Crc32c Parallel computation
+ *   Algorithm comes from Intel whitepaper:
+ *   crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
+  static uint32_t Armv8CrcHashParallel(const void* data, int32_t nbytes, 
uint32_t crc) {
+    const uint8_t* buf8;
+    const uint64_t* buf64 = reinterpret_cast<const uint64_t*>(data);
+    int32_t length = nbytes;
+
+    while (length >= 1024) {
+      uint64_t t0, t1;
+      uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+      /* parallel computation params:
+       *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+       *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+       */
+      uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+      /* First 8 byte for better pipelining */
+      crc0 = ARMCE_crc32_u64(crc, *buf64++);
+
+      /* 3 blocks crc32c parallel computation
+       *
+       * 42 * 8 * 3 = 1008 (bytes)
+       */
+      for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
+        crc0 = ARMCE_crc32_u64(crc0, *buf64);
+        crc1 = ARMCE_crc32_u64(crc1, *(buf64 + BLK_LENGTH));
+        crc2 = ARMCE_crc32_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
+      }
+      buf64 += (BLK_LENGTH * 2);
+
+      /* Last 8 bytes */
+      crc = ARMCE_crc32_u64(crc2, *buf64++);
+
+      t0 = (uint64_t)vmull_p64(crc0, k0);
+      t1 = (uint64_t)vmull_p64(crc1, k1);
+
+      /* Merge (crc0, crc1, crc2) -> crc */
+      crc1 = ARMCE_crc32_u64(0, t1);
+      crc ^= crc1;
+      crc0 = ARMCE_crc32_u64(0, t0);
+      crc ^= crc0;
+
+      length -= 1024;
+    }
+
+    buf8 = reinterpret_cast<const uint8_t*>(buf64);
+    while (length >= 8) {
+      crc = ARMCE_crc32_u64(crc, *reinterpret_cast<const uint64_t*>(buf8));
+      buf8 += 8;
+      length -= 8;
+    }
+
+    /* The following is more efficient than the straight loop */
+    if (length >= 4) {
+      crc = ARMCE_crc32_u32(crc, *reinterpret_cast<const uint32_t*>(buf8));
+      buf8 += 4;
+      length -= 4;
+    }
+
+    if (length >= 2) {
+      crc = ARMCE_crc32_u16(crc, *reinterpret_cast<const uint16_t*>(buf8));
+      buf8 += 2;
+      length -= 2;
+    }
+
+    if (length >= 1) crc = ARMCE_crc32_u8(crc, *(buf8));
+
+    return crc;
+  }
+#endif
+
   /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions.  The 
input hash
   /// parameter is the current hash/seed value.
   /// This should only be called if SSE/ArmCRC is supported.
@@ -295,8 +375,14 @@ inline int HashUtil::Hash<true>(const void* data, int32_t 
bytes, uint32_t seed)
     return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
   else
 #endif
-    // Double CRC
-    return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed));
+
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+    // Arm64 parallel crc32
+    return static_cast<int>(HashUtil::Armv8CrcHashParallel(data, bytes, seed));
+#else
+  // Double CRC
+  return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed));
+#endif
 }
 
 // Murmur Hash
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index be2d4cf..bad2b49 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -167,8 +167,12 @@ hash_t ComputeStringHash(const void* data, int64_t length) 
{
   }
 
   if (HashUtil::have_hardware_crc32) {
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+    auto h = HashUtil::Armv8CrcHashParallel(data, 
static_cast<int32_t>(length), AlgNum);
+#else
     // DoubleCrcHash is faster that Murmur2.
     auto h = HashUtil::DoubleCrcHash(data, static_cast<int32_t>(length), 
AlgNum);
+#endif
     return ScalarHelper<uint64_t, AlgNum>::ComputeHash(h);
   } else {
     // Fall back on 64-bit Murmur2 for longer strings.
diff --git a/cpp/src/arrow/util/neon-util.h b/cpp/src/arrow/util/neon-util.h
index 714d232..4c28aa3 100644
--- a/cpp/src/arrow/util/neon-util.h
+++ b/cpp/src/arrow/util/neon-util.h
@@ -20,11 +20,19 @@
 namespace arrow {
 
 #if defined(__aarch64__) || defined(__AARCH64__)
+
 #ifdef __ARM_FEATURE_CRC32
 #define ARROW_HAVE_ARM_CRC
 #include <arm_acle.h>
-#endif
-#endif
+
+#ifdef __ARM_FEATURE_CRYPTO
+#include <arm_neon.h>
+#define ARROW_HAVE_ARMV8_CRYPTO
+#endif  // __ARM_FEATURE_CRYPTO
+
+#endif  // __ARM_FEATURE_CRC32
+
+#endif  // defined(__aarch64__) || defined(__AARCH64__)
 
 #if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)

[arrow] branch master updated: ARROW-5458: [C++] Apache Arrow parallel CRC32c computation optimization

Reply via email to