This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1b65c55 ARROW-5458: [C++] Apache Arrow parallel CRC32c computation
optimization
1b65c55 is described below
commit 1b65c5562fc792c9343ab6bcd91dfa68508136a4
Author: Yuqi Gu <[email protected]>
AuthorDate: Wed Jul 10 14:37:17 2019 -0500
ARROW-5458: [C++] Apache Arrow parallel CRC32c computation optimization
ARMv8 defines VMULL/PMULL crypto instruction.
This patch optimizes crc32c calculate with the instruction when
available rather than original linear crc instructions.
Author: Yuqi Gu <[email protected]>
Closes #4427 from guyuqi/Arm64-parallel-CRC32c and squashes the following
commits:
16e22af03 <Yuqi Gu> Use C++-style casts
be0955038 <Yuqi Gu> Re-implemented with the algorithm derived from Intel
whitepaper. Remove the useless code of cache prefetching.
a3e2bacc1 <Yuqi Gu> Fix code style format-check
349903a8d <Yuqi Gu> Fix style rules
1cdb28380 <Yuqi Gu> ARROW-5458: Apache Arrow parallel CRC32c computation
optimization
---
cpp/cmake_modules/SetupCxxFlags.cmake | 7 ++-
cpp/src/arrow/util/hash-util.h | 90 ++++++++++++++++++++++++++++++++++-
cpp/src/arrow/util/hashing.h | 4 ++
cpp/src/arrow/util/neon-util.h | 12 ++++-
4 files changed, 108 insertions(+), 5 deletions(-)
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake
b/cpp/cmake_modules/SetupCxxFlags.cmake
index 496904b..9eba9e8 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -24,6 +24,7 @@ check_cxx_compiler_flag("-msse4.2" CXX_SUPPORTS_SSE4_2)
check_cxx_compiler_flag("-maltivec" CXX_SUPPORTS_ALTIVEC)
# Arm64 compiler flags
check_cxx_compiler_flag("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC)
+check_cxx_compiler_flag("-march=armv8-a+crc+crypto"
CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
# Support C11
set(CMAKE_C_STANDARD 11)
@@ -265,7 +266,11 @@ if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
endif()
if(CXX_SUPPORTS_ARMCRC)
- set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
+ if(CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc+crypto")
+ else()
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
+ endif()
endif()
if(ARROW_USE_SIMD)
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index 7aed3c1..4d33786 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -71,6 +71,86 @@ class HashUtil {
static constexpr bool have_hardware_crc32 = false;
#endif
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+/* Crc32c Parallel computation
+ * Algorithm comes from Intel whitepaper:
+ * crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
+ static uint32_t Armv8CrcHashParallel(const void* data, int32_t nbytes,
uint32_t crc) {
+ const uint8_t* buf8;
+ const uint64_t* buf64 = reinterpret_cast<const uint64_t*>(data);
+ int32_t length = nbytes;
+
+ while (length >= 1024) {
+ uint64_t t0, t1;
+ uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+ /* parallel computation params:
+ * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+ * k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+ */
+ uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+ /* First 8 byte for better pipelining */
+ crc0 = ARMCE_crc32_u64(crc, *buf64++);
+
+ /* 3 blocks crc32c parallel computation
+ *
+ * 42 * 8 * 3 = 1008 (bytes)
+ */
+ for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
+ crc0 = ARMCE_crc32_u64(crc0, *buf64);
+ crc1 = ARMCE_crc32_u64(crc1, *(buf64 + BLK_LENGTH));
+ crc2 = ARMCE_crc32_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
+ }
+ buf64 += (BLK_LENGTH * 2);
+
+ /* Last 8 bytes */
+ crc = ARMCE_crc32_u64(crc2, *buf64++);
+
+ t0 = (uint64_t)vmull_p64(crc0, k0);
+ t1 = (uint64_t)vmull_p64(crc1, k1);
+
+ /* Merge (crc0, crc1, crc2) -> crc */
+ crc1 = ARMCE_crc32_u64(0, t1);
+ crc ^= crc1;
+ crc0 = ARMCE_crc32_u64(0, t0);
+ crc ^= crc0;
+
+ length -= 1024;
+ }
+
+ buf8 = reinterpret_cast<const uint8_t*>(buf64);
+ while (length >= 8) {
+ crc = ARMCE_crc32_u64(crc, *reinterpret_cast<const uint64_t*>(buf8));
+ buf8 += 8;
+ length -= 8;
+ }
+
+ /* The following is more efficient than the straight loop */
+ if (length >= 4) {
+ crc = ARMCE_crc32_u32(crc, *reinterpret_cast<const uint32_t*>(buf8));
+ buf8 += 4;
+ length -= 4;
+ }
+
+ if (length >= 2) {
+ crc = ARMCE_crc32_u16(crc, *reinterpret_cast<const uint16_t*>(buf8));
+ buf8 += 2;
+ length -= 2;
+ }
+
+ if (length >= 1) crc = ARMCE_crc32_u8(crc, *(buf8));
+
+ return crc;
+ }
+#endif
+
/// Compute the Crc32 hash for data using SSE4/ArmCRC instructions. The
input hash
/// parameter is the current hash/seed value.
/// This should only be called if SSE/ArmCRC is supported.
@@ -295,8 +375,14 @@ inline int HashUtil::Hash<true>(const void* data, int32_t
bytes, uint32_t seed)
return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
else
#endif
- // Double CRC
- return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed));
+
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+ // Arm64 parallel crc32
+ return static_cast<int>(HashUtil::Armv8CrcHashParallel(data, bytes, seed));
+#else
+ // Double CRC
+ return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed));
+#endif
}
// Murmur Hash
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index be2d4cf..bad2b49 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -167,8 +167,12 @@ hash_t ComputeStringHash(const void* data, int64_t length)
{
}
if (HashUtil::have_hardware_crc32) {
+#ifdef ARROW_HAVE_ARMV8_CRYPTO
+ auto h = HashUtil::Armv8CrcHashParallel(data,
static_cast<int32_t>(length), AlgNum);
+#else
// DoubleCrcHash is faster that Murmur2.
auto h = HashUtil::DoubleCrcHash(data, static_cast<int32_t>(length),
AlgNum);
+#endif
return ScalarHelper<uint64_t, AlgNum>::ComputeHash(h);
} else {
// Fall back on 64-bit Murmur2 for longer strings.
diff --git a/cpp/src/arrow/util/neon-util.h b/cpp/src/arrow/util/neon-util.h
index 714d232..4c28aa3 100644
--- a/cpp/src/arrow/util/neon-util.h
+++ b/cpp/src/arrow/util/neon-util.h
@@ -20,11 +20,19 @@
namespace arrow {
#if defined(__aarch64__) || defined(__AARCH64__)
+
#ifdef __ARM_FEATURE_CRC32
#define ARROW_HAVE_ARM_CRC
#include <arm_acle.h>
-#endif
-#endif
+
+#ifdef __ARM_FEATURE_CRYPTO
+#include <arm_neon.h>
+#define ARROW_HAVE_ARMV8_CRYPTO
+#endif // __ARM_FEATURE_CRYPTO
+
+#endif // __ARM_FEATURE_CRC32
+
+#endif // defined(__aarch64__) || defined(__AARCH64__)
#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)