This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new f5d2899 KUDU-3007 (3/3): Support building and running Kudu on aarch64
platform
f5d2899 is described below
commit f5d2899a6483fc08a17ade8d9b26b3e6930da4e4
Author: liusheng <[email protected]>
AuthorDate: Sat Jan 11 16:17:19 2020 +0800
KUDU-3007 (3/3): Support building and running Kudu on aarch64 platform
This change has modified the Kudu code to adapt one more ARM64/aarch64
platform. With this patch, Kudu can be built and run on both x86_64 and
aarch64 platforms.
Change-Id: I2953519c5d28de17e6b2bb7094abab0c1cd12c97
Reviewed-on: http://gerrit.cloudera.org:8080/14964
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <[email protected]>
---
CMakeLists.txt | 30 ++++++++++++++++------
README.adoc | 3 +--
build-support/jenkins/build-and-test.sh | 7 +++++
.../test/cluster/TestKuduBinaryJarExtractor.java | 4 ++-
src/kudu/cfile/binary_plain_block.cc | 7 ++++-
src/kudu/cfile/bitshuffle_arch_wrapper.cc | 3 ++-
src/kudu/codegen/codegen-test.cc | 4 +++
src/kudu/common/columnar_serialization.cc | 6 ++++-
src/kudu/common/key_encoder.h | 6 +++++
src/kudu/common/zp7.cc | 6 +++++
src/kudu/gutil/CMakeLists.txt | 7 ++++-
src/kudu/gutil/cpu.cc | 5 ++++
src/kudu/gutil/cycleclock-inl.h | 15 ++++++++++-
src/kudu/gutil/dynamic_annotations.h | 1 +
src/kudu/gutil/port.h | 2 ++
src/kudu/gutil/spinlock.h | 9 +++++++
src/kudu/gutil/spinlock_linux-inl.h | 3 +++
src/kudu/rpc/rpc-test-base.h | 2 +-
src/kudu/util/block_bloom_filter.cc | 11 ++++++++
src/kudu/util/debug-util.cc | 4 +++
src/kudu/util/debug-util.h | 4 +++
src/kudu/util/group_varint-inl.h | 9 ++++++-
src/kudu/util/group_varint-test.cc | 4 +++
src/kudu/util/init.cc | 3 +++
src/kudu/util/memory/memory.cc | 7 ++++-
src/kudu/util/notification.h | 4 +++
src/kudu/util/striped64.cc | 6 +++++
thirdparty/build-definitions.sh | 3 ++-
28 files changed, 155 insertions(+), 20 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 077305b..d5f9eb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,9 +151,15 @@ include(CompilerInfo)
# compiler flags that are common across debug/release builds
-# -msse4.2: Enable sse4.2 compiler intrinsics.
-execute_process(COMMAND uname -p OUTPUT_VARIABLE ARCH_NAME)
-if (NOT "${ARCH_NAME}" MATCHES "aarch64")
+execute_process(COMMAND uname -m OUTPUT_VARIABLE ARCH_NAME)
+if("${ARCH_NAME}" MATCHES "aarch64")
+ # Certain platforms such as ARM do not use signed chars by default
+ # which causes issues with certain bounds checks.
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fsigned-char")
+ # Turn off fp-contract on aarch64 to avoid multiply-add operation result
difference.
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -ffp-contract=off")
+else()
+ # -msse4.2: Enable sse4.2 compiler intrinsics.
set(CXX_COMMON_FLAGS "-msse4.2")
endif()
# -Wall: Enable all warnings.
@@ -409,11 +415,19 @@ if (${KUDU_USE_TSAN})
# require all code to be position independent, and the easiest way to
# guarantee that is via dynamic linking (not all 3rd party archives are
# compiled with -fPIC e.g. boost).
- if("${KUDU_LINK}" STREQUAL "a")
- message("Using dynamic linking for TSAN")
- set(KUDU_LINK "d")
- elseif("${KUDU_LINK}" STREQUAL "s")
- message(SEND_ERROR "Cannot use TSAN with static linking")
+ if(NOT "${ARCH_NAME}" MATCHES "aarch64")
+ if("${KUDU_LINK}" STREQUAL "a")
+ message("Using dynamic linking for TSAN")
+ set(KUDU_LINK "d")
+ elseif("${KUDU_LINK}" STREQUAL "s")
+ message(SEND_ERROR "Cannot use TSAN with static linking")
+ endif()
+ else()
+ # workaround for github.com/google/sanitizers/issues/1208
+ # TSAN with dynamic linking cause all of test cases failed on aarch64,
+ # we don't apply ENABLE_DIST_TEST on aarch64, so apply static linking
direcly
+ message("Using static linking for TSAN on aarch64")
+ set(KUDU_LINK "s")
endif()
endif()
diff --git a/README.adoc b/README.adoc
index bad43b2..22036a0 100644
--- a/README.adoc
+++ b/README.adoc
@@ -384,8 +384,7 @@ linking the kudu binaries and unit tests. The full range of
options for `KUDU_LI
`static`, `dynamic`, and `auto`. The default is `auto` and only the first
letter
matters for the purpose of matching.
-NOTE: Dynamic linking is incompatible with ASAN and static linking is
incompatible
-with TSAN.
+NOTE: Static linking is incompatible with TSAN.
== Developing Kudu in Eclipse
diff --git a/build-support/jenkins/build-and-test.sh
b/build-support/jenkins/build-and-test.sh
index f1d860f..21eaa49 100755
--- a/build-support/jenkins/build-and-test.sh
+++ b/build-support/jenkins/build-and-test.sh
@@ -194,6 +194,7 @@ if [ -n "$BUILD_ID" ]; then
trap cleanup EXIT
fi
+ARTIFACT_ARCH=$(uname -m)
# Configure the build
#
# ASAN/TSAN can't build the Python bindings because the exported Kudu client
@@ -202,6 +203,12 @@ if [ "$BUILD_TYPE" = "ASAN" ]; then
USE_CLANG=1
CMAKE_BUILD=fastdebug
EXTRA_BUILD_FLAGS="-DKUDU_USE_ASAN=1 -DKUDU_USE_UBSAN=1"
+ # workaround for github.com/google/sanitizers/issues/1208
+ # ASAN with dynamic linking cause all tests fail on aarch64,
+ # we don't apply ENABLE_DIST_TEST on aarch64, so use static linking.
+ if [ "$ARTIFACT_ARCH" = "aarch64" ]; then
+ EXTRA_BUILD_FLAGS="$EXTRA_BUILD_FLAGS -DKUDU_LINK=static"
+ fi
BUILD_PYTHON=0
BUILD_PYTHON3=0
elif [ "$BUILD_TYPE" = "TSAN" ]; then
diff --git
a/java/kudu-test-utils/src/test/java/org/apache/kudu/test/cluster/TestKuduBinaryJarExtractor.java
b/java/kudu-test-utils/src/test/java/org/apache/kudu/test/cluster/TestKuduBinaryJarExtractor.java
index 4604cb8..ad56b87 100644
---
a/java/kudu-test-utils/src/test/java/org/apache/kudu/test/cluster/TestKuduBinaryJarExtractor.java
+++
b/java/kudu-test-utils/src/test/java/org/apache/kudu/test/cluster/TestKuduBinaryJarExtractor.java
@@ -40,6 +40,7 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
+import com.google.gradle.osdetector.OsDetector;
import org.junit.Rule;
import org.junit.Test;
import org.slf4j.Logger;
@@ -49,6 +50,7 @@ import org.apache.kudu.test.junit.RetryRule;
public class TestKuduBinaryJarExtractor {
+ private static final OsDetector DETECTOR = new OsDetector();
private static final Logger LOG =
LoggerFactory.getLogger(TestKuduBinaryJarExtractor.class);
@Rule
@@ -110,7 +112,7 @@ public class TestKuduBinaryJarExtractor {
properties.setProperty("artifact.version", "1.9.0-SNAPSHOT");
properties.setProperty("artifact.prefix", "apache-kudu-1.9.0-SNAPSHOT");
properties.setProperty("artifact.os", os);
- properties.setProperty("artifact.arch", "x86_64");
+ properties.setProperty("artifact.arch", DETECTOR.getArch());
properties.store(out, "test");
}
diff --git a/src/kudu/cfile/binary_plain_block.cc
b/src/kudu/cfile/binary_plain_block.cc
index 18ec85d..fce2810 100644
--- a/src/kudu/cfile/binary_plain_block.cc
+++ b/src/kudu/cfile/binary_plain_block.cc
@@ -201,12 +201,17 @@ Status BinaryPlainBlockDecoder::ParseHeader() {
size_t rem = num_elems_;
while (rem >= 4) {
if (PREDICT_TRUE(p + 16 < limit)) {
+ #ifndef __aarch64__
p = coding::DecodeGroupVarInt32_SSE(
p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]);
-
+ #else
+ p = coding::DecodeGroupVarInt32_SlowButSafe(
+ p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]);
+ #endif //__aarch64__
// The above function should add at most 17 (4 32-bit ints plus a
selector byte) to
// 'p'. Thus, since we checked that (p + 16 < limit) above, we are
guaranteed that
// (p <= limit) now.
+
DCHECK_LE(p, limit);
} else {
p = coding::DecodeGroupVarInt32_SlowButSafe(
diff --git a/src/kudu/cfile/bitshuffle_arch_wrapper.cc
b/src/kudu/cfile/bitshuffle_arch_wrapper.cc
index 4799cd5..8d21f87 100644
--- a/src/kudu/cfile/bitshuffle_arch_wrapper.cc
+++ b/src/kudu/cfile/bitshuffle_arch_wrapper.cc
@@ -35,6 +35,7 @@
#undef bshuf_decompress_lz4
#include "kudu/gutil/cpu.h"
+// IWYU pragma: no_forward_declare base::CPU
using base::CPU;
@@ -57,7 +58,7 @@ decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4;
// the cost of a 'std::once' call.
__attribute__((constructor))
void SelectBitshuffleFunctions() {
-#ifndef __APPLE__
+#if !defined(__APPLE__) && !defined(__aarch64__)
if (CPU().has_avx2()) {
g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2;
g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2;
diff --git a/src/kudu/codegen/codegen-test.cc b/src/kudu/codegen/codegen-test.cc
index c42e095..2a91e25 100644
--- a/src/kudu/codegen/codegen-test.cc
+++ b/src/kudu/codegen/codegen-test.cc
@@ -380,7 +380,11 @@ TEST_F(CodegenTest, TestDumpMC) {
const vector<string>& msgs = sink.logged_msgs();
ASSERT_EQ(msgs.size(), 1);
+ #ifndef __aarch64__
EXPECT_THAT(msgs[0], testing::ContainsRegex("retq"));
+ #else
+ EXPECT_THAT(msgs[0], testing::ContainsRegex("ret"));
+ #endif //__aarch64__
}
// Basic test for the CompilationManager code cache.
diff --git a/src/kudu/common/columnar_serialization.cc
b/src/kudu/common/columnar_serialization.cc
index 6c481d9..3fd6b12 100644
--- a/src/kudu/common/columnar_serialization.cc
+++ b/src/kudu/common/columnar_serialization.cc
@@ -17,12 +17,16 @@
#include "kudu/common/columnar_serialization.h"
+#ifdef __aarch64__
+#include "kudu/util/sse2neon.h" // IWYU pragma: keep
+#else
#include <emmintrin.h>
#include <immintrin.h>
+#endif
#include <cstring>
#include <ostream>
-#include <string>
+#include <string> // IWYU pragma: keep
#include <vector>
#include <glog/logging.h>
diff --git a/src/kudu/common/key_encoder.h b/src/kudu/common/key_encoder.h
index 5b2cd76..214cc57 100644
--- a/src/kudu/common/key_encoder.h
+++ b/src/kudu/common/key_encoder.h
@@ -18,8 +18,10 @@
#ifndef KUDU_COMMON_KEYENCODER_H
#define KUDU_COMMON_KEYENCODER_H
+#ifndef __aarch64__
#include <emmintrin.h>
#include <smmintrin.h>
+#endif
#include <climits>
#include <cstdint>
@@ -244,6 +246,9 @@ struct KeyEncoderTraits<BINARY, Buffer> {
// REQUIRES: len == 16 or 8
template<int LEN>
static bool SSEEncodeChunk(const uint8_t** srcp, uint8_t** dstp) {
+ #ifdef __aarch64__
+ return false;
+ #else
COMPILE_ASSERT(LEN == 16 || LEN == 8, invalid_length);
__m128i data;
if (LEN == 16) {
@@ -280,6 +285,7 @@ struct KeyEncoderTraits<BINARY, Buffer> {
*dstp += LEN;
*srcp += LEN;
return true;
+ #endif //__aarch64__
}
// Non-SSE loop which encodes 'len' bytes from 'srcp' into 'dst'.
diff --git a/src/kudu/common/zp7.cc b/src/kudu/common/zp7.cc
index de817d7..9bf2835 100644
--- a/src/kudu/common/zp7.cc
+++ b/src/kudu/common/zp7.cc
@@ -38,13 +38,17 @@
#ifdef __x86_64__
#include <emmintrin.h>
+#elif defined(__aarch64__)
+#include "kudu/util/sse2neon.h"
#endif
+#ifndef __aarch64__
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
#define USE_INLINE_ASM_CLMUL
#else
#include <wmmintrin.h>
#endif
+#endif
#define N_BITS (6)
@@ -85,7 +89,9 @@ static inline __m128i asm_mm_clmulepi64_si128(__m128i a,
__m128i b) {
// the same masks).
//
// This variant depends on the CLMUL instruction.
+#ifndef __aarch64__
__attribute__((target("pclmul")))
+#endif // __aarch64__
ATTRIBUTE_NO_SANITIZE_INTEGER
static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
zp7_masks_64_t r;
diff --git a/src/kudu/gutil/CMakeLists.txt b/src/kudu/gutil/CMakeLists.txt
index 53bf62d..02808d9 100644
--- a/src/kudu/gutil/CMakeLists.txt
+++ b/src/kudu/gutil/CMakeLists.txt
@@ -16,7 +16,6 @@
# under the License.
set(GUTIL_SRCS
- atomicops-internals-x86.cc
bits.cc
cpu.cc
dynamic_annotations.c
@@ -51,6 +50,12 @@ set(GUTIL_SRCS
utf/rune.c
walltime.cc)
+if (NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+set(GUTIL_SRCS
+ atomicops-internals-x86.cc
+ ${GUTIL_SRCS})
+endif()
+
set(GUTIL_LIBS
glog
protobuf)
diff --git a/src/kudu/gutil/cpu.cc b/src/kudu/gutil/cpu.cc
index e108304..318d397 100644
--- a/src/kudu/gutil/cpu.cc
+++ b/src/kudu/gutil/cpu.cc
@@ -4,10 +4,12 @@
#include "kudu/gutil/cpu.h"
+#ifndef __aarch64__
#include <cstring>
#include <utility>
#include "kudu/gutil/integral_types.h"
+#endif //__aarch64__
#if defined(__x86_64__)
#if defined(_MSC_VER)
@@ -274,6 +276,9 @@ void CPU::Initialize() {
#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) ||
defined(OS_LINUX))
cpu_brand_.assign(g_lazy_cpuinfo.Get().brand());
has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon();
+#elif defined(__aarch64__)
+ cpu_brand_.assign("ARM64");
+ has_broken_neon_ = false;
#else
#error unknown architecture
#endif
diff --git a/src/kudu/gutil/cycleclock-inl.h b/src/kudu/gutil/cycleclock-inl.h
index c913d51..a7b4059 100644
--- a/src/kudu/gutil/cycleclock-inl.h
+++ b/src/kudu/gutil/cycleclock-inl.h
@@ -147,7 +147,20 @@ inline int64 CycleClock::Now() {
}
// ----------------------------------------------------------------
-#elif defined(ARMV6) // V6 is the earliest arm that has a standard cyclecount
+#elif defined(__aarch64__)
+#include "kudu/gutil/sysinfo.h"
+inline int64 CycleClock::Now() {
+ // System timer of ARMv8 runs at a different frequency than the CPU's.
+ // The frequency is fixed, typically in the range 1-50MHz. It can be
+ // read at CNTFRQ special register. We assume the OS has set up
+ // the virtual timer properly.
+ int64_t virtual_timer_value;
+ asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+ return virtual_timer_value;
+}
+
+// ----------------------------------------------------------------
+#elif defined(ARMV6) // V6 is the earliest arm that has a standard cyclecount
#include "kudu/gutil/sysinfo.h"
inline int64 CycleClock::Now() {
uint32 pmccntr;
diff --git a/src/kudu/gutil/dynamic_annotations.h
b/src/kudu/gutil/dynamic_annotations.h
index 7e03d45..5cc5aba 100644
--- a/src/kudu/gutil/dynamic_annotations.h
+++ b/src/kudu/gutil/dynamic_annotations.h
@@ -57,6 +57,7 @@
#ifndef __DYNAMIC_ANNOTATIONS_H__
#define __DYNAMIC_ANNOTATIONS_H__
+#include <stddef.h>
#ifndef DYNAMIC_ANNOTATIONS_ENABLED
# define DYNAMIC_ANNOTATIONS_ENABLED 0
#endif
diff --git a/src/kudu/gutil/port.h b/src/kudu/gutil/port.h
index f0b06d7..f5155b2 100644
--- a/src/kudu/gutil/port.h
+++ b/src/kudu/gutil/port.h
@@ -324,6 +324,8 @@ inline void* memrchr(const void* bytes, int find_char,
size_t len) {
// TODO(user) This is the L1 D-cache line size of our Power7 machines.
// Need to check if this is appropriate for other PowerPC64 systems.
#define CACHELINE_SIZE 128
+#elif defined(__aarch64__)
+#define CACHELINE_SIZE 64
#elif defined(__arm__)
// Cache line sizes for ARM: These values are not strictly correct since
// cache line sizes depend on implementations, not architectures. There
diff --git a/src/kudu/gutil/spinlock.h b/src/kudu/gutil/spinlock.h
index eced3ef..eb85d10 100644
--- a/src/kudu/gutil/spinlock.h
+++ b/src/kudu/gutil/spinlock.h
@@ -76,6 +76,9 @@ class LOCKABLE SpinLock {
SlowLock();
}
ANNOTATE_RWLOCK_ACQUIRED(this, 1);
+#ifdef __aarch64__
+ __asm__ __volatile__ ("dmb ish" ::: "memory");
+#endif //__aarch64__
}
// Try to acquire this SpinLock without blocking and return true if the
@@ -89,6 +92,9 @@ class LOCKABLE SpinLock {
if (res) {
ANNOTATE_RWLOCK_ACQUIRED(this, 1);
}
+#ifdef __aarch64__
+ __asm__ __volatile__ ("dmb ish" ::: "memory");
+#endif //__aarch64__
return res;
}
@@ -105,6 +111,9 @@ class LOCKABLE SpinLock {
// for the lock.
SlowUnlock(wait_cycles);
}
+#ifdef __aarch64__
+ __asm__ __volatile__ ("dmb ish" ::: "memory");
+#endif //__aarch64__
}
// Determine if the lock is held. When the lock is held by the invoking
diff --git a/src/kudu/gutil/spinlock_linux-inl.h
b/src/kudu/gutil/spinlock_linux-inl.h
index 54e16a7..ebfe570 100644
--- a/src/kudu/gutil/spinlock_linux-inl.h
+++ b/src/kudu/gutil/spinlock_linux-inl.h
@@ -42,6 +42,9 @@
#define FUTEX_WAKE 1
#define FUTEX_PRIVATE_FLAG 128
+// Note: Instead of making direct system calls that are inlined, we rely
+// on the syscall() function in glibc to do the right thing.
+
static bool have_futex;
static int futex_private_flag = FUTEX_PRIVATE_FLAG;
diff --git a/src/kudu/rpc/rpc-test-base.h b/src/kudu/rpc/rpc-test-base.h
index 5d19137..fb88a79 100644
--- a/src/kudu/rpc/rpc-test-base.h
+++ b/src/kudu/rpc/rpc-test-base.h
@@ -416,7 +416,7 @@ class RpcTestBase : public KuduTest {
public:
RpcTestBase()
: n_worker_threads_(3),
- service_queue_length_(100),
+ service_queue_length_(200),
n_server_reactor_threads_(3),
keepalive_time_ms_(1000),
metric_entity_(METRIC_ENTITY_server.Instantiate(&metric_registry_,
"test.rpc_test")) {
diff --git a/src/kudu/util/block_bloom_filter.cc
b/src/kudu/util/block_bloom_filter.cc
index 2c80f79..16a5b7b 100644
--- a/src/kudu/util/block_bloom_filter.cc
+++ b/src/kudu/util/block_bloom_filter.cc
@@ -17,8 +17,12 @@
#include "kudu/util/block_bloom_filter.h"
+#ifdef __aarch64__
+#include "kudu/util/sse2neon.h"
+#else //__aarch64__
#include <emmintrin.h>
#include <mm_malloc.h>
+#endif
#include <algorithm>
#include <cmath>
@@ -183,10 +187,17 @@ void BlockBloomFilter::BucketInsert(const uint32_t
bucket_idx, const uint32_t ha
new_bucket[i] = 1U << new_bucket[i];
}
for (int i = 0; i < 2; ++i) {
+#ifdef __aarch64__
+ // IWYU pragma: no_include <arm_neon.h>
+ uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4
* i));
+ uint8x16_t* existing_bucket =
reinterpret_cast<uint8x16_t*>(&directory_[bucket_idx][4 * i]);
+ *existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
+#else
__m128i new_bucket_sse =
_mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
__m128i* existing_bucket = reinterpret_cast<__m128i*>(
&DCHECK_NOTNULL(directory_)[bucket_idx][4 * i]);
*existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
+#endif
}
}
diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc
index 7bcb787..44a44aa 100644
--- a/src/kudu/util/debug-util.cc
+++ b/src/kudu/util/debug-util.cc
@@ -43,7 +43,11 @@
#include <glog/raw_logging.h>
#ifdef __linux__
#define UNW_LOCAL_ONLY
+#ifdef __aarch64__
+#include <libunwind-aarch64.h>
+#else
#include <libunwind.h>
+#endif //__aarch64__
#endif
#include "kudu/gutil/basictypes.h"
diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h
index e8c94ea..db4e8d1 100644
--- a/src/kudu/util/debug-util.h
+++ b/src/kudu/util/debug-util.h
@@ -31,6 +31,10 @@
#include "kudu/gutil/strings/fastmem.h"
#include "kudu/util/status.h"
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_PRIVATE_FLAG 128
+
namespace kudu {
template <typename T> class ArrayView;
diff --git a/src/kudu/util/group_varint-inl.h b/src/kudu/util/group_varint-inl.h
index 27e289f..598881d 100644
--- a/src/kudu/util/group_varint-inl.h
+++ b/src/kudu/util/group_varint-inl.h
@@ -17,13 +17,18 @@
#ifndef KUDU_UTIL_GROUP_VARINT_INL_H
#define KUDU_UTIL_GROUP_VARINT_INL_H
-#include <emmintrin.h>
#ifdef __linux__
#include <endian.h>
#endif
+
+#ifdef __aarch64__
+#include "kudu/util/sse2neon.h"
+#else
+#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
+#endif //__aarch64__
#include <cstdint>
#include <cstring>
@@ -123,6 +128,7 @@ inline const uint8_t *DecodeGroupVarInt32_SlowButSafe(
}
+#ifndef __aarch64__
inline void DoExtractM128(__m128i results,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
#define SSE_USE_EXTRACT_PS
@@ -202,6 +208,7 @@ inline const uint8_t *DecodeGroupVarInt32_SSE_Add(
return src;
}
+#endif //__aarch64__
// Append a set of group-varint encoded integers to the given faststring.
inline void AppendGroupVarInt32(
diff --git a/src/kudu/util/group_varint-test.cc
b/src/kudu/util/group_varint-test.cc
index 505da77..ac15ef1 100644
--- a/src/kudu/util/group_varint-test.cc
+++ b/src/kudu/util/group_varint-test.cc
@@ -72,6 +72,7 @@ static void DoTestRoundTripGVI32(
const uint8_t *end;
+#ifndef __aarch64__
if (use_sse) {
end = DecodeGroupVarInt32_SSE(
buf.data(), &ret[0], &ret[1], &ret[2], &ret[3]);
@@ -79,6 +80,9 @@ static void DoTestRoundTripGVI32(
end = DecodeGroupVarInt32(
buf.data(), &ret[0], &ret[1], &ret[2], &ret[3]);
}
+#else
+ end = DecodeGroupVarInt32(buf.data(), &ret[0], &ret[1], &ret[2], &ret[3]);
+#endif //__aarch64__
ASSERT_EQ(a, ret[0]);
ASSERT_EQ(b, ret[1]);
diff --git a/src/kudu/util/init.cc b/src/kudu/util/init.cc
index 5267730..b6c8197 100644
--- a/src/kudu/util/init.cc
+++ b/src/kudu/util/init.cc
@@ -66,6 +66,9 @@ void CheckStandardFds() {
Status CheckCPUFlags() {
base::CPU cpu;
+ if (!cpu.has_broken_neon() && cpu.cpu_brand()=="ARM64") {
+ return Status::OK();
+ }
if (!cpu.has_sse42()) {
return BadCPUStatus(cpu, "SSE4.2");
}
diff --git a/src/kudu/util/memory/memory.cc b/src/kudu/util/memory/memory.cc
index b3964df..58924f8 100644
--- a/src/kudu/util/memory/memory.cc
+++ b/src/kudu/util/memory/memory.cc
@@ -20,7 +20,12 @@
#include "kudu/util/memory/memory.h"
+#ifdef __aarch64__
+#define _mm_free(p) free(p)
+#define _mm_malloc(a, b) malloc(a)
+#else
#include <mm_malloc.h>
+#endif //__aarch64__
#include <algorithm>
#include <cstdlib>
@@ -30,8 +35,8 @@
#include "kudu/util/alignment.h"
#include "kudu/util/flag_tags.h"
-#include "kudu/util/memory/overwrite.h"
#include "kudu/util/mem_tracker.h"
+#include "kudu/util/memory/overwrite.h"
using std::copy;
using std::min;
diff --git a/src/kudu/util/notification.h b/src/kudu/util/notification.h
index 968afbb..6b38b8d 100644
--- a/src/kudu/util/notification.h
+++ b/src/kudu/util/notification.h
@@ -25,6 +25,10 @@
#include "kudu/util/countdown_latch.h"
#endif
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_PRIVATE_FLAG 128
+
namespace kudu {
// This class defines a `Notification` abstraction, which allows threads
diff --git a/src/kudu/util/striped64.cc b/src/kudu/util/striped64.cc
index 789a395..6f60ff4 100644
--- a/src/kudu/util/striped64.cc
+++ b/src/kudu/util/striped64.cc
@@ -17,7 +17,13 @@
#include "kudu/util/striped64.h"
+#ifdef __aarch64__
+#define _mm_free(p) free(p)
+#define _mm_malloc(a, b) malloc(a)
+#else
#include <mm_malloc.h>
+#endif //__aarch64__
+
#include <unistd.h>
#include <cstdlib>
diff --git a/thirdparty/build-definitions.sh b/thirdparty/build-definitions.sh
index 19ba003..cb93032 100644
--- a/thirdparty/build-definitions.sh
+++ b/thirdparty/build-definitions.sh
@@ -454,7 +454,8 @@ build_gperftools() {
$GPERFTOOLS_SOURCE/configure \
--enable-frame-pointers \
--with-pic \
- --prefix=$PREFIX
+ --prefix=$PREFIX \
+ --enable-emergency-malloc
fixup_libtool
make -j$PARALLEL $EXTRA_MAKEFLAGS install
popd