This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 366e8083a2 GH-35116: [CI][C++] Enable compile-time AVX2 on some CI
platforms (#36662)
366e8083a2 is described below
commit 366e8083a2bd6d24ad371548699ef936fb7bb468
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jul 19 11:19:49 2023 +0200
GH-35116: [CI][C++] Enable compile-time AVX2 on some CI platforms (#36662)
AVX2 became mainline on Intel and AMD server CPUs around 2015, so it's
unlikely to be unavailable on current cloud platforms:
https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#CPUs_with_AVX2
Enable it at least on one Windows and one Linux CI platform.
x86 macOS is a legacy platform, so less interesting to exercise there (and
I'm not sure the old CPUs in x86 Macs actually support AVX2).
Also, fix the buggy AVX2 activation logic in Acero and avoid force-testing
AVX2 on incompatible systems.
* Closes: #35116
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
.github/workflows/cpp.yml | 14 +++-
ci/scripts/cpp_build.sh | 1 +
cpp/cmake_modules/SetupCxxFlags.cmake | 49 ++++++------
cpp/src/arrow/CMakeLists.txt | 22 +++---
cpp/src/arrow/acero/CMakeLists.txt | 6 +-
cpp/src/arrow/acero/bloom_filter.cc | 8 +-
cpp/src/arrow/acero/bloom_filter.h | 5 +-
cpp/src/arrow/acero/bloom_filter_avx2.cc | 5 +-
cpp/src/arrow/acero/bloom_filter_test.cc | 85 +++++++++-----------
cpp/src/arrow/acero/swiss_join_avx2.cc | 4 -
cpp/src/arrow/acero/swiss_join_internal.h | 2 +-
cpp/src/arrow/acero/test_util_internal.cc | 9 +++
cpp/src/arrow/acero/test_util_internal.h | 12 +--
cpp/src/arrow/compute/kernels/CMakeLists.txt | 54 ++++++++++---
cpp/src/arrow/compute/key_hash.cc | 6 +-
cpp/src/arrow/compute/key_hash.h | 4 +-
cpp/src/arrow/compute/key_hash_avx2.cc | 4 -
cpp/src/arrow/compute/key_hash_test.cc | 92 +++++++++++-----------
cpp/src/arrow/compute/key_map.cc | 4 +-
cpp/src/arrow/compute/key_map.h | 2 +-
cpp/src/arrow/compute/key_map_avx2.cc | 4 -
cpp/src/arrow/compute/row/compare_internal.cc | 8 +-
cpp/src/arrow/compute/row/compare_internal.h | 2 +-
cpp/src/arrow/compute/row/compare_internal_avx2.cc | 4 -
cpp/src/arrow/compute/row/encode_internal.cc | 10 +--
cpp/src/arrow/compute/row/encode_internal.h | 6 +-
cpp/src/arrow/compute/row/encode_internal_avx2.cc | 4 -
cpp/src/arrow/compute/util.cc | 10 +--
cpp/src/arrow/compute/util.h | 2 +-
cpp/src/arrow/compute/util_avx2.cc | 8 +-
cpp/src/arrow/testing/util.cc | 15 ++++
cpp/src/arrow/testing/util.h | 6 ++
cpp/src/arrow/util/byte_stream_split.h | 28 ++++---
docker-compose.yml | 30 ++++---
34 files changed, 289 insertions(+), 236 deletions(-)
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 67435566ce..63a16c8c11 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -67,7 +67,8 @@ jobs:
image: conda-cpp
llvm: "14"
runs-on: ubuntu-latest
- title: AMD64 Conda C++
+ simd-level: AVX2
+ title: AMD64 Conda C++ AVX2
ubuntu: "22.04"
- arch: amd64
clang-tools: "14"
@@ -85,6 +86,7 @@ jobs:
ubuntu: "20.04"
env:
ARCH: ${{ matrix.arch }}
+ ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
CLANG_TOOLS: ${{ matrix.clang-tools }}
LLVM: ${{ matrix.llvm }}
UBUNTU: ${{ matrix.ubuntu }}
@@ -175,6 +177,10 @@ jobs:
ARROW_WITH_ZSTD: ON
GTest_SOURCE: BUNDLED
steps:
+ - name: CPU Info
+ run: |
+ sysctl -a | grep cpu
+ sysctl -a | grep "hw.optional"
- name: Checkout Arrow
uses: actions/checkout@v3
with:
@@ -220,7 +226,7 @@ jobs:
ci/scripts/cpp_test.sh $(pwd) $(pwd)/build
windows:
- name: AMD64 ${{ matrix.name }} C++17
+ name: ${{ matrix.title }}
runs-on: ${{ matrix.os }}
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
@@ -231,7 +237,8 @@ jobs:
- windows-2019
include:
- os: windows-2019
- name: Windows 2019
+ simd-level: AVX2
+ title: AMD64 Windows 2019 C++17 AVX2
env:
ARROW_BOOST_USE_SHARED: OFF
ARROW_BUILD_BENCHMARKS: ON
@@ -246,6 +253,7 @@ jobs:
ARROW_MIMALLOC: ON
ARROW_ORC: ON
ARROW_PARQUET: ON
+ ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
ARROW_USE_GLOG: OFF
ARROW_VERBOSE_THIRDPARTY_BUILD: OFF
ARROW_WITH_BROTLI: OFF
diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh
index f0f893c419..e53b3fa460 100755
--- a/ci/scripts/cpp_build.sh
+++ b/ci/scripts/cpp_build.sh
@@ -126,6 +126,7 @@ cmake \
-DARROW_PARQUET=${ARROW_PARQUET:-OFF} \
-DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \
-DARROW_S3=${ARROW_S3:-OFF} \
+ -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \
-DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \
-DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-ON} \
-DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake
b/cpp/cmake_modules/SetupCxxFlags.cmake
index 076c2e7450..6b47fcb717 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -62,29 +62,32 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
"${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq
-mavx512bw")
check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2)
endif()
- check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
- if(MINGW)
- # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
- message(STATUS "Disable AVX512 support on MINGW for now")
- else()
- # Check for AVX512 support in the compiler.
- set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
- set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}")
- check_cxx_source_compiles("
- #ifdef _MSC_VER
- #include <intrin.h>
- #else
- #include <immintrin.h>
- #endif
-
- int main() {
- __m512i mask = _mm512_set1_epi32(0x1);
- char out[32];
- _mm512_storeu_si512(out, mask);
- return 0;
- }"
- CXX_SUPPORTS_AVX512)
- set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ # Check for AVX extensions on 64-bit systems only, as 32-bit support seems
iffy
+ check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
+ if(MINGW)
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
+ message(STATUS "Disable AVX512 support on MINGW for now")
+ else()
+ # Check for AVX512 support in the compiler.
+ set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+ set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}")
+ check_cxx_source_compiles("
+ #ifdef _MSC_VER
+ #include <intrin.h>
+ #else
+ #include <immintrin.h>
+ #endif
+
+ int main() {
+ __m512i mask = _mm512_set1_epi32(0x1);
+ char out[32];
+ _mm512_storeu_si512(out, mask);
+ return 0;
+ }"
+ CXX_SUPPORTS_AVX512)
+ set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+ endif()
endif()
# Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL
if(CXX_SUPPORTS_SSE4_2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fccff6c8cf..a398e790de 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -119,7 +119,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME)
${ARG_UNPARSED_ARGUMENTS})
endfunction()
-macro(append_avx2_src SRC)
+macro(append_runtime_avx2_src SRC)
if(ARROW_HAVE_RUNTIME_AVX2)
list(APPEND ARROW_SRCS ${SRC})
set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -127,7 +127,7 @@ macro(append_avx2_src SRC)
endif()
endmacro()
-macro(append_avx512_src SRC)
+macro(append_runtime_avx512_src SRC)
if(ARROW_HAVE_RUNTIME_AVX512)
list(APPEND ARROW_SRCS ${SRC})
set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -254,8 +254,8 @@ if(ARROW_JEMALLOC)
PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
endif()
-append_avx2_src(util/bpacking_avx2.cc)
-append_avx512_src(util/bpacking_avx512.cc)
+append_runtime_avx2_src(util/bpacking_avx2.cc)
+append_runtime_avx512_src(util/bpacking_avx512.cc)
if(ARROW_HAVE_NEON)
list(APPEND ARROW_SRCS util/bpacking_neon.cc)
@@ -425,11 +425,11 @@ list(APPEND
compute/row/row_internal.cc
compute/util.cc)
-append_avx2_src(compute/key_hash_avx2.cc)
-append_avx2_src(compute/key_map_avx2.cc)
-append_avx2_src(compute/row/compare_internal_avx2.cc)
-append_avx2_src(compute/row/encode_internal_avx2.cc)
-append_avx2_src(compute/util_avx2.cc)
+append_runtime_avx2_src(compute/key_hash_avx2.cc)
+append_runtime_avx2_src(compute/key_map_avx2.cc)
+append_runtime_avx2_src(compute/row/compare_internal_avx2.cc)
+append_runtime_avx2_src(compute/row/encode_internal_avx2.cc)
+append_runtime_avx2_src(compute/util_avx2.cc)
if(ARROW_COMPUTE)
# Include the remaining kernels
@@ -464,8 +464,8 @@ if(ARROW_COMPUTE)
compute/kernels/vector_select_k.cc
compute/kernels/vector_sort.cc)
- append_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
- append_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
+ append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
+ append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
endif()
if(ARROW_FILESYSTEM)
diff --git a/cpp/src/arrow/acero/CMakeLists.txt
b/cpp/src/arrow/acero/CMakeLists.txt
index 287884432b..c2c91db58d 100644
--- a/cpp/src/arrow/acero/CMakeLists.txt
+++ b/cpp/src/arrow/acero/CMakeLists.txt
@@ -19,7 +19,7 @@ add_custom_target(arrow_acero)
arrow_install_all_headers("arrow/acero")
-macro(append_acero_avx2_src SRC)
+macro(append_acero_runtime_avx2_src SRC)
if(ARROW_HAVE_RUNTIME_AVX2)
list(APPEND ARROW_ACERO_SRCS ${SRC})
set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -56,8 +56,8 @@ set(ARROW_ACERO_SRCS
union_node.cc
util.cc)
-append_acero_avx2_src(bloom_filter_avx2.cc)
-append_acero_avx2_src(swiss_join_avx2.cc)
+append_acero_runtime_avx2_src(bloom_filter_avx2.cc)
+append_acero_runtime_avx2_src(swiss_join_avx2.cc)
set(ARROW_ACERO_SHARED_LINK_LIBS)
set(ARROW_ACERO_STATIC_LINK_LIBS)
diff --git a/cpp/src/arrow/acero/bloom_filter.cc
b/cpp/src/arrow/acero/bloom_filter.cc
index ad5e66ded0..b9855ee506 100644
--- a/cpp/src/arrow/acero/bloom_filter.cc
+++ b/cpp/src/arrow/acero/bloom_filter.cc
@@ -123,7 +123,7 @@ void BlockedBloomFilter::InsertImp(int64_t num_rows, const
T* hashes) {
void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows,
const uint32_t* hashes) {
int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
num_processed = Insert_avx2(num_rows, hashes);
}
@@ -134,7 +134,7 @@ void BlockedBloomFilter::Insert(int64_t hardware_flags,
int64_t num_rows,
void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows,
const uint64_t* hashes) {
int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
num_processed = Insert_avx2(num_rows, hashes);
}
@@ -181,7 +181,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags,
int64_t num_rows,
bool enable_prefetch) const {
int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (!(enable_prefetch && UsePrefetch()) &&
(hardware_flags & arrow::internal::CpuInfo::AVX2)) {
num_processed = Find_avx2(num_rows, hashes, result_bit_vector);
@@ -202,7 +202,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags,
int64_t num_rows,
bool enable_prefetch) const {
int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (!(enable_prefetch && UsePrefetch()) &&
(hardware_flags & arrow::internal::CpuInfo::AVX2)) {
num_processed = Find_avx2(num_rows, hashes, result_bit_vector);
diff --git a/cpp/src/arrow/acero/bloom_filter.h
b/cpp/src/arrow/acero/bloom_filter.h
index b8f7f8cd25..50d07bfd94 100644
--- a/cpp/src/arrow/acero/bloom_filter.h
+++ b/cpp/src/arrow/acero/bloom_filter.h
@@ -17,13 +17,14 @@
#pragma once
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
#include <immintrin.h>
#endif
#include <atomic>
#include <cstdint>
#include <memory>
+
#include "arrow/acero/partition_util.h"
#include "arrow/acero/util.h"
#include "arrow/memory_pool.h"
@@ -203,7 +204,7 @@ class ARROW_ACERO_EXPORT BlockedBloomFilter {
void SingleFold(int num_folds);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
inline __m256i mask_avx2(__m256i hash) const;
inline __m256i block_id_avx2(__m256i hash) const;
int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes);
diff --git a/cpp/src/arrow/acero/bloom_filter_avx2.cc
b/cpp/src/arrow/acero/bloom_filter_avx2.cc
index b6c281276d..5816bb4fc0 100644
--- a/cpp/src/arrow/acero/bloom_filter_avx2.cc
+++ b/cpp/src/arrow/acero/bloom_filter_avx2.cc
@@ -16,14 +16,13 @@
// under the License.
#include <immintrin.h>
+
#include "arrow/acero/bloom_filter.h"
#include "arrow/util/bit_util.h"
namespace arrow {
namespace acero {
-#if defined(ARROW_HAVE_AVX2)
-
inline __m256i BlockedBloomFilter::mask_avx2(__m256i hash) const {
// AVX2 translation of mask() method
//
@@ -132,7 +131,5 @@ int64_t BlockedBloomFilter::Insert_avx2(int64_t num_rows,
const uint64_t* hashes
return InsertImp_avx2(num_rows, hashes);
}
-#endif
-
} // namespace acero
} // namespace arrow
diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc
b/cpp/src/arrow/acero/bloom_filter_test.cc
index de433ac68c..95375e277e 100644
--- a/cpp/src/arrow/acero/bloom_filter_test.cc
+++ b/cpp/src/arrow/acero/bloom_filter_test.cc
@@ -22,13 +22,13 @@
#include <condition_variable>
#include <thread>
#include <unordered_set>
+
#include "arrow/acero/bloom_filter.h"
#include "arrow/acero/task_util.h"
#include "arrow/acero/test_util_internal.h"
#include "arrow/acero/util.h"
#include "arrow/compute/key_hash.h"
#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/cpu_info.h"
namespace arrow {
@@ -171,9 +171,7 @@ void TestBloomSmallHashHelper(int64_t num_input_hashes,
const T* input_hashes,
// Output FPR and build and probe cost.
//
void TestBloomSmall(BloomFilterBuildStrategy strategy, int64_t num_build,
- int num_build_copies, bool use_simd, bool enable_prefetch)
{
- int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0;
-
+ int num_build_copies, int64_t hardware_flags, bool
enable_prefetch) {
// Generate input keys
//
int64_t num_probe = 4 * num_build;
@@ -324,10 +322,8 @@ void TestBloomLargeHashHelper(int64_t hardware_flags,
int64_t block,
// Test with larger size Bloom filters (use large prime with arithmetic
// sequence modulo 2^64).
//
-void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, bool
use_simd,
- bool enable_prefetch) {
- int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0;
-
+void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build,
+ int64_t hardware_flags, bool enable_prefetch) {
// Largest 63-bit prime
constexpr uint64_t prime = 0x7FFFFFFFFFFFFFE7ULL;
@@ -458,42 +454,40 @@ TEST(BloomFilter, Basic) {
num_build.push_back(1LL << log_large);
#endif
- constexpr int num_param_sets = 3;
- struct {
- bool use_avx2;
+ struct TestParam {
+ int64_t hardware_flags;
bool enable_prefetch;
bool insert_multiple_copies;
- } params[num_param_sets];
- for (int i = 0; i < num_param_sets; ++i) {
- params[i].use_avx2 = (i == 1);
- params[i].enable_prefetch = (i == 2);
- params[i].insert_multiple_copies = (i == 3);
+ };
+ std::vector<TestParam> test_params;
+ for (const auto hardware_flags : HardwareFlagsForTesting()) {
+ test_params.push_back({hardware_flags, false, false});
}
+ test_params.push_back({0, true, false});
+ test_params.push_back({0, false, true});
- std::vector<BloomFilterBuildStrategy> strategy;
- strategy.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
+ std::vector<BloomFilterBuildStrategy> strategies;
+ strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
#ifndef ARROW_VALGRIND
- strategy.push_back(BloomFilterBuildStrategy::PARALLEL);
+ strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
#endif
static constexpr int64_t min_rows_for_large = 2 * 1024 * 1024;
- for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) {
- for (int iparam_set = 0; iparam_set < num_param_sets; ++iparam_set) {
- ARROW_SCOPED_TRACE("%s ", params[iparam_set].use_avx2 ?
"AVX2"
- : params[iparam_set].enable_prefetch ?
"PREFETCH"
- : params[iparam_set].insert_multiple_copies ?
"FOLDING"
- :
"REGULAR");
- for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build)
{
- ARROW_SCOPED_TRACE("num_build ",
static_cast<int>(num_build[inum_build]));
- if (num_build[inum_build] >= min_rows_for_large) {
- TestBloomLarge(strategy[istrategy], num_build[inum_build],
- params[iparam_set].use_avx2,
params[iparam_set].enable_prefetch);
+ for (const auto& strategy : strategies) {
+ for (const auto& test_param : test_params) {
+ ARROW_SCOPED_TRACE("hardware_flags = ", test_param.hardware_flags,
+ test_param.enable_prefetch ? " PREFETCH" : "",
+ test_param.insert_multiple_copies ? " FOLDING" :
"REGULAR");
+ for (const auto n : num_build) {
+ ARROW_SCOPED_TRACE("num_build ", n);
+ if (n >= min_rows_for_large) {
+ TestBloomLarge(strategy, n, test_param.hardware_flags,
+ test_param.enable_prefetch);
} else {
- TestBloomSmall(strategy[istrategy], num_build[inum_build],
- params[iparam_set].insert_multiple_copies ? 8 : 1,
- params[iparam_set].use_avx2,
params[iparam_set].enable_prefetch);
+ TestBloomSmall(strategy, n, test_param.insert_multiple_copies ? 8 :
1,
+ test_param.hardware_flags,
test_param.enable_prefetch);
}
}
}
@@ -506,19 +500,18 @@ TEST(BloomFilter, Scaling) {
num_build.push_back(1000000);
num_build.push_back(4000000);
- std::vector<BloomFilterBuildStrategy> strategy;
- strategy.push_back(BloomFilterBuildStrategy::PARALLEL);
-
- for (bool use_avx2 : {false, true}) {
- for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) {
- for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build)
{
- ARROW_SCOPED_TRACE("num_build = ",
static_cast<int>(num_build[inum_build]));
- ARROW_SCOPED_TRACE("strategy = ",
- strategy[istrategy] ==
BloomFilterBuildStrategy::PARALLEL
- ? "PARALLEL"
- : "SINGLE_THREADED");
- ARROW_SCOPED_TRACE("avx2 = ", use_avx2 ? "AVX2" : "SCALAR");
- TestBloomLarge(strategy[istrategy], num_build[inum_build], use_avx2,
+ std::vector<BloomFilterBuildStrategy> strategies;
+ strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
+
+ for (const auto hardware_flags : HardwareFlagsForTesting()) {
+ for (const auto& strategy : strategies) {
+ for (const auto n : num_build) {
+ ARROW_SCOPED_TRACE("num_build = ", n);
+ ARROW_SCOPED_TRACE("strategy = ", strategy ==
BloomFilterBuildStrategy::PARALLEL
+ ? "PARALLEL"
+ : "SINGLE_THREADED");
+ ARROW_SCOPED_TRACE("hardware_flags = ", hardware_flags);
+ TestBloomLarge(strategy, n, hardware_flags,
/*enable_prefetch=*/false);
}
}
diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc
b/cpp/src/arrow/acero/swiss_join_avx2.cc
index d5c0b7817f..0888dd8938 100644
--- a/cpp/src/arrow/acero/swiss_join_avx2.cc
+++ b/cpp/src/arrow/acero/swiss_join_avx2.cc
@@ -23,8 +23,6 @@
namespace arrow {
namespace acero {
-#if defined(ARROW_HAVE_AVX2)
-
template <class PROCESS_8_VALUES_FN>
int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int
num_rows,
const uint32_t* row_ids,
@@ -191,7 +189,5 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl&
rows, int column_id,
return num_rows - (num_rows % unroll);
}
-#endif
-
} // namespace acero
} // namespace arrow
diff --git a/cpp/src/arrow/acero/swiss_join_internal.h
b/cpp/src/arrow/acero/swiss_join_internal.h
index cd12b34a0c..88b80f06f5 100644
--- a/cpp/src/arrow/acero/swiss_join_internal.h
+++ b/cpp/src/arrow/acero/swiss_join_internal.h
@@ -80,7 +80,7 @@ class RowArrayAccessor {
const uint32_t* row_ids, PROCESS_VALUE_FN
process_value_fn);
private:
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
// This is equivalent to Visit method, but processing 8 rows at a time in a
// loop.
// Returns the number of processed rows, which may be less than requested (up
diff --git a/cpp/src/arrow/acero/test_util_internal.cc
b/cpp/src/arrow/acero/test_util_internal.cc
index 2042650be6..f50ca92238 100644
--- a/cpp/src/arrow/acero/test_util_internal.cc
+++ b/cpp/src/arrow/acero/test_util_internal.cc
@@ -45,8 +45,10 @@
#include "arrow/testing/builder.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
+#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/util/async_generator.h"
+#include "arrow/util/cpu_info.h"
#include "arrow/util/iterator.h"
#include "arrow/util/logging.h"
#include "arrow/util/unreachable.h"
@@ -54,6 +56,7 @@
namespace arrow {
+using arrow::internal::CpuInfo;
using arrow::internal::Executor;
using compute::SortKey;
@@ -62,6 +65,7 @@ using compute::Take;
namespace acero {
namespace {
+
void ValidateOutputImpl(const ArrayData& output) {
ASSERT_OK(::arrow::internal::ValidateArrayFull(output));
TestInitialized(output);
@@ -116,6 +120,11 @@ void ValidateOutput(const Datum& output) {
}
}
+std::vector<int64_t> HardwareFlagsForTesting() {
+ // Acero currently only has AVX2 optimizations
+ return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2});
+}
+
namespace {
struct DummyNode : ExecNode {
diff --git a/cpp/src/arrow/acero/test_util_internal.h
b/cpp/src/arrow/acero/test_util_internal.h
index 03f4170286..569fb1254d 100644
--- a/cpp/src/arrow/acero/test_util_internal.h
+++ b/cpp/src/arrow/acero/test_util_internal.h
@@ -20,6 +20,7 @@
#include "arrow/testing/gtest_util.h"
#include "arrow/util/vector.h"
+#include <cstdint>
#include <functional>
#include <random>
#include <string>
@@ -33,12 +34,14 @@
#include "arrow/util/async_generator.h"
#include "arrow/util/pcg_random.h"
-namespace arrow {
-
-namespace acero {
+namespace arrow::acero {
void ValidateOutput(const Datum& output);
+// Enumerate all hardware flags that can be tested on this platform
+// and would lead to different code paths being tested in Acero.
+std::vector<int64_t> HardwareFlagsForTesting();
+
using StartProducingFunc = std::function<Status(ExecNode*)>;
using StopProducingFunc = std::function<void(ExecNode*)>;
@@ -204,5 +207,4 @@ struct TableGenerationProperties {
Result<std::shared_ptr<Table>> MakeRandomTimeSeriesTable(
const TableGenerationProperties& properties);
-} // namespace acero
-} // namespace arrow
+} // namespace arrow::acero
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index a17d6275a7..0bd6fe8613 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -18,11 +18,20 @@
# ----------------------------------------------------------------------
# Tests that don't require the full kernel library
+# Define arrow_compute_testing object library for common test files
+if(ARROW_TESTING)
+ add_library(arrow_compute_kernels_testing OBJECT test_util.cc)
+ # Even though this is still just an object library we still need to "link"
our
+ # dependencies so that include paths are configured correctly
+ target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST})
+endif()
+
add_arrow_test(scalar_cast_test
${ARROW_COMPUTE_TEST_ARGS}
SOURCES
scalar_cast_test.cc
- test_util.cc)
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
# ----------------------------------------------------------------------
# Scalar kernels
@@ -32,25 +41,36 @@ add_arrow_compute_test(scalar_type_test
scalar_boolean_test.cc
scalar_nested_test.cc
scalar_string_test.cc
- test_util.cc)
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
-add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc
test_util.cc)
+add_arrow_compute_test(scalar_if_else_test
+ SOURCES
+ scalar_if_else_test.cc
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
-add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc
test_util.cc)
+add_arrow_compute_test(scalar_temporal_test
+ SOURCES
+ scalar_temporal_test.cc
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
add_arrow_compute_test(scalar_math_test
SOURCES
scalar_arithmetic_test.cc
scalar_compare_test.cc
scalar_round_arithmetic_test.cc
- test_util.cc)
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
add_arrow_compute_test(scalar_utility_test
SOURCES
scalar_random_test.cc
scalar_set_lookup_test.cc
scalar_validity_test.cc
- test_util.cc)
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute")
add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute")
@@ -75,12 +95,20 @@ add_arrow_compute_test(vector_test
vector_replace_test.cc
vector_run_end_encode_test.cc
select_k_test.cc
- test_util.cc)
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
-add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc
test_util.cc)
+add_arrow_compute_test(vector_sort_test
+ SOURCES
+ vector_sort_test.cc
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
-add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc
- test_util.cc)
+add_arrow_compute_test(vector_selection_test
+ SOURCES
+ vector_selection_test.cc
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
@@ -94,7 +122,11 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX
"arrow-compute")
# Aggregates
-add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc)
+add_arrow_compute_test(aggregate_test
+ SOURCES
+ aggregate_test.cc
+ EXTRA_LINK_LIBS
+ arrow_compute_kernels_testing)
# ----------------------------------------------------------------------
# Utilities
diff --git a/cpp/src/arrow/compute/key_hash.cc
b/cpp/src/arrow/compute/key_hash.cc
index 3fcfbf3d83..f5867b405e 100644
--- a/cpp/src/arrow/compute/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -236,7 +236,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool
combine_hashes, uint32_t
const uint32_t* offsets, const uint8_t*
concatenated_keys,
uint32_t* hashes, uint32_t*
hashes_temp_for_combine) {
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets,
concatenated_keys,
hashes, hashes_temp_for_combine);
@@ -255,7 +255,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool
combine_hashes, uint32_t
const uint64_t* offsets, const uint8_t*
concatenated_keys,
uint32_t* hashes, uint32_t*
hashes_temp_for_combine) {
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets,
concatenated_keys,
hashes, hashes_temp_for_combine);
@@ -361,7 +361,7 @@ void Hashing32::HashFixed(int64_t hardware_flags, bool
combine_hashes, uint32_t
}
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys,
hashes,
hashes_temp_for_combine);
diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index e43d7b8df5..b193716c9b 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -17,7 +17,7 @@
#pragma once
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
#include <immintrin.h>
#endif
@@ -115,7 +115,7 @@ class ARROW_EXPORT Hashing32 {
static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
length_key,
const uint8_t* keys, uint32_t* hashes);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
static inline __m256i Avalanche_avx2(__m256i hash);
static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i
hash);
template <bool T_COMBINE_HASHES>
diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc
b/cpp/src/arrow/compute/key_hash_avx2.cc
index f30c3460bd..1b444b5767 100644
--- a/cpp/src/arrow/compute/key_hash_avx2.cc
+++ b/cpp/src/arrow/compute/key_hash_avx2.cc
@@ -23,8 +23,6 @@
namespace arrow {
namespace compute {
-#if defined(ARROW_HAVE_AVX2)
-
inline __m256i Hashing32::Avalanche_avx2(__m256i hash) {
hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15));
hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2));
@@ -315,7 +313,5 @@ uint32_t Hashing32::HashVarLen_avx2(bool combine_hashes,
uint32_t num_rows,
}
}
-#endif
-
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/key_hash_test.cc
b/cpp/src/arrow/compute/key_hash_test.cc
index d10645391b..3e6d41525c 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -21,18 +21,26 @@
#include <map>
#include <random>
#include <unordered_set>
+
#include "arrow/array/builder_binary.h"
#include "arrow/compute/key_hash.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
#include "arrow/util/cpu_info.h"
#include "arrow/util/pcg_random.h"
namespace arrow {
using internal::checked_pointer_cast;
+using internal::CpuInfo;
namespace compute {
+std::vector<int64_t> HardwareFlagsForTesting() {
+ // Our key-hash and key-map routines currently only have AVX2 optimizations
+ return GetSupportedHardwareFlags({CpuInfo::AVX2});
+}
+
class TestVectorHash {
private:
template <typename Type, typename ArrayType = typename
TypeTraits<Type>::ArrayType>
@@ -131,85 +139,79 @@ class TestVectorHash {
const offset_t* key_offsets =
reinterpret_cast<const offset_t*>(keys_array->raw_value_offsets());
- std::vector<uint32_t> hashes_scalar32;
- std::vector<uint64_t> hashes_scalar64;
- hashes_scalar32.resize(num_rows);
- hashes_scalar64.resize(num_rows);
- std::vector<uint32_t> hashes_simd32;
- std::vector<uint64_t> hashes_simd64;
- hashes_simd32.resize(num_rows);
- hashes_simd64.resize(num_rows);
-
- int64_t hardware_flags_scalar = 0LL;
- int64_t hardware_flags_simd = ::arrow::internal::CpuInfo::AVX2;
+ // For each tested hardware flags, we will compute the hashes and check
+ // them for consistency.
+ const auto hardware_flags_for_testing = HardwareFlagsForTesting();
+ ASSERT_GT(hardware_flags_for_testing.size(), 0);
+ std::vector<std::vector<uint32_t>>
hashes32(hardware_flags_for_testing.size());
+ std::vector<std::vector<uint64_t>>
hashes64(hardware_flags_for_testing.size());
+ for (auto& h : hashes32) {
+ h.resize(num_rows);
+ }
+ for (auto& h : hashes64) {
+ h.resize(num_rows);
+ }
constexpr int mini_batch_size = 1024;
std::vector<uint32_t> temp_buffer;
temp_buffer.resize(mini_batch_size * 4);
- for (bool use_simd : {false, true}) {
+ for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size());
++i) {
+ const auto hardware_flags = hardware_flags_for_testing[i];
if (use_32bit_hash) {
if (!use_varlen_input) {
- Hashing32::HashFixed(use_simd ? hardware_flags_simd :
hardware_flags_scalar,
+ Hashing32::HashFixed(hardware_flags,
/*combine_hashes=*/false, num_rows,
fixed_length, keys,
- use_simd ? hashes_simd32.data() :
hashes_scalar32.data(),
- temp_buffer.data());
+ hashes32[i].data(), temp_buffer.data());
} else {
for (int first_row = 0; first_row < num_rows;) {
int batch_size_next = std::min(num_rows - first_row,
mini_batch_size);
- Hashing32::HashVarLen(
- use_simd ? hardware_flags_simd : hardware_flags_scalar,
- /*combine_hashes=*/false, batch_size_next, key_offsets +
first_row, keys,
- (use_simd ? hashes_simd32.data() : hashes_scalar32.data()) +
first_row,
- temp_buffer.data());
+ Hashing32::HashVarLen(hardware_flags,
+ /*combine_hashes=*/false, batch_size_next,
+ key_offsets + first_row, keys,
+ hashes32[i].data() + first_row,
temp_buffer.data());
first_row += batch_size_next;
}
}
+ for (int j = 0; j < num_rows; ++j) {
+ hashes64[i][j] = hashes32[i][j];
+ }
} else {
if (!use_varlen_input) {
Hashing64::HashFixed(
- /*combine_hashes=*/false, num_rows, fixed_length, keys,
- use_simd ? hashes_simd64.data() : hashes_scalar64.data());
+ /*combine_hashes=*/false, num_rows, fixed_length, keys,
hashes64[i].data());
} else {
Hashing64::HashVarLen(
- /*combine_hashes=*/false, num_rows, key_offsets, keys,
- use_simd ? hashes_simd64.data() : hashes_scalar64.data());
+ /*combine_hashes=*/false, num_rows, key_offsets, keys,
hashes64[i].data());
}
}
}
- if (use_32bit_hash) {
- for (int i = 0; i < num_rows; ++i) {
- hashes_scalar64[i] = hashes_scalar32[i];
- hashes_simd64[i] = hashes_simd32[i];
- }
- }
-
- // Verify that both scalar and AVX2 implementations give the same hashes
+ // Verify that all implementations (scalar, SIMD) give the same hashes
//
- for (int i = 0; i < num_rows; ++i) {
- ASSERT_EQ(hashes_scalar64[i], hashes_simd64[i])
- << "scalar and simd approaches yielded different hashes";
+ const auto& hashes_scalar64 = hashes64[0];
+ for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size());
++i) {
+ for (int j = 0; j < num_rows; ++j) {
+ ASSERT_EQ(hashes64[i][j], hashes_scalar64[j])
+ << "scalar and simd approaches yielded different hashes";
+ }
}
// Verify that the same key appearing multiple times generates the same
hash
// each time. Measure the number of unique hashes and compare to the number
// of unique keys.
//
- std::map<int, uint64_t> unique_key_to_hash;
- std::set<uint64_t> unique_hashes;
+ std::unordered_map<int, uint64_t> unique_key_to_hash;
+ std::unordered_set<uint64_t> unique_hashes;
for (int i = 0; i < num_rows; ++i) {
- std::map<int, uint64_t>::iterator iter =
unique_key_to_hash.find(row_ids[i]);
- if (iter == unique_key_to_hash.end()) {
- unique_key_to_hash.insert(std::make_pair(row_ids[i],
hashes_scalar64[i]));
- } else {
- ASSERT_EQ(iter->second, hashes_scalar64[i]);
- }
- if (unique_hashes.find(hashes_scalar64[i]) == unique_hashes.end()) {
- unique_hashes.insert(hashes_scalar64[i]);
+ auto [it, inserted] =
+ unique_key_to_hash.try_emplace(row_ids[i], hashes_scalar64[i]);
+ if (!inserted) {
+ ASSERT_EQ(it->second, hashes_scalar64[i]);
}
+ unique_hashes.insert(hashes_scalar64[i]);
}
float percent_hash_collisions =
100.0f * static_cast<float>(num_unique - unique_hashes.size()) /
diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index fd5c404a07..71ca56c91a 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -133,7 +133,7 @@ void SwissTable::extract_group_ids(const int num_keys,
const uint16_t* optional_
// Optimistically use simplified lookup involving only a start block to find
// a single group id candidate for every input.
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
int num_group_id_bytes = num_group_id_bits / 8;
if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) &&
!optional_selection) {
num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots,
out_group_ids,
@@ -301,7 +301,7 @@ void SwissTable::early_filter(const int num_keys, const
uint32_t* hashes,
// Optimistically use simplified lookup involving only a start block to find
// a single group id candidate for every input.
int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
if (log_blocks_ <= 4) {
num_processed = early_filter_imp_avx2_x32(num_keys, hashes,
out_match_bitvector,
diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h
index 7ab48470f2..95fb3be274 100644
--- a/cpp/src/arrow/compute/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -163,7 +163,7 @@ class ARROW_EXPORT SwissTable {
//
void early_filter_imp(const int num_keys, const uint32_t* hashes,
uint8_t* out_match_bitvector, uint8_t*
out_local_slots) const;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes,
uint8_t* out_match_bitvector,
uint8_t* out_local_slots) const;
diff --git a/cpp/src/arrow/compute/key_map_avx2.cc
b/cpp/src/arrow/compute/key_map_avx2.cc
index eb318ff188..7315535110 100644
--- a/cpp/src/arrow/compute/key_map_avx2.cc
+++ b/cpp/src/arrow/compute/key_map_avx2.cc
@@ -23,8 +23,6 @@
namespace arrow {
namespace compute {
-#if defined(ARROW_HAVE_AVX2)
-
// This is more or less translation of equivalent scalar code, adjusted for a
// different instruction set (e.g. missing leading zero count instruction).
//
@@ -412,7 +410,5 @@ int SwissTable::extract_group_ids_avx2(const int num_keys,
const uint32_t* hashe
return num_keys - (num_keys % unroll);
}
-#endif
-
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/row/compare_internal.cc
b/cpp/src/arrow/compute/row/compare_internal.cc
index 39ac33932b..7c402e7a23 100644
--- a/cpp/src/arrow/compute/row/compare_internal.cc
+++ b/cpp/src/arrow/compute/row/compare_internal.cc
@@ -42,7 +42,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col,
uint32_t num_rows_to_com
return;
}
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col,
num_rows_to_compare,
sel_left_maybe_null,
left_to_right_map,
@@ -130,7 +130,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t
offset_within_row,
const RowTableImpl& rows,
uint8_t* match_bytevector) {
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
num_processed = CompareBinaryColumnToRow_avx2(
use_selection, offset_within_row, num_rows_to_compare,
sel_left_maybe_null,
@@ -297,7 +297,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t
id_varbinary_col,
const RowTableImpl& rows,
uint8_t* match_bytevector) {
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
num_processed = CompareVarBinaryColumnToRow_avx2(
use_selection, is_first_varbinary_col, id_varbinary_col,
num_rows_to_compare,
@@ -313,7 +313,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t
id_varbinary_col,
void KeyCompare::AndByteVectors(LightContext* ctx, uint32_t num_elements,
uint8_t* bytevector_A, const uint8_t*
bytevector_B) {
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
num_processed = AndByteVectors_avx2(num_elements, bytevector_A,
bytevector_B);
}
diff --git a/cpp/src/arrow/compute/row/compare_internal.h
b/cpp/src/arrow/compute/row/compare_internal.h
index 638b8c2ec7..db953fbe11 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -86,7 +86,7 @@ class ARROW_EXPORT KeyCompare {
static void AndByteVectors(LightContext* ctx, uint32_t num_elements,
uint8_t* bytevector_A, const uint8_t*
bytevector_B);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
template <bool use_selection>
static uint32_t NullUpdateColumnToRowImp_avx2(
diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc
b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
index 95f37ab617..ff407c51b8 100644
--- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
@@ -24,8 +24,6 @@
namespace arrow {
namespace compute {
-#if defined(ARROW_HAVE_AVX2)
-
inline __m256i set_first_n_bytes_avx2(int n) {
constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL;
constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL;
@@ -670,7 +668,5 @@ uint32_t KeyCompare::CompareVarBinaryColumnToRow_avx2(
return num_rows_to_compare;
}
-#endif
-
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/row/encode_internal.cc
b/cpp/src/arrow/compute/row/encode_internal.cc
index 3a6a85b027..01d552ef82 100644
--- a/cpp/src/arrow/compute/row/encode_internal.cc
+++ b/cpp/src/arrow/compute/row/encode_internal.cc
@@ -455,7 +455,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t
num_rows,
bool is_row_fixed_length = rows.metadata().is_fixed_length;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows,
offset_within_row, rows,
col);
@@ -466,7 +466,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t
num_rows,
} else {
DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
}
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
}
#endif
@@ -524,7 +524,7 @@ void EncoderBinaryPair::Decode(uint32_t start_row, uint32_t
num_rows,
bool is_row_fixed_length = rows.metadata().is_fixed_length;
uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2() && col_width1 == col_width2) {
num_processed =
DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
@@ -772,7 +772,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t
num_rows,
KeyColumnArray* col, LightContext* ctx) {
// Output column varbinary buffer needs an extra 32B
// at the end in avx2 version and 8B otherwise.
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (ctx->has_avx2()) {
DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
} else {
@@ -782,7 +782,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t
num_rows,
} else {
DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
}
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
}
#endif
}
diff --git a/cpp/src/arrow/compute/row/encode_internal.h
b/cpp/src/arrow/compute/row/encode_internal.h
index b83767b694..6091fb6698 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -187,7 +187,7 @@ class EncoderBinary {
template <bool is_row_fixed_length>
static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t
offset_within_row,
const RowTableImpl& rows, KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
uint32_t num_rows, uint32_t offset_within_row,
const RowTableImpl& rows, KeyColumnArray* col);
@@ -213,7 +213,7 @@ class EncoderBinaryPair {
static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
uint32_t num_rows,
uint32_t offset_within_row, const RowTableImpl& rows,
KeyColumnArray* col1, KeyColumnArray* col2);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t
col_width,
uint32_t start_row, uint32_t num_rows,
uint32_t offset_within_row, const
RowTableImpl& rows,
@@ -300,7 +300,7 @@ class EncoderVarBinary {
template <bool first_varbinary_col>
static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t
varbinary_col_id,
const RowTableImpl& rows, KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
uint32_t varbinary_col_id, const RowTableImpl&
rows,
KeyColumnArray* col);
diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc
b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
index 02ba310bde..50969c7bd6 100644
--- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
@@ -22,8 +22,6 @@
namespace arrow {
namespace compute {
-#if defined(ARROW_HAVE_AVX2)
-
void EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t
start_row,
uint32_t num_rows, uint32_t
offset_within_row,
const RowTableImpl& rows,
KeyColumnArray* col) {
@@ -230,7 +228,5 @@ void EncoderVarBinary::DecodeImp_avx2(uint32_t start_row,
uint32_t num_rows,
});
}
-#endif
-
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
index f69f60a5af..faf3e0c87e 100644
--- a/cpp/src/arrow/compute/util.cc
+++ b/cpp/src/arrow/compute/util.cc
@@ -118,7 +118,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const
int num_bits,
// 64 bits at a time
constexpr int unroll = 64;
int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
if (filter_input_indexes) {
avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits,
input_indexes,
@@ -141,7 +141,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const
int num_bits,
bits_to_indexes_helper(word, i * 64 + base_index, num_indexes,
indexes);
}
}
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
}
#endif
// Optionally process the last partial word with masking out bits outside
range
@@ -253,7 +253,7 @@ void bits_to_bytes(int64_t hardware_flags, const int
num_bits, const uint8_t* bi
}
int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
@@ -309,7 +309,7 @@ void bytes_to_bits(int64_t hardware_flags, const int
num_bits, const uint8_t* by
}
int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
@@ -339,7 +339,7 @@ void bytes_to_bits(int64_t hardware_flags, const int
num_bits, const uint8_t* by
bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes) {
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
return avx2::are_all_bytes_zero_avx2(bytes, num_bytes);
}
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 489139eab8..730e59f346 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -168,7 +168,7 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags,
const int num_bits,
ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t*
bytes,
uint32_t num_bytes);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
namespace avx2 {
ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int
num_bits,
diff --git a/cpp/src/arrow/compute/util_avx2.cc
b/cpp/src/arrow/compute/util_avx2.cc
index 89ec6aa97a..0191ab06f9 100644
--- a/cpp/src/arrow/compute/util_avx2.cc
+++ b/cpp/src/arrow/compute/util_avx2.cc
@@ -21,9 +21,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/logging.h"
-#if defined(ARROW_HAVE_AVX2)
-
-namespace arrow::util::avx2 {
+namespace arrow::util::bit_util::avx2 {
template <int bit_to_search>
void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int*
num_indexes,
@@ -211,6 +209,4 @@ bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t
num_bytes) {
return result_or32 == 0;
}
-} // namespace arrow::util::avx2
-
-#endif // ARROW_HAVE_AVX2
+} // namespace arrow::util::bit_util::avx2
diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc
index b598544807..e8a782575e 100644
--- a/cpp/src/arrow/testing/util.cc
+++ b/cpp/src/arrow/testing/util.cc
@@ -43,6 +43,7 @@
#include "arrow/table.h"
#include "arrow/testing/random.h"
#include "arrow/type.h"
+#include "arrow/util/cpu_info.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/pcg_random.h"
@@ -211,4 +212,18 @@ const std::vector<std::shared_ptr<DataType>>&
all_dictionary_index_types() {
return types;
}
+std::vector<int64_t> GetSupportedHardwareFlags(
+ const std::vector<int64_t>& candidate_flags) {
+ std::vector<int64_t> hardware_flags;
+ // Always test fallback codepaths
+ hardware_flags.push_back(0);
+ for (const int64_t candidate_flag : candidate_flags) {
+ if (candidate_flag != 0 &&
+ internal::CpuInfo::GetInstance()->IsSupported(candidate_flag)) {
+ hardware_flags.push_back(candidate_flag);
+ }
+ }
+ return hardware_flags;
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h
index 4f4b03438f..b4b2785a36 100644
--- a/cpp/src/arrow/testing/util.h
+++ b/cpp/src/arrow/testing/util.h
@@ -131,4 +131,10 @@ ARROW_TESTING_EXPORT std::string GetListenAddress();
ARROW_TESTING_EXPORT
const std::vector<std::shared_ptr<DataType>>& all_dictionary_index_types();
+// Get a list of supported hardware flags from the given candidates.
+// The result will always contain 0, meaning no optional CPU feature enabled
at all.
+ARROW_TESTING_EXPORT
+std::vector<int64_t> GetSupportedHardwareFlags(
+ const std::vector<int64_t>& candidate_flags);
+
} // namespace arrow
diff --git a/cpp/src/arrow/util/byte_stream_split.h
b/cpp/src/arrow/util/byte_stream_split.h
index 28dcce52bb..d428df0659 100644
--- a/cpp/src/arrow/util/byte_stream_split.h
+++ b/cpp/src/arrow/util/byte_stream_split.h
@@ -39,9 +39,9 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t
num_values, int64_t
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+ constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
const int64_t num_blocks = size / kBlockSize;
uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
@@ -92,11 +92,12 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values,
const size_t num_value
uint8_t* output_buffer_raw) {
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
+ constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+
__m128i stage[3][kNumStreams];
__m128i final_result[kNumStreams];
const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
const size_t num_blocks = size / kBlockSize;
const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
__m128i* output_buffer_streams[kNumStreams];
@@ -143,7 +144,7 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values,
const size_t num_value
_mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2
+ 1]);
}
}
- if (kNumStreams == 8U) {
+ if constexpr (kNumStreams == 8U) {
// This is the path for double.
__m128i tmp[8];
for (size_t i = 0; i < 4; ++i) {
@@ -181,9 +182,9 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t
num_values, int64_t
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+ constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
if (size < kBlockSize) // Back to SSE for small size
return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
const int64_t num_blocks = size / kBlockSize;
@@ -220,7 +221,7 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t
num_values, int64_t
}
}
- if (kNumStreams == 8U) {
+ if constexpr (kNumStreams == 8U) {
// path for double, 128i index:
// {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
// {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
@@ -266,11 +267,12 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values,
const size_t num_value
uint8_t* output_buffer_raw) {
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
- if (kNumStreams == 8U) // Back to SSE, currently no path for double.
+ constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+
+ if constexpr (kNumStreams == 8U) // Back to SSE, currently no path for
double.
return ByteStreamSplitEncodeSse2<T>(raw_values, num_values,
output_buffer_raw);
const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
if (size < kBlockSize) // Back to SSE for small size
return ByteStreamSplitEncodeSse2<T>(raw_values, num_values,
output_buffer_raw);
const size_t num_blocks = size / kBlockSize;
@@ -339,9 +341,9 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data,
int64_t num_values, int64_
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+ constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
if (size < kBlockSize) // Back to AVX2 for small size
return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
const int64_t num_blocks = size / kBlockSize;
@@ -379,7 +381,7 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data,
int64_t num_values, int64_
}
}
- if (kNumStreams == 8U) {
+ if constexpr (kNumStreams == 8U) {
// path for double, 128i index:
// {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
// {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
@@ -442,8 +444,10 @@ void ByteStreamSplitEncodeAvx512(const uint8_t*
raw_values, const size_t num_val
uint8_t* output_buffer_raw) {
constexpr size_t kNumStreams = sizeof(T);
static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of
streams.");
- const size_t size = num_values * sizeof(T);
constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+
+ const size_t size = num_values * sizeof(T);
+
if (size < kBlockSize) // Back to AVX2 for small size
return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values,
output_buffer_raw);
@@ -469,7 +473,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values,
const size_t num_val
__m512i unpack[KNumUnpack + 1][kNumStreams];
__m512i permutex[kNumStreams];
__m512i permutex_mask;
- if (kNumStreams == 8U) {
+ if constexpr (kNumStreams == 8U) {
// use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016,
0x000E0006,
0x001D0015, 0x000D0005, 0x001C0014,
0x000C0004,
@@ -494,7 +498,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values,
const size_t num_val
}
}
- if (kNumStreams == 8U) {
+ if constexpr (kNumStreams == 8U) {
// path for double
// 1. unpack to epi16 block
// 2. permutexvar_epi16 to 128i block
diff --git a/docker-compose.yml b/docker-compose.yml
index fbb879b2bc..8727aded2c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -72,6 +72,10 @@ x-sccache: &sccache
SCCACHE_REGION:
SCCACHE_S3_KEY_PREFIX: ${SCCACHE_S3_KEY_PREFIX:-sccache}
+x-cpp: &cpp
+ ARROW_RUNTIME_SIMD_LEVEL:
+ ARROW_SIMD_LEVEL:
+
# CPU/memory limit presets to pass to Docker.
#
# Usage: archery docker run --resource-limit=github <image>
@@ -227,7 +231,7 @@ services:
ulimits: &ulimits
core: ${ULIMIT_CORE}
environment:
- <<: [*common, *ccache]
+ <<: [*common, *ccache, *cpp]
ARROW_ENABLE_TIMING_TESTS: # inherit
ARROW_MIMALLOC: "ON"
volumes: &alpine-linux-volumes
@@ -278,7 +282,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_BUILD_BENCHMARKS: "ON"
ARROW_BUILD_EXAMPLES: "ON"
ARROW_ENABLE_TIMING_TESTS: # inherit
@@ -313,7 +317,7 @@ services:
arch: ${ARCH}
shm_size: *shm-size
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
# Shrink test runtime by enabling minimal optimizations
ARROW_C_FLAGS_DEBUG: "-g1 -Og"
ARROW_CXX_FLAGS_DEBUG: "-g1 -Og"
@@ -349,7 +353,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_ENABLE_TIMING_TESTS: # inherit
ARROW_MIMALLOC: "ON"
volumes: &debian-volumes
@@ -390,7 +394,7 @@ services:
- apparmor:unconfined
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_ENABLE_TIMING_TESTS: # inherit
ARROW_MIMALLOC: "ON"
volumes: &ubuntu-volumes
@@ -426,7 +430,7 @@ services:
- apparmor:unconfined
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_HOME: /arrow
ARROW_DEPENDENCY_SOURCE: BUNDLED
LIBARROW_MINIMAL: "false"
@@ -448,7 +452,7 @@ services:
volumes:
- .:/arrow:delegated
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_DEPENDENCY_SOURCE: BUNDLED
ARROW_HOME: /arrow
LIBARROW_MINIMAL: "false"
@@ -470,7 +474,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_DEPENDENCY_SOURCE: BUNDLED
CMAKE_GENERATOR: "Unix Makefiles"
volumes: *ubuntu-volumes
@@ -491,7 +495,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_BUILD_UTILITIES: "OFF"
ARROW_COMPUTE: "OFF"
ARROW_CSV: "OFF"
@@ -538,7 +542,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_BUILD_UTILITIES: "OFF"
ARROW_COMPUTE: "OFF"
ARROW_CSV: "OFF"
@@ -588,7 +592,7 @@ services:
shm_size: *shm-size
volumes: *ubuntu-volumes
environment:
- <<: [*common, *ccache]
+ <<: [*common, *ccache, *cpp]
CC: clang-${CLANG_TOOLS}
CXX: clang++-${CLANG_TOOLS}
# Avoid creating huge static libraries
@@ -630,7 +634,7 @@ services:
shm_size: *shm-size
volumes: *ubuntu-volumes
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
CC: clang-${CLANG_TOOLS}
CXX: clang++-${CLANG_TOOLS}
ARROW_BUILD_STATIC: "OFF"
@@ -662,7 +666,7 @@ services:
shm_size: *shm-size
ulimits: *ulimits
environment:
- <<: [*common, *ccache, *sccache]
+ <<: [*common, *ccache, *sccache, *cpp]
ARROW_ENABLE_TIMING_TESTS: # inherit
ARROW_MIMALLOC: "ON"
Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15