[arrow] branch main updated: GH-35116: [CI][C++] Enable compile-time AVX2 on some CI platforms (#36662)

apitrou Wed, 19 Jul 2023 02:20:04 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 366e8083a2 GH-35116: [CI][C++] Enable compile-time AVX2 on some CI 
platforms (#36662)
366e8083a2 is described below

commit 366e8083a2bd6d24ad371548699ef936fb7bb468
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jul 19 11:19:49 2023 +0200

    GH-35116: [CI][C++] Enable compile-time AVX2 on some CI platforms (#36662)
    
    AVX2 became mainline on Intel and AMD server CPUs around 2015, so it's 
unlikely to be unavailable on current cloud platforms:
    https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#CPUs_with_AVX2
    
    Enable it at least on one Windows and one Linux CI platform.
    
    x86 macOS is a legacy platform, so less interesting to exercise there (and 
I'm not sure the old CPUs in x86 Macs actually support AVX2).
    
    Also, fix the buggy AVX2 activation logic in Acero and avoid force-testing 
AVX2 on incompatible systems.
    
    * Closes: #35116
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 .github/workflows/cpp.yml                          | 14 +++-
 ci/scripts/cpp_build.sh                            |  1 +
 cpp/cmake_modules/SetupCxxFlags.cmake              | 49 ++++++------
 cpp/src/arrow/CMakeLists.txt                       | 22 +++---
 cpp/src/arrow/acero/CMakeLists.txt                 |  6 +-
 cpp/src/arrow/acero/bloom_filter.cc                |  8 +-
 cpp/src/arrow/acero/bloom_filter.h                 |  5 +-
 cpp/src/arrow/acero/bloom_filter_avx2.cc           |  5 +-
 cpp/src/arrow/acero/bloom_filter_test.cc           | 85 +++++++++-----------
 cpp/src/arrow/acero/swiss_join_avx2.cc             |  4 -
 cpp/src/arrow/acero/swiss_join_internal.h          |  2 +-
 cpp/src/arrow/acero/test_util_internal.cc          |  9 +++
 cpp/src/arrow/acero/test_util_internal.h           | 12 +--
 cpp/src/arrow/compute/kernels/CMakeLists.txt       | 54 ++++++++++---
 cpp/src/arrow/compute/key_hash.cc                  |  6 +-
 cpp/src/arrow/compute/key_hash.h                   |  4 +-
 cpp/src/arrow/compute/key_hash_avx2.cc             |  4 -
 cpp/src/arrow/compute/key_hash_test.cc             | 92 +++++++++++-----------
 cpp/src/arrow/compute/key_map.cc                   |  4 +-
 cpp/src/arrow/compute/key_map.h                    |  2 +-
 cpp/src/arrow/compute/key_map_avx2.cc              |  4 -
 cpp/src/arrow/compute/row/compare_internal.cc      |  8 +-
 cpp/src/arrow/compute/row/compare_internal.h       |  2 +-
 cpp/src/arrow/compute/row/compare_internal_avx2.cc |  4 -
 cpp/src/arrow/compute/row/encode_internal.cc       | 10 +--
 cpp/src/arrow/compute/row/encode_internal.h        |  6 +-
 cpp/src/arrow/compute/row/encode_internal_avx2.cc  |  4 -
 cpp/src/arrow/compute/util.cc                      | 10 +--
 cpp/src/arrow/compute/util.h                       |  2 +-
 cpp/src/arrow/compute/util_avx2.cc                 |  8 +-
 cpp/src/arrow/testing/util.cc                      | 15 ++++
 cpp/src/arrow/testing/util.h                       |  6 ++
 cpp/src/arrow/util/byte_stream_split.h             | 28 ++++---
 docker-compose.yml                                 | 30 ++++---
 34 files changed, 289 insertions(+), 236 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 67435566ce..63a16c8c11 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -67,7 +67,8 @@ jobs:
             image: conda-cpp
             llvm: "14"
             runs-on: ubuntu-latest
-            title: AMD64 Conda C++
+            simd-level: AVX2
+            title: AMD64 Conda C++ AVX2
             ubuntu: "22.04"
           - arch: amd64
             clang-tools: "14"
@@ -85,6 +86,7 @@ jobs:
             ubuntu: "20.04"
     env:
       ARCH: ${{ matrix.arch }}
+      ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
       CLANG_TOOLS: ${{ matrix.clang-tools }}
       LLVM: ${{ matrix.llvm }}
       UBUNTU: ${{ matrix.ubuntu }}
@@ -175,6 +177,10 @@ jobs:
       ARROW_WITH_ZSTD: ON
       GTest_SOURCE: BUNDLED
     steps:
+      - name: CPU Info
+        run: |
+          sysctl -a | grep cpu
+          sysctl -a | grep "hw.optional"
       - name: Checkout Arrow
         uses: actions/checkout@v3
         with:
@@ -220,7 +226,7 @@ jobs:
           ci/scripts/cpp_test.sh $(pwd) $(pwd)/build
 
   windows:
-    name: AMD64 ${{ matrix.name }} C++17
+    name: ${{ matrix.title }}
     runs-on: ${{ matrix.os }}
     if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
     timeout-minutes: 60
@@ -231,7 +237,8 @@ jobs:
           - windows-2019
         include:
           - os: windows-2019
-            name: Windows 2019
+            simd-level: AVX2
+            title: AMD64 Windows 2019 C++17 AVX2
     env:
       ARROW_BOOST_USE_SHARED: OFF
       ARROW_BUILD_BENCHMARKS: ON
@@ -246,6 +253,7 @@ jobs:
       ARROW_MIMALLOC: ON
       ARROW_ORC: ON
       ARROW_PARQUET: ON
+      ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
       ARROW_USE_GLOG: OFF
       ARROW_VERBOSE_THIRDPARTY_BUILD: OFF
       ARROW_WITH_BROTLI: OFF
diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh
index f0f893c419..e53b3fa460 100755
--- a/ci/scripts/cpp_build.sh
+++ b/ci/scripts/cpp_build.sh
@@ -126,6 +126,7 @@ cmake \
   -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \
   -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \
   -DARROW_S3=${ARROW_S3:-OFF} \
+  -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \
   -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \
   -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-ON} \
   -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake 
b/cpp/cmake_modules/SetupCxxFlags.cmake
index 076c2e7450..6b47fcb717 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -62,29 +62,32 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
         "${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq 
-mavx512bw")
     check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2)
   endif()
-  check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
-  if(MINGW)
-    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
-    message(STATUS "Disable AVX512 support on MINGW for now")
-  else()
-    # Check for AVX512 support in the compiler.
-    set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}")
-    check_cxx_source_compiles("
-      #ifdef _MSC_VER
-      #include <intrin.h>
-      #else
-      #include <immintrin.h>
-      #endif
-
-      int main() {
-        __m512i mask = _mm512_set1_epi32(0x1);
-        char out[32];
-        _mm512_storeu_si512(out, mask);
-        return 0;
-      }"
-                              CXX_SUPPORTS_AVX512)
-    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # Check for AVX extensions on 64-bit systems only, as 32-bit support seems 
iffy
+    check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
+    if(MINGW)
+      # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
+      message(STATUS "Disable AVX512 support on MINGW for now")
+    else()
+      # Check for AVX512 support in the compiler.
+      set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}")
+      check_cxx_source_compiles("
+        #ifdef _MSC_VER
+        #include <intrin.h>
+        #else
+        #include <immintrin.h>
+        #endif
+
+        int main() {
+          __m512i mask = _mm512_set1_epi32(0x1);
+          char out[32];
+          _mm512_storeu_si512(out, mask);
+          return 0;
+        }"
+                                CXX_SUPPORTS_AVX512)
+      set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+    endif()
   endif()
   # Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL
   if(CXX_SUPPORTS_SSE4_2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fccff6c8cf..a398e790de 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -119,7 +119,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME)
                 ${ARG_UNPARSED_ARGUMENTS})
 endfunction()
 
-macro(append_avx2_src SRC)
+macro(append_runtime_avx2_src SRC)
   if(ARROW_HAVE_RUNTIME_AVX2)
     list(APPEND ARROW_SRCS ${SRC})
     set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -127,7 +127,7 @@ macro(append_avx2_src SRC)
   endif()
 endmacro()
 
-macro(append_avx512_src SRC)
+macro(append_runtime_avx512_src SRC)
   if(ARROW_HAVE_RUNTIME_AVX512)
     list(APPEND ARROW_SRCS ${SRC})
     set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -254,8 +254,8 @@ if(ARROW_JEMALLOC)
                               PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 endif()
 
-append_avx2_src(util/bpacking_avx2.cc)
-append_avx512_src(util/bpacking_avx512.cc)
+append_runtime_avx2_src(util/bpacking_avx2.cc)
+append_runtime_avx512_src(util/bpacking_avx512.cc)
 
 if(ARROW_HAVE_NEON)
   list(APPEND ARROW_SRCS util/bpacking_neon.cc)
@@ -425,11 +425,11 @@ list(APPEND
      compute/row/row_internal.cc
      compute/util.cc)
 
-append_avx2_src(compute/key_hash_avx2.cc)
-append_avx2_src(compute/key_map_avx2.cc)
-append_avx2_src(compute/row/compare_internal_avx2.cc)
-append_avx2_src(compute/row/encode_internal_avx2.cc)
-append_avx2_src(compute/util_avx2.cc)
+append_runtime_avx2_src(compute/key_hash_avx2.cc)
+append_runtime_avx2_src(compute/key_map_avx2.cc)
+append_runtime_avx2_src(compute/row/compare_internal_avx2.cc)
+append_runtime_avx2_src(compute/row/encode_internal_avx2.cc)
+append_runtime_avx2_src(compute/util_avx2.cc)
 
 if(ARROW_COMPUTE)
   # Include the remaining kernels
@@ -464,8 +464,8 @@ if(ARROW_COMPUTE)
        compute/kernels/vector_select_k.cc
        compute/kernels/vector_sort.cc)
 
-  append_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
-  append_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
+  append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc)
+  append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc)
 endif()
 
 if(ARROW_FILESYSTEM)
diff --git a/cpp/src/arrow/acero/CMakeLists.txt 
b/cpp/src/arrow/acero/CMakeLists.txt
index 287884432b..c2c91db58d 100644
--- a/cpp/src/arrow/acero/CMakeLists.txt
+++ b/cpp/src/arrow/acero/CMakeLists.txt
@@ -19,7 +19,7 @@ add_custom_target(arrow_acero)
 
 arrow_install_all_headers("arrow/acero")
 
-macro(append_acero_avx2_src SRC)
+macro(append_acero_runtime_avx2_src SRC)
   if(ARROW_HAVE_RUNTIME_AVX2)
     list(APPEND ARROW_ACERO_SRCS ${SRC})
     set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
@@ -56,8 +56,8 @@ set(ARROW_ACERO_SRCS
     union_node.cc
     util.cc)
 
-append_acero_avx2_src(bloom_filter_avx2.cc)
-append_acero_avx2_src(swiss_join_avx2.cc)
+append_acero_runtime_avx2_src(bloom_filter_avx2.cc)
+append_acero_runtime_avx2_src(swiss_join_avx2.cc)
 
 set(ARROW_ACERO_SHARED_LINK_LIBS)
 set(ARROW_ACERO_STATIC_LINK_LIBS)
diff --git a/cpp/src/arrow/acero/bloom_filter.cc 
b/cpp/src/arrow/acero/bloom_filter.cc
index ad5e66ded0..b9855ee506 100644
--- a/cpp/src/arrow/acero/bloom_filter.cc
+++ b/cpp/src/arrow/acero/bloom_filter.cc
@@ -123,7 +123,7 @@ void BlockedBloomFilter::InsertImp(int64_t num_rows, const 
T* hashes) {
 void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows,
                                 const uint32_t* hashes) {
   int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     num_processed = Insert_avx2(num_rows, hashes);
   }
@@ -134,7 +134,7 @@ void BlockedBloomFilter::Insert(int64_t hardware_flags, 
int64_t num_rows,
 void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows,
                                 const uint64_t* hashes) {
   int64_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     num_processed = Insert_avx2(num_rows, hashes);
   }
@@ -181,7 +181,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, 
int64_t num_rows,
                               bool enable_prefetch) const {
   int64_t num_processed = 0;
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (!(enable_prefetch && UsePrefetch()) &&
       (hardware_flags & arrow::internal::CpuInfo::AVX2)) {
     num_processed = Find_avx2(num_rows, hashes, result_bit_vector);
@@ -202,7 +202,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, 
int64_t num_rows,
                               bool enable_prefetch) const {
   int64_t num_processed = 0;
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (!(enable_prefetch && UsePrefetch()) &&
       (hardware_flags & arrow::internal::CpuInfo::AVX2)) {
     num_processed = Find_avx2(num_rows, hashes, result_bit_vector);
diff --git a/cpp/src/arrow/acero/bloom_filter.h 
b/cpp/src/arrow/acero/bloom_filter.h
index b8f7f8cd25..50d07bfd94 100644
--- a/cpp/src/arrow/acero/bloom_filter.h
+++ b/cpp/src/arrow/acero/bloom_filter.h
@@ -17,13 +17,14 @@
 
 #pragma once
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
 #include <immintrin.h>
 #endif
 
 #include <atomic>
 #include <cstdint>
 #include <memory>
+
 #include "arrow/acero/partition_util.h"
 #include "arrow/acero/util.h"
 #include "arrow/memory_pool.h"
@@ -203,7 +204,7 @@ class ARROW_ACERO_EXPORT BlockedBloomFilter {
 
   void SingleFold(int num_folds);
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   inline __m256i mask_avx2(__m256i hash) const;
   inline __m256i block_id_avx2(__m256i hash) const;
   int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes);
diff --git a/cpp/src/arrow/acero/bloom_filter_avx2.cc 
b/cpp/src/arrow/acero/bloom_filter_avx2.cc
index b6c281276d..5816bb4fc0 100644
--- a/cpp/src/arrow/acero/bloom_filter_avx2.cc
+++ b/cpp/src/arrow/acero/bloom_filter_avx2.cc
@@ -16,14 +16,13 @@
 // under the License.
 
 #include <immintrin.h>
+
 #include "arrow/acero/bloom_filter.h"
 #include "arrow/util/bit_util.h"
 
 namespace arrow {
 namespace acero {
 
-#if defined(ARROW_HAVE_AVX2)
-
 inline __m256i BlockedBloomFilter::mask_avx2(__m256i hash) const {
   // AVX2 translation of mask() method
   //
@@ -132,7 +131,5 @@ int64_t BlockedBloomFilter::Insert_avx2(int64_t num_rows, 
const uint64_t* hashes
   return InsertImp_avx2(num_rows, hashes);
 }
 
-#endif
-
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc 
b/cpp/src/arrow/acero/bloom_filter_test.cc
index de433ac68c..95375e277e 100644
--- a/cpp/src/arrow/acero/bloom_filter_test.cc
+++ b/cpp/src/arrow/acero/bloom_filter_test.cc
@@ -22,13 +22,13 @@
 #include <condition_variable>
 #include <thread>
 #include <unordered_set>
+
 #include "arrow/acero/bloom_filter.h"
 #include "arrow/acero/task_util.h"
 #include "arrow/acero/test_util_internal.h"
 #include "arrow/acero/util.h"
 #include "arrow/compute/key_hash.h"
 #include "arrow/util/bitmap_ops.h"
-#include "arrow/util/cpu_info.h"
 
 namespace arrow {
 
@@ -171,9 +171,7 @@ void TestBloomSmallHashHelper(int64_t num_input_hashes, 
const T* input_hashes,
 // Output FPR and build and probe cost.
 //
 void TestBloomSmall(BloomFilterBuildStrategy strategy, int64_t num_build,
-                    int num_build_copies, bool use_simd, bool enable_prefetch) 
{
-  int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0;
-
+                    int num_build_copies, int64_t hardware_flags, bool 
enable_prefetch) {
   // Generate input keys
   //
   int64_t num_probe = 4 * num_build;
@@ -324,10 +322,8 @@ void TestBloomLargeHashHelper(int64_t hardware_flags, 
int64_t block,
 // Test with larger size Bloom filters (use large prime with arithmetic
 // sequence modulo 2^64).
 //
-void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, bool 
use_simd,
-                    bool enable_prefetch) {
-  int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0;
-
+void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build,
+                    int64_t hardware_flags, bool enable_prefetch) {
   // Largest 63-bit prime
   constexpr uint64_t prime = 0x7FFFFFFFFFFFFFE7ULL;
 
@@ -458,42 +454,40 @@ TEST(BloomFilter, Basic) {
   num_build.push_back(1LL << log_large);
 #endif
 
-  constexpr int num_param_sets = 3;
-  struct {
-    bool use_avx2;
+  struct TestParam {
+    int64_t hardware_flags;
     bool enable_prefetch;
     bool insert_multiple_copies;
-  } params[num_param_sets];
-  for (int i = 0; i < num_param_sets; ++i) {
-    params[i].use_avx2 = (i == 1);
-    params[i].enable_prefetch = (i == 2);
-    params[i].insert_multiple_copies = (i == 3);
+  };
+  std::vector<TestParam> test_params;
+  for (const auto hardware_flags : HardwareFlagsForTesting()) {
+    test_params.push_back({hardware_flags, false, false});
   }
+  test_params.push_back({0, true, false});
+  test_params.push_back({0, false, true});
 
-  std::vector<BloomFilterBuildStrategy> strategy;
-  strategy.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
+  std::vector<BloomFilterBuildStrategy> strategies;
+  strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
 #ifndef ARROW_VALGRIND
-  strategy.push_back(BloomFilterBuildStrategy::PARALLEL);
+  strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
 #endif
 
   static constexpr int64_t min_rows_for_large = 2 * 1024 * 1024;
 
-  for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) {
-    for (int iparam_set = 0; iparam_set < num_param_sets; ++iparam_set) {
-      ARROW_SCOPED_TRACE("%s ", params[iparam_set].use_avx2                 ? 
"AVX2"
-                                : params[iparam_set].enable_prefetch        ? 
"PREFETCH"
-                                : params[iparam_set].insert_multiple_copies ? 
"FOLDING"
-                                                                            : 
"REGULAR");
-      for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) 
{
-        ARROW_SCOPED_TRACE("num_build ", 
static_cast<int>(num_build[inum_build]));
-        if (num_build[inum_build] >= min_rows_for_large) {
-          TestBloomLarge(strategy[istrategy], num_build[inum_build],
-                         params[iparam_set].use_avx2, 
params[iparam_set].enable_prefetch);
+  for (const auto& strategy : strategies) {
+    for (const auto& test_param : test_params) {
+      ARROW_SCOPED_TRACE("hardware_flags = ", test_param.hardware_flags,
+                         test_param.enable_prefetch ? " PREFETCH" : "",
+                         test_param.insert_multiple_copies ? " FOLDING" : 
"REGULAR");
+      for (const auto n : num_build) {
+        ARROW_SCOPED_TRACE("num_build ", n);
+        if (n >= min_rows_for_large) {
+          TestBloomLarge(strategy, n, test_param.hardware_flags,
+                         test_param.enable_prefetch);
 
         } else {
-          TestBloomSmall(strategy[istrategy], num_build[inum_build],
-                         params[iparam_set].insert_multiple_copies ? 8 : 1,
-                         params[iparam_set].use_avx2, 
params[iparam_set].enable_prefetch);
+          TestBloomSmall(strategy, n, test_param.insert_multiple_copies ? 8 : 
1,
+                         test_param.hardware_flags, 
test_param.enable_prefetch);
         }
       }
     }
@@ -506,19 +500,18 @@ TEST(BloomFilter, Scaling) {
   num_build.push_back(1000000);
   num_build.push_back(4000000);
 
-  std::vector<BloomFilterBuildStrategy> strategy;
-  strategy.push_back(BloomFilterBuildStrategy::PARALLEL);
-
-  for (bool use_avx2 : {false, true}) {
-    for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) {
-      for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) 
{
-        ARROW_SCOPED_TRACE("num_build = ", 
static_cast<int>(num_build[inum_build]));
-        ARROW_SCOPED_TRACE("strategy = ",
-                           strategy[istrategy] == 
BloomFilterBuildStrategy::PARALLEL
-                               ? "PARALLEL"
-                               : "SINGLE_THREADED");
-        ARROW_SCOPED_TRACE("avx2 = ", use_avx2 ? "AVX2" : "SCALAR");
-        TestBloomLarge(strategy[istrategy], num_build[inum_build], use_avx2,
+  std::vector<BloomFilterBuildStrategy> strategies;
+  strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
+
+  for (const auto hardware_flags : HardwareFlagsForTesting()) {
+    for (const auto& strategy : strategies) {
+      for (const auto n : num_build) {
+        ARROW_SCOPED_TRACE("num_build = ", n);
+        ARROW_SCOPED_TRACE("strategy = ", strategy == 
BloomFilterBuildStrategy::PARALLEL
+                                              ? "PARALLEL"
+                                              : "SINGLE_THREADED");
+        ARROW_SCOPED_TRACE("hardware_flags = ", hardware_flags);
+        TestBloomLarge(strategy, n, hardware_flags,
                        /*enable_prefetch=*/false);
       }
     }
diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc 
b/cpp/src/arrow/acero/swiss_join_avx2.cc
index d5c0b7817f..0888dd8938 100644
--- a/cpp/src/arrow/acero/swiss_join_avx2.cc
+++ b/cpp/src/arrow/acero/swiss_join_avx2.cc
@@ -23,8 +23,6 @@
 namespace arrow {
 namespace acero {
 
-#if defined(ARROW_HAVE_AVX2)
-
 template <class PROCESS_8_VALUES_FN>
 int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int 
num_rows,
                                  const uint32_t* row_ids,
@@ -191,7 +189,5 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& 
rows, int column_id,
   return num_rows - (num_rows % unroll);
 }
 
-#endif
-
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/swiss_join_internal.h 
b/cpp/src/arrow/acero/swiss_join_internal.h
index cd12b34a0c..88b80f06f5 100644
--- a/cpp/src/arrow/acero/swiss_join_internal.h
+++ b/cpp/src/arrow/acero/swiss_join_internal.h
@@ -80,7 +80,7 @@ class RowArrayAccessor {
                          const uint32_t* row_ids, PROCESS_VALUE_FN 
process_value_fn);
 
  private:
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   // This is equivalent to Visit method, but processing 8 rows at a time in a
   // loop.
   // Returns the number of processed rows, which may be less than requested (up
diff --git a/cpp/src/arrow/acero/test_util_internal.cc 
b/cpp/src/arrow/acero/test_util_internal.cc
index 2042650be6..f50ca92238 100644
--- a/cpp/src/arrow/acero/test_util_internal.cc
+++ b/cpp/src/arrow/acero/test_util_internal.cc
@@ -45,8 +45,10 @@
 #include "arrow/testing/builder.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
+#include "arrow/testing/util.h"
 #include "arrow/type.h"
 #include "arrow/util/async_generator.h"
+#include "arrow/util/cpu_info.h"
 #include "arrow/util/iterator.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/unreachable.h"
@@ -54,6 +56,7 @@
 
 namespace arrow {
 
+using arrow::internal::CpuInfo;
 using arrow::internal::Executor;
 
 using compute::SortKey;
@@ -62,6 +65,7 @@ using compute::Take;
 namespace acero {
 
 namespace {
+
 void ValidateOutputImpl(const ArrayData& output) {
   ASSERT_OK(::arrow::internal::ValidateArrayFull(output));
   TestInitialized(output);
@@ -116,6 +120,11 @@ void ValidateOutput(const Datum& output) {
   }
 }
 
+std::vector<int64_t> HardwareFlagsForTesting() {
+  // Acero currently only has AVX2 optimizations
+  return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2});
+}
+
 namespace {
 
 struct DummyNode : ExecNode {
diff --git a/cpp/src/arrow/acero/test_util_internal.h 
b/cpp/src/arrow/acero/test_util_internal.h
index 03f4170286..569fb1254d 100644
--- a/cpp/src/arrow/acero/test_util_internal.h
+++ b/cpp/src/arrow/acero/test_util_internal.h
@@ -20,6 +20,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/util/vector.h"
 
+#include <cstdint>
 #include <functional>
 #include <random>
 #include <string>
@@ -33,12 +34,14 @@
 #include "arrow/util/async_generator.h"
 #include "arrow/util/pcg_random.h"
 
-namespace arrow {
-
-namespace acero {
+namespace arrow::acero {
 
 void ValidateOutput(const Datum& output);
 
+// Enumerate all hardware flags that can be tested on this platform
+// and would lead to different code paths being tested in Acero.
+std::vector<int64_t> HardwareFlagsForTesting();
+
 using StartProducingFunc = std::function<Status(ExecNode*)>;
 using StopProducingFunc = std::function<void(ExecNode*)>;
 
@@ -204,5 +207,4 @@ struct TableGenerationProperties {
 Result<std::shared_ptr<Table>> MakeRandomTimeSeriesTable(
     const TableGenerationProperties& properties);
 
-}  // namespace acero
-}  // namespace arrow
+}  // namespace arrow::acero
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt 
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index a17d6275a7..0bd6fe8613 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -18,11 +18,20 @@
 # ----------------------------------------------------------------------
 # Tests that don't require the full kernel library
 
+# Define arrow_compute_testing object library for common test files
+if(ARROW_TESTING)
+  add_library(arrow_compute_kernels_testing OBJECT test_util.cc)
+  # Even though this is still just an object library we still need to "link" 
our
+  # dependencies so that include paths are configured correctly
+  target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST})
+endif()
+
 add_arrow_test(scalar_cast_test
                ${ARROW_COMPUTE_TEST_ARGS}
                SOURCES
                scalar_cast_test.cc
-               test_util.cc)
+               EXTRA_LINK_LIBS
+               arrow_compute_kernels_testing)
 
 # ----------------------------------------------------------------------
 # Scalar kernels
@@ -32,25 +41,36 @@ add_arrow_compute_test(scalar_type_test
                        scalar_boolean_test.cc
                        scalar_nested_test.cc
                        scalar_string_test.cc
-                       test_util.cc)
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
-add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc 
test_util.cc)
+add_arrow_compute_test(scalar_if_else_test
+                       SOURCES
+                       scalar_if_else_test.cc
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
-add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc 
test_util.cc)
+add_arrow_compute_test(scalar_temporal_test
+                       SOURCES
+                       scalar_temporal_test.cc
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
 add_arrow_compute_test(scalar_math_test
                        SOURCES
                        scalar_arithmetic_test.cc
                        scalar_compare_test.cc
                        scalar_round_arithmetic_test.cc
-                       test_util.cc)
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
 add_arrow_compute_test(scalar_utility_test
                        SOURCES
                        scalar_random_test.cc
                        scalar_set_lookup_test.cc
                        scalar_validity_test.cc
-                       test_util.cc)
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
 add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute")
@@ -75,12 +95,20 @@ add_arrow_compute_test(vector_test
                        vector_replace_test.cc
                        vector_run_end_encode_test.cc
                        select_k_test.cc
-                       test_util.cc)
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
-add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc 
test_util.cc)
+add_arrow_compute_test(vector_sort_test
+                       SOURCES
+                       vector_sort_test.cc
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
-add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc
-                       test_util.cc)
+add_arrow_compute_test(vector_selection_test
+                       SOURCES
+                       vector_selection_test.cc
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
 add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
@@ -94,7 +122,11 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX 
"arrow-compute")
 
 # Aggregates
 
-add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc)
+add_arrow_compute_test(aggregate_test
+                       SOURCES
+                       aggregate_test.cc
+                       EXTRA_LINK_LIBS
+                       arrow_compute_kernels_testing)
 
 # ----------------------------------------------------------------------
 # Utilities
diff --git a/cpp/src/arrow/compute/key_hash.cc 
b/cpp/src/arrow/compute/key_hash.cc
index 3fcfbf3d83..f5867b405e 100644
--- a/cpp/src/arrow/compute/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -236,7 +236,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool 
combine_hashes, uint32_t
                            const uint32_t* offsets, const uint8_t* 
concatenated_keys,
                            uint32_t* hashes, uint32_t* 
hashes_temp_for_combine) {
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, 
concatenated_keys,
                                     hashes, hashes_temp_for_combine);
@@ -255,7 +255,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool 
combine_hashes, uint32_t
                            const uint64_t* offsets, const uint8_t* 
concatenated_keys,
                            uint32_t* hashes, uint32_t* 
hashes_temp_for_combine) {
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, 
concatenated_keys,
                                     hashes, hashes_temp_for_combine);
@@ -361,7 +361,7 @@ void Hashing32::HashFixed(int64_t hardware_flags, bool 
combine_hashes, uint32_t
   }
 
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, 
hashes,
                                       hashes_temp_for_combine);
diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index e43d7b8df5..b193716c9b 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
 #include <immintrin.h>
 #endif
 
@@ -115,7 +115,7 @@ class ARROW_EXPORT Hashing32 {
   static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
length_key,
                       const uint8_t* keys, uint32_t* hashes);
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   static inline __m256i Avalanche_avx2(__m256i hash);
   static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i 
hash);
   template <bool T_COMBINE_HASHES>
diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc 
b/cpp/src/arrow/compute/key_hash_avx2.cc
index f30c3460bd..1b444b5767 100644
--- a/cpp/src/arrow/compute/key_hash_avx2.cc
+++ b/cpp/src/arrow/compute/key_hash_avx2.cc
@@ -23,8 +23,6 @@
 namespace arrow {
 namespace compute {
 
-#if defined(ARROW_HAVE_AVX2)
-
 inline __m256i Hashing32::Avalanche_avx2(__m256i hash) {
   hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15));
   hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2));
@@ -315,7 +313,5 @@ uint32_t Hashing32::HashVarLen_avx2(bool combine_hashes, 
uint32_t num_rows,
   }
 }
 
-#endif
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/key_hash_test.cc 
b/cpp/src/arrow/compute/key_hash_test.cc
index d10645391b..3e6d41525c 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -21,18 +21,26 @@
 #include <map>
 #include <random>
 #include <unordered_set>
+
 #include "arrow/array/builder_binary.h"
 #include "arrow/compute/key_hash.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/pcg_random.h"
 
 namespace arrow {
 
 using internal::checked_pointer_cast;
+using internal::CpuInfo;
 
 namespace compute {
 
+std::vector<int64_t> HardwareFlagsForTesting() {
+  // Our key-hash and key-map routines currently only have AVX2 optimizations
+  return GetSupportedHardwareFlags({CpuInfo::AVX2});
+}
+
 class TestVectorHash {
  private:
   template <typename Type, typename ArrayType = typename 
TypeTraits<Type>::ArrayType>
@@ -131,85 +139,79 @@ class TestVectorHash {
     const offset_t* key_offsets =
         reinterpret_cast<const offset_t*>(keys_array->raw_value_offsets());
 
-    std::vector<uint32_t> hashes_scalar32;
-    std::vector<uint64_t> hashes_scalar64;
-    hashes_scalar32.resize(num_rows);
-    hashes_scalar64.resize(num_rows);
-    std::vector<uint32_t> hashes_simd32;
-    std::vector<uint64_t> hashes_simd64;
-    hashes_simd32.resize(num_rows);
-    hashes_simd64.resize(num_rows);
-
-    int64_t hardware_flags_scalar = 0LL;
-    int64_t hardware_flags_simd = ::arrow::internal::CpuInfo::AVX2;
+    // For each tested hardware flags, we will compute the hashes and check
+    // them for consistency.
+    const auto hardware_flags_for_testing = HardwareFlagsForTesting();
+    ASSERT_GT(hardware_flags_for_testing.size(), 0);
+    std::vector<std::vector<uint32_t>> 
hashes32(hardware_flags_for_testing.size());
+    std::vector<std::vector<uint64_t>> 
hashes64(hardware_flags_for_testing.size());
+    for (auto& h : hashes32) {
+      h.resize(num_rows);
+    }
+    for (auto& h : hashes64) {
+      h.resize(num_rows);
+    }
 
     constexpr int mini_batch_size = 1024;
     std::vector<uint32_t> temp_buffer;
     temp_buffer.resize(mini_batch_size * 4);
 
-    for (bool use_simd : {false, true}) {
+    for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); 
++i) {
+      const auto hardware_flags = hardware_flags_for_testing[i];
       if (use_32bit_hash) {
         if (!use_varlen_input) {
-          Hashing32::HashFixed(use_simd ? hardware_flags_simd : 
hardware_flags_scalar,
+          Hashing32::HashFixed(hardware_flags,
                                /*combine_hashes=*/false, num_rows, 
fixed_length, keys,
-                               use_simd ? hashes_simd32.data() : 
hashes_scalar32.data(),
-                               temp_buffer.data());
+                               hashes32[i].data(), temp_buffer.data());
         } else {
           for (int first_row = 0; first_row < num_rows;) {
             int batch_size_next = std::min(num_rows - first_row, 
mini_batch_size);
 
-            Hashing32::HashVarLen(
-                use_simd ? hardware_flags_simd : hardware_flags_scalar,
-                /*combine_hashes=*/false, batch_size_next, key_offsets + 
first_row, keys,
-                (use_simd ? hashes_simd32.data() : hashes_scalar32.data()) + 
first_row,
-                temp_buffer.data());
+            Hashing32::HashVarLen(hardware_flags,
+                                  /*combine_hashes=*/false, batch_size_next,
+                                  key_offsets + first_row, keys,
+                                  hashes32[i].data() + first_row, 
temp_buffer.data());
 
             first_row += batch_size_next;
           }
         }
+        for (int j = 0; j < num_rows; ++j) {
+          hashes64[i][j] = hashes32[i][j];
+        }
       } else {
         if (!use_varlen_input) {
           Hashing64::HashFixed(
-              /*combine_hashes=*/false, num_rows, fixed_length, keys,
-              use_simd ? hashes_simd64.data() : hashes_scalar64.data());
+              /*combine_hashes=*/false, num_rows, fixed_length, keys, 
hashes64[i].data());
         } else {
           Hashing64::HashVarLen(
-              /*combine_hashes=*/false, num_rows, key_offsets, keys,
-              use_simd ? hashes_simd64.data() : hashes_scalar64.data());
+              /*combine_hashes=*/false, num_rows, key_offsets, keys, 
hashes64[i].data());
         }
       }
     }
 
-    if (use_32bit_hash) {
-      for (int i = 0; i < num_rows; ++i) {
-        hashes_scalar64[i] = hashes_scalar32[i];
-        hashes_simd64[i] = hashes_simd32[i];
-      }
-    }
-
-    // Verify that both scalar and AVX2 implementations give the same hashes
+    // Verify that all implementations (scalar, SIMD) give the same hashes
     //
-    for (int i = 0; i < num_rows; ++i) {
-      ASSERT_EQ(hashes_scalar64[i], hashes_simd64[i])
-          << "scalar and simd approaches yielded different hashes";
+    const auto& hashes_scalar64 = hashes64[0];
+    for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); 
++i) {
+      for (int j = 0; j < num_rows; ++j) {
+        ASSERT_EQ(hashes64[i][j], hashes_scalar64[j])
+            << "scalar and simd approaches yielded different hashes";
+      }
     }
 
     // Verify that the same key appearing multiple times generates the same 
hash
     // each time. Measure the number of unique hashes and compare to the number
     // of unique keys.
     //
-    std::map<int, uint64_t> unique_key_to_hash;
-    std::set<uint64_t> unique_hashes;
+    std::unordered_map<int, uint64_t> unique_key_to_hash;
+    std::unordered_set<uint64_t> unique_hashes;
     for (int i = 0; i < num_rows; ++i) {
-      std::map<int, uint64_t>::iterator iter = 
unique_key_to_hash.find(row_ids[i]);
-      if (iter == unique_key_to_hash.end()) {
-        unique_key_to_hash.insert(std::make_pair(row_ids[i], 
hashes_scalar64[i]));
-      } else {
-        ASSERT_EQ(iter->second, hashes_scalar64[i]);
-      }
-      if (unique_hashes.find(hashes_scalar64[i]) == unique_hashes.end()) {
-        unique_hashes.insert(hashes_scalar64[i]);
+      auto [it, inserted] =
+          unique_key_to_hash.try_emplace(row_ids[i], hashes_scalar64[i]);
+      if (!inserted) {
+        ASSERT_EQ(it->second, hashes_scalar64[i]);
       }
+      unique_hashes.insert(hashes_scalar64[i]);
     }
     float percent_hash_collisions =
         100.0f * static_cast<float>(num_unique - unique_hashes.size()) /
diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index fd5c404a07..71ca56c91a 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -133,7 +133,7 @@ void SwissTable::extract_group_ids(const int num_keys, 
const uint16_t* optional_
 
   // Optimistically use simplified lookup involving only a start block to find
   // a single group id candidate for every input.
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   int num_group_id_bytes = num_group_id_bits / 8;
   if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && 
!optional_selection) {
     num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, 
out_group_ids,
@@ -301,7 +301,7 @@ void SwissTable::early_filter(const int num_keys, const 
uint32_t* hashes,
   // Optimistically use simplified lookup involving only a start block to find
   // a single group id candidate for every input.
   int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
     if (log_blocks_ <= 4) {
       num_processed = early_filter_imp_avx2_x32(num_keys, hashes, 
out_match_bitvector,
diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h
index 7ab48470f2..95fb3be274 100644
--- a/cpp/src/arrow/compute/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -163,7 +163,7 @@ class ARROW_EXPORT SwissTable {
   //
   void early_filter_imp(const int num_keys, const uint32_t* hashes,
                         uint8_t* out_match_bitvector, uint8_t* 
out_local_slots) const;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes,
                                uint8_t* out_match_bitvector,
                                uint8_t* out_local_slots) const;
diff --git a/cpp/src/arrow/compute/key_map_avx2.cc 
b/cpp/src/arrow/compute/key_map_avx2.cc
index eb318ff188..7315535110 100644
--- a/cpp/src/arrow/compute/key_map_avx2.cc
+++ b/cpp/src/arrow/compute/key_map_avx2.cc
@@ -23,8 +23,6 @@
 namespace arrow {
 namespace compute {
 
-#if defined(ARROW_HAVE_AVX2)
-
 // This is more or less translation of equivalent scalar code, adjusted for a
 // different instruction set (e.g. missing leading zero count instruction).
 //
@@ -412,7 +410,5 @@ int SwissTable::extract_group_ids_avx2(const int num_keys, 
const uint32_t* hashe
   return num_keys - (num_keys % unroll);
 }
 
-#endif
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/row/compare_internal.cc 
b/cpp/src/arrow/compute/row/compare_internal.cc
index 39ac33932b..7c402e7a23 100644
--- a/cpp/src/arrow/compute/row/compare_internal.cc
+++ b/cpp/src/arrow/compute/row/compare_internal.cc
@@ -42,7 +42,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, 
uint32_t num_rows_to_com
     return;
   }
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2()) {
     num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, 
num_rows_to_compare,
                                                sel_left_maybe_null, 
left_to_right_map,
@@ -130,7 +130,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t 
offset_within_row,
                                           const RowTableImpl& rows,
                                           uint8_t* match_bytevector) {
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2()) {
     num_processed = CompareBinaryColumnToRow_avx2(
         use_selection, offset_within_row, num_rows_to_compare, 
sel_left_maybe_null,
@@ -297,7 +297,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t 
id_varbinary_col,
                                              const RowTableImpl& rows,
                                              uint8_t* match_bytevector) {
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2()) {
     num_processed = CompareVarBinaryColumnToRow_avx2(
         use_selection, is_first_varbinary_col, id_varbinary_col, 
num_rows_to_compare,
@@ -313,7 +313,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t 
id_varbinary_col,
 void KeyCompare::AndByteVectors(LightContext* ctx, uint32_t num_elements,
                                 uint8_t* bytevector_A, const uint8_t* 
bytevector_B) {
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2()) {
     num_processed = AndByteVectors_avx2(num_elements, bytevector_A, 
bytevector_B);
   }
diff --git a/cpp/src/arrow/compute/row/compare_internal.h 
b/cpp/src/arrow/compute/row/compare_internal.h
index 638b8c2ec7..db953fbe11 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -86,7 +86,7 @@ class ARROW_EXPORT KeyCompare {
   static void AndByteVectors(LightContext* ctx, uint32_t num_elements,
                              uint8_t* bytevector_A, const uint8_t* 
bytevector_B);
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
 
   template <bool use_selection>
   static uint32_t NullUpdateColumnToRowImp_avx2(
diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc 
b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
index 95f37ab617..ff407c51b8 100644
--- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
@@ -24,8 +24,6 @@
 namespace arrow {
 namespace compute {
 
-#if defined(ARROW_HAVE_AVX2)
-
 inline __m256i set_first_n_bytes_avx2(int n) {
   constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL;
   constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL;
@@ -670,7 +668,5 @@ uint32_t KeyCompare::CompareVarBinaryColumnToRow_avx2(
   return num_rows_to_compare;
 }
 
-#endif
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/row/encode_internal.cc 
b/cpp/src/arrow/compute/row/encode_internal.cc
index 3a6a85b027..01d552ef82 100644
--- a/cpp/src/arrow/compute/row/encode_internal.cc
+++ b/cpp/src/arrow/compute/row/encode_internal.cc
@@ -455,7 +455,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t 
num_rows,
 
     bool is_row_fixed_length = rows.metadata().is_fixed_length;
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
     if (ctx->has_avx2()) {
       DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, 
offset_within_row, rows,
                         col);
@@ -466,7 +466,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t 
num_rows,
       } else {
         DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
       }
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
     }
 #endif
 
@@ -524,7 +524,7 @@ void EncoderBinaryPair::Decode(uint32_t start_row, uint32_t 
num_rows,
   bool is_row_fixed_length = rows.metadata().is_fixed_length;
 
   uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2() && col_width1 == col_width2) {
     num_processed =
         DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
@@ -772,7 +772,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t 
num_rows,
                               KeyColumnArray* col, LightContext* ctx) {
   // Output column varbinary buffer needs an extra 32B
   // at the end in avx2 version and 8B otherwise.
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (ctx->has_avx2()) {
     DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
   } else {
@@ -782,7 +782,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t 
num_rows,
     } else {
       DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
     }
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   }
 #endif
 }
diff --git a/cpp/src/arrow/compute/row/encode_internal.h 
b/cpp/src/arrow/compute/row/encode_internal.h
index b83767b694..6091fb6698 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -187,7 +187,7 @@ class EncoderBinary {
   template <bool is_row_fixed_length>
   static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t 
offset_within_row,
                         const RowTableImpl& rows, KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
                                 uint32_t num_rows, uint32_t offset_within_row,
                                 const RowTableImpl& rows, KeyColumnArray* col);
@@ -213,7 +213,7 @@ class EncoderBinaryPair {
   static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, 
uint32_t num_rows,
                         uint32_t offset_within_row, const RowTableImpl& rows,
                         KeyColumnArray* col1, KeyColumnArray* col2);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t 
col_width,
                                     uint32_t start_row, uint32_t num_rows,
                                     uint32_t offset_within_row, const 
RowTableImpl& rows,
@@ -300,7 +300,7 @@ class EncoderVarBinary {
   template <bool first_varbinary_col>
   static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t 
varbinary_col_id,
                         const RowTableImpl& rows, KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
                                 uint32_t varbinary_col_id, const RowTableImpl& 
rows,
                                 KeyColumnArray* col);
diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc 
b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
index 02ba310bde..50969c7bd6 100644
--- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc
@@ -22,8 +22,6 @@
 namespace arrow {
 namespace compute {
 
-#if defined(ARROW_HAVE_AVX2)
-
 void EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t 
start_row,
                                       uint32_t num_rows, uint32_t 
offset_within_row,
                                       const RowTableImpl& rows, 
KeyColumnArray* col) {
@@ -230,7 +228,5 @@ void EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, 
uint32_t num_rows,
       });
 }
 
-#endif
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
index f69f60a5af..faf3e0c87e 100644
--- a/cpp/src/arrow/compute/util.cc
+++ b/cpp/src/arrow/compute/util.cc
@@ -118,7 +118,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const 
int num_bits,
   // 64 bits at a time
   constexpr int unroll = 64;
   int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     if (filter_input_indexes) {
       avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, 
input_indexes,
@@ -141,7 +141,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const 
int num_bits,
         bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, 
indexes);
       }
     }
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   }
 #endif
   // Optionally process the last partial word with masking out bits outside 
range
@@ -253,7 +253,7 @@ void bits_to_bytes(int64_t hardware_flags, const int 
num_bits, const uint8_t* bi
   }
 
   int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     // The function call below processes whole 32 bit chunks together.
     num_processed = num_bits - (num_bits % 32);
@@ -309,7 +309,7 @@ void bytes_to_bits(int64_t hardware_flags, const int 
num_bits, const uint8_t* by
   }
 
   int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     // The function call below processes whole 32 bit chunks together.
     num_processed = num_bits - (num_bits % 32);
@@ -339,7 +339,7 @@ void bytes_to_bits(int64_t hardware_flags, const int 
num_bits, const uint8_t* by
 
 bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
                         uint32_t num_bytes) {
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
     return avx2::are_all_bytes_zero_avx2(bytes, num_bytes);
   }
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 489139eab8..730e59f346 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -168,7 +168,7 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, 
const int num_bits,
 ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* 
bytes,
                                      uint32_t num_bytes);
 
-#if defined(ARROW_HAVE_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
 
 namespace avx2 {
 ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int 
num_bits,
diff --git a/cpp/src/arrow/compute/util_avx2.cc 
b/cpp/src/arrow/compute/util_avx2.cc
index 89ec6aa97a..0191ab06f9 100644
--- a/cpp/src/arrow/compute/util_avx2.cc
+++ b/cpp/src/arrow/compute/util_avx2.cc
@@ -21,9 +21,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/logging.h"
 
-#if defined(ARROW_HAVE_AVX2)
-
-namespace arrow::util::avx2 {
+namespace arrow::util::bit_util::avx2 {
 
 template <int bit_to_search>
 void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* 
num_indexes,
@@ -211,6 +209,4 @@ bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t 
num_bytes) {
   return result_or32 == 0;
 }
 
-}  // namespace arrow::util::avx2
-
-#endif  // ARROW_HAVE_AVX2
+}  // namespace arrow::util::bit_util::avx2
diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc
index b598544807..e8a782575e 100644
--- a/cpp/src/arrow/testing/util.cc
+++ b/cpp/src/arrow/testing/util.cc
@@ -43,6 +43,7 @@
 #include "arrow/table.h"
 #include "arrow/testing/random.h"
 #include "arrow/type.h"
+#include "arrow/util/cpu_info.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/pcg_random.h"
@@ -211,4 +212,18 @@ const std::vector<std::shared_ptr<DataType>>& 
all_dictionary_index_types() {
   return types;
 }
 
+std::vector<int64_t> GetSupportedHardwareFlags(
+    const std::vector<int64_t>& candidate_flags) {
+  std::vector<int64_t> hardware_flags;
+  // Always test fallback codepaths
+  hardware_flags.push_back(0);
+  for (const int64_t candidate_flag : candidate_flags) {
+    if (candidate_flag != 0 &&
+        internal::CpuInfo::GetInstance()->IsSupported(candidate_flag)) {
+      hardware_flags.push_back(candidate_flag);
+    }
+  }
+  return hardware_flags;
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h
index 4f4b03438f..b4b2785a36 100644
--- a/cpp/src/arrow/testing/util.h
+++ b/cpp/src/arrow/testing/util.h
@@ -131,4 +131,10 @@ ARROW_TESTING_EXPORT std::string GetListenAddress();
 ARROW_TESTING_EXPORT
 const std::vector<std::shared_ptr<DataType>>& all_dictionary_index_types();
 
+// Get a list of supported hardware flags from the given candidates.
+// The result will always contain 0, meaning no optional CPU feature enabled 
at all.
+ARROW_TESTING_EXPORT
+std::vector<int64_t> GetSupportedHardwareFlags(
+    const std::vector<int64_t>& candidate_flags);
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/byte_stream_split.h 
b/cpp/src/arrow/util/byte_stream_split.h
index 28dcce52bb..d428df0659 100644
--- a/cpp/src/arrow/util/byte_stream_split.h
+++ b/cpp/src/arrow/util/byte_stream_split.h
@@ -39,9 +39,9 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t 
num_values, int64_t
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
   constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+  constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
 
   const int64_t size = num_values * sizeof(T);
-  constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
   const int64_t num_blocks = size / kBlockSize;
   uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
 
@@ -92,11 +92,12 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const size_t num_value
                                uint8_t* output_buffer_raw) {
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
+  constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+
   __m128i stage[3][kNumStreams];
   __m128i final_result[kNumStreams];
 
   const size_t size = num_values * sizeof(T);
-  constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
   const size_t num_blocks = size / kBlockSize;
   const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
   __m128i* output_buffer_streams[kNumStreams];
@@ -143,7 +144,7 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, 
const size_t num_value
             _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 
+ 1]);
       }
     }
-    if (kNumStreams == 8U) {
+    if constexpr (kNumStreams == 8U) {
       // This is the path for double.
       __m128i tmp[8];
       for (size_t i = 0; i < 4; ++i) {
@@ -181,9 +182,9 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t 
num_values, int64_t
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
   constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+  constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
 
   const int64_t size = num_values * sizeof(T);
-  constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
   if (size < kBlockSize)  // Back to SSE for small size
     return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
   const int64_t num_blocks = size / kBlockSize;
@@ -220,7 +221,7 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t 
num_values, int64_t
       }
     }
 
-    if (kNumStreams == 8U) {
+    if constexpr (kNumStreams == 8U) {
       // path for double, 128i index:
       //   {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
       //   {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
@@ -266,11 +267,12 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, 
const size_t num_value
                                uint8_t* output_buffer_raw) {
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
-  if (kNumStreams == 8U)  // Back to SSE, currently no path for double.
+  constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+
+  if constexpr (kNumStreams == 8U)  // Back to SSE, currently no path for 
double.
     return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, 
output_buffer_raw);
 
   const size_t size = num_values * sizeof(T);
-  constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
   if (size < kBlockSize)  // Back to SSE for small size
     return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, 
output_buffer_raw);
   const size_t num_blocks = size / kBlockSize;
@@ -339,9 +341,9 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, 
int64_t num_values, int64_
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
   constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+  constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
 
   const int64_t size = num_values * sizeof(T);
-  constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
   if (size < kBlockSize)  // Back to AVX2 for small size
     return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
   const int64_t num_blocks = size / kBlockSize;
@@ -379,7 +381,7 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, 
int64_t num_values, int64_
       }
     }
 
-    if (kNumStreams == 8U) {
+    if constexpr (kNumStreams == 8U) {
       // path for double, 128i index:
       // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
       // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
@@ -442,8 +444,10 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* 
raw_values, const size_t num_val
                                  uint8_t* output_buffer_raw) {
   constexpr size_t kNumStreams = sizeof(T);
   static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of 
streams.");
-  const size_t size = num_values * sizeof(T);
   constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+
+  const size_t size = num_values * sizeof(T);
+
   if (size < kBlockSize)  // Back to AVX2 for small size
     return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, 
output_buffer_raw);
 
@@ -469,7 +473,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, 
const size_t num_val
   __m512i unpack[KNumUnpack + 1][kNumStreams];
   __m512i permutex[kNumStreams];
   __m512i permutex_mask;
-  if (kNumStreams == 8U) {
+  if constexpr (kNumStreams == 8U) {
     // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
     permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 
0x000E0006,
                                      0x001D0015, 0x000D0005, 0x001C0014, 
0x000C0004,
@@ -494,7 +498,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, 
const size_t num_val
       }
     }
 
-    if (kNumStreams == 8U) {
+    if constexpr (kNumStreams == 8U) {
       // path for double
       // 1. unpack to epi16 block
       // 2. permutexvar_epi16 to 128i block
diff --git a/docker-compose.yml b/docker-compose.yml
index fbb879b2bc..8727aded2c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -72,6 +72,10 @@ x-sccache: &sccache
   SCCACHE_REGION:
   SCCACHE_S3_KEY_PREFIX: ${SCCACHE_S3_KEY_PREFIX:-sccache}
 
+x-cpp: &cpp
+  ARROW_RUNTIME_SIMD_LEVEL:
+  ARROW_SIMD_LEVEL:
+
 # CPU/memory limit presets to pass to Docker.
 #
 # Usage: archery docker run --resource-limit=github <image>
@@ -227,7 +231,7 @@ services:
     ulimits: &ulimits
       core: ${ULIMIT_CORE}
     environment:
-      <<: [*common, *ccache]
+      <<: [*common, *ccache, *cpp]
       ARROW_ENABLE_TIMING_TESTS:  # inherit
       ARROW_MIMALLOC: "ON"
     volumes: &alpine-linux-volumes
@@ -278,7 +282,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_BUILD_BENCHMARKS: "ON"
       ARROW_BUILD_EXAMPLES: "ON"
       ARROW_ENABLE_TIMING_TESTS:  # inherit
@@ -313,7 +317,7 @@ services:
         arch: ${ARCH}
     shm_size: *shm-size
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       # Shrink test runtime by enabling minimal optimizations
       ARROW_C_FLAGS_DEBUG: "-g1 -Og"
       ARROW_CXX_FLAGS_DEBUG: "-g1 -Og"
@@ -349,7 +353,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_ENABLE_TIMING_TESTS:  # inherit
       ARROW_MIMALLOC: "ON"
     volumes: &debian-volumes
@@ -390,7 +394,7 @@ services:
       - apparmor:unconfined
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_ENABLE_TIMING_TESTS:  # inherit
       ARROW_MIMALLOC: "ON"
     volumes: &ubuntu-volumes
@@ -426,7 +430,7 @@ services:
       - apparmor:unconfined
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_HOME: /arrow
       ARROW_DEPENDENCY_SOURCE: BUNDLED
       LIBARROW_MINIMAL: "false"
@@ -448,7 +452,7 @@ services:
     volumes:
       - .:/arrow:delegated
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_DEPENDENCY_SOURCE: BUNDLED
       ARROW_HOME: /arrow
       LIBARROW_MINIMAL: "false"
@@ -470,7 +474,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_DEPENDENCY_SOURCE: BUNDLED
       CMAKE_GENERATOR: "Unix Makefiles"
     volumes: *ubuntu-volumes
@@ -491,7 +495,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_BUILD_UTILITIES: "OFF"
       ARROW_COMPUTE: "OFF"
       ARROW_CSV: "OFF"
@@ -538,7 +542,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_BUILD_UTILITIES: "OFF"
       ARROW_COMPUTE: "OFF"
       ARROW_CSV: "OFF"
@@ -588,7 +592,7 @@ services:
     shm_size: *shm-size
     volumes: *ubuntu-volumes
     environment:
-      <<: [*common, *ccache]
+      <<: [*common, *ccache, *cpp]
       CC: clang-${CLANG_TOOLS}
       CXX: clang++-${CLANG_TOOLS}
       # Avoid creating huge static libraries
@@ -630,7 +634,7 @@ services:
     shm_size: *shm-size
     volumes: *ubuntu-volumes
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       CC: clang-${CLANG_TOOLS}
       CXX: clang++-${CLANG_TOOLS}
       ARROW_BUILD_STATIC: "OFF"
@@ -662,7 +666,7 @@ services:
     shm_size: *shm-size
     ulimits: *ulimits
     environment:
-      <<: [*common, *ccache, *sccache]
+      <<: [*common, *ccache, *sccache, *cpp]
       ARROW_ENABLE_TIMING_TESTS:  # inherit
       ARROW_MIMALLOC: "ON"
       Protobuf_SOURCE: "BUNDLED"  # Need Protobuf >= 3.15

[arrow] branch main updated: GH-35116: [CI][C++] Enable compile-time AVX2 on some CI platforms (#36662)

Reply via email to