This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 50015f0848 GH-37017: [C++] Guard unexpected uses of BMI2 instructions 
(#37610)
50015f0848 is described below

commit 50015f084846faaa5bf867346d045499e3f68316
Author: Antoine Pitrou <[email protected]>
AuthorDate: Fri Sep 8 09:08:12 2023 +0200

    GH-37017: [C++] Guard unexpected uses of BMI2 instructions (#37610)
    
    ### Rationale for this change
    
    Some functions introduced with Acero only check for AVX2 availability, but 
they actually invoke BMI2 instructions.
    This can have two negative consequences:
    * compiling BMI2 intrinsics may fail because BMI2 was not explicitly 
enabled on the compiler (gh-37017)
    * some rare CPUs (Via CPUs perhaps) may support AVX2 but not BMI2; other 
CPUs by AMD have a very inefficient implementation of some BMI2 instructions
    
    ### What changes are included in this PR?
    
    1. Ensure that the suitable compiler flag is passed when compiling code 
with BMI2 intrinsics
    2. Make sure the CPU supports BMI2 adequately before invoking functions 
featuring BMI2 instructions
    
    ### Are these changes tested?
    
    Yes, assuming CI covers enough diversity of target platforms.
    
    ### Are there any user-facing changes?
    
    No, but performance might change (positively or negatively) depending on 
the CPU and platform.
    
    * Closes: #37017
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/cmake_modules/SetupCxxFlags.cmake  | 10 ++++++++--
 cpp/src/arrow/CMakeLists.txt           | 13 +++++++++++--
 cpp/src/arrow/compute/key_map.cc       | 10 ++++++----
 cpp/src/arrow/compute/key_map.h        |  3 ++-
 cpp/src/arrow/compute/util.cc          | 19 ++++++++++---------
 cpp/src/arrow/compute/util.h           |  3 ++-
 cpp/src/parquet/CMakeLists.txt         | 12 ++++++++----
 cpp/src/parquet/level_conversion_inc.h |  2 ++
 8 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake 
b/cpp/cmake_modules/SetupCxxFlags.cmake
index 6b47fcb717..a5f5659723 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -49,13 +49,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
   if(MSVC)
     set(ARROW_SSE4_2_FLAG "")
     set(ARROW_AVX2_FLAG "/arch:AVX2")
+    # MSVC has no specific flag for BMI2, it seems to be enabled with AVX2
+    set(ARROW_BMI2_FLAG "/arch:AVX2")
     set(ARROW_AVX512_FLAG "/arch:AVX512")
     set(CXX_SUPPORTS_SSE4_2 TRUE)
   else()
     set(ARROW_SSE4_2_FLAG "-msse4.2")
     set(ARROW_AVX2_FLAG "-march=haswell")
+    set(ARROW_BMI2_FLAG "-mbmi2")
     # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
-    set(ARROW_AVX512_FLAG "-march=skylake-avx512 -mbmi2")
+    set(ARROW_AVX512_FLAG "-march=skylake-avx512")
     # Append the avx2/avx512 subset option also, fix issue ARROW-9877 for 
homebrew-cpp
     set(ARROW_AVX2_FLAG "${ARROW_AVX2_FLAG} -mavx2")
     set(ARROW_AVX512_FLAG
@@ -95,13 +98,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
     set(ARROW_HAVE_RUNTIME_SSE4_2 ON)
     add_definitions(-DARROW_HAVE_RUNTIME_SSE4_2)
   endif()
+  # Note: for now we assume that AVX2 support should also enable BMI2 support,
+  # at least at compile-time (more care may be required for runtime dispatch).
   if(CXX_SUPPORTS_AVX2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES 
"^(AVX2|AVX512|MAX)$")
     set(ARROW_HAVE_RUNTIME_AVX2 ON)
+    set(ARROW_HAVE_RUNTIME_BMI2 ON)
     add_definitions(-DARROW_HAVE_RUNTIME_AVX2 -DARROW_HAVE_RUNTIME_BMI2)
   endif()
   if(CXX_SUPPORTS_AVX512 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$")
     set(ARROW_HAVE_RUNTIME_AVX512 ON)
-    add_definitions(-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2)
+    add_definitions(-DARROW_HAVE_RUNTIME_AVX512)
   endif()
   if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
     set(ARROW_SIMD_LEVEL "SSE4_2")
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 427463bb16..f474d0c517 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -127,6 +127,15 @@ macro(append_runtime_avx2_src SRC)
   endif()
 endmacro()
 
+macro(append_runtime_avx2_bmi2_src SRC)
+  if(ARROW_HAVE_RUNTIME_AVX2 AND ARROW_HAVE_RUNTIME_BMI2)
+    list(APPEND ARROW_SRCS ${SRC})
+    set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
+    set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS
+                                                  "${ARROW_AVX2_FLAG} 
${ARROW_BMI2_FLAG}")
+  endif()
+endmacro()
+
 macro(append_runtime_avx512_src SRC)
   if(ARROW_HAVE_RUNTIME_AVX512)
     list(APPEND ARROW_SRCS ${SRC})
@@ -433,10 +442,10 @@ list(APPEND
      compute/util.cc)
 
 append_runtime_avx2_src(compute/key_hash_avx2.cc)
-append_runtime_avx2_src(compute/key_map_avx2.cc)
+append_runtime_avx2_bmi2_src(compute/key_map_avx2.cc)
 append_runtime_avx2_src(compute/row/compare_internal_avx2.cc)
 append_runtime_avx2_src(compute/row/encode_internal_avx2.cc)
-append_runtime_avx2_src(compute/util_avx2.cc)
+append_runtime_avx2_bmi2_src(compute/util_avx2.cc)
 
 if(ARROW_COMPUTE)
   # Include the remaining kernels
diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index 71ca56c91a..525dae850f 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -28,6 +28,7 @@
 namespace arrow {
 
 using bit_util::CountLeadingZeros;
+using internal::CpuInfo;
 
 namespace compute {
 
@@ -133,9 +134,10 @@ void SwissTable::extract_group_ids(const int num_keys, 
const uint16_t* optional_
 
   // Optimistically use simplified lookup involving only a start block to find
   // a single group id candidate for every input.
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
   int num_group_id_bytes = num_group_id_bits / 8;
-  if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && 
!optional_selection) {
+  if ((hardware_flags_ & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2() &&
+      !optional_selection) {
     num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, 
out_group_ids,
                                            sizeof(uint64_t), 8 + 8 * 
num_group_id_bytes,
                                            num_group_id_bytes);
@@ -301,8 +303,8 @@ void SwissTable::early_filter(const int num_keys, const 
uint32_t* hashes,
   // Optimistically use simplified lookup involving only a start block to find
   // a single group id candidate for every input.
   int num_processed = 0;
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-  if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  if ((hardware_flags_ & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2()) {
     if (log_blocks_ <= 4) {
       num_processed = early_filter_imp_avx2_x32(num_keys, hashes, 
out_match_bitvector,
                                                 out_local_slots);
diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h
index 95fb3be274..85ef9029d6 100644
--- a/cpp/src/arrow/compute/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -163,7 +163,8 @@ class ARROW_EXPORT SwissTable {
   //
   void early_filter_imp(const int num_keys, const uint32_t* hashes,
                         uint8_t* out_match_bitvector, uint8_t* 
out_local_slots) const;
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  // The functions below use BMI2 instructions, be careful before calling!
   int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes,
                                uint8_t* out_match_bitvector,
                                uint8_t* out_local_slots) const;
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
index faf3e0c87e..c55143af0c 100644
--- a/cpp/src/arrow/compute/util.cc
+++ b/cpp/src/arrow/compute/util.cc
@@ -27,6 +27,7 @@
 namespace arrow {
 
 using bit_util::CountTrailingZeros;
+using internal::CpuInfo;
 
 namespace util {
 
@@ -118,8 +119,8 @@ void bits_to_indexes_internal(int64_t hardware_flags, const 
int num_bits,
   // 64 bits at a time
   constexpr int unroll = 64;
   int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  if ((hardware_flags & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2()) {
     if (filter_input_indexes) {
       avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, 
input_indexes,
                                      num_indexes, indexes);
@@ -141,7 +142,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const 
int num_bits,
         bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, 
indexes);
       }
     }
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
   }
 #endif
   // Optionally process the last partial word with masking out bits outside 
range
@@ -253,8 +254,8 @@ void bits_to_bytes(int64_t hardware_flags, const int 
num_bits, const uint8_t* bi
   }
 
   int num_processed = 0;
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  if ((hardware_flags & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2()) {
     // The function call below processes whole 32 bit chunks together.
     num_processed = num_bits - (num_bits % 32);
     avx2::bits_to_bytes_avx2(num_processed, bits, bytes);
@@ -309,8 +310,8 @@ void bytes_to_bits(int64_t hardware_flags, const int 
num_bits, const uint8_t* by
   }
 
   int num_processed = 0;
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  if ((hardware_flags & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2()) {
     // The function call below processes whole 32 bit chunks together.
     num_processed = num_bits - (num_bits % 32);
     avx2::bytes_to_bits_avx2(num_processed, bytes, bits);
@@ -339,8 +340,8 @@ void bytes_to_bits(int64_t hardware_flags, const int 
num_bits, const uint8_t* by
 
 bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
                         uint32_t num_bytes) {
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+  if ((hardware_flags & CpuInfo::AVX2) && 
CpuInfo::GetInstance()->HasEfficientBmi2()) {
     return avx2::are_all_bytes_zero_avx2(bytes, num_bytes);
   }
 #endif
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 730e59f346..e2a2e71b85 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -168,7 +168,8 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, 
const int num_bits,
 ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* 
bytes,
                                      uint32_t num_bytes);
 
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+// The functions below use BMI2 instructions, be careful before calling!
 
 namespace avx2 {
 ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int 
num_bits,
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index eabfcea586..8ffeab4dfb 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -212,10 +212,14 @@ if(ARROW_HAVE_RUNTIME_AVX2)
   #
   # TODO: Use COMPILE_OPTIONS instead of COMPILE_FLAGS when we require
   # CMake 3.11 or later.
-  set(BMI2_FLAGS "${AVX2_FLAGS} -DARROW_HAVE_BMI2 -mbmi2")
-  set_source_files_properties(level_conversion_bmi2.cc
-                              PROPERTIES SKIP_PRECOMPILE_HEADERS ON 
COMPILE_FLAGS
-                                                                    
"${BMI2_FLAGS}")
+  if(ARROW_HAVE_RUNTIME_BMI2)
+    # Need to pass ARROW_HAVE_BMI2 for level_conversion_inc.h to compile
+    # the BMI2 path.
+    set(BMI2_FLAGS "${AVX2_FLAGS} ${ARROW_BMI2_FLAG} -DARROW_HAVE_BMI2")
+    set_source_files_properties(level_conversion_bmi2.cc
+                                PROPERTIES SKIP_PRECOMPILE_HEADERS ON 
COMPILE_FLAGS
+                                                                      
"${BMI2_FLAGS}")
+  endif()
 endif()
 
 if(PARQUET_REQUIRE_ENCRYPTION)
diff --git a/cpp/src/parquet/level_conversion_inc.h 
b/cpp/src/parquet/level_conversion_inc.h
index 0bcdbccb34..d1ccedabfd 100644
--- a/cpp/src/parquet/level_conversion_inc.h
+++ b/cpp/src/parquet/level_conversion_inc.h
@@ -29,9 +29,11 @@
 #include "arrow/util/simd.h"
 #include "parquet/exception.h"
 #include "parquet/level_comparison.h"
+
 #ifndef PARQUET_IMPL_NAMESPACE
 #error "PARQUET_IMPL_NAMESPACE must be defined"
 #endif
+
 namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
 
 // clang-format off

Reply via email to