Hi,
The patch ( http://gcc.gnu.org/ml/gcc-patches/2011-02/txt00059.txt ) which
introduces splitting avx256 unaligned loads.
However, we found that it causes significant regressions for cpu2006 (
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49089 ).
In this work, we introduce a tune option that sets splitting unaligned loads
default only for such CPUs that such splitting
is beneficial.
The patch passed bootstrapping and regression tests on x86_64-unknown-linux-gnu
system.
Is it OK to commit?
Thanks,
Changpeng
From 415012803abf2cac95c067394504c55dd968f4f5 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <chfang@huainan.(none)>
Date: Mon, 13 Jun 2011 13:13:32 -0700
Subject: [PATCH] pr49089: enable avx256 splitting unaligned load only when beneficial
* config/i386/i386.h (ix86_tune_indices): Introduce
X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL.
(TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL): New definition.
* config/i386/i386.c (ix86_tune_features): Add entry for
X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL.
(ix86_option_override_internal): Enable avx256 unaligned load splitting
only when TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL is set.
---
gcc/config/i386/i386.c | 9 +++++++--
gcc/config/i386/i386.h | 3 +++
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7b266b9..d5f358f 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2088,7 +2088,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
at -O3. For the moment, the prefetching seems badly tuned for Intel
chips. */
- m_K6_GEODE | m_AMD_MULTIPLE
+ m_K6_GEODE | m_AMD_MULTIPLE,
+
+ /* X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL: Enable splitting 256-bit
+ unaligned load. It hurts the performance on Bulldozer. */
+ m_COREI7
};
/* Feature tests against the various architecture variations. */
@@ -4194,7 +4198,8 @@ ix86_option_override_internal (bool main_args_p)
if (flag_expensive_optimizations
&& !(target_flags_explicit & MASK_VZEROUPPER))
target_flags |= MASK_VZEROUPPER;
- if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
+ if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL
+ && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8badcbb..b2a1bc8 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -312,6 +312,7 @@ enum ix86_tune_indices {
X86_TUNE_OPT_AGU,
X86_TUNE_VECTORIZE_DOUBLE,
X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL,
+ X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL,
X86_TUNE_LAST
};
@@ -410,6 +411,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
#define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
+#define TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL \
+ ix86_tune_features[X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
--
1.7.0.4