Add support for vector permute cost since various permutes can expand into a 
complex
sequence of instructions.  This fixes major performance regressions due to 
recent changes
in the SLP vectorizer (which now vectorizes more aggressively and emits many 
complex 
permutes).

Set the cost to > 1 for all microarchitectures so that the number of permutes 
is usually zero
and regressions disappear.  An example of the kind of code that might be 
emitted for
VEC_PERM_EXPR {0, 3} where registers happen to be in the wrong order:

        adrp    x4, .LC16
        ldr     q5, [x4, #:lo12:.LC16
        eor     v1.16b, v1.16b, v0.16b
        eor     v0.16b, v1.16b, v0.16b
        eor     v1.16b, v1.16b, v0.16b
        tbl     v0.16b, {v0.16b - v1.16b}, v5.16b

Regress passes. This fixes regressions that were introduced recently, so OK for 
commit?


ChangeLog:
2015-12-15  Wilco Dijkstra  <wdijk...@arm.com>

        * gcc/config/aarch64/aarch64.c (generic_vector_cost):
        Set vec_permute_cost.
        (cortexa57_vector_cost): Likewise.
        (exynosm1_vector_cost): Likewise.
        (xgene1_vector_cost): Likewise.
        (aarch64_builtin_vectorization_cost): Use vec_permute_cost.
        * gcc/config/aarch64/aarch64-protos.h (cpu_vector_cost):
        Add vec_permute_cost entry.


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
549a89d1f691b32efbc74359f045b5df74765f0e..1bc812a4d01e8b9895c11cefde3148429397e95a
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -156,9 +156,10 @@ struct cpu_vector_cost
   const int scalar_load_cost;           /* Cost of scalar load.  */
   const int scalar_store_cost;          /* Cost of scalar store.  */
   const int vec_stmt_cost;              /* Cost of any vector operation,
-                                           excluding load, store,
+                                           excluding load, store, permute,
                                            vector-to-scalar and
                                            scalar-to-vector operation.  */
+  const int vec_permute_cost;           /* Cost of permute operation.  */
   const int vec_to_scalar_cost;                 /* Cost of vec-to-scalar 
operation.  */
   const int scalar_to_vec_cost;                 /* Cost of scalar-to-vector
                                            operation.  */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
10754c88c0973d8ef3c847195b727f02b193bbd8..2584f16d345b3d015d577dd28c08a73ee3e0b0fb
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -314,6 +314,7 @@ static const struct cpu_vector_cost generic_vector_cost =
   1, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
   1, /* vec_stmt_cost  */
+  2, /* vec_permute_cost  */
   1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
   1, /* vec_align_load_cost  */
@@ -331,6 +332,7 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
   3, /* vec_stmt_cost  */
+  3, /* vec_permute_cost  */
   8, /* vec_to_scalar_cost  */
   8, /* scalar_to_vec_cost  */
   5, /* vec_align_load_cost  */
@@ -347,6 +349,7 @@ static const struct cpu_vector_cost exynosm1_vector_cost =
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
   3, /* vec_stmt_cost  */
+  3, /* vec_permute_cost  */
   3, /* vec_to_scalar_cost  */
   3, /* scalar_to_vec_cost  */
   5, /* vec_align_load_cost  */
@@ -364,6 +367,7 @@ static const struct cpu_vector_cost xgene1_vector_cost =
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
   2, /* vec_stmt_cost  */
+  2, /* vec_permute_cost  */
   4, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
   10, /* vec_align_load_cost  */
@@ -7555,6 +7559,8 @@ aarch64_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
        return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
 
       case vec_perm:
+       return aarch64_tune_params.vec_costs->vec_permute_cost;
+
       case vec_promote_demote:
        return aarch64_tune_params.vec_costs->vec_stmt_cost;


Reply via email to