jumps

Andrew Pinski Thu, 13 Nov 2014 16:57:07 -0800

On ThunderX, I found that aligning functions/loops/jumps to an 8 byte
boundary have a slightly better performance because the hardware issue
and dispatch matches what GCC's schedule has created.


I set generic, cortex-a53 and cortex-a57 also to be 8 byte aligned
also.  Someone might want to change the cortext-a57 number to be more
correct to that processor. Understanding how cortex-a53 is a dual issue,
it made sense to set to 8 byte alignment but I don't know if it really
make sense.

Build and tested for aarch64-elf with no regressions.

ChangeLog:
* config/aarch64/aarch64-protos.h (tune_params): Add align field.
* config/aarch64/aarch64.c (generic_tunings): Specify align.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(thunderx_tunings): Likewise.
(aarch64_override_options): Set align_loops, align_jumps,
align_functions based on what the tuning struct.
---
 gcc/config/aarch64/aarch64-protos.h |    1 +
 gcc/config/aarch64/aarch64.c        |   24 ++++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 9e0ff8c..3e70495 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -171,6 +171,7 @@ struct tune_params
   const int memmov_cost;
   const int issue_rate;
   const unsigned int fuseable_ops;
+  const unsigned int align;
 };
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5216ac0..9214332 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -317,7 +317,8 @@ static const struct tune_params generic_tunings =
   &generic_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 2),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING)
+  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
+  NAMED_PARAM (align, 8)
 };
 
 static const struct tune_params cortexa53_tunings =
@@ -328,7 +329,8 @@ static const struct tune_params cortexa53_tunings =
   &generic_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 2),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK)
+  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK),
+  NAMED_PARAM (align, 8)
 };
 
 static const struct tune_params cortexa57_tunings =
@@ -339,7 +341,8 @@ static const struct tune_params cortexa57_tunings =
   &cortexa57_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 3),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK)
+  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK),
+  NAMED_PARAM (align, 8)
 };
 
 static const struct tune_params thunderx_tunings =
@@ -350,7 +353,8 @@ static const struct tune_params thunderx_tunings =
   &generic_vector_cost,
   NAMED_PARAM (memmov_cost, 6),
   NAMED_PARAM (issue_rate, 2),
-  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH)
+  NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH),
+  NAMED_PARAM (align, 8)
 };
 
 /* A processor implementing AArch64.  */
@@ -6501,6 +6505,18 @@ aarch64_override_options (void)
 #endif
     }
 
+  /* If not opzimizing for size, set the default
+     alignment to what the target wants */
+  if (!optimize_size)
+    {
+      if (align_loops <= 0)
+       align_loops = aarch64_tune_params->align;
+      if (align_jumps <= 0)
+       align_jumps = aarch64_tune_params->align;
+      if (align_functions <= 0)
+       align_functions = aarch64_tune_params->align;
+    }
+
   aarch64_override_options_after_change ();
 }
 
-- 
1.7.2.5

[PATCH 3/3] [AARCH64] Add aligning of functions/loops/jumps

Reply via email to