On ThunderX, I found that aligning functions/loops/jumps to an 8 byte boundary have a slightly better performance because the hardware issue and dispatch matches what GCC's schedule has created.
I set generic, cortex-a53 and cortex-a57 also to be 8 byte aligned also. Someone might want to change the cortext-a57 number to be more correct to that processor. Understanding how cortex-a53 is a dual issue, it made sense to set to 8 byte alignment but I don't know if it really make sense. Build and tested for aarch64-elf with no regressions. ChangeLog: * config/aarch64/aarch64-protos.h (tune_params): Add align field. * config/aarch64/aarch64.c (generic_tunings): Specify align. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (thunderx_tunings): Likewise. (aarch64_override_options): Set align_loops, align_jumps, align_functions based on what the tuning struct. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 9e0ff8c..3e70495 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -171,6 +171,7 @@ struct tune_params const int memmov_cost; const int issue_rate; const unsigned int fuseable_ops; + const unsigned int align; }; HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 5216ac0..9214332 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -317,7 +317,8 @@ static const struct tune_params generic_tunings = &generic_vector_cost, NAMED_PARAM (memmov_cost, 4), NAMED_PARAM (issue_rate, 2), - NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING) + NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING), + NAMED_PARAM (align, 8) }; static const struct tune_params cortexa53_tunings = @@ -328,7 +329,8 @@ static const struct tune_params cortexa53_tunings = &generic_vector_cost, NAMED_PARAM (memmov_cost, 4), NAMED_PARAM (issue_rate, 2), - NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK) + NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK), + NAMED_PARAM (align, 8) }; static const struct tune_params cortexa57_tunings = @@ -339,7 +341,8 @@ static const struct tune_params cortexa57_tunings = &cortexa57_vector_cost, NAMED_PARAM (memmov_cost, 4), NAMED_PARAM (issue_rate, 3), - NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK) + NAMED_PARAM (fuseable_ops, AARCH64_FUSE_MOV_MOVK), + NAMED_PARAM (align, 8) }; static const struct tune_params thunderx_tunings = @@ -350,7 +353,8 @@ static const struct tune_params thunderx_tunings = &generic_vector_cost, NAMED_PARAM (memmov_cost, 6), NAMED_PARAM (issue_rate, 2), - NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH) + NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH), + NAMED_PARAM (align, 8) }; /* A processor implementing AArch64. */ @@ -6501,6 +6505,18 @@ aarch64_override_options (void) #endif } + /* If not opzimizing for size, set the default + alignment to what the target wants */ + if (!optimize_size) + { + if (align_loops <= 0) + align_loops = aarch64_tune_params->align; + if (align_jumps <= 0) + align_jumps = aarch64_tune_params->align; + if (align_functions <= 0) + align_functions = aarch64_tune_params->align; + } + aarch64_override_options_after_change (); } -- 1.7.2.5