Hi,
This patch enables -fprefetch-loop-arrays for -mcpu=thunderxt88 and
-mcpu=thunderxt88p1. I filled out the tuning structures for both
thunderx and thunderx2t99. No other core current enables software
prefetching so I set them to 0 which does not change the default
parameters.
OK? Bootstrapped and tested on both ThunderX2 CN99xx and ThunderX
CN88xx with no regressions. I got a 2x improvement for 462.libquantum
on CN88xx, overall a 10% improvement on SPEC INT on CN88xx at -Ofast.
CN99xx's SPEC did not change.
Thanks,
Andrew Pinski
ChangeLog:
* config/aarch64/aarch64-protos.h (struct tune_params): Add
prefetch_latency, simultaneous_prefetches, l1_cache_size, and
l2_cache_size fields.
(enum aarch64_autoprefetch_model): Add AUTOPREFETCHER_SW.
* config/aarch64/aarch64.c (generic_tunings): Update to include
prefetch_latency, simultaneous_prefetches, l1_cache_size, and
l2_cache_size fields to 0.
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(cortexa73_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Fill out some of the new fields.
(thunderxt88_tunings): New variable.
(xgene1_tunings): Update to include prefetch_latency,
simultaneous_prefetches, l1_cache_size, and l2_cache_size fields to 0.
(qdf24xx_tunings): Likewise.
(thunderx2t99_tunings): Fill out some of the new fields.
(aarch64_override_options_internal): Consider AUTOPREFETCHER_SW like
AUTOPREFETCHER_OFF.
Set param values if the fields are non-zero. Turn on
prefetch-loop-arrays if AUTOPREFETCHER_SW and optimize level is at
least 3 or profile feed usage is enabled.
* config/aarch64/aarch64-cores.def (thunderxt88p1): Use thunderxt88 tuning.
(thunderxt88): Likewise.
Index: config/aarch64/aarch64-cores.def
===================================================================
--- config/aarch64/aarch64-cores.def (revision 244917)
+++ config/aarch64/aarch64-cores.def (working copy)
@@ -63,8 +63,8 @@ AARCH64_CORE("qdf24xx", qdf24xx, c
AARCH64_CORE("thunderx", thunderx, thunderx, 8A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE,
thunderx, 0x43, 0x0a0, -1)
/* Do not swap around "thunderxt88p1" and "thunderxt88",
this order is required to handle variant correctly. */
-AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx, 8A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,
thunderx, 0x43, 0x0a1, 0)
-AARCH64_CORE("thunderxt88", thunderxt88, thunderx, 8A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE,
thunderx, 0x43, 0x0a1, -1)
+AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx, 8A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,
thunderxt88, 0x43, 0x0a1, 0)
+AARCH64_CORE("thunderxt88", thunderxt88, thunderx, 8A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE,
thunderxt88, 0x43, 0x0a1, -1)
AARCH64_CORE("thunderxt81", thunderxt81, thunderx, 8_1A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE,
thunderx, 0x43, 0x0a2, -1)
AARCH64_CORE("thunderxt83", thunderxt83, thunderx, 8_1A,
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE,
thunderx, 0x43, 0x0a3, -1)
Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h (revision 244917)
+++ config/aarch64/aarch64-protos.h (working copy)
@@ -220,10 +220,19 @@ struct tune_params
unsigned int max_case_values;
/* Value for PARAM_L1_CACHE_LINE_SIZE; or 0 to use the default. */
unsigned int cache_line_size;
+ /* Value for PARAM_PREFETCH_LATENCY; or 0 to use the default. */
+ unsigned int prefetch_latency;
+ /* Value for PARAM_SIMULTANEOUS_PREFETCHES; or 0 to use the default. */
+ unsigned int simultaneous_prefetches;
+ /* Value for PARAM_L1_CACHE_SIZE; or 0 to use the default. */
+ unsigned int l1_cache_size;
+ /* Value for PARAM_L2_CACHE_SIZE; or 0 to use the default. */
+ unsigned int l2_cache_size;
/* An enum specifying how to take into account CPU autoprefetch capabilities
during instruction scheduling:
- AUTOPREFETCHER_OFF: Do not take autoprefetch capabilities into account.
+ - AUTOPREFETCHER_SW: Turn on software based prefetching.
- AUTOPREFETCHER_WEAK: Attempt to sort sequences of loads/store in order of
offsets but allow the pipeline hazard recognizer to alter that order to
maximize multi-issue opportunities.
@@ -233,6 +242,7 @@ struct tune_params
enum aarch64_autoprefetch_model
{
AUTOPREFETCHER_OFF,
+ AUTOPREFETCHER_SW,
AUTOPREFETCHER_WEAK,
AUTOPREFETCHER_STRONG
} autoprefetcher_model;
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c (revision 244917)
+++ config/aarch64/aarch64.c (working copy)
@@ -535,6 +535,10 @@ static const struct tune_params generic_
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -561,6 +565,10 @@ static const struct tune_params cortexa3
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -587,6 +595,10 @@ static const struct tune_params cortexa5
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -613,6 +625,10 @@ static const struct tune_params cortexa5
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
};
@@ -639,6 +655,10 @@ static const struct tune_params cortexa7
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -665,6 +685,10 @@ static const struct tune_params cortexa7
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -690,6 +714,10 @@ static const struct tune_params exynosm1
2, /* min_div_recip_mul_df. */
48, /* max_case_values. */
64, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -714,11 +742,45 @@ static const struct tune_params thunderx
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
+ 128, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 8, /* simultaneous_prefetches. */
+ 32, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
};
+/* Tunings for ThunderX CN88xx */
+static const struct tune_params thunderxt88_tunings =
+{
+ &thunderx_extra_costs,
+ &generic_addrcost_table,
+ &thunderx_regmove_cost,
+ &thunderx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ 6, /* memmov_cost */
+ 2, /* issue_rate */
+ AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
+ 8, /* function_align. */
+ 8, /* jump_align. */
+ 8, /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ 128, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 8, /* simultaneous_prefetches. */
+ 32, /* l1_cache_size. */
+ 16*1024, /* l2_cache_size. */
+ tune_params::AUTOPREFETCHER_SW, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
+};
+
static const struct tune_params xgene1_tunings =
{
&xgene1_extra_costs,
@@ -740,6 +802,10 @@ static const struct tune_params xgene1_t
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
0, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -766,6 +832,10 @@ static const struct tune_params qdf24xx_
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
64, /* cache_line_size. */
+ 0, /* prefetch_latency. */
+ 0, /* simultaneous_prefetches. */
+ 0, /* l1_cache_size. */
+ 0, /* l2_cache_size. */
tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -791,7 +861,11 @@ static const struct tune_params thunderx
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
64, /* cache_line_size. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ 0, /* prefetch_latency. */
+ 8, /* simultaneous_prefetches. */
+ 32, /* l1_cache_size. */
+ 256, /* l2_cache_size. */
+ tune_params::AUTOPREFETCHER_NONE, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
};
@@ -8646,6 +8720,7 @@ aarch64_override_options_internal (struc
switch (aarch64_tune_params.autoprefetcher_model)
{
case tune_params::AUTOPREFETCHER_OFF:
+ case tune_params::AUTOPREFETCHER_SW:
queue_depth = -1;
break;
case tune_params::AUTOPREFETCHER_WEAK:
@@ -8672,6 +8747,42 @@ aarch64_override_options_internal (struc
opts->x_param_values,
global_options_set.x_param_values);
+ /* Set the prefetch latncy. */
+ if (selected_cpu->tune->prefetch_latency != 0)
+ maybe_set_param_value (PARAM_PREFETCH_LATENCY,
+ selected_cpu->tune->prefetch_latency,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
+ /* Set the simultaneous prefetches. */
+ if (selected_cpu->tune->simultaneous_prefetches != 0)
+ maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+ selected_cpu->tune->simultaneous_prefetches,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
+ /* Set the l1 cache size. */
+ if (selected_cpu->tune->l1_cache_size != 0)
+ maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+ selected_cpu->tune->l1_cache_size,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
+ /* Set the l2 cache size. */
+ if (selected_cpu->tune->l2_cache_size != 0)
+ maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+ selected_cpu->tune->l2_cache_size,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
+
+ /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
+ if (opts->x_flag_prefetch_loop_arrays < 0
+ && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+ && !opts->x_optimize_size
+ && aarch64_tune_params.autoprefetcher_model ==
tune_params::AUTOPREFETCHER_SW)
+ opts->x_flag_prefetch_loop_arrays = 1;
+
aarch64_override_options_after_change_1 (opts);
}