Change the RACPREFDATA(x) setting to prefetch the next 256-byte line after 4 consecutive lines have been used, instead of after 2 consecutive lines. This does improve the synthetic memcpy benchmark by an additional +0.5% on top of the previous change for Cortex-A72 CPUs.
Signed-off-by: Florian Fainelli <f.faine...@gmail.com> --- drivers/soc/bcm/brcmstb/biuctrl.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/soc/bcm/brcmstb/biuctrl.c b/drivers/soc/bcm/brcmstb/biuctrl.c index 28f69cc0df51..63864b6dea2e 100644 --- a/drivers/soc/bcm/brcmstb/biuctrl.c +++ b/drivers/soc/bcm/brcmstb/biuctrl.c @@ -23,7 +23,9 @@ #define DPREF_LINE_2_SHIFT 24 #define DPREF_LINE_2_MASK 0xff -/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */ +/* Bitmask to enable instruction and data prefetching with a 256-bytes stride, + * prefetch next 256-byte line after 4 consecutive lines used + */ #define RAC_DATA_INST_EN_MASK (1 << RACPREFINST_SHIFT | \ RACENPREF_MASK << RACENINST_SHIFT | \ 1 << RACPREFDATA_SHIFT | \ @@ -174,7 +176,7 @@ static const u32 a72_b53_mach_compat[] = { static void __init a72_b53_rac_enable_all(struct device_node *np) { unsigned int cpu; - u32 enable = 0, pref_dist; + u32 enable = 0, pref_dist, shift; if (IS_ENABLED(CONFIG_CACHE_B15_RAC)) return; @@ -184,9 +186,13 @@ static void __init a72_b53_rac_enable_all(struct device_node *np) pref_dist = cbc_readl(RAC_CONFIG1_REG); for_each_possible_cpu(cpu) { + shift = cpu * RAC_CPU_SHIFT + RACPREFDATA_SHIFT; enable |= RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT); - if (cpubiuctrl_regs == a72_cpubiuctrl_regs) + if (cpubiuctrl_regs == a72_cpubiuctrl_regs) { + enable &= ~(RACENPREF_MASK << shift); + enable |= 3 << shift; pref_dist |= 1 << (cpu + DPREF_LINE_2_SHIFT); + } } cbc_writel(enable, RAC_CONFIG0_REG); -- 2.25.1