On some x86 systems (old AMD Athlons, Intel Luna Lake) we have the problem that changing the caching flags of system memory requires changing the global MTRR/PAT tables since those CPUs can't handle aliasing caching attributes.
But on most modern x86 system (e.g. AMD CPUs after 2004) we actually don't need that any more and can update the caching flags directly in the PTEs of the userspace and kernel mappings. We already do this with encryption on x86 64bit for quite a while and all other supported platforms (Sparc, PowerPC, ARM, MIPS, LONGARCH) as well as the i915 driver have never done anything different either. So stop changing the global chaching flags for CPU systems which don't need it and just insert a clflush to be on the safe side so that we never return memory with dirty cache lines. Testing on a Ryzen 5 and 7 shows that the clflush has absolutely no performance impact, but I'm still waiting for CI systems to confirm functional correctness. v2: drop the pool only on AMD CPUs for now Signed-off-by: Christian König <christian.koe...@amd.com> --- drivers/gpu/drm/ttm/ttm_pool.c | 37 +++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 83b10706ba89..3f830fb2aea5 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -45,6 +45,7 @@ #include <drm/ttm/ttm_pool.h> #include <drm/ttm/ttm_tt.h> #include <drm/ttm/ttm_bo.h> +#include <drm/drm_cache.h> #include "ttm_module.h" @@ -119,6 +120,8 @@ module_param(page_pool_size, ulong, 0644); static atomic_long_t allocated_pages; +static bool skip_caching_adjustment; + static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS]; static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS]; @@ -195,7 +198,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching, /* We don't care that set_pages_wb is inefficient here. This is only * used when we have to shrink and CPU overhead is irrelevant then. */ - if (caching != ttm_cached && !PageHighMem(p)) + if (!skip_caching_adjustment && + caching != ttm_cached && !PageHighMem(p)) set_pages_wb(p, 1 << order); #endif @@ -223,13 +227,19 @@ static int ttm_pool_apply_caching(struct ttm_pool_alloc_state *alloc) if (!num_pages) return 0; - switch (alloc->tt_caching) { - case ttm_cached: - break; - case ttm_write_combined: - return set_pages_array_wc(alloc->caching_divide, num_pages); - case ttm_uncached: - return set_pages_array_uc(alloc->caching_divide, num_pages); + if (skip_caching_adjustment) { + drm_clflush_pages(alloc->caching_divide, num_pages); + } else { + switch (alloc->tt_caching) { + case ttm_cached: + break; + case ttm_write_combined: + return set_pages_array_wc(alloc->caching_divide, + num_pages); + case ttm_uncached: + return set_pages_array_uc(alloc->caching_divide, + num_pages); + } } #endif alloc->caching_divide = alloc->pages; @@ -342,6 +352,9 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool, return &pool->caching[caching].orders[order]; #ifdef CONFIG_X86 + if (skip_caching_adjustment) + return NULL; + switch (caching) { case ttm_write_combined: if (pool->nid != NUMA_NO_NODE) @@ -981,7 +994,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt, #ifdef CONFIG_X86 /* Anything returned to the system needs to be cached. */ - if (tt->caching != ttm_cached) + if (!skip_caching_adjustment && tt->caching != ttm_cached) set_pages_array_wb(tt->pages, tt->num_pages); #endif @@ -1296,6 +1309,12 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); +#ifdef CONFIG_X86 + skip_caching_adjustment = + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + static_cpu_has(X86_FEATURE_CLFLUSH); +#endif + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); -- 2.43.0