From: tiancyin <[email protected]> [Why] On some servers equipped with huge system memory at multi-terabyte scale, the PCI bus physical address alignment policy may assign GPUs very large bus addresses that exceed 44 bits. This causes DMA address overflow errors:
[ 83.216803] amdgpu 0000:43:00.0: DMA addr 0x0000210b39000000+8388608 overflow (mask fffffffffff, bus limit 0). [How] Enlarge the DMA mask from 44-bit to 48-bit to accommodate larger physical addresses. Signed-off-by: tiancyin <[email protected]> --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 24 +++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 26 +++++++++++++++++++++----- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index a1f8141f28c9..60393e311537 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -21,6 +21,7 @@ * */ #include <linux/firmware.h> +#include <linux/processor.h> #include <linux/pci.h> #include <drm/drm_cache.h> @@ -726,7 +727,7 @@ static int gmc_v11_0_gart_init(struct amdgpu_device *adev) static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) { - int r, vram_width = 0, vram_type = 0, vram_vendor = 0; + int r, vram_width = 0, vram_type = 0, vram_vendor = 0, dma_mask; struct amdgpu_device *adev = ip_block->adev; adev->mmhub.funcs->init(adev); @@ -805,13 +806,26 @@ static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) */ adev->gmc.mc_mask = 0xffffffffffffULL; /* 48 bit MC */ - r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(44)); +#if defined CONFIG_X86 && defined CONFIG_PHYS_ADDR_T_64BIT + dma_mask = boot_cpu_data.x86_phys_bits >= 48 ? 48 : 44; +#else + dma_mask = 44; +#endif + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(dma_mask)); if (r) { - dev_warn(adev->dev, "amdgpu: No suitable DMA available.\n"); - return r; + dev_notice(adev->dev, + "amdgpu: %d bit DMA is not available, fallback to 44 bit.\n", + dma_mask); + dma_mask = 44; + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(dma_mask)); + if (r) { + dev_warn(adev->dev, + "amdgpu: No suitable DMA available.\n"); + return r; + } } - adev->need_swiotlb = drm_need_swiotlb(44); + adev->need_swiotlb = drm_need_swiotlb(dma_mask); r = gmc_v11_0_mc_init(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index f4a19357ccbc..5ca3d1141cb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -21,6 +21,7 @@ * */ #include <linux/firmware.h> +#include <linux/processor.h> #include <linux/pci.h> #include <drm/drm_cache.h> @@ -742,7 +743,7 @@ static int gmc_v12_0_gart_init(struct amdgpu_device *adev) static int gmc_v12_0_sw_init(struct amdgpu_ip_block *ip_block) { - int r, vram_width = 0, vram_type = 0, vram_vendor = 0; + int r, vram_width = 0, vram_type = 0, vram_vendor = 0, dma_mask; struct amdgpu_device *adev = ip_block->adev; adev->mmhub.funcs->init(adev); @@ -802,13 +803,28 @@ static int gmc_v12_0_sw_init(struct amdgpu_ip_block *ip_block) */ adev->gmc.mc_mask = 0xffffffffffffULL; /* 48 bit MC */ - r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(44)); +#if defined CONFIG_X86 && defined CONFIG_PHYS_ADDR_T_64BIT + dma_mask = boot_cpu_data.x86_phys_bits >= 48 ? 48 : 44; +#else + dma_mask = 44; +#endif + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(dma_mask)); if (r) { - printk(KERN_WARNING "amdgpu: No suitable DMA available.\n"); - return r; + printk(KERN_NOTICE + "amdgpu: %d bit DMA is not available, fallback to 44 bit.\n", + dma_mask); + dma_mask = 44; + r = dma_set_mask_and_coherent(adev->dev, + DMA_BIT_MASK(dma_mask)); + if (r) { + printk(KERN_WARNING + "amdgpu: No suitable DMA available.\n"); + return r; + } + } - adev->need_swiotlb = drm_need_swiotlb(44); + adev->need_swiotlb = drm_need_swiotlb(dma_mask); r = gmc_v12_0_mc_init(adev); if (r) -- 2.34.1
