From: tiancyin <[email protected]> [Why] On some servers equipped with huge system memory at multi-terabyte scale, the PCI bus physical address alignment policy may assign GPUs very large bus addresses that exceed 44 bits. This causes DMA address overflow errors:
[ 83.216803] amdgpu 0000:43:00.0: DMA addr 0x0000210b39000000+8388608 overflow (mask fffffffffff, bus limit 0). [How] Enlarge the DMA mask from 44-bit to 48-bit to accommodate larger physical addresses. Signed-off-by: tiancyin <[email protected]> --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 26 +++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 25 ++++++++++++++++++++----- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index a1f8141f28c9..7efc3880eed8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -21,6 +21,7 @@ * */ #include <linux/firmware.h> +#include <linux/processor.h> #include <linux/pci.h> #include <drm/drm_cache.h> @@ -726,7 +727,7 @@ static int gmc_v11_0_gart_init(struct amdgpu_device *adev) static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) { - int r, vram_width = 0, vram_type = 0, vram_vendor = 0; + int r, vram_width = 0, vram_type = 0, vram_vendor = 0, dma_mask; struct amdgpu_device *adev = ip_block->adev; adev->mmhub.funcs->init(adev); @@ -805,13 +806,28 @@ static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) */ adev->gmc.mc_mask = 0xffffffffffffULL; /* 48 bit MC */ - r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(44)); +#if defined CONFIG_X86 && defined CONFIG_PHYS_ADDR_T_64BIT + dma_mask = boot_cpu_data.x86_phys_bits >= 48 ? 48 : 44; +#else + dma_mask = 44; +#endif +fallback_dma_mask: + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(dma_mask)); if (r) { - dev_warn(adev->dev, "amdgpu: No suitable DMA available.\n"); - return r; + if (dma_mask > 44) { + dev_notice( + adev->dev, + "amdgpu: %d bit DMA is not available, fallback to 44 bit.\n", + dma_mask); + dma_mask = 44; + goto fallback_dma_mask; + } else { + dev_warn(adev->dev, "amdgpu: No suitable DMA available.\n"); + return r; + } } - adev->need_swiotlb = drm_need_swiotlb(44); + adev->need_swiotlb = drm_need_swiotlb(dma_mask); r = gmc_v11_0_mc_init(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index f4a19357ccbc..e1dd99e1151f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -21,6 +21,7 @@ * */ #include <linux/firmware.h> +#include <linux/processor.h> #include <linux/pci.h> #include <drm/drm_cache.h> @@ -742,7 +743,7 @@ static int gmc_v12_0_gart_init(struct amdgpu_device *adev) static int gmc_v12_0_sw_init(struct amdgpu_ip_block *ip_block) { - int r, vram_width = 0, vram_type = 0, vram_vendor = 0; + int r, vram_width = 0, vram_type = 0, vram_vendor = 0, dma_mask; struct amdgpu_device *adev = ip_block->adev; adev->mmhub.funcs->init(adev); @@ -802,13 +803,27 @@ static int gmc_v12_0_sw_init(struct amdgpu_ip_block *ip_block) */ adev->gmc.mc_mask = 0xffffffffffffULL; /* 48 bit MC */ - r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(44)); +#if defined CONFIG_X86 && defined CONFIG_PHYS_ADDR_T_64BIT + dma_mask = boot_cpu_data.x86_phys_bits >= 48 ? 48 : 44; +#else + dma_mask = 44; +#endif +fallback_dma_mask: + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(dma_mask)); if (r) { - printk(KERN_WARNING "amdgpu: No suitable DMA available.\n"); - return r; + if (dma_mask > 44) { + printk(KERN_NOTICE + "amdgpu: %d bit DMA is not available, fallback to 44 bit.\n", + dma_mask); + dma_mask = 44; + goto fallback_dma_mask; + } else { + printk(KERN_WARNING "amdgpu: No suitable DMA available.\n"); + return r; + } } - adev->need_swiotlb = drm_need_swiotlb(44); + adev->need_swiotlb = drm_need_swiotlb(dma_mask); r = gmc_v12_0_mc_init(adev); if (r) -- 2.34.1
