RE: [PATCH] drm/amdgpu: update GC golden setting for navy_flounder
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Tao Zhou > -Original Message- > From: Jiansong Chen > Sent: Wednesday, July 29, 2020 12:02 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao ; Chen, Jiansong (Simon) > > Subject: [PATCH] drm/amdgpu: update GC golden setting for navy_flounder > > Update GC golden setting for navy_flounder. > > Signed-off-by: Jiansong Chen > Change-Id: Ia7e82616b0be48f397c73b015823ac10ef907f08 > --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index db9f1e89a0f8..ca16f01956d3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -3127,7 +3127,7 @@ static const struct soc15_reg_golden > golden_settings_gc_10_3_2[] = > SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA0_CLK_CTRL, > 0xff7f0fff, 0x3100), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA1_CLK_CTRL, > 0xff7f0fff, 0x7e000100), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmCPF_GCR_CNTL, 0x0007, > 0xc000), > -SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, > 0x0200), > +SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, > 0x0280), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, > 0x0080), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_EXCEPTION_CONTROL, > 0x7fff0f1f, 0x00b8), > SOC15_REG_GOLDEN_VALUE(GC, 0, > mmGCR_GENERAL_CNTL_Sienna_Cichlid, 0x1ff1, 0x0500), @@ -3158,7 > +3158,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_2[] > = > SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER7_SELECT, > 0xf0f001ff, 0x), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER8_SELECT, > 0xf0f001ff, 0x), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT, > 0xf0f001ff, 0x), > -SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x, > 0x010b), > +SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, > 0x0103), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, > 0x00a0), > SOC15_REG_GOLDEN_VALUE(GC, 0, mmVGT_GS_MAX_WAVE_ID, > 0x0fff, 0x03ff) }; > -- > 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: update GC golden setting for navy_flounder
Update GC golden setting for navy_flounder. Signed-off-by: Jiansong Chen Change-Id: Ia7e82616b0be48f397c73b015823ac10ef907f08 --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index db9f1e89a0f8..ca16f01956d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3127,7 +3127,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_2[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA0_CLK_CTRL, 0xff7f0fff, 0x3100), SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA1_CLK_CTRL, 0xff7f0fff, 0x7e000100), SOC15_REG_GOLDEN_VALUE(GC, 0, mmCPF_GCR_CNTL, 0x0007, 0xc000), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0200), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080), SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_EXCEPTION_CONTROL, 0x7fff0f1f, 0x00b8), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Sienna_Cichlid, 0x1ff1, 0x0500), @@ -3158,7 +3158,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_2[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER7_SELECT, 0xf0f001ff, 0x), SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER8_SELECT, 0xf0f001ff, 0x), SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT, 0xf0f001ff, 0x), - SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x, 0x010b), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103), SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0), SOC15_REG_GOLDEN_VALUE(GC, 0, mmVGT_GS_MAX_WAVE_ID, 0x0fff, 0x03ff) }; -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 11/12] drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0
When amdgpu_bad_page_threshold = 0, bad page reservation stuffs are skipped in either UMC ECC irq or page retirement calling of sync flood isr. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0b7317349bde..f47909d6a95b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1678,7 +1678,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) int ret = 0; /* no bad page record, skip eeprom access */ - if (!control->num_recs) + if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) return ret; bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); @@ -1782,7 +1782,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) struct amdgpu_bo *bo = NULL; int i, ret = 0; - if (!con || !con->eh_data) + /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */ + if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0)) return 0; mutex_lock(>recovery_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index af1b1ccf613c..262baf0f61ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -125,8 +125,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, "detected in UMC block\n", err_data->ue_count); - if (err_data->err_addr_cnt && - amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + if ((amdgpu_bad_page_threshold != 0) && + err_data->err_addr_cnt && + amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt)) dev_warn(adev->dev, "Failed to add ras bad page!\n"); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 09/12] drm/amdgpu: add one definition for RAS's sysfs/debugfs name
Add one definition for the RAS module's FS name. It's used in both debugfs and sysfs cases. v2: Use static variable instead of macro definition. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c519948ebcff..0328f7882199 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,8 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +static const char *RAS_FS_NAME = "ras"; + const char *ras_error_string[] = { "none", "parity", @@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (sysfs_add_file_to_group(>dev->kobj, >sysfs_attr.attr, - "ras")) { + RAS_FS_NAME)) { put_obj(obj); return -EINVAL; } @@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, sysfs_remove_file_from_group(>dev->kobj, >sysfs_attr.attr, - "ras"); + RAS_FS_NAME); obj->attr_inuse = 0; put_obj(obj); @@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct drm_minor *minor = adev->ddev->primary; - con->dir = debugfs_create_dir("ras", minor->debugfs_root); + con->dir = debugfs_create_dir(RAS_FS_NAME, + minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, adev, _ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 12/12] drm/amdgpu: update eeprom once specifying one bigger threshold
During driver's probe, when it hits bad gpu tag in eeprom i2c init calling(the tag was set when reported bad page reaches bad page threshold in last driver's working loop), there are some strategys to deal with the cases: 1. when the module parameter amdgpu_bad_page_threshold = 0, that means page retirement feature is disabled, so just resetting the eeprom is fine. 2. When amdgpu_bad_page_threshold is not 0, and moreover, user sets one bigger valid data in order to make current boot up succeeds, correct eeprom header tag and do not break booting. 3. For other cases, driver's probe will be broken. v2: Just update eeprom header tag instead of resetting the whole table header when user sets one bigger threshold data. Signed-off-by: Guchun Chen --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 30 +-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index be895dc2d739..c6c47c665f6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -216,6 +216,24 @@ static bool __validate_tbl_checksum(struct amdgpu_ras_eeprom_control *control, return true; } +static int amdgpu_ras_eeprom_correct_header_tag( + struct amdgpu_ras_eeprom_control *control, + uint32_t header) +{ + unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE]; + struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; + int ret = 0; + + memset(buff, 0, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE); + + mutex_lock(>tbl_mutex); + hdr->header = header; + ret = __update_table_header(control, buff); + mutex_unlock(>tbl_mutex); + + return ret; +} + int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; @@ -248,6 +266,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct i2c_msg msg = { .addr = 0, .flags = I2C_M_RD, @@ -287,9 +306,16 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && (amdgpu_bad_page_threshold != 0)) { - *exceed_err_limit = true; - DRM_ERROR("Exceeding the bad_page_threshold parameter, " + if (ras->bad_page_cnt_threshold > control->num_recs) { + DRM_INFO("Using one valid bigger bad page threshold " + "and correcting eeprom header tag.\n"); + ret = amdgpu_ras_eeprom_correct_header_tag(control, + EEPROM_TABLE_HDR_VAL); + } else { + *exceed_err_limit = true; + DRM_ERROR("Exceeding the bad_page_threshold parameter, " "disabling the GPU.\n"); + } } else { DRM_INFO("Creating new EEPROM table"); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 08/12] drm/amdgpu: restore ras flags when user resets eeprom
RAS flags needs to be cleaned as well when user requires one clean eeprom. v2: RAS flags shall be restored after eeprom reset succeeds. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fab6f8d6bee6..c519948ebcff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -368,12 +368,19 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_device *adev = + (struct amdgpu_device *)file_inode(f)->i_private; int ret; - ret = amdgpu_ras_eeprom_reset_table(>psp.ras.ras->eeprom_control); + ret = amdgpu_ras_eeprom_reset_table( + &(amdgpu_ras_get_context(adev)->eeprom_control)); - return ret == 1 ? size : -EIO; + if (ret == 1) { + amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; + return size; + } else { + return -EIO; + } } static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 10/12] drm/amdgpu: decouple sysfs creating of bad page node
Bad page information should not be exposed by sysfs when bad page retirement is disabled, so decouple it from ras sysfs group creating, and add one guard before creating. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 71 - 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0328f7882199..0b7317349bde 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1027,6 +1027,35 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); } +static void amdgpu_ras_sysfs_add_badpage_node(struct amdgpu_device *adev) +{ +#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct attribute_group group; + struct bin_attribute *bin_attrs[] = { + >badpages_attr, + NULL, + }; + + con->badpages_attr = (struct bin_attribute) { + .attr = { + .name = "gpu_vram_bad_pages", + .mode = S_IRUGO, + }, + .size = 0, + .private = NULL, + .read = amdgpu_ras_sysfs_badpages_read, + }; + + group.name = RAS_FS_NAME; + group.bin_attrs = bin_attrs; + + sysfs_bin_attr_init(bin_attrs[0]); + + sysfs_update_group(>dev->kobj, ); +#endif +} + static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1034,16 +1063,9 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) >features_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { - >badpages_attr, - NULL - }; struct attribute_group group = { .name = RAS_FS_NAME, .attrs = attrs, -#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) - .bin_attrs = bin_attrs, -#endif }; con->features_attr = (struct device_attribute) { @@ -1054,22 +1076,22 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) .show = amdgpu_ras_sysfs_features_read, }; - con->badpages_attr = (struct bin_attribute) { - .attr = { - .name = "gpu_vram_bad_pages", - .mode = S_IRUGO, - }, - .size = 0, - .private = NULL, - .read = amdgpu_ras_sysfs_badpages_read, - }; - sysfs_attr_init(attrs[0]); - sysfs_bin_attr_init(bin_attrs[0]); return sysfs_create_group(>dev->kobj, ); } +static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) +{ +#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + sysfs_remove_file_from_group(>dev->kobj, + >badpages_attr.attr, + RAS_FS_NAME); +#endif +} + static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1077,16 +1099,9 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) >features_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { - >badpages_attr, - NULL - }; struct attribute_group group = { .name = RAS_FS_NAME, .attrs = attrs, -#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) - .bin_attrs = bin_attrs, -#endif }; sysfs_remove_group(>dev->kobj, ); @@ -1155,6 +1170,9 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_sysfs_remove(adev, >head); } + if (amdgpu_bad_page_threshold != 0) + amdgpu_ras_sysfs_remove_bad_page_node(adev); + amdgpu_ras_sysfs_remove_feature_node(adev); return 0; @@ -1283,6 +1301,9 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { amdgpu_ras_sysfs_create_feature_node(adev); + if (amdgpu_bad_page_threshold != 0) + amdgpu_ras_sysfs_add_badpage_node(adev); + return 0; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 02/12] drm/amdgpu: validate bad page threshold in ras
Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when saved bad pages exceed threshold value, and proceed corresponding actions. v2: When using the default typical value, it should be min value between typical value and eeprom max records length. v3: drop the case of setting bad_page_cnt_threshold to be 0x, as it confuses user. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 48 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 5 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 2 + 4 files changed, 58 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6f06e1214622..3c4c142e9d8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -69,6 +69,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #defineRAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ +#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1700,6 +1703,47 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int tmp_threshold = amdgpu_bad_page_threshold; + u64 val; + + /* +* Justification of value bad_page_cnt_threshold in ras structure +* +* Generally, -1 <= amdgpu_bad_page_threshold <= max record length +* in eeprom, and introduce two scenarios accordingly. +* +* Bad page retirement enablement: +*- If amdgpu_bad_page_threshold = -1, +* bad_page_cnt_threshold = typical value by formula. +* +*- When the value from user is 0 < amdgpu_bad_page_threshold < +* max record length in eeprom, use it directly. +* +* Bad page retirement disablement: +*- If amdgpu_bad_page_threshold = 0, bad page retirement +* functionality is disabled, and bad_page_cnt_threshold will +* take no effect. +*/ + + if (tmp_threshold < -1) + tmp_threshold = -1; + else if (tmp_threshold > max_length) + tmp_threshold = max_length; + + if (tmp_threshold == -1) { + val = adev->gmc.mc_vram_size; + do_div(val, RAS_BAD_PAGE_RATE); + con->bad_page_cnt_threshold = min(lower_32_bits(val), + max_length); + } else { + con->bad_page_cnt_threshold = tmp_threshold; + } +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1777,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; int ret; if (con) @@ -1795,6 +1840,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(>in_recovery, 0); con->adev = adev; + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + ret = amdgpu_ras_eeprom_init(>eeprom_control); if (ret) goto free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b2667342cf67..4672649a9293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -336,6 +336,9 @@ struct amdgpu_ras { struct amdgpu_ras_eeprom_control eeprom_control; bool error_query_ready; + + /* bad page count threshold */ + uint32_t bad_page_cnt_threshold; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index c0096097bbcf..a2c982b1eac6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, return ret == num ? 0 : -EIO; } +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +{ + return EEPROM_MAX_RECORD_NUM; +} + /* Used for testing if bugs encountered */ #if 0 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
[PATCH 06/12] drm/amdgpu: schedule ras recovery when reaching bad page threshold
Once the bad page saved to eeprom reaches the configured threshold, ras recovery will be issued to notify user. v2: Fix spelling typo. Signed-off-by: Guchun Chen --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 37 ++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 67995b66d7d4..d24bf65f6dd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, int i, ret = 0; struct i2c_msg *msgs, *msg; unsigned char *buffs, *buff; + bool sched_ras_recovery = false; struct eeprom_table_record *record; struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS) return 0; @@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, goto free_buff; } + /* +* If saved bad pages number exceeds the bad page threshold for +* the whole VRAM, update table header to mark the BAD GPU tag +* and schedule one ras recovery after eeprom write is done, +* this can avoid the missing for latest records. +* +* This new header will be picked up and checked in the bootup +* by ras recovery, which may break bootup process to notify +* user this GPU is in bad state and to retire such GPU for +* further check. +*/ + if (write && (amdgpu_bad_page_threshold != 0) && + ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) { + dev_warn(adev->dev, + "Saved bad pages(%d) reaches threshold value(%d).\n", + control->num_recs + num, ras->bad_page_cnt_threshold); + control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD; + sched_ras_recovery = true; + } + /* In case of overflow just start from beginning to not lose newest records */ if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES)) control->next_addr = EEPROM_RECORD_START; - /* * TODO Currently makes EEPROM writes for each record, this creates * internal fragmentation. Optimized the code to do full page write of @@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, __update_tbl_checksum(control, records, num, old_hdr_byte_sum); __update_table_header(control, buffs); + + if (sched_ras_recovery) { + /* +* Before scheduling ras recovery, assert the related +* flag first, which shall bypass common bad page +* reservation execution in amdgpu_ras_reset_gpu. +*/ + amdgpu_ras_get_context(adev)->flags |= + AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV; + + dev_warn(adev->dev, "Conduct ras recovery due to bad " + "page threshold reached.\n"); + amdgpu_ras_reset_gpu(adev); + } } else if (!__validate_tbl_checksum(control, records, num)) { DRM_WARN("EEPROM Table checksum mismatch!"); /* TODO Uncomment when EEPROM read/write is relliable */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU
When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. v2: Fix spelling typo, correct the condition to detect bad gpu tag and refine error message. v3: Refine function argument name. v4: Fix missing check of returning value of i2c initialization error case. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2662cd7c8685..30af0dfee1a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. -* recovery_init may fail, but it can free all resources allocated by -* itself and its failure should not stop amdgpu init process. +* +* amdgpu_ras_recovery_init may fail, but the upper only cares the +* failure from bad gpu situation and stop amdgpu init process +* accordingly. For other failed cases, it will still release all +* the resource and print error message, rather than returning one +* negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4c142e9d8a..67d9d65b069e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1843,8 +1844,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(>eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); + /* +* This calling fails when exc_err_limit is true or +* ret != 0. +*/ + if (exc_err_limit || ret) goto free; if (con->eeprom_control.num_recs) { @@ -1868,6 +1873,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* +* Except error threshold exceeding case, other failure cases in this +* function would not fail amdgpu driver init. +*/ + if (!exc_err_limit) + ret = 0; + else + ret = -EINVAL; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 35c0c849d49b..67995b66d7d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) { int ret = 0; struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) .buf= buff, }; + *exceed_err_limit = false; + /* Verify i2c adapter is initialized */ if (!adev->pm.smu_i2c.algo) return -ENOENT; @@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", control->num_recs); + } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && + (amdgpu_bad_page_threshold != 0)) { + *exceed_err_limit = true; +
[PATCH 07/12] drm/amdgpu: break GPU recovery once it's in bad state
When GPU executes recovery and retriving bad GPU tag from external eerpom device, the recovery will be broken and error message is printed as well for user's awareness. v2: Refine warning message in threshold reaching case, and fix spelling typo. v3: Fix explicit calling of bad gpu. v4: Rename function names. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 20 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 + .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 40 +++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 4 ++ 5 files changed, 79 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 30af0dfee1a1..c893d9adbab7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4139,8 +4139,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_fbdev_set_suspend(tmp_adev, 0); - /* must succeed. */ - amdgpu_ras_resume(tmp_adev); + /* +* The GPU enters bad state once faulty pages +* by ECC has reached the threshold, and ras +* recovery is scheduled next. So add one check +* here to break recovery if it indeed exceeds +* bad page threshold, and remind user to +* retire this GPU or setting one bigger +* bad_page_threshold value to fix this once +* probing driver again. +*/ + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + /* must succeed. */ + amdgpu_ras_resume(tmp_adev); + } else { + r = -EINVAL; + goto out; + } /* Update PSP FW topology after reset */ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) @@ -4148,7 +4163,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - out: if (!r) { amdgpu_irq_gpu_reset_resume_helper(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 64ae0742f385..fab6f8d6bee6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2206,3 +2206,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } + +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool exc_err_limit = false; + + if (con && (amdgpu_bad_page_threshold != 0)) + amdgpu_ras_eeprom_check_err_threshold(>eeprom_control, + _err_limit); + + /* +* We are only interested in variable exc_err_limit, +* as it says if GPU is in bad state or not. +*/ + return exc_err_limit; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index cf9f60202334..70a6fca73617 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev); unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev); + /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index d24bf65f6dd7..be895dc2d739 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) return curr_address; } +int amdgpu_ras_eeprom_check_err_threshold( + struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + unsigned char buff[EEPROM_ADDRESS_SIZE + + EEPROM_TABLE_HEADER_SIZE] = { 0 }; + struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; + struct i2c_msg msg = {
[PATCH 05/12] drm/amdgpu: skip bad page reservation once issuing from eeprom write
Once the ras recovery is issued from eeprom write itself, bad page reservation should be ignored, otherwise, recursive calling of writting to eeprom would happen. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 67d9d65b069e..64ae0742f385 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -62,8 +62,6 @@ const char *ras_block_string[] = { #define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i]) -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) /* inject address is 52 bits */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 4672649a9293..cf9f60202334 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -31,6 +31,10 @@ #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1) +#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2) + enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__SDMA, @@ -503,10 +507,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - /* save bad page to eeprom before gpu reset, -* i2c may be unstable in gpu reset + /* +* Save bad page to eeprom before gpu reset, i2c may be unstable +* in gpu reset. +* +* Also, exclude the case when ras recovery issuer is +* eeprom page write itself. */ - if (in_task()) + if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task()) amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(>in_recovery, 0, 1) == 0) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 03/12] drm/amdgpu: add bad gpu tag definition
This tag will be hired for bad gpu detection in eeprom's access. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index a2c982b1eac6..35c0c849d49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -46,6 +46,9 @@ #define EEPROM_TABLE_HDR_VAL 0x414d4452 #define EEPROM_TABLE_VER 0x0001 +/* Bad GPU tag ‘BADG’ */ +#define EEPROM_TABLE_HDR_BAD 0x42414447 + /* Assume 2 Mbit size */ #define EEPROM_SIZE_BYTES 256000 #define EEPROM_PAGE__SIZE_BYTES 256 -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 00/12] BAD GPU retirement policy by total bad pages
The series is to enable/disable bad page feature and apply different bad page reservation strategy by different bad page threshold configurations. When the saved bad pages written to eeprom reach the threshold, one ras recovery will be issued immediately and the recovery will fail to tell user that the GPU is BAD and needs to be retired for further check or setting one valid bigger threshold value in next driver's probe to skip corresponding check. During bootup, similar bad page threshold check is conducted as well when eeprom get initialized, and it will possibly break boot up for user's awareness. When user sets bad_page_threshold=0 once probing driver, bad page retirement feature is completely disabled, and driver has no chance to process bad page information record and write it to eeprom. Guchun Chen (12): drm/amdgpu: add bad page count threshold in module parameter drm/amdgpu: validate bad page threshold in ras drm/amdgpu: add bad gpu tag definition drm/amdgpu: break driver init process when it's bad GPU drm/amdgpu: skip bad page reservation once issuing from eeprom write drm/amdgpu: schedule ras recovery when reaching bad page threshold drm/amdgpu: break GPU recovery once it's in bad state drm/amdgpu: restore ras flags when user resets eeprom drm/amdgpu: add one definition for RAS's sysfs/debugfs name drm/amdgpu: decouple sysfs creating of bad page node drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0 drm/amdgpu: update eeprom once specifying one bigger threshold drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 32 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 186 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 19 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 121 +++- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +- 8 files changed, 331 insertions(+), 53 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 01/12] drm/amdgpu: add bad page count threshold in module parameter
bad_page_threshold could be configured to enable/disable the associated bad page retirement feature in RAS. When it's -1, ras will use typical bad page failure value to handle bad page retirement. When it's 0, disable bad page retirement, and no bad page will be recorded and saved. For other valid value, driver will use this manual value as the threshold value of totoal bad pages. v2: correct documentation of this parameter. v3: remove confused statement in documentation. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 06bfb8658dec..bb83ffb5e26a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level; extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; +extern int amdgpu_bad_page_threshold; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d28b95f721c4..820a28c9e957 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = { }; int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0x; +int amdgpu_bad_page_threshold = -1; /** * DOC: vramlimit (int) @@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); module_param_named(reset_method, amdgpu_reset_method, int, 0444); +/** + * DOC: bad_page_threshold (int) + * Bad page threshold is to specify the threshold value of faulty pages + * detected by RAS ECC, that may result in GPU entering bad status if total + * faulty pages by ECC exceed threshold value and leave it for user's further + * check. + */ +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default typical value), 0 = disable bad page retirement)"); +module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
[AMD Official Use Only - Internal Distribution Only] Thanks very much Lunben and Guchun! Regards, Rico From: Tuikov, Luben Sent: Wednesday, July 29, 2020 2:44 To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth Subject: Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit On 2020-07-28 1:27 a.m., Tianci Yin wrote: > From: "Tianci.Yin" > > On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, " are lost " > reconfigure the golden settings after GFXOFF exit. " so reconfigure ..." > > Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 > Signed-off-by: Tianci.Yin > --- > drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ > 1 file changed, 11 insertions(+) > > diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > index 55463e7a11e2..5da0436d41e0 100644 > --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, > >struct smu_context *smu = (struct smu_context*)(handle); >struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); > + struct amdgpu_device *adev = smu->adev; > >if (!smu->is_apu && !smu_dpm_ctx->dpm_context) >return -EINVAL; > @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, >amdgpu_device_ip_set_clockgating_state(smu->adev, > > AMD_IP_BLOCK_TYPE_GFX, > > AMD_CG_STATE_UNGATE); > + > + if (adev->asic_type >= CHIP_NAVI10 && > + adev->asic_type <= CHIP_NAVI12 && > + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { > + if (adev->gfx.funcs->init_spm_golden) { > + dev_dbg(adev->dev,"GFXOFF exited, > re-init SPM golden settings\n"); Space after comma is required. > + amdgpu_gfx_init_spm_golden(adev); > + } else > + dev_warn(adev->dev,"Callback > init_spm_golden is NULL\n"); Space after comma is required. Please add braces to the single statement of the "else". The reason for this is that it complements the braces of the "if ( ) {" of the multi-line statement and closes the block. "checkpatch" calls it "unbalanced braces". With these three fixed, this patch is Reviewed-by: Luben Tuikov Regards, Luben > + } >} >} else { >/* exit umd pstate, restore level, enable gfx cg*/ > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden for Navi1x
[AMD Official Use Only - Internal Distribution Only] Thanks very much Luben! Regards, Rico From: Tuikov, Luben Sent: Wednesday, July 29, 2020 2:29 To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth Subject: Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden for Navi1x On 2020-07-28 1:27 a.m., Tianci Yin wrote: > From: "Tianci.Yin" > > On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, Use present tense:... " are lost after " > reconfiguration is needed. Make the configuration code as an interface for Add "so a reconfiguration is needed. " > future use. > If the lines of your commit message are too long, then "git push" complains about them. Sixty char wide is perfect, since "git log" indents them when displaying them. With this fixed, then Reviewed-by: Luben Tuikov Regards, Luben > Change-Id: I172f3dc7f59da69b0364052dcad75a9c9aab019e > Signed-off-by: Tianci.Yin > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 34 ++--- > 2 files changed, 27 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index 1e7a2b0997c5..a611e78dd4ba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -216,6 +216,7 @@ struct amdgpu_gfx_funcs { >int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); >int (*query_ras_error_count) (struct amdgpu_device *adev, void > *ras_error_status); >void (*reset_ras_error_count) (struct amdgpu_device *adev); > + void (*init_spm_golden)(struct amdgpu_device *adev); > }; > > struct sq_work { > @@ -324,6 +325,7 @@ struct amdgpu_gfx { > #define amdgpu_gfx_get_gpu_clock_counter(adev) > (adev)->gfx.funcs->get_gpu_clock_counter((adev)) > #define amdgpu_gfx_select_se_sh(adev, se, sh, instance) > (adev)->gfx.funcs->select_se_sh((adev), (se), (sh), (instance)) > #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q, vmid) > (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q), (vmid)) > +#define amdgpu_gfx_init_spm_golden(adev) > (adev)->gfx.funcs->init_spm_golden((adev)) > > /** > * amdgpu_gfx_create_bitmask - create a bitmask > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index db9f1e89a0f8..da21ad04ac0f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -3307,6 +3307,29 @@ static void gfx_v10_0_set_kiq_pm4_funcs(struct > amdgpu_device *adev) >adev->gfx.kiq.pmf = _v10_0_kiq_pm4_funcs; > } > > +static void gfx_v10_0_init_spm_golden_registers(struct amdgpu_device *adev) > +{ > + switch (adev->asic_type) { > + case CHIP_NAVI10: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_0_nv10, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10)); > + break; > + case CHIP_NAVI14: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_1_nv14, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14)); > + break; > + case CHIP_NAVI12: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_1_2_nv12, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_2_nv12)); > + break; > + default: > + break; > + } > +} > + > static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev) > { >switch (adev->asic_type) { > @@ -3317,9 +3340,6 @@ static void gfx_v10_0_init_golden_registers(struct > amdgpu_device *adev) >soc15_program_register_sequence(adev, >golden_settings_gc_10_0_nv10, >(const > u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10)); > - soc15_program_register_sequence(adev, > - > golden_settings_gc_rlc_spm_10_0_nv10, > - (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10)); >break; >case CHIP_NAVI14: >soc15_program_register_sequence(adev, > @@ -3328,9 +3348,6 @@ static void gfx_v10_0_init_golden_registers(struct > amdgpu_device *adev) >soc15_program_register_sequence(adev, >
Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2)
[AMD Public Use] Hi Alex, amdgpu_gfx_off_ctrl() invoked by a few other functions, like amdgpu_info_ioctl() , putting the code into amdgpu_gfx_off_ctrl() will cost more meaningless time on SPM golden reconfiguration. amdgpu_gfx_off_ctrl(adev, false); amdgpu_asic_read_register(adev, se_num, sh_num, info->read_mmr_reg.dword_offset + i, [i]); amdgpu_gfx_off_ctrl(adev, true); In most cases, we don't care about the SPM, so I think smu_enable_umd_pstate is a better place. Thanks very much! Rico From: Deucher, Alexander Sent: Tuesday, July 28, 2020 22:16 To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Chen, Guchun ; Feng, Kenneth Subject: Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2) [AMD Public Use] Would it be better to put this code into amdgpu_gfx_off_ctrl()? Then we'll handle this in all cases where we disable gfx off. Alex From: Tianci Yin Sent: Tuesday, July 28, 2020 3:04 AM To: amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Chen, Guchun ; Feng, Kenneth ; Yin, Tianci (Rico) Subject: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2) From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Reviewed-by: Feifei Xu Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..41487123c207 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); + struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1318,12 +1319,22 @@ static int smu_enable_umd_pstate(void *handle, if (*level & profile_mode_mask) { smu_dpm_ctx->saved_dpm_level = smu_dpm_ctx->dpm_level; smu_dpm_ctx->enable_umd_pstate = true; - amdgpu_device_ip_set_powergating_state(smu->adev, + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_PG_STATE_UNGATE); - amdgpu_device_ip_set_clockgating_state(smu->adev, + amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12 && + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { + if (adev->gfx.funcs->init_spm_golden) { + dev_dbg(adev->dev,"GFXOFF exited, re-init SPM golden settings\n"); + amdgpu_gfx_init_spm_golden(adev); + } else + dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); + } } } else { /* exit umd pstate, restore level, enable gfx cg*/ @@ -1331,10 +1342,10 @@ static int smu_enable_umd_pstate(void *handle, if (*level == AMD_DPM_FORCED_LEVEL_PROFILE_EXIT) *level = smu_dpm_ctx->saved_dpm_level; smu_dpm_ctx->enable_umd_pstate = false; - amdgpu_device_ip_set_clockgating_state(smu->adev, + amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_GATE); - amdgpu_device_ip_set_powergating_state(smu->adev, + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_PG_STATE_GATE); } -- 2.17.1
RE: [PATCH] drm/amd/powerplay: update driver if version for navy_flounder
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Tao Zhou > -Original Message- > From: Jiansong Chen > Sent: Tuesday, July 28, 2020 7:21 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao ; Feng, Kenneth > ; Chen, Jiansong (Simon) > Subject: [PATCH] drm/amd/powerplay: update driver if version for > navy_flounder > > It's in accordance with pmfw 65.5.0 for navy_flounder. > > Signed-off-by: Jiansong Chen > Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0 > --- > drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h > b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h > index 9504f9954fd3..6a42331aba8a 100644 > --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h > +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h > @@ -31,7 +31,7 @@ > #define SMU11_DRIVER_IF_VERSION_NV12 0x33 #define > SMU11_DRIVER_IF_VERSION_NV14 0x36 #define > SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34 -#define > SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2 > +#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3 > > /* MP Apertures */ > #define MP0_Public0x0380 > -- > 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 01/14] drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at
Am 2020-07-28 um 6:45 p.m. schrieb Alex Deucher: > Just return early to match other bo_create functions. > > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 + > 1 file changed, 5 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index 5ac7b5561475..16a37caa654a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -366,6 +366,11 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device > *adev, > unsigned int i; > int r; > > + if (!size) { > + amdgpu_bo_unref(bo_ptr); I was going to say, unreffing the bo_ptr before allocating anything seems weird. But amdgpu_bo_create_reserved, which is called just below, does the same thing. So this doesn't really change anything. Never mind. Regards, Felix > + return 0; > + } > + > offset &= PAGE_MASK; > size = ALIGN(size, PAGE_SIZE); > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 14/14] drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus
Am 2020-07-28 um 6:46 p.m. schrieb Alex Deucher: > I suspect the only reason this was set was to avoid touching > the display related registers on arcturus. Someone should > double check this on arcturus with S3. Sounds reasonable, given that the other offenders here are all APUs. AFAIK, we haven't tried S3 on Arcturus. Doesn't seem like something one would do on a server. See one more comment on patch 1. Other than that the series is Reviewed-by: Felix Kuehling Regards, Felix > > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > index fc9e18aaa76e..0bd7b3797534 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > @@ -448,7 +448,6 @@ void amdgpu_gmc_get_vbios_allocations(struct > amdgpu_device *adev) > switch (adev->asic_type) { > case CHIP_VEGA10: > case CHIP_RAVEN: > - case CHIP_ARCTURUS: > case CHIP_RENOIR: > adev->gmc.keep_stolen_vga_memory = true; > break; ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 01/14] drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at
Just return early to match other bo_create functions. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 5ac7b5561475..16a37caa654a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -366,6 +366,11 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, unsigned int i; int r; + if (!size) { + amdgpu_bo_unref(bo_ptr); + return 0; + } + offset &= PAGE_MASK; size = ALIGN(size, PAGE_SIZE); -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 13/14] drm/amdgpu: drop the CPU pointers for the stolen vga bos
We never use them. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 16 +--- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ec975251b171..3df9d5a53741 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1915,7 +1915,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) uint64_t gtt_size; int r; u64 vis_vram_limit; - void *stolen_vga_buf, *stolen_extended_buf; mutex_init(>mman.gtt_window_lock); @@ -1982,14 +1981,14 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_vga_size, AMDGPU_GEM_DOMAIN_VRAM, >gmc.stolen_vga_memory, - _vga_buf); + NULL); if (r) return r; r = amdgpu_bo_create_kernel_at(adev, adev->gmc.stolen_vga_size, adev->gmc.stolen_extended_size, AMDGPU_GEM_DOMAIN_VRAM, >gmc.stolen_extended_memory, - _extended_buf); + NULL); if (r) return r; @@ -2048,13 +2047,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) */ void amdgpu_ttm_late_init(struct amdgpu_device *adev) { - void *stolen_vga_buf, *stolen_extended_buf; - /* return the VGA stolen memory (if any) back to VRAM */ if (!adev->gmc.keep_stolen_vga_memory) - amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); - amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL, - _extended_buf); + amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, NULL); + amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL, NULL); } /** @@ -2062,15 +2058,13 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev) */ void amdgpu_ttm_fini(struct amdgpu_device *adev) { - void *stolen_vga_buf; - if (!adev->mman.initialized) return; amdgpu_ttm_training_reserve_vram_fini(adev); /* return the stolen vga memory back to VRAM */ if (adev->gmc.keep_stolen_vga_memory) - amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); + amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, NULL); /* return the IP Discovery TMR memory back to VRAM */ amdgpu_bo_free_kernel(>discovery_memory, NULL, NULL); amdgpu_ttm_fw_reserve_vram_fini(adev); -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 03/14] drm/amdgpu: use a define for the memory size of the vga emulator
Rather than open coding it everywhere. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h| 2 ++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++-- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 58e39429395f..2a7fbe21619d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -212,6 +212,8 @@ extern int amdgpu_cik_support; #define AMDGPUFB_CONN_LIMIT4 #define AMDGPU_BIOS_NUM_SCRATCH16 +#define AMDGPU_VBIOS_VGA_ALLOCATION(9 * 1024 * 1024) /* reserve 8MB for vga emulator and 1 MB for FB */ + /* hard reset data */ #define AMDGPU_ASIC_RESET_DATA 0x39d5e86b diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 1a78073c2f05..040220e97cf3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -707,7 +707,7 @@ static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev) unsigned size; if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 MB for FB */ + size = AMDGPU_VBIOS_VGA_ALLOCATION; } else { u32 viewport; u32 pitch; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 538e7ee35cdf..4de996868d32 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -805,7 +805,7 @@ static unsigned gmc_v6_0_get_vbios_fb_size(struct amdgpu_device *adev) unsigned size; if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 MB for FB */ + size = AMDGPU_VBIOS_VGA_ALLOCATION; } else { u32 viewport = RREG32(mmVIEWPORT_SIZE); size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) * diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 0f8e8aff9114..4113f2d33b75 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -970,7 +970,7 @@ static unsigned gmc_v7_0_get_vbios_fb_size(struct amdgpu_device *adev) unsigned size; if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 MB for FB */ + size = AMDGPU_VBIOS_VGA_ALLOCATION; } else { u32 viewport = RREG32(mmVIEWPORT_SIZE); size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) * diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index abe64010f0d5..f29ff9afcc10 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1087,7 +1087,7 @@ static unsigned gmc_v8_0_get_vbios_fb_size(struct amdgpu_device *adev) unsigned size; if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 MB for FB */ + size = AMDGPU_VBIOS_VGA_ALLOCATION; } else { u32 viewport = RREG32(mmVIEWPORT_SIZE); size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) * diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index c5f94bab4a01..ac15d7678d24 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1077,11 +1077,11 @@ static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev) * Check related code in gmc_v9_0_sw_fini * */ if (gmc_v9_0_keep_stolen_memory(adev)) - return 9 * 1024 * 1024; + return AMDGPU_VBIOS_VGA_ALLOCATION; d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 MB for FB */ + size = AMDGPU_VBIOS_VGA_ALLOCATION; } else { u32 viewport; -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 05/14] drm/amdgpu: move keep stolen memory check into gmc core
Rather than leaving this as a gmc v9 specific hack. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 9 - drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 11 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index ddb64be670c2..0cf18f01e67a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -215,6 +215,7 @@ struct amdgpu_gmc { boolprt_warning; uint64_tstolen_vga_size; struct amdgpu_bo*stolen_vga_memory; + boolkeep_stolen_vga_memory; uint32_tsdpif_register; /* apertures */ u64 shared_aperture_start; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 39781127d059..fd61769202b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2042,8 +2042,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) void amdgpu_ttm_late_init(struct amdgpu_device *adev) { void *stolen_vga_buf; + /* return the VGA stolen memory (if any) back to VRAM */ - amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); + if (!adev->gmc.keep_stolen_vga_memory) + amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); } /** @@ -2051,10 +2053,15 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev) */ void amdgpu_ttm_fini(struct amdgpu_device *adev) { + void *stolen_vga_buf; + if (!adev->mman.initialized) return; amdgpu_ttm_training_reserve_vram_fini(adev); + /* return the stolen vga memory back to VRAM */ + if (adev->gmc.keep_stolen_vga_memory) + amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); /* return the IP Discovery TMR memory back to VRAM */ amdgpu_bo_free_kernel(>discovery_memory, NULL, NULL); amdgpu_ttm_fw_reserve_vram_fini(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index b66c60680dba..c5d2e4390fba 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -932,8 +932,7 @@ static int gmc_v9_0_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int r; - if (!gmc_v9_0_keep_stolen_memory(adev)) - amdgpu_bo_late_init(adev); + amdgpu_bo_late_init(adev); r = amdgpu_gmc_allocate_vm_inv_eng(adev); if (r) @@ -1076,7 +1075,7 @@ static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev) * TODO Remove once GART corruption is resolved * Check related code in gmc_v9_0_sw_fini * */ - if (gmc_v9_0_keep_stolen_memory(adev)) + if (adev->gmc.keep_stolen_vga_memory) return AMDGPU_VBIOS_VGA_ALLOCATION; d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); @@ -1243,6 +1242,7 @@ static int gmc_v9_0_sw_init(void *handle) if (r) return r; + adev->gmc.keep_stolen_vga_memory = gmc_v9_0_keep_stolen_memory(adev); adev->gmc.stolen_vga_size = gmc_v9_0_get_vbios_fb_size(adev); /* Memory manager */ @@ -1275,15 +1275,10 @@ static int gmc_v9_0_sw_init(void *handle) static int gmc_v9_0_sw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - void *stolen_vga_buf; amdgpu_gmc_ras_fini(adev); amdgpu_gem_force_release(adev); amdgpu_vm_manager_fini(adev); - - if (gmc_v9_0_keep_stolen_memory(adev)) - amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); - amdgpu_gart_table_vram_free(adev); amdgpu_bo_fini(adev); amdgpu_gart_fini(adev); -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 00/14] rework stolen pre-OS fb allocation handling
Split the allocations into two so we can still support the S3 workarounds required on some platforms while also avoiding any artifacts when transitioning from bios to driver. In the future we could integrate handling of the ip discovery data and other vbios allocations into this helper function to consolidate handling of all of the vbios reservations. Alex Deucher (14): drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at drm/amdgpu: use create_at for the stolen pre-OS buffer drm/amdgpu: use a define for the memory size of the vga emulator drm/amdgpu: move stolen vga bo from amdgpu to amdgpu.gmc drm/amdgpu: move keep stolen memory check into gmc core drm/amdgpu: add support for extended stolen vga memory drm/amdgpu/gmc: add new helper to get the FB size used by pre-OS console drm/amdgpu/gmc6: switch to using amdgpu_gmc_get_vbios_allocations drm/amdgpu/gmc7: switch to using amdgpu_gmc_get_vbios_allocations drm/amdgpu/gmc8: switch to using amdgpu_gmc_get_vbios_allocations drm/amdgpu/gmc9: switch to using amdgpu_gmc_get_vbios_allocations drm/amdgpu/gmc10: switch to using amdgpu_gmc_get_vbios_allocations drm/amdgpu: drop the CPU pointers for the stolen vga bos drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus drivers/gpu/drm/amd/amdgpu/amdgpu.h| 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c| 42 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h| 11 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c| 24 +++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 57 +- drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 8 +- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 11 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 11 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 119 +++-- 10 files changed, 153 insertions(+), 138 deletions(-) -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 12/14] drm/amdgpu/gmc10: switch to using amdgpu_gmc_get_vbios_allocations
The new helper centralizes the logic in one place. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 57 +++--- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 337d70503970..fcde302d3eb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -553,6 +553,28 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev, } } +static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev) +{ + u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); + unsigned size; + + if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { + size = AMDGPU_VBIOS_VGA_ALLOCATION; + } else { + u32 viewport; + u32 pitch; + + viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION); + pitch = RREG32_SOC15(DCE, 0, mmHUBPREQ0_DCSURF_SURFACE_PITCH); + size = (REG_GET_FIELD(viewport, + HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) * + REG_GET_FIELD(pitch, HUBPREQ0_DCSURF_SURFACE_PITCH, PITCH) * + 4); + } + + return size; +} + static const struct amdgpu_gmc_funcs gmc_v10_0_gmc_funcs = { .flush_gpu_tlb = gmc_v10_0_flush_gpu_tlb, .flush_gpu_tlb_pasid = gmc_v10_0_flush_gpu_tlb_pasid, @@ -560,7 +582,8 @@ static const struct amdgpu_gmc_funcs gmc_v10_0_gmc_funcs = { .emit_pasid_mapping = gmc_v10_0_emit_pasid_mapping, .map_mtype = gmc_v10_0_map_mtype, .get_vm_pde = gmc_v10_0_get_vm_pde, - .get_vm_pte = gmc_v10_0_get_vm_pte + .get_vm_pte = gmc_v10_0_get_vm_pte, + .get_vbios_fb_size = gmc_v10_0_get_vbios_fb_size, }; static void gmc_v10_0_set_gmc_funcs(struct amdgpu_device *adev) @@ -701,36 +724,6 @@ static int gmc_v10_0_gart_init(struct amdgpu_device *adev) return amdgpu_gart_table_vram_alloc(adev); } -static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev) -{ - u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); - unsigned size; - - if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = AMDGPU_VBIOS_VGA_ALLOCATION; - } else { - u32 viewport; - u32 pitch; - - viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION); - pitch = RREG32_SOC15(DCE, 0, mmHUBPREQ0_DCSURF_SURFACE_PITCH); - size = (REG_GET_FIELD(viewport, - HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) * - REG_GET_FIELD(pitch, HUBPREQ0_DCSURF_SURFACE_PITCH, PITCH) * - 4); - } - /* return 0 if the pre-OS buffer uses up most of vram */ - if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) { - DRM_ERROR("Warning: pre-OS buffer uses most of vram, \ - be aware of gart table overwrite\n"); - return 0; - } - - return size; -} - - - static int gmc_v10_0_sw_init(void *handle) { int r, vram_width = 0, vram_type = 0, vram_vendor = 0; @@ -812,7 +805,7 @@ static int gmc_v10_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_vga_size = gmc_v10_0_get_vbios_fb_size(adev); + amdgpu_gmc_get_vbios_allocations(adev); /* Memory manager */ r = amdgpu_bo_init(adev); -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 11/14] drm/amdgpu/gmc9: switch to using amdgpu_gmc_get_vbios_allocations
The new helper centralizes the logic in one place. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 112 +- 1 file changed, 38 insertions(+), 74 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index c5d2e4390fba..65488ddc34c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -827,6 +827,41 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, *flags |= AMDGPU_PTE_SNOOPED; } +static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev) +{ + u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); + unsigned size; + + if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { + size = AMDGPU_VBIOS_VGA_ALLOCATION; + } else { + u32 viewport; + + switch (adev->asic_type) { + case CHIP_RAVEN: + case CHIP_RENOIR: + viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION); + size = (REG_GET_FIELD(viewport, + HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) * + REG_GET_FIELD(viewport, + HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) * + 4); + break; + case CHIP_VEGA10: + case CHIP_VEGA12: + case CHIP_VEGA20: + default: + viewport = RREG32_SOC15(DCE, 0, mmSCL0_VIEWPORT_SIZE); + size = (REG_GET_FIELD(viewport, SCL0_VIEWPORT_SIZE, VIEWPORT_HEIGHT) * + REG_GET_FIELD(viewport, SCL0_VIEWPORT_SIZE, VIEWPORT_WIDTH) * + 4); + break; + } + } + + return size; +} + static const struct amdgpu_gmc_funcs gmc_v9_0_gmc_funcs = { .flush_gpu_tlb = gmc_v9_0_flush_gpu_tlb, .flush_gpu_tlb_pasid = gmc_v9_0_flush_gpu_tlb_pasid, @@ -834,7 +869,8 @@ static const struct amdgpu_gmc_funcs gmc_v9_0_gmc_funcs = { .emit_pasid_mapping = gmc_v9_0_emit_pasid_mapping, .map_mtype = gmc_v9_0_map_mtype, .get_vm_pde = gmc_v9_0_get_vm_pde, - .get_vm_pte = gmc_v9_0_get_vm_pte + .get_vm_pte = gmc_v9_0_get_vm_pte, + .get_vbios_fb_size = gmc_v9_0_get_vbios_fb_size, }; static void gmc_v9_0_set_gmc_funcs(struct amdgpu_device *adev) @@ -902,31 +938,6 @@ static int gmc_v9_0_early_init(void *handle) return 0; } -static bool gmc_v9_0_keep_stolen_memory(struct amdgpu_device *adev) -{ - - /* -* TODO: -* Currently there is a bug where some memory client outside -* of the driver writes to first 8M of VRAM on S3 resume, -* this overrides GART which by default gets placed in first 8M and -* causes VM_FAULTS once GTT is accessed. -* Keep the stolen memory reservation until the while this is not solved. -* Also check code in gmc_v9_0_get_vbios_fb_size and gmc_v9_0_late_init -*/ - switch (adev->asic_type) { - case CHIP_VEGA10: - case CHIP_RAVEN: - case CHIP_ARCTURUS: - case CHIP_RENOIR: - return true; - case CHIP_VEGA12: - case CHIP_VEGA20: - default: - return false; - } -} - static int gmc_v9_0_late_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1066,52 +1077,6 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev) return amdgpu_gart_table_vram_alloc(adev); } -static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev) -{ - u32 d1vga_control; - unsigned size; - - /* -* TODO Remove once GART corruption is resolved -* Check related code in gmc_v9_0_sw_fini -* */ - if (adev->gmc.keep_stolen_vga_memory) - return AMDGPU_VBIOS_VGA_ALLOCATION; - - d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL); - if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) { - size = AMDGPU_VBIOS_VGA_ALLOCATION; - } else { - u32 viewport; - - switch (adev->asic_type) { - case CHIP_RAVEN: - case CHIP_RENOIR: - viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION); - size = (REG_GET_FIELD(viewport, - HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) * - REG_GET_FIELD(viewport, - HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) * - 4); - break; -
[PATCH 04/14] drm/amdgpu: move stolen vga bo from amdgpu to amdgpu.gmc
Since that is where we store the other data related to the stolen vga memory. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2a7fbe21619d..899664357015 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -767,7 +767,6 @@ struct amdgpu_device { boolis_atom_fw; uint8_t *bios; uint32_tbios_size; - struct amdgpu_bo*stolen_vga_memory; uint32_tbios_scratch_reg_offset; uint32_tbios_scratch[AMDGPU_BIOS_NUM_SCRATCH]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 9d58c56f6cfc..ddb64be670c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -213,7 +213,8 @@ struct amdgpu_gmc { uint8_t vram_vendor; uint32_tsrbm_soft_reset; boolprt_warning; - uint64_tstolen_size; + uint64_tstolen_vga_size; + struct amdgpu_bo*stolen_vga_memory; uint32_tsdpif_register; /* apertures */ u64 shared_aperture_start; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index a188216bccc2..39781127d059 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1979,9 +1979,9 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) * This is used for VGA emulation and pre-OS scanout buffers to * avoid display artifacts while transitioning between pre-OS * and driver. */ - r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_size, + r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_vga_size, AMDGPU_GEM_DOMAIN_VRAM, - >stolen_vga_memory, + >gmc.stolen_vga_memory, _vga_buf); if (r) return r; @@ -2043,7 +2043,7 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev) { void *stolen_vga_buf; /* return the VGA stolen memory (if any) back to VRAM */ - amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf); + amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 040220e97cf3..337d70503970 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -812,7 +812,7 @@ static int gmc_v10_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_size = gmc_v10_0_get_vbios_fb_size(adev); + adev->gmc.stolen_vga_size = gmc_v10_0_get_vbios_fb_size(adev); /* Memory manager */ r = amdgpu_bo_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 4de996868d32..28ddb41a78c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -862,7 +862,7 @@ static int gmc_v6_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_size = gmc_v6_0_get_vbios_fb_size(adev); + adev->gmc.stolen_vga_size = gmc_v6_0_get_vbios_fb_size(adev); r = amdgpu_bo_init(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 4113f2d33b75..8b8ecbb99d84 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1035,7 +1035,7 @@ static int gmc_v7_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_size = gmc_v7_0_get_vbios_fb_size(adev); + adev->gmc.stolen_vga_size = gmc_v7_0_get_vbios_fb_size(adev); /* Memory manager */ r = amdgpu_bo_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index f29ff9afcc10..8e3763ec268f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1160,7 +1160,7 @@ static int gmc_v8_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_size =
[PATCH 09/14] drm/amdgpu/gmc7: switch to using amdgpu_gmc_get_vbios_allocations
The new helper centralizes the logic in one place. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 8b8ecbb99d84..80c146df338a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -977,9 +977,7 @@ static unsigned gmc_v7_0_get_vbios_fb_size(struct amdgpu_device *adev) REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) * 4); } - /* return 0 if the pre-OS buffer uses up most of vram */ - if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) - return 0; + return size; } @@ -1035,7 +1033,7 @@ static int gmc_v7_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_vga_size = gmc_v7_0_get_vbios_fb_size(adev); + amdgpu_gmc_get_vbios_allocations(adev); /* Memory manager */ r = amdgpu_bo_init(adev); @@ -1372,7 +1370,8 @@ static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = { .emit_pasid_mapping = gmc_v7_0_emit_pasid_mapping, .set_prt = gmc_v7_0_set_prt, .get_vm_pde = gmc_v7_0_get_vm_pde, - .get_vm_pte = gmc_v7_0_get_vm_pte + .get_vm_pte = gmc_v7_0_get_vm_pte, + .get_vbios_fb_size = gmc_v7_0_get_vbios_fb_size, }; static const struct amdgpu_irq_src_funcs gmc_v7_0_irq_funcs = { -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 06/14] drm/amdgpu: add support for extended stolen vga memory
This will allow us to split the allocation for systems where we have to keep the stolen memory around to avoid S3 issues. This way we don't waste as much memory and still avoid any screen artifacts during the bios to driver transition. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 13 +++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 0cf18f01e67a..8f4af955d72c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -215,6 +215,8 @@ struct amdgpu_gmc { boolprt_warning; uint64_tstolen_vga_size; struct amdgpu_bo*stolen_vga_memory; + uint64_tstolen_extended_size; + struct amdgpu_bo*stolen_extended_memory; boolkeep_stolen_vga_memory; uint32_tsdpif_register; /* apertures */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fd61769202b3..ec975251b171 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1915,7 +1915,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) uint64_t gtt_size; int r; u64 vis_vram_limit; - void *stolen_vga_buf; + void *stolen_vga_buf, *stolen_extended_buf; mutex_init(>mman.gtt_window_lock); @@ -1985,6 +1985,13 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) _vga_buf); if (r) return r; + r = amdgpu_bo_create_kernel_at(adev, adev->gmc.stolen_vga_size, + adev->gmc.stolen_extended_size, + AMDGPU_GEM_DOMAIN_VRAM, + >gmc.stolen_extended_memory, + _extended_buf); + if (r) + return r; DRM_INFO("amdgpu: %uM of VRAM memory ready\n", (unsigned) (adev->gmc.real_vram_size / (1024 * 1024))); @@ -2041,11 +2048,13 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) */ void amdgpu_ttm_late_init(struct amdgpu_device *adev) { - void *stolen_vga_buf; + void *stolen_vga_buf, *stolen_extended_buf; /* return the VGA stolen memory (if any) back to VRAM */ if (!adev->gmc.keep_stolen_vga_memory) amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, _vga_buf); + amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL, + _extended_buf); } /** -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 02/14] drm/amdgpu: use create_at for the stolen pre-OS buffer
Should be functionally the same since nothing else is allocated at that point, but let's be exact. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 605d266754f6..a188216bccc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1979,10 +1979,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) * This is used for VGA emulation and pre-OS scanout buffers to * avoid display artifacts while transitioning between pre-OS * and driver. */ - r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, - >stolen_vga_memory, - NULL, _vga_buf); + r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_size, + AMDGPU_GEM_DOMAIN_VRAM, + >stolen_vga_memory, + _vga_buf); if (r) return r; -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 14/14] drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus
I suspect the only reason this was set was to avoid touching the display related registers on arcturus. Someone should double check this on arcturus with S3. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index fc9e18aaa76e..0bd7b3797534 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -448,7 +448,6 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev) switch (adev->asic_type) { case CHIP_VEGA10: case CHIP_RAVEN: - case CHIP_ARCTURUS: case CHIP_RENOIR: adev->gmc.keep_stolen_vga_memory = true; break; -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 08/14] drm/amdgpu/gmc6: switch to using amdgpu_gmc_get_vbios_allocations
The new helper centralizes the logic in one place. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 28ddb41a78c8..95a9117e9564 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -812,9 +812,6 @@ static unsigned gmc_v6_0_get_vbios_fb_size(struct amdgpu_device *adev) REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) * 4); } - /* return 0 if the pre-OS buffer uses up most of vram */ - if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) - return 0; return size; } @@ -862,7 +859,7 @@ static int gmc_v6_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_vga_size = gmc_v6_0_get_vbios_fb_size(adev); + amdgpu_gmc_get_vbios_allocations(adev); r = amdgpu_bo_init(adev); if (r) @@ -1136,6 +1133,7 @@ static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = { .set_prt = gmc_v6_0_set_prt, .get_vm_pde = gmc_v6_0_get_vm_pde, .get_vm_pte = gmc_v6_0_get_vm_pte, + .get_vbios_fb_size = gmc_v6_0_get_vbios_fb_size, }; static const struct amdgpu_irq_src_funcs gmc_v6_0_irq_funcs = { -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 07/14] drm/amdgpu/gmc: add new helper to get the FB size used by pre-OS console
This adds a new gmc callback to get the size reserved by the pre-OS console and provides a helper function for use by gmc IP drivers. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 43 + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 5 +++ 2 files changed, 48 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 21d2c8543f85..fc9e18aaa76e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -27,6 +27,7 @@ #include #include "amdgpu.h" +#include "amdgpu_gmc.h" #include "amdgpu_ras.h" #include "amdgpu_xgmi.h" @@ -431,3 +432,45 @@ void amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, int hub_type, WREG32(reg, tmp); } } + +void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev) +{ + unsigned size; + + /* +* TODO: +* Currently there is a bug where some memory client outside +* of the driver writes to first 8M of VRAM on S3 resume, +* this overrides GART which by default gets placed in first 8M and +* causes VM_FAULTS once GTT is accessed. +* Keep the stolen memory reservation until the while this is not solved. +*/ + switch (adev->asic_type) { + case CHIP_VEGA10: + case CHIP_RAVEN: + case CHIP_ARCTURUS: + case CHIP_RENOIR: + adev->gmc.keep_stolen_vga_memory = true; + break; + default: + adev->gmc.keep_stolen_vga_memory = false; + break; + } + + if (!amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) + size = 0; + else + size = amdgpu_gmc_get_vbios_fb_size(adev); + + /* set to 0 if the pre-OS buffer uses up most of vram */ + if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) + size = 0; + + if (size > AMDGPU_VBIOS_VGA_ALLOCATION) { + adev->gmc.stolen_vga_size = AMDGPU_VBIOS_VGA_ALLOCATION; + adev->gmc.stolen_extended_size = size - adev->gmc.stolen_vga_size; + } else { + adev->gmc.stolen_vga_size = size; + adev->gmc.stolen_extended_size = 0; + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 8f4af955d72c..c2a85d0b1546 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -131,6 +131,8 @@ struct amdgpu_gmc_funcs { void (*get_vm_pte)(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *mapping, uint64_t *flags); + /* get the amount of memory used by the vbios for pre-OS console */ + unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev); }; struct amdgpu_xgmi { @@ -253,6 +255,7 @@ struct amdgpu_gmc { #define amdgpu_gmc_map_mtype(adev, flags) (adev)->gmc.gmc_funcs->map_mtype((adev),(flags)) #define amdgpu_gmc_get_vm_pde(adev, level, dst, flags) (adev)->gmc.gmc_funcs->get_vm_pde((adev), (level), (dst), (flags)) #define amdgpu_gmc_get_vm_pte(adev, mapping, flags) (adev)->gmc.gmc_funcs->get_vm_pte((adev), (mapping), (flags)) +#define amdgpu_gmc_get_vbios_fb_size(adev) (adev)->gmc.gmc_funcs->get_vbios_fb_size((adev)) /** * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR @@ -307,4 +310,6 @@ extern void amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, int hub_type, bool enable); +void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev); + #endif -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 10/14] drm/amdgpu/gmc8: switch to using amdgpu_gmc_get_vbios_allocations
The new helper centralizes the logic in one place. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 8e3763ec268f..9ab65ca7df77 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1094,9 +1094,7 @@ static unsigned gmc_v8_0_get_vbios_fb_size(struct amdgpu_device *adev) REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) * 4); } - /* return 0 if the pre-OS buffer uses up most of vram */ - if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) - return 0; + return size; } @@ -1160,7 +1158,7 @@ static int gmc_v8_0_sw_init(void *handle) if (r) return r; - adev->gmc.stolen_vga_size = gmc_v8_0_get_vbios_fb_size(adev); + amdgpu_gmc_get_vbios_allocations(adev); /* Memory manager */ r = amdgpu_bo_init(adev); @@ -1739,7 +1737,8 @@ static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = { .emit_pasid_mapping = gmc_v8_0_emit_pasid_mapping, .set_prt = gmc_v8_0_set_prt, .get_vm_pde = gmc_v8_0_get_vm_pde, - .get_vm_pte = gmc_v8_0_get_vm_pte + .get_vm_pte = gmc_v8_0_get_vm_pte, + .get_vbios_fb_size = gmc_v8_0_get_vbios_fb_size, }; static const struct amdgpu_irq_src_funcs gmc_v8_0_irq_funcs = { -- 2.25.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free
On Tue, Jul 28, 2020 at 01:07:13PM -0400, Kazlauskas, Nicholas wrote: > On 2020-07-28 5:22 a.m., Paul Menzel wrote: > > Dear Linux folks, > > > > > > Am 25.07.20 um 07:20 schrieb Mazin Rezk: > > > On Saturday, July 25, 2020 12:59 AM, Duncan wrote: > > > > > > > On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote: > > > > > > > > > > Am 24.07.20 um 19:33 schrieb Kees Cook: > > > > > > > > > > > > > There was a fix to disable the async path for this driver that > > > > > > > worked around the bug too, yes? That seems like a safer and more > > > > > > > focused change that doesn't revert the SLUB defense for all > > > > > > > users, and would actually provide a complete, I think, workaround > > > > > > > > > > That said, I haven't seen the async disabling patch. If you could > > > > > link to it, I'd be glad to test it out and perhaps we can use that > > > > > instead. > > > > > > > > I'm confused. Not to put words in Kees' mouth; /I/ am confused (which > > > > admittedly could well be just because I make no claims to be a > > > > coder and am simply reading the bug and thread, but I'd appreciate some > > > > "unconfusing" anyway). > > > > > > > > My interpretation of the "async disabling" reference was that it was to > > > > comment #30 on the bug: > > > > > > > > https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30 > > > > > > > > > > > > ... which (if I'm not confused on this point too) appears to be yours. > > > > There it was stated... > > > > > > > > I've also found that this bug exclusively occurs when commit_work is on > > > > the workqueue. After forcing drm_atomic_helper_commit to run all of the > > > > commits without adding to the workqueue and running the OS, the issue > > > > seems to have disappeared. > > > > > > > > > > > > Would not forcing all commits to run directly, without placing them on > > > > the workqueue, be "async disabling"? That's what I /thought/ he was > > > > referencing. > > > > > > Oh, I thought he was referring to a different patch. Kees, could I get > > > your confirmation on this? > > > > > > The change I made actually affected all of the DRM code, although > > > this could > > > easily be changed to be specific to amdgpu. (By forcing blocking on > > > amdgpu_dm's non-blocking commit code) > > > > > > That said, I'd still need to test further because I only did test it > > > for a > > > couple of hours then. Although it should work in theory. > > > > > > > OTOH your base/context swap idea sounds like a possibly "less > > > > disturbance" workaround, if it works, and given the point in the > > > > commit cycle... (But if it's out Sunday it's likely too late to test > > > > and get it in now anyway; if it's another week, tho...) > > > > > > The base/context swap idea should make the use-after-free behave how it > > > did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a > > > "less disturbance" workaround and more of a "no disturbance" workaround. > > > > Sorry for bothering, but is there now a solution, besides reverting the > > commits, to avoid freezes/crashes *without* performance regressions? > > > > > > Kind regards, > > > > Paul > > Mazin's "drm/amd/display: Clear dm_state for fast updates" change > accomplishes this, at least as a temporary hack. Yeah I gets it's horrible, but better than nothing. Reverting the old amdgpu change to a private state object is probably a lot more invasive. > I've started work on a more large scale fix that we could get in in after. Does that include a fix for the "stuff needed by irq handler"? Either way pls cc dri-devel, I think this is something worth of a bit wider discussion. Feels like unsolved homework from the entire "make DC integrate into linux" saga ... -Daniel -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 4/4] radeon: fall back to ACPI EDID retrieval
On 7/28/20 1:50 AM, Christian König wrote: Am 27.07.20 um 22:53 schrieb Daniel Dadap: Fall back to retrieving the EDID via the ACPI _DDC method, when present for notebook internal panels, when retrieving BIOS-embedded EDIDs. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/radeon/radeon_combios.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index c3e49c973812..de801d9fca54 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct radeon_device *rdev) struct edid * radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) { - struct edid *edid; - if (rdev->mode_info.bios_hardcoded_edid) { + struct edid *edid; That's an unrelated an incorrect style change. You need a blank line after declaration. Ah, yes, that doesn't really need to be changed. I'll remove it from this patch. Would a separate patch to change the scope of that declaration (with a blank line after) be welcome, or should I just leave it alone? edid = kmalloc(rdev->mode_info.bios_hardcoded_edid_size, GFP_KERNEL); if (edid) { memcpy((unsigned char *)edid, @@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) return edid; } } - return NULL; + + return drm_get_edid_acpi(); In general a good idea, but I'm wondering if we should really do this so unconditionally here. I'm not personally aware of any AMD notebook designs that require the ACPI _DDC EDID retrieval. I've only seen it on NVIDIA+Intel hybrid systems and on a small number of NVIDIA discrete-only systems. I just figured I'd update the radeon DRM-KMS driver while updating i915 and Nouveau, for completeness, as it could be helpful should such a design exist. As for whether there should be some condition around this, I suppose that's reasonable, but I'm not really sure what would make sense as a condition. As it stands, drm_edid_acpi() only returns a value if at least one of the VGA or 3D controllers on the system provides an ACPI _DDC method, and if that ACPI method successfully returns an EDID. On the caller's end, it's currently part of the path where the radeon driver is already trying to fall back to a hardcoded EDID provided by the system. Perhaps instead if we call it within the LVDS || eDP condition here, instead? if (rdev->is_atom_bios) { /* some laptops provide a hardcoded edid in rom for LCDs */ if (((connector->connector_type == DRM_MODE_CONNECTOR_LVDS) || (connector->connector_type == DRM_MODE_CONNECTOR_eDP))) radeon_connector->edid = radeon_bios_get_hardcoded_edid(rdev); } else { /* some servers provide a hardcoded edid in rom for KVMs */ radeon_connector->edid = radeon_bios_get_hardcoded_edid(rdev); } That would be more in line with the changes in this patchset for i915 and nouveau. Regards, Christian. } static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device *rdev, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[Linux-kernel-mentees] [PATCH] drm/amdgpu: Prevent kernel-infoleak in amdgpu_info_ioctl()
Compiler leaves a 4-byte hole near the end of `dev_info`, causing amdgpu_info_ioctl() to copy uninitialized kernel stack memory to userspace when `size` is greater than 356. In 2015 we tried to fix this issue by doing `= {};` on `dev_info`, which unfortunately does not initialize that 4-byte hole. Fix it by using memset() instead. Cc: sta...@vger.kernel.org Fixes: c193fa91b918 ("drm/amdgpu: information leak in amdgpu_info_ioctl()") Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)") Suggested-by: Dan Carpenter Signed-off-by: Peilin Ye --- $ pahole -C "drm_amdgpu_info_device" drivers/gpu/drm/amd/amdgpu/amdgpu_kms.o struct drm_amdgpu_info_device { __u32 device_id;/* 0 4 */ __u32 chip_rev; /* 4 4 */ __u32 external_rev; /* 8 4 */ __u32 pci_rev; /*12 4 */ __u32 family; /*16 4 */ __u32 num_shader_engines; /*20 4 */ __u32 num_shader_arrays_per_engine; /*24 4 */ __u32 gpu_counter_freq; /*28 4 */ __u64 max_engine_clock; /*32 8 */ __u64 max_memory_clock; /*40 8 */ __u32 cu_active_number; /*48 4 */ __u32 cu_ao_mask; /*52 4 */ __u32 cu_bitmap[4][4]; /*5664 */ /* --- cacheline 1 boundary (64 bytes) was 56 bytes ago --- */ __u32 enabled_rb_pipes_mask; /* 120 4 */ __u32 num_rb_pipes; /* 124 4 */ /* --- cacheline 2 boundary (128 bytes) --- */ __u32 num_hw_gfx_contexts; /* 128 4 */ __u32 _pad; /* 132 4 */ __u64 ids_flags;/* 136 8 */ __u64 virtual_address_offset; /* 144 8 */ __u64 virtual_address_max; /* 152 8 */ __u32 virtual_address_alignment; /* 160 4 */ __u32 pte_fragment_size;/* 164 4 */ __u32 gart_page_size; /* 168 4 */ __u32 ce_ram_size; /* 172 4 */ __u32 vram_type;/* 176 4 */ __u32 vram_bit_width; /* 180 4 */ __u32 vce_harvest_config; /* 184 4 */ __u32 gc_double_offchip_lds_buf; /* 188 4 */ /* --- cacheline 3 boundary (192 bytes) --- */ __u64 prim_buf_gpu_addr;/* 192 8 */ __u64 pos_buf_gpu_addr; /* 200 8 */ __u64 cntl_sb_buf_gpu_addr; /* 208 8 */ __u64 param_buf_gpu_addr; /* 216 8 */ __u32 prim_buf_size;/* 224 4 */ __u32 pos_buf_size; /* 228 4 */ __u32 cntl_sb_buf_size; /* 232 4 */ __u32 param_buf_size; /* 236 4 */ __u32 wave_front_size; /* 240 4 */ __u32 num_shader_visible_vgprs; /* 244 4 */ __u32 num_cu_per_sh;/* 248 4 */ __u32 num_tcc_blocks; /* 252 4 */ /* --- cacheline 4 boundary (256 bytes) --- */ __u32 gs_vgt_table_depth; /* 256 4 */ __u32 gs_prim_buffer_depth; /* 260 4 */ __u32 max_gs_waves_per_vgt; /* 264 4 */ __u32 _pad1;/* 268 4 */ __u32 cu_ao_bitmap[4][4]; /* 27264 */ /* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */ __u64 high_va_offset; /* 336 8 */ __u64 high_va_max; /* 344 8 */ __u32 pa_sc_tile_steering_override; /* 352 4 */ /* XXX 4 bytes hole, try to pack */ __u64 tcc_disabled_mask;/* 360 8 */ /* size: 368, cachelines: 6, members: 49 */ /* sum members: 364, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git
Re: [PATCH] drm/amdgpu: fix PSP autoload twice in FLR
On 2020-07-28 2:04 p.m., Luben Tuikov wrote: > Thanks for removing the braces. > > On 2020-07-27 10:29 p.m., Liu ChengZhe wrote: >> the block->status.hw = false assignment will overwrite PSP's previous > ^^ > You want to start a sentence here. Capitalize "The". > Also don't use future tense in commit descriptions (and commit titles). > Simply use present tense. Using future tense makes it confusing if > this is what the code used to do before this change or if the code > is doing this right now as someone is reading the commit in the future with > "git log". > >> hw status, which will cause PSP execute resume operation after hw init. > > I've found it best to describe what's being done as if telling a story. > Break it down into "tell what's happening" and "tell what what's fixed and > how it affects the rest of the system". Something like this: > > Assigning false to block->status.hw overwrites PSP's previous > hardware status, which causes the PSP to resume operation after > hardware init. > > Remove this assignment and let the PSP start when it is told to. > > Check if the above rendition of your change is correct, and use it if so. Double checking now, since "resume" is an op, you should capitalize it. "... which causes the PSP to execute Resume operation right after hardware init. Remove this assignment and let the PSP execute Resume operation when it is told to do so." Or something to that effect. Regards, Luben > > Regards, > Luben > >> >> v2: (R)remove the braces(.) > > > >> >> Signed-off-by: Liu ChengZhe >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- >> 1 file changed, 3 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 62ecac97fbd2..5d9affa1d35a 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2574,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct >> amdgpu_device *adev) >> AMD_IP_BLOCK_TYPE_IH, >> }; >> >> +for (i = 0; i < adev->num_ip_blocks; i++) >> +adev->ip_blocks[i].status.hw = false; >> + >> for (i = 0; i < ARRAY_SIZE(ip_order); i++) { >> int j; >> struct amdgpu_ip_block *block; >> @@ -2581,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct >> amdgpu_device *adev) >> for (j = 0; j < adev->num_ip_blocks; j++) { >> block = >ip_blocks[j]; >> >> -block->status.hw = false; >> if (block->version->type != ip_order[i] || >> !block->status.valid) >> continue; >> > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
On 2020-07-28 1:27 a.m., Tianci Yin wrote: > From: "Tianci.Yin" > > On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, " are lost " > reconfigure the golden settings after GFXOFF exit. " so reconfigure ..." > > Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 > Signed-off-by: Tianci.Yin > --- > drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ > 1 file changed, 11 insertions(+) > > diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > index 55463e7a11e2..5da0436d41e0 100644 > --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, > > struct smu_context *smu = (struct smu_context*)(handle); > struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); > + struct amdgpu_device *adev = smu->adev; > > if (!smu->is_apu && !smu_dpm_ctx->dpm_context) > return -EINVAL; > @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, > amdgpu_device_ip_set_clockgating_state(smu->adev, > > AMD_IP_BLOCK_TYPE_GFX, > > AMD_CG_STATE_UNGATE); > + > + if (adev->asic_type >= CHIP_NAVI10 && > + adev->asic_type <= CHIP_NAVI12 && > + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { > + if (adev->gfx.funcs->init_spm_golden) { > + dev_dbg(adev->dev,"GFXOFF exited, > re-init SPM golden settings\n"); Space after comma is required. > + amdgpu_gfx_init_spm_golden(adev); > + } else > + dev_warn(adev->dev,"Callback > init_spm_golden is NULL\n"); Space after comma is required. Please add braces to the single statement of the "else". The reason for this is that it complements the braces of the "if ( ) {" of the multi-line statement and closes the block. "checkpatch" calls it "unbalanced braces". With these three fixed, this patch is Reviewed-by: Luben Tuikov Regards, Luben > + } > } > } else { > /* exit umd pstate, restore level, enable gfx cg*/ > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden for Navi1x
On 2020-07-28 1:27 a.m., Tianci Yin wrote: > From: "Tianci.Yin" > > On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, Use present tense:... " are lost after " > reconfiguration is needed. Make the configuration code as an interface for Add "so a reconfiguration is needed. " > future use. > If the lines of your commit message are too long, then "git push" complains about them. Sixty char wide is perfect, since "git log" indents them when displaying them. With this fixed, then Reviewed-by: Luben Tuikov Regards, Luben > Change-Id: I172f3dc7f59da69b0364052dcad75a9c9aab019e > Signed-off-by: Tianci.Yin > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 34 ++--- > 2 files changed, 27 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index 1e7a2b0997c5..a611e78dd4ba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -216,6 +216,7 @@ struct amdgpu_gfx_funcs { > int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); > int (*query_ras_error_count) (struct amdgpu_device *adev, void > *ras_error_status); > void (*reset_ras_error_count) (struct amdgpu_device *adev); > + void (*init_spm_golden)(struct amdgpu_device *adev); > }; > > struct sq_work { > @@ -324,6 +325,7 @@ struct amdgpu_gfx { > #define amdgpu_gfx_get_gpu_clock_counter(adev) > (adev)->gfx.funcs->get_gpu_clock_counter((adev)) > #define amdgpu_gfx_select_se_sh(adev, se, sh, instance) > (adev)->gfx.funcs->select_se_sh((adev), (se), (sh), (instance)) > #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q, vmid) > (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q), (vmid)) > +#define amdgpu_gfx_init_spm_golden(adev) > (adev)->gfx.funcs->init_spm_golden((adev)) > > /** > * amdgpu_gfx_create_bitmask - create a bitmask > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index db9f1e89a0f8..da21ad04ac0f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -3307,6 +3307,29 @@ static void gfx_v10_0_set_kiq_pm4_funcs(struct > amdgpu_device *adev) > adev->gfx.kiq.pmf = _v10_0_kiq_pm4_funcs; > } > > +static void gfx_v10_0_init_spm_golden_registers(struct amdgpu_device *adev) > +{ > + switch (adev->asic_type) { > + case CHIP_NAVI10: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_0_nv10, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10)); > + break; > + case CHIP_NAVI14: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_1_nv14, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14)); > + break; > + case CHIP_NAVI12: > + soc15_program_register_sequence(adev, > + > golden_settings_gc_rlc_spm_10_1_2_nv12, > + (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_2_nv12)); > + break; > + default: > + break; > + } > +} > + > static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev) > { > switch (adev->asic_type) { > @@ -3317,9 +3340,6 @@ static void gfx_v10_0_init_golden_registers(struct > amdgpu_device *adev) > soc15_program_register_sequence(adev, > golden_settings_gc_10_0_nv10, > (const > u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10)); > - soc15_program_register_sequence(adev, > - > golden_settings_gc_rlc_spm_10_0_nv10, > - (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10)); > break; > case CHIP_NAVI14: > soc15_program_register_sequence(adev, > @@ -3328,9 +3348,6 @@ static void gfx_v10_0_init_golden_registers(struct > amdgpu_device *adev) > soc15_program_register_sequence(adev, > golden_settings_gc_10_1_nv14, > (const > u32)ARRAY_SIZE(golden_settings_gc_10_1_nv14)); > - soc15_program_register_sequence(adev, > - > golden_settings_gc_rlc_spm_10_1_nv14, > - (const > u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14)); > break; > case CHIP_NAVI12: >
[PATCH 1/1] drm/ttm: fix offset in VMAs with a pg_offs in ttm_bo_vm_access
VMAs with a pg_offs that's offset from the start of the vma_node need to adjust the offset within the BO accordingly. This matches the offset calculation in ttm_bo_vm_fault_reserved. Signed-off-by: Felix Kuehling Tested-by: Laurent Morichetti --- drivers/gpu/drm/ttm/ttm_bo_vm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 389128b8c4dd..60b41447bec8 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -405,8 +405,10 @@ static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo, int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { - unsigned long offset = (addr) - vma->vm_start; struct ttm_buffer_object *bo = vma->vm_private_data; + unsigned long offset = (addr) - vma->vm_start + + ((vma->vm_pgoff - drm_vma_node_start(>base.vma_node)) +<< PAGE_SHIFT); int ret; if (len < 1 || (offset + len) >> PAGE_SHIFT > bo->num_pages) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm amdgpu: Skip tmr load for SRIOV
On 2020-07-28 1:36 a.m., Liu ChengZhe wrote: > 1. For Navi12, CHIP_SIENNA_CICHLID, skip tmr load operation; > 2. Check pointer before release firmware. > > v2: use CHIP_SIENNA_CICHLID instead > v3: remove local "bool ret"; fix grammer issue > v4: use my name instead of "root" > Don't indent any lines. > Signed-off-by: Liu ChengZhe > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 35 - > 1 file changed, 29 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > index a053b7af0680..7f18286a0cc2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > @@ -193,12 +193,18 @@ static int psp_sw_fini(void *handle) > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > > psp_memory_training_fini(>psp); > - release_firmware(adev->psp.sos_fw); > - adev->psp.sos_fw = NULL; > - release_firmware(adev->psp.asd_fw); > - adev->psp.asd_fw = NULL; > - release_firmware(adev->psp.ta_fw); > - adev->psp.ta_fw = NULL; > + if (adev->psp.sos_fw) { > + release_firmware(adev->psp.sos_fw); > + adev->psp.sos_fw = NULL; > + } > + if (adev->psp.asd_fw) { > + release_firmware(adev->psp.asd_fw); > + adev->psp.asd_fw = NULL; > + } > + if (adev->psp.ta_fw) { > + release_firmware(adev->psp.ta_fw); > + adev->psp.ta_fw = NULL; > + } > > if (adev->asic_type == CHIP_NAVI10) > psp_sysfs_fini(adev); > @@ -409,11 +415,28 @@ static int psp_clear_vf_fw(struct psp_context *psp) > return ret; > } > > +static bool psp_skip_tmr(struct psp_context *psp) > +{ > + switch (psp->adev->asic_type) { > + case CHIP_NAVI12: > + case CHIP_SIENNA_CICHLID: > + return true; > + default: > + return false; > + } > +} Yeah, that's very nice now. > + > static int psp_tmr_load(struct psp_context *psp) > { > int ret; > struct psp_gfx_cmd_resp *cmd; > Fix this: > + /* (F)for Navi12 and CHIP_SIENNA_CICHLID SRIOV, do not set up TMR(.) > + * (A)already set( )up by host driver(.) Thanks, Luben > + */ > + if (amdgpu_sriov_vf(psp->adev) && psp_skip_tmr(psp)) > + return 0; > + > cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); > if (!cmd) > return -ENOMEM; > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm amdgpu: Skip tmr load for SRIOV
Thanks for this patch. On 2020-07-28 1:12 a.m., Liu ChengZhe wrote: > From: root You should fix your Git setup to show proper user name, not "root". I've prepared a Confluence page which shows a way to do it, and a few other things along the way: http://confluence.amd.com/display/~ltuikov/Git+Setup > > 1. For Navi12, CHIP_SIENNA_CICHLID, skip tmr load operation; > 2. Check pointer before release firmware. > > v2: use CHIP_SIENNA_CICHLID instead > v3: remove local "bool ret"; fix grammer issue > Signed-off-by: root You're missing an empty line between your commit message and the Signed-off-by: line. Also please do not indent your commit message. "git log" already indents it and it would look too indented to the right. Below for more: > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 35 - > 1 file changed, 29 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > index a053b7af0680..7f18286a0cc2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > @@ -193,12 +193,18 @@ static int psp_sw_fini(void *handle) > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > > psp_memory_training_fini(>psp); > - release_firmware(adev->psp.sos_fw); > - adev->psp.sos_fw = NULL; > - release_firmware(adev->psp.asd_fw); > - adev->psp.asd_fw = NULL; > - release_firmware(adev->psp.ta_fw); > - adev->psp.ta_fw = NULL; > + if (adev->psp.sos_fw) { > + release_firmware(adev->psp.sos_fw); > + adev->psp.sos_fw = NULL; > + } > + if (adev->psp.asd_fw) { > + release_firmware(adev->psp.asd_fw); > + adev->psp.asd_fw = NULL; > + } > + if (adev->psp.ta_fw) { > + release_firmware(adev->psp.ta_fw); > + adev->psp.ta_fw = NULL; > + } > > if (adev->asic_type == CHIP_NAVI10) > psp_sysfs_fini(adev); > @@ -409,11 +415,28 @@ static int psp_clear_vf_fw(struct psp_context *psp) > return ret; > } > > +static bool psp_skip_tmr(struct psp_context *psp) > +{ > + switch (psp->adev->asic_type) { > + case CHIP_NAVI12: > + case CHIP_SIENNA_CICHLID: > + return true; > + default: > + return false; > + } > +} > + > static int psp_tmr_load(struct psp_context *psp) > { > int ret; > struct psp_gfx_cmd_resp *cmd; > > + /* for Navi12 and CHIP_SIENNA_CICHLID SRIOV, do not set up TMR > + * (already setup by host driver) Thanks for fixing noun "setup" to verb "set up". But there is another "already setup by" should be "already set up by the host driver". Thanks and regards, Luben > + */ > + if (amdgpu_sriov_vf(psp->adev) && psp_skip_tmr(psp)) > + return 0; > + > cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); > if (!cmd) > return -ENOMEM; > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: fix PSP autoload twice in FLR
Thanks for removing the braces. On 2020-07-27 10:29 p.m., Liu ChengZhe wrote: > the block->status.hw = false assignment will overwrite PSP's previous ^^ You want to start a sentence here. Capitalize "The". Also don't use future tense in commit descriptions (and commit titles). Simply use present tense. Using future tense makes it confusing if this is what the code used to do before this change or if the code is doing this right now as someone is reading the commit in the future with "git log". > hw status, which will cause PSP execute resume operation after hw init. I've found it best to describe what's being done as if telling a story. Break it down into "tell what's happening" and "tell what what's fixed and how it affects the rest of the system". Something like this: Assigning false to block->status.hw overwrites PSP's previous hardware status, which causes the PSP to resume operation after hardware init. Remove this assignment and let the PSP start when it is told to. Check if the above rendition of your change is correct, and use it if so. Regards, Luben > > v2: (R)remove the braces(.) > > Signed-off-by: Liu ChengZhe > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 62ecac97fbd2..5d9affa1d35a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2574,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct > amdgpu_device *adev) > AMD_IP_BLOCK_TYPE_IH, > }; > > + for (i = 0; i < adev->num_ip_blocks; i++) > + adev->ip_blocks[i].status.hw = false; > + > for (i = 0; i < ARRAY_SIZE(ip_order); i++) { > int j; > struct amdgpu_ip_block *block; > @@ -2581,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct > amdgpu_device *adev) > for (j = 0; j < adev->num_ip_blocks; j++) { > block = >ip_blocks[j]; > > - block->status.hw = false; > if (block->version->type != ip_order[i] || > !block->status.valid) > continue; > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free
On 2020-07-28 5:22 a.m., Paul Menzel wrote: Dear Linux folks, Am 25.07.20 um 07:20 schrieb Mazin Rezk: On Saturday, July 25, 2020 12:59 AM, Duncan wrote: On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote: Am 24.07.20 um 19:33 schrieb Kees Cook: There was a fix to disable the async path for this driver that worked around the bug too, yes? That seems like a safer and more focused change that doesn't revert the SLUB defense for all users, and would actually provide a complete, I think, workaround That said, I haven't seen the async disabling patch. If you could link to it, I'd be glad to test it out and perhaps we can use that instead. I'm confused. Not to put words in Kees' mouth; /I/ am confused (which admittedly could well be just because I make no claims to be a coder and am simply reading the bug and thread, but I'd appreciate some "unconfusing" anyway). My interpretation of the "async disabling" reference was that it was to comment #30 on the bug: https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30 ... which (if I'm not confused on this point too) appears to be yours. There it was stated... I've also found that this bug exclusively occurs when commit_work is on the workqueue. After forcing drm_atomic_helper_commit to run all of the commits without adding to the workqueue and running the OS, the issue seems to have disappeared. Would not forcing all commits to run directly, without placing them on the workqueue, be "async disabling"? That's what I /thought/ he was referencing. Oh, I thought he was referring to a different patch. Kees, could I get your confirmation on this? The change I made actually affected all of the DRM code, although this could easily be changed to be specific to amdgpu. (By forcing blocking on amdgpu_dm's non-blocking commit code) That said, I'd still need to test further because I only did test it for a couple of hours then. Although it should work in theory. OTOH your base/context swap idea sounds like a possibly "less disturbance" workaround, if it works, and given the point in the commit cycle... (But if it's out Sunday it's likely too late to test and get it in now anyway; if it's another week, tho...) The base/context swap idea should make the use-after-free behave how it did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a "less disturbance" workaround and more of a "no disturbance" workaround. Sorry for bothering, but is there now a solution, besides reverting the commits, to avoid freezes/crashes *without* performance regressions? Kind regards, Paul Mazin's "drm/amd/display: Clear dm_state for fast updates" change accomplishes this, at least as a temporary hack. I've started work on a more large scale fix that we could get in in after. Regards, Nicholas Kazlauskas ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amd/display: parse ta firmware for navy_flounder
On Tue, Jul 28, 2020 at 11:43 AM Bhawanpreet Lakha wrote: > > Use the same case as sienna_cichlid > > Signed-off-by: Bhawanpreet Lakha Reviewed-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c > index d488d250805d..e16874f30d5d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c > @@ -179,12 +179,11 @@ static int psp_v11_0_init_microcode(struct psp_context > *psp) > } > break; > case CHIP_SIENNA_CICHLID: > + case CHIP_NAVY_FLOUNDER: > err = psp_init_ta_microcode(>psp, chip_name); > if (err) > return err; > break; > - case CHIP_NAVY_FLOUNDER: > - break; > default: > BUG(); > } > -- > 2.17.1 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amd/display: parse ta firmware for navy_flounder
Use the same case as sienna_cichlid Signed-off-by: Bhawanpreet Lakha --- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index d488d250805d..e16874f30d5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -179,12 +179,11 @@ static int psp_v11_0_init_microcode(struct psp_context *psp) } break; case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: err = psp_init_ta_microcode(>psp, chip_name); if (err) return err; break; - case CHIP_NAVY_FLOUNDER: - break; default: BUG(); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free
Dear Linux folks, Am 25.07.20 um 07:20 schrieb Mazin Rezk: On Saturday, July 25, 2020 12:59 AM, Duncan wrote: On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote: Am 24.07.20 um 19:33 schrieb Kees Cook: There was a fix to disable the async path for this driver that worked around the bug too, yes? That seems like a safer and more focused change that doesn't revert the SLUB defense for all users, and would actually provide a complete, I think, workaround That said, I haven't seen the async disabling patch. If you could link to it, I'd be glad to test it out and perhaps we can use that instead. I'm confused. Not to put words in Kees' mouth; /I/ am confused (which admittedly could well be just because I make no claims to be a coder and am simply reading the bug and thread, but I'd appreciate some "unconfusing" anyway). My interpretation of the "async disabling" reference was that it was to comment #30 on the bug: https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30 ... which (if I'm not confused on this point too) appears to be yours. There it was stated... I've also found that this bug exclusively occurs when commit_work is on the workqueue. After forcing drm_atomic_helper_commit to run all of the commits without adding to the workqueue and running the OS, the issue seems to have disappeared. Would not forcing all commits to run directly, without placing them on the workqueue, be "async disabling"? That's what I /thought/ he was referencing. Oh, I thought he was referring to a different patch. Kees, could I get your confirmation on this? The change I made actually affected all of the DRM code, although this could easily be changed to be specific to amdgpu. (By forcing blocking on amdgpu_dm's non-blocking commit code) That said, I'd still need to test further because I only did test it for a couple of hours then. Although it should work in theory. OTOH your base/context swap idea sounds like a possibly "less disturbance" workaround, if it works, and given the point in the commit cycle... (But if it's out Sunday it's likely too late to test and get it in now anyway; if it's another week, tho...) The base/context swap idea should make the use-after-free behave how it did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a "less disturbance" workaround and more of a "no disturbance" workaround. Sorry for bothering, but is there now a solution, besides reverting the commits, to avoid freezes/crashes *without* performance regressions? Kind regards, Paul ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)
Am 2020-07-28 um 5:00 a.m. schrieb Monk Liu: > what: > the MQD's save and restore of KCQ (kernel compute queue) > cost lots of clocks during world switch which impacts a lot > to multi-VF performance > > how: > introduce a paramter to control the number of KCQ to avoid > performance drop if there is no kernel compute queue needed > > notes: > this paramter only affects gfx 8/9/10 > > v2: > refine namings > > v3: > choose queues for each ring to that try best to cross pipes evenly. Thanks. Some more suggestions for simplifications inline. > > TODO: > in the future we will let hypervisor driver to set this paramter > automatically thus no need for user to configure it through > modprobe in virtual machine > > Signed-off-by: Monk Liu > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 > +++--- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 29 +++ > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 31 > 7 files changed, 87 insertions(+), 71 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e97c088..de11136 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -201,6 +201,7 @@ extern int amdgpu_si_support; > #ifdef CONFIG_DRM_AMDGPU_CIK > extern int amdgpu_cik_support; > #endif > +extern int amdgpu_num_kcq; > > #define AMDGPU_VM_MAX_NUM_CTX4096 > #define AMDGPU_SG_THRESHOLD (256*1024*1024) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 62ecac9..cf445bab 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct > amdgpu_device *adev) > > amdgpu_gmc_tmz_set(adev); > > + if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { > + amdgpu_num_kcq = 8; > + dev_warn(adev->dev, "set kernel compute queue number to 8 due > to invalid paramter provided by user\n"); > + } > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 6291f5f..b545c40 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -150,6 +150,7 @@ int amdgpu_noretry; > int amdgpu_force_asic_type = -1; > int amdgpu_tmz = 0; > int amdgpu_reset_method = -1; /* auto */ > +int amdgpu_num_kcq = -1; > > struct amdgpu_mgpu_info mgpu_info = { > .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), > @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); > MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = > legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); > module_param_named(reset_method, amdgpu_reset_method, int, 0444); > > +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup > (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); > +module_param_named(num_kcq, amdgpu_num_kcq, int, 0444); > + > static const struct pci_device_id pciidlist[] = { > #ifdef CONFIG_DRM_AMDGPU_SI > {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index 8eff017..f83a9a7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct > amdgpu_device *adev, > > void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) > { > - int i, queue, pipe, mec; > + int i, queue, pipe; > bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); > + int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec * > + > adev->gfx.mec.num_queue_per_pipe, > + > adev->gfx.num_compute_rings); Indentation looks wrong. Did you use the wrong TAB size? > + > + if (multipipe_policy) { > + /* policy: make queues evenly cross all pipes on MEC1 only */ > + for (i = 0; i < max_queues_per_mec; i++) { > + pipe = i % adev->gfx.mec.num_pipe_per_mec; > + queue = (i / adev->gfx.mec.num_pipe_per_mec) % > + adev->gfx.mec.num_queue_per_pipe; > + > + set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue, > + adev->gfx.mec.queue_bitmap); > + } > + } else { > + int
Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2)
[AMD Public Use] Would it be better to put this code into amdgpu_gfx_off_ctrl()? Then we'll handle this in all cases where we disable gfx off. Alex From: Tianci Yin Sent: Tuesday, July 28, 2020 3:04 AM To: amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Chen, Guchun ; Feng, Kenneth ; Yin, Tianci (Rico) Subject: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2) From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Reviewed-by: Feifei Xu Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..41487123c207 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); + struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1318,12 +1319,22 @@ static int smu_enable_umd_pstate(void *handle, if (*level & profile_mode_mask) { smu_dpm_ctx->saved_dpm_level = smu_dpm_ctx->dpm_level; smu_dpm_ctx->enable_umd_pstate = true; - amdgpu_device_ip_set_powergating_state(smu->adev, + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_PG_STATE_UNGATE); - amdgpu_device_ip_set_clockgating_state(smu->adev, + amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12 && + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { + if (adev->gfx.funcs->init_spm_golden) { + dev_dbg(adev->dev,"GFXOFF exited, re-init SPM golden settings\n"); + amdgpu_gfx_init_spm_golden(adev); + } else + dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); + } } } else { /* exit umd pstate, restore level, enable gfx cg*/ @@ -1331,10 +1342,10 @@ static int smu_enable_umd_pstate(void *handle, if (*level == AMD_DPM_FORCED_LEVEL_PROFILE_EXIT) *level = smu_dpm_ctx->saved_dpm_level; smu_dpm_ctx->enable_umd_pstate = false; - amdgpu_device_ip_set_clockgating_state(smu->adev, + amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_GATE); - amdgpu_device_ip_set_powergating_state(smu->adev, + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_GFX, AMD_PG_STATE_GATE); } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU
[AMD Public Use] Hi Dennis, Please check my response after yours. Regards, Guchun -Original Message- From: Li, Dennis Sent: Tuesday, July 28, 2020 5:43 PM To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Grodzovsky, Andrey ; Zhou1, Tao ; Clements, John ; Lazar, Lijo ; Koenig, Christian ; Yang, Stanley Subject: RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU [AMD Official Use Only - Internal Distribution Only] Hi, Guchun, Please see my below comments. Best Regards Dennis Li -Original Message- From: Chen, Guchun Sent: Tuesday, July 28, 2020 3:49 PM To: amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Li, Dennis ; Grodzovsky, Andrey ; Zhou1, Tao ; Clements, John ; Lazar, Lijo ; Koenig, Christian ; Yang, Stanley Cc: Chen, Guchun Subject: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. v2: Fix spelling typo, correct the condition to detect bad gpu tag and refine error message. v3: Refine function argument name. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2662cd7c8685..30af0dfee1a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. -* recovery_init may fail, but it can free all resources allocated by -* itself and its failure should not stop amdgpu init process. +* +* amdgpu_ras_recovery_init may fail, but the upper only cares the +* failure from bad gpu situation and stop amdgpu init process +* accordingly. For other failed cases, it will still release all +* the resource and print error message, rather than returning one +* negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4c142e9d8a..56e1aeba2d64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(>eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); + /* +* We only fail this calling and halt booting up +* when exc_err_limit is true. +*/ + if (exc_err_limit) { + ret = -EINVAL; goto free; + } [Dennis Li] Compared with old codes, new change miss checking ret. [Guchun] Yeah, this hits me that another if condition is that ret should be checked as well when exc_err_limit is false, that means there is some problem with eeprom i2c functionality. It will be addressed in next patch set. if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* +* Except error threshold exceeding case, other failure cases in this +* function would not fail amdgpu driver init. +*/ + if (!exc_err_limit) + ret = 0; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index
Re: [PATCH] drm/amd/powerplay: update driver if version for navy_flounder
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Kenneth Feng 在 2020/7/28 下午7:21,“Jiansong Chen” 写入: It's in accordance with pmfw 65.5.0 for navy_flounder. Signed-off-by: Jiansong Chen Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0 --- drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h index 9504f9954fd3..6a42331aba8a 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h @@ -31,7 +31,7 @@ #define SMU11_DRIVER_IF_VERSION_NV12 0x33 #define SMU11_DRIVER_IF_VERSION_NV14 0x36 #define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34 -#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2 +#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3 /* MP Apertures */ #define MP0_Public0x0380 -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amd/display: Clear dm_state for fast updates
On Monday, July 27, 2020 7:42 PM, Mazin Rezk wrote: > On Monday, July 27, 2020 5:32 PM, Daniel Vetter wrote: > > > On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk wrote: > > > > > > On Monday, July 27, 2020 4:29 PM, Daniel Vetter wrote: > > > > > > > On Mon, Jul 27, 2020 at 9:28 PM Christian König > > > > wrote: > > > > > > > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas: > > > > > > On 2020-07-27 9:39 a.m., Christian König wrote: > > > > > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk: > > > > > >>> This patch fixes a race condition that causes a use-after-free > > > > > >>> during > > > > > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking > > > > > >>> commits > > > > > >>> are requested and the second one finishes before the first. > > > > > >>> Essentially, > > > > > >>> this bug occurs when the following sequence of events happens: > > > > > >>> > > > > > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is > > > > > >>> deferred to the workqueue. > > > > > >>> > > > > > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is > > > > > >>> deferred to the workqueue. > > > > > >>> > > > > > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the > > > > > >>> commit_tail and commit #2 completes, freeing dm_state #1. > > > > > >>> > > > > > >>> 4. Commit #1 starts after commit #2 completes, uses the freed > > > > > >>> dm_state > > > > > >>> 1 and dereferences a freelist pointer while setting the context. > > > > > >> > > > > > >> Well I only have a one mile high view on this, but why don't you > > > > > >> let > > > > > >> the work items execute in order? > > > > > >> > > > > > >> That would be better anyway cause this way we don't trigger a cache > > > > > >> line ping pong between CPUs. > > > > > >> > > > > > >> Christian. > > > > > > > > > > > > We use the DRM helpers for managing drm_atomic_commit_state and > > > > > > those > > > > > > helpers internally push non-blocking commit work into the system > > > > > > unbound work queue. > > > > > > > > > > Mhm, well if you send those helper atomic commits in the order A,B and > > > > > they execute it in the order B,A I would call that a bug :) > > > > > > > > The way it works is it pushes all commits into unbound work queue, but > > > > then forces serialization as needed. We do _not_ want e.g. updates on > > > > different CRTC to be serialized, that would result in lots of judder. > > > > And hw is funny enough that there's all kinds of dependencies. > > > > > > > > The way you force synchronization is by adding other CRTC state > > > > objects. So if DC is busted and can only handle a single update per > > > > work item, then I guess you always need all CRTC states and everything > > > > will be run in order. But that also totally kills modern multi-screen > > > > compositors. Xorg isn't modern, just in case that's not clear :-) > > > > > > > > Lucking at the code it seems like you indeed have only a single dm > > > > state, so yeah global sync is what you'll need as immediate fix, and > > > > then maybe fix up DM to not be quite so silly ... or at least only do > > > > the dm state stuff when really needed. > > > > > > > > We could also sprinkle the drm_crtc_commit structure around a bit > > > > (it's the glue that provides the synchronization across commits), but > > > > since your dm state is global just grabbing all crtc states > > > > unconditionally as part of that is probably best. > > > > > > > > > > While we could duplicate a copy of that code with nothing but the > > > > > > workqueue changed that isn't something I'd really like to maintain > > > > > > going forward. > > > > > > > > > > I'm not talking about duplicating the code, I'm talking about fixing > > > > > the > > > > > helpers. I don't know that code well, but from the outside it sounds > > > > > like a bug there. > > > > > > > > > > And executing work items in the order they are submitted is trivial. > > > > > > > > > > Had anybody pinged Daniel or other people familiar with the helper > > > > > code > > > > > about it? > > > > > > > > Yeah something is wrong here, and the fix looks horrible :-) > > > > > > > > Aside, I've also seen some recent discussion flare up about > > > > drm_atomic_state_get/put used to paper over some other use-after-free, > > > > but this time related to interrupt handlers. Maybe a few rules about > > > > that: > > > > - dont > > > > - especially not when it's interrupt handlers, because you can't call > > > > drm_atomic_state_put from interrupt handlers. > > > > > > > > Instead have an spin_lock_irq to protect the shared date with your > > > > interrupt handler, and _copy_ the date over. This is e.g. what > > > > drm_crtc_arm_vblank_event does. > > > > > > Nicholas wrote a patch that attempted to resolve the issue by adding every > > > CRTC into the commit to use use the stall checks. [1] While this forces > > > synchronisation on commits, it's kind of a hacky method that
Re: [PATCH] drm/amd/display: Clear dm_state for fast updates
On Monday, July 27, 2020 4:29 PM, Daniel Vetter wrote: > On Mon, Jul 27, 2020 at 9:28 PM Christian König > wrote: > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas: > > > On 2020-07-27 9:39 a.m., Christian König wrote: > > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk: > > >>> This patch fixes a race condition that causes a use-after-free during > > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking > > >>> commits > > >>> are requested and the second one finishes before the first. > > >>> Essentially, > > >>> this bug occurs when the following sequence of events happens: > > >>> > > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is > > >>> deferred to the workqueue. > > >>> > > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is > > >>> deferred to the workqueue. > > >>> > > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the > > >>> commit_tail and commit #2 completes, freeing dm_state #1. > > >>> > > >>> 4. Commit #1 starts after commit #2 completes, uses the freed dm_state > > >>> 1 and dereferences a freelist pointer while setting the context. > > >> > > >> Well I only have a one mile high view on this, but why don't you let > > >> the work items execute in order? > > >> > > >> That would be better anyway cause this way we don't trigger a cache > > >> line ping pong between CPUs. > > >> > > >> Christian. > > > > > > We use the DRM helpers for managing drm_atomic_commit_state and those > > > helpers internally push non-blocking commit work into the system > > > unbound work queue. > > > > Mhm, well if you send those helper atomic commits in the order A,B and > > they execute it in the order B,A I would call that a bug :) > > The way it works is it pushes all commits into unbound work queue, but > then forces serialization as needed. We do _not_ want e.g. updates on > different CRTC to be serialized, that would result in lots of judder. > And hw is funny enough that there's all kinds of dependencies. > > The way you force synchronization is by adding other CRTC state > objects. So if DC is busted and can only handle a single update per > work item, then I guess you always need all CRTC states and everything > will be run in order. But that also totally kills modern multi-screen > compositors. Xorg isn't modern, just in case that's not clear :-) > > Lucking at the code it seems like you indeed have only a single dm > state, so yeah global sync is what you'll need as immediate fix, and > then maybe fix up DM to not be quite so silly ... or at least only do > the dm state stuff when really needed. > > We could also sprinkle the drm_crtc_commit structure around a bit > (it's the glue that provides the synchronization across commits), but > since your dm state is global just grabbing all crtc states > unconditionally as part of that is probably best. > > > > While we could duplicate a copy of that code with nothing but the > > > workqueue changed that isn't something I'd really like to maintain > > > going forward. > > > > I'm not talking about duplicating the code, I'm talking about fixing the > > helpers. I don't know that code well, but from the outside it sounds > > like a bug there. > > > > And executing work items in the order they are submitted is trivial. > > > > Had anybody pinged Daniel or other people familiar with the helper code > > about it? > > Yeah something is wrong here, and the fix looks horrible :-) > > Aside, I've also seen some recent discussion flare up about > drm_atomic_state_get/put used to paper over some other use-after-free, > but this time related to interrupt handlers. Maybe a few rules about > that: > - dont > - especially not when it's interrupt handlers, because you can't call > drm_atomic_state_put from interrupt handlers. > > Instead have an spin_lock_irq to protect the shared date with your > interrupt handler, and _copy_ the date over. This is e.g. what > drm_crtc_arm_vblank_event does. Nicholas wrote a patch that attempted to resolve the issue by adding every CRTC into the commit to use use the stall checks. [1] While this forces synchronisation on commits, it's kind of a hacky method that may take a toll on performance. Is it possible to have a DRM helper that forces synchronisation on some commits without having to add every CRTC into the commit? Also, is synchronisation really necessary for fast updates in amdgpu? I'll admit, the idea of eliminating the use-after-free bug by eliminating the use entirely doesn't seem ideal; but is forcing synchronisation on these updates that much better? [1] https://bugzilla.kernel.org/show_bug.cgi?id=207383#c96 Thanks, Mazin Rezk > > Cheers, Daniel > > > > > Regards, > > Christian. > > > > > > > > Regards, > > > Nicholas Kazlauskas > > > > > >> > > >>> > > >>> Since this bug has only been spotted with fast commits, this patch > > >>> fixes > > >>> the bug by clearing the dm_state instead of using the old dc_state for > > >>> fast updates. In
Re: [PATCH] drm/amd/display: Clear dm_state for fast updates
On Monday, July 27, 2020 5:32 PM, Daniel Vetter wrote: > On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk wrote: > > > > On Monday, July 27, 2020 4:29 PM, Daniel Vetter wrote: > > > > > On Mon, Jul 27, 2020 at 9:28 PM Christian König > > > wrote: > > > > > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas: > > > > > On 2020-07-27 9:39 a.m., Christian König wrote: > > > > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk: > > > > >>> This patch fixes a race condition that causes a use-after-free > > > > >>> during > > > > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking > > > > >>> commits > > > > >>> are requested and the second one finishes before the first. > > > > >>> Essentially, > > > > >>> this bug occurs when the following sequence of events happens: > > > > >>> > > > > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is > > > > >>> deferred to the workqueue. > > > > >>> > > > > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is > > > > >>> deferred to the workqueue. > > > > >>> > > > > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the > > > > >>> commit_tail and commit #2 completes, freeing dm_state #1. > > > > >>> > > > > >>> 4. Commit #1 starts after commit #2 completes, uses the freed > > > > >>> dm_state > > > > >>> 1 and dereferences a freelist pointer while setting the context. > > > > >> > > > > >> Well I only have a one mile high view on this, but why don't you let > > > > >> the work items execute in order? > > > > >> > > > > >> That would be better anyway cause this way we don't trigger a cache > > > > >> line ping pong between CPUs. > > > > >> > > > > >> Christian. > > > > > > > > > > We use the DRM helpers for managing drm_atomic_commit_state and those > > > > > helpers internally push non-blocking commit work into the system > > > > > unbound work queue. > > > > > > > > Mhm, well if you send those helper atomic commits in the order A,B and > > > > they execute it in the order B,A I would call that a bug :) > > > > > > The way it works is it pushes all commits into unbound work queue, but > > > then forces serialization as needed. We do _not_ want e.g. updates on > > > different CRTC to be serialized, that would result in lots of judder. > > > And hw is funny enough that there's all kinds of dependencies. > > > > > > The way you force synchronization is by adding other CRTC state > > > objects. So if DC is busted and can only handle a single update per > > > work item, then I guess you always need all CRTC states and everything > > > will be run in order. But that also totally kills modern multi-screen > > > compositors. Xorg isn't modern, just in case that's not clear :-) > > > > > > Lucking at the code it seems like you indeed have only a single dm > > > state, so yeah global sync is what you'll need as immediate fix, and > > > then maybe fix up DM to not be quite so silly ... or at least only do > > > the dm state stuff when really needed. > > > > > > We could also sprinkle the drm_crtc_commit structure around a bit > > > (it's the glue that provides the synchronization across commits), but > > > since your dm state is global just grabbing all crtc states > > > unconditionally as part of that is probably best. > > > > > > > > While we could duplicate a copy of that code with nothing but the > > > > > workqueue changed that isn't something I'd really like to maintain > > > > > going forward. > > > > > > > > I'm not talking about duplicating the code, I'm talking about fixing the > > > > helpers. I don't know that code well, but from the outside it sounds > > > > like a bug there. > > > > > > > > And executing work items in the order they are submitted is trivial. > > > > > > > > Had anybody pinged Daniel or other people familiar with the helper code > > > > about it? > > > > > > Yeah something is wrong here, and the fix looks horrible :-) > > > > > > Aside, I've also seen some recent discussion flare up about > > > drm_atomic_state_get/put used to paper over some other use-after-free, > > > but this time related to interrupt handlers. Maybe a few rules about > > > that: > > > - dont > > > - especially not when it's interrupt handlers, because you can't call > > > drm_atomic_state_put from interrupt handlers. > > > > > > Instead have an spin_lock_irq to protect the shared date with your > > > interrupt handler, and _copy_ the date over. This is e.g. what > > > drm_crtc_arm_vblank_event does. > > > > Nicholas wrote a patch that attempted to resolve the issue by adding every > > CRTC into the commit to use use the stall checks. [1] While this forces > > synchronisation on commits, it's kind of a hacky method that may take a > > toll on performance. > > > > Is it possible to have a DRM helper that forces synchronisation on some > > commits without having to add every CRTC into the commit? > > > > Also, is synchronisation really necessary for fast updates in amdgpu? > > I'll admit, the idea of eliminating the
[PATCH] drm/amd/powerplay: update driver if version for navy_flounder
It's in accordance with pmfw 65.5.0 for navy_flounder. Signed-off-by: Jiansong Chen Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0 --- drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h index 9504f9954fd3..6a42331aba8a 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h @@ -31,7 +31,7 @@ #define SMU11_DRIVER_IF_VERSION_NV12 0x33 #define SMU11_DRIVER_IF_VERSION_NV14 0x36 #define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34 -#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2 +#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3 /* MP Apertures */ #define MP0_Public 0x0380 -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring
Am 28.07.20 um 12:21 schrieb Jack Xiao: assign the cpu/gpu address of fence for the normal or mes ring from ring structure. Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Acked-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 58d4c219178a..0be3e2007387 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, uint64_t index; if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) { - ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs]; - ring->fence_drv.gpu_addr = adev->wb.gpu_addr + (ring->fence_offs * 4); + ring->fence_drv.cpu_addr = ring->fence_cpu_addr; + ring->fence_drv.gpu_addr = ring->fence_gpu_addr; That doesn't look like a good idea to me. We should probably rather remove ring->fence_offs and move all the handling here instead. Christian. } else { /* put fence directly behind firmware */ index = ALIGN(adev->uvd.fw->size, 8); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/4] drm/amdgpu: initialize the cpu/gpu address of rptr/wptr/fence
Am 28.07.20 um 12:21 schrieb Jack Xiao: Initialize the cpu/gpu address of rptr/wptr/fence. Signed-off-by: Jack Xiao --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 37 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 13ea8ebc421c..ff63ecc861bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -150,6 +150,12 @@ void amdgpu_ring_undo(struct amdgpu_ring *ring) ring->funcs->end_use(ring); } +#define amdgpu_ring_get_gpu_addr(ring, offset) \ + (ring->adev->wb.gpu_addr + offset * 4) + +#define amdgpu_ring_get_cpu_addr(ring, offset) \ + (>adev->wb.wb[offset]) Those are not ring functions, but rather wb functions. Please clean that up. Christian. + /** * amdgpu_ring_init - init driver ring struct. * @@ -217,17 +223,38 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, "(%d) ring trail_fence_offs wb alloc failed\n", r); return r; } - ring->trail_fence_gpu_addr = - adev->wb.gpu_addr + (ring->trail_fence_offs * 4); - ring->trail_fence_cpu_addr = >wb.wb[ring->trail_fence_offs]; r = amdgpu_device_wb_get(adev, >cond_exe_offs); if (r) { dev_err(adev->dev, "(%d) ring cond_exec_polling wb alloc failed\n", r); return r; } - ring->cond_exe_gpu_addr = adev->wb.gpu_addr + (ring->cond_exe_offs * 4); - ring->cond_exe_cpu_addr = >wb.wb[ring->cond_exe_offs]; + + ring->fence_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->fence_offs); + ring->fence_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->fence_offs); + + ring->rptr_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->rptr_offs); + ring->rptr_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->rptr_offs); + + ring->wptr_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->wptr_offs); + ring->wptr_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->wptr_offs); + + ring->trail_fence_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->trail_fence_offs); + ring->trail_fence_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->trail_fence_offs); + + ring->cond_exe_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->cond_exe_offs); + ring->cond_exe_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->cond_exe_offs); + /* always set cond_exec_polling to CONTINUE */ *ring->cond_exe_cpu_addr = 1; ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring
[AMD Public Use] Series is Reviewed-by: Le Ma Regards, Ma Le -Original Message- From: Xiao, Jack Sent: Tuesday, July 28, 2020 6:22 PM To: amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Koenig, Christian ; Ma, Le Cc: Xiao, Jack ; Koenig, Christian Subject: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring assign the cpu/gpu address of fence for the normal or mes ring from ring structure. Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Acked-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 58d4c219178a..0be3e2007387 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, uint64_t index; if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) { - ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs]; - ring->fence_drv.gpu_addr = adev->wb.gpu_addr + (ring->fence_offs * 4); + ring->fence_drv.cpu_addr = ring->fence_cpu_addr; + ring->fence_drv.gpu_addr = ring->fence_gpu_addr; } else { /* put fence directly behind firmware */ index = ALIGN(adev->uvd.fw->size, 8); -- 2.26.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/4] drm/amdgpu: initialize the cpu/gpu address of rptr/wptr/fence
Initialize the cpu/gpu address of rptr/wptr/fence. Signed-off-by: Jack Xiao --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 37 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 13ea8ebc421c..ff63ecc861bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -150,6 +150,12 @@ void amdgpu_ring_undo(struct amdgpu_ring *ring) ring->funcs->end_use(ring); } +#define amdgpu_ring_get_gpu_addr(ring, offset) \ + (ring->adev->wb.gpu_addr + offset * 4) + +#define amdgpu_ring_get_cpu_addr(ring, offset) \ + (>adev->wb.wb[offset]) + /** * amdgpu_ring_init - init driver ring struct. * @@ -217,17 +223,38 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, "(%d) ring trail_fence_offs wb alloc failed\n", r); return r; } - ring->trail_fence_gpu_addr = - adev->wb.gpu_addr + (ring->trail_fence_offs * 4); - ring->trail_fence_cpu_addr = >wb.wb[ring->trail_fence_offs]; r = amdgpu_device_wb_get(adev, >cond_exe_offs); if (r) { dev_err(adev->dev, "(%d) ring cond_exec_polling wb alloc failed\n", r); return r; } - ring->cond_exe_gpu_addr = adev->wb.gpu_addr + (ring->cond_exe_offs * 4); - ring->cond_exe_cpu_addr = >wb.wb[ring->cond_exe_offs]; + + ring->fence_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->fence_offs); + ring->fence_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->fence_offs); + + ring->rptr_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->rptr_offs); + ring->rptr_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->rptr_offs); + + ring->wptr_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->wptr_offs); + ring->wptr_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->wptr_offs); + + ring->trail_fence_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->trail_fence_offs); + ring->trail_fence_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->trail_fence_offs); + + ring->cond_exe_gpu_addr = + amdgpu_ring_get_gpu_addr(ring, ring->cond_exe_offs); + ring->cond_exe_cpu_addr = + amdgpu_ring_get_cpu_addr(ring, ring->cond_exe_offs); + /* always set cond_exec_polling to CONTINUE */ *ring->cond_exe_cpu_addr = 1; -- 2.26.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/4] drm/amdgpu: use ring structure to access rptr/wptr v2
Use ring structure to access the cpu/gpu address of rptr/wptr. v2: merge gfx10/sdma5/sdma5.2 patches Signed-off-by: Jack Xiao Reviewed-by: Christian König Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 8 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 37 +- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 8 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 - drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 20 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 25 + drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c | 4 +-- drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c | 4 +-- drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c | 4 +-- drivers/gpu/drm/amd/amdgpu/mes_v10_1.c | 11 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 8 +++--- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 16 +-- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 28 --- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 16 +-- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 16 +-- drivers/gpu/drm/amd/amdgpu/si_dma.c| 4 +-- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 6 ++--- drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 6 ++--- drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c | 12 - drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 12 - drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 12 - 21 files changed, 126 insertions(+), 143 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c index 20f108818b2b..a6a7aa9e9aec 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c @@ -164,7 +164,7 @@ static uint64_t cik_sdma_ring_get_rptr(struct amdgpu_ring *ring) { u32 rptr; - rptr = ring->adev->wb.wb[ring->rptr_offs]; + rptr = *ring->rptr_cpu_addr; return (rptr & 0x3fffc) >> 2; } @@ -432,12 +432,10 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev) struct amdgpu_ring *ring; u32 rb_cntl, ib_cntl; u32 rb_bufsz; - u32 wb_offset; int i, j, r; for (i = 0; i < adev->sdma.num_instances; i++) { ring = >sdma.instance[i].ring; - wb_offset = (ring->rptr_offs * 4); mutex_lock(>srbm_mutex); for (j = 0; j < 16; j++) { @@ -473,9 +471,9 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev) /* set the wb address whether it's enabled or not */ WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_HI + sdma_offsets[i], - upper_32_bits(adev->wb.gpu_addr + wb_offset) & 0x); + upper_32_bits(ring->rptr_gpu_addr) & 0x); WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_LO + sdma_offsets[i], - ((adev->wb.gpu_addr + wb_offset) & 0xFFFC)); + ((ring->rptr_gpu_addr) & 0xFFFC)); rb_cntl |= SDMA0_GFX_RB_CNTL__RPTR_WRITEBACK_ENABLE_MASK; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index db9f1e89a0f8..7036e286b627 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3205,9 +3205,8 @@ static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue static void gfx10_kiq_map_queues(struct amdgpu_ring *kiq_ring, struct amdgpu_ring *ring) { - struct amdgpu_device *adev = kiq_ring->adev; uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj); - uint64_t wptr_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4); + uint64_t wptr_addr = ring->wptr_gpu_addr; uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0; amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); @@ -5835,12 +5834,12 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, mmCP_RB0_WPTR_HI, upper_32_bits(ring->wptr)); /* set the wb address wether it's enabled or not */ - rptr_addr = adev->wb.gpu_addr + (ring->rptr_offs * 4); + rptr_addr = ring->rptr_gpu_addr; WREG32_SOC15(GC, 0, mmCP_RB0_RPTR_ADDR, lower_32_bits(rptr_addr)); WREG32_SOC15(GC, 0, mmCP_RB0_RPTR_ADDR_HI, upper_32_bits(rptr_addr) & CP_RB_RPTR_ADDR_HI__RB_RPTR_ADDR_HI_MASK); - wptr_gpu_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4); + wptr_gpu_addr = ring->wptr_gpu_addr; WREG32_SOC15(GC, 0, mmCP_RB_WPTR_POLL_ADDR_LO, lower_32_bits(wptr_gpu_addr)); WREG32_SOC15(GC, 0, mmCP_RB_WPTR_POLL_ADDR_HI, @@ -5873,11 +5872,11 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, mmCP_RB1_WPTR, lower_32_bits(ring->wptr)); WREG32_SOC15(GC, 0, mmCP_RB1_WPTR_HI, upper_32_bits(ring->wptr)); /* Set the wb address wether it's enabled or not */ - rptr_addr =
[PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring
assign the cpu/gpu address of fence for the normal or mes ring from ring structure. Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Acked-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 58d4c219178a..0be3e2007387 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, uint64_t index; if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) { - ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs]; - ring->fence_drv.gpu_addr = adev->wb.gpu_addr + (ring->fence_offs * 4); + ring->fence_drv.cpu_addr = ring->fence_cpu_addr; + ring->fence_drv.gpu_addr = ring->fence_gpu_addr; } else { /* put fence directly behind firmware */ index = ALIGN(adev->uvd.fw->size, 8); -- 2.26.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/4] drm/amdgpu: define ring structure to access rptr/wptr/fence
Define ring structure to access the cpu/gpu address of rptr/wptr/fence instead of dynamic calculation. Cc: Christian König Suggested-by: Christian König Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Acked-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index da871d84b742..940618d1bd4d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -208,6 +208,8 @@ struct amdgpu_ring { struct amdgpu_bo*ring_obj; volatile uint32_t *ring; unsignedrptr_offs; + u64 rptr_gpu_addr; + volatile u32*rptr_cpu_addr; u64 wptr; u64 wptr_old; unsignedring_size; @@ -228,7 +230,11 @@ struct amdgpu_ring { booluse_doorbell; booluse_pollmem; unsignedwptr_offs; + u64 wptr_gpu_addr; + volatile u32*wptr_cpu_addr; unsignedfence_offs; + u64 fence_gpu_addr; + volatile u32*fence_cpu_addr; uint64_tcurrent_ctx; charname[16]; u32 trail_seq; -- 2.26.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU
[AMD Official Use Only - Internal Distribution Only] Hi, Guchun, Please see my below comments. Best Regards Dennis Li -Original Message- From: Chen, Guchun Sent: Tuesday, July 28, 2020 3:49 PM To: amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Li, Dennis ; Grodzovsky, Andrey ; Zhou1, Tao ; Clements, John ; Lazar, Lijo ; Koenig, Christian ; Yang, Stanley Cc: Chen, Guchun Subject: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. v2: Fix spelling typo, correct the condition to detect bad gpu tag and refine error message. v3: Refine function argument name. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2662cd7c8685..30af0dfee1a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. -* recovery_init may fail, but it can free all resources allocated by -* itself and its failure should not stop amdgpu init process. +* +* amdgpu_ras_recovery_init may fail, but the upper only cares the +* failure from bad gpu situation and stop amdgpu init process +* accordingly. For other failed cases, it will still release all +* the resource and print error message, rather than returning one +* negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4c142e9d8a..56e1aeba2d64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(>eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); + /* +* We only fail this calling and halt booting up +* when exc_err_limit is true. +*/ + if (exc_err_limit) { + ret = -EINVAL; goto free; + } [Dennis Li] Compared with old codes, new change miss checking ret. if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* +* Except error threshold exceeding case, other failure cases in this +* function would not fail amdgpu driver init. +*/ + if (!exc_err_limit) + ret = 0; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 35c0c849d49b..67995b66d7d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) { int ret = 0; struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) .buf= buff, }; +
RE: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)
[AMD Official Use Only - Internal Distribution Only] I repeated the patch broadcast through git-send-email _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Koenig, Christian Sent: Tuesday, July 28, 2020 5:04 PM To: Liu, Monk ; amd-...@freedesktop.org Cc: Kuehling, Felix Subject: Re: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3) The patch looks totally mangled to me, e.g. some spaces and new lines are missing. Probably because it was forwarded. Christian. Am 28.07.20 um 10:59 schrieb Liu, Monk: > [AMD Official Use Only - Internal Distribution Only] > > -Original Message- > From: Monk Liu > Sent: Tuesday, July 28, 2020 2:59 PM > To: amd-gfx@lists.freedesktop.org > Cc: Liu, Monk > Subject: [PATCH] drm/amdgpu: introduce a new parameter to configure > how many KCQ we want(v3) > > what: > the MQD's save and restore of KCQ (kernel compute queue) cost lots of > clocks during world switch which impacts a lot to multi-VF performance > > how: > introduce a paramter to control the number of KCQ to avoid performance > drop if there is no kernel compute queue needed > > notes: > this paramter only affects gfx 8/9/10 > > v2: > refine namings > > v3: > choose queues for each ring to that try best to cross pipes evenly. > > TODO: > in the future we will let hypervisor driver to set this paramter > automatically thus no need for user to configure it through modprobe > in virtual machine > > Signed-off-by: Monk Liu > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 > +++--- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 29 +++ > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 31 > 7 files changed, 87 insertions(+), 71 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e97c088..de11136 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -201,6 +201,7 @@ extern int amdgpu_si_support; #ifdef > CONFIG_DRM_AMDGPU_CIK extern int amdgpu_cik_support; #endif > +extern int amdgpu_num_kcq; > > #define AMDGPU_VM_MAX_NUM_CTX4096 > #define AMDGPU_SG_THRESHOLD(256*1024*1024) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 62ecac9..cf445bab 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct > amdgpu_device *adev) > > amdgpu_gmc_tmz_set(adev); > > +if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { amdgpu_num_kcq = 8; > +dev_warn(adev->dev, "set kernel compute queue number to 8 due to > +invalid paramter provided by user\n"); } > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 6291f5f..b545c40 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -150,6 +150,7 @@ int amdgpu_noretry; > int amdgpu_force_asic_type = -1; > int amdgpu_tmz = 0; > int amdgpu_reset_method = -1; /* auto */ > +int amdgpu_num_kcq = -1; > > struct amdgpu_mgpu_info mgpu_info = { > .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), > @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); > MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), > 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); > module_param_named(reset_method, amdgpu_reset_method, int, 0444); > > +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want > +to setup (8 if set to greater than 8 or less than 0, only affect gfx > +8+)"); module_param_named(num_kcq, amdgpu_num_kcq, int, 0444); > + > static const struct pci_device_id pciidlist[] = { #ifdef > CONFIG_DRM_AMDGPU_SI > {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index 8eff017..f83a9a7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -202,40 +202,42 @@ bool > amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, > > void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) { > -int i, queue, pipe, mec; > +int i, queue, pipe; > bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); > +int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec * > + adev->gfx.mec.num_queue_per_pipe, > + adev->gfx.num_compute_rings); > + > +if (multipipe_policy) { > +/* policy: make queues evenly cross all pipes on MEC1 only */ for (i > +=
Re: [PATCH] drm/amd/display: Clear dm_state for fast updates
On Mon, Jul 27, 2020 at 10:49:48PM -0400, Kazlauskas, Nicholas wrote: > On 2020-07-27 5:32 p.m., Daniel Vetter wrote: > > On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk wrote: > > > > > > On Monday, July 27, 2020 4:29 PM, Daniel Vetter wrote: > > > > > > > On Mon, Jul 27, 2020 at 9:28 PM Christian König > > > > wrote: > > > > > > > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas: > > > > > > On 2020-07-27 9:39 a.m., Christian König wrote: > > > > > > > Am 27.07.20 um 07:40 schrieb Mazin Rezk: > > > > > > > > This patch fixes a race condition that causes a use-after-free > > > > > > > > during > > > > > > > > amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking > > > > > > > > commits > > > > > > > > are requested and the second one finishes before the first. > > > > > > > > Essentially, > > > > > > > > this bug occurs when the following sequence of events happens: > > > > > > > > > > > > > > > > 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and > > > > > > > > is > > > > > > > > deferred to the workqueue. > > > > > > > > > > > > > > > > 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and > > > > > > > > is > > > > > > > > deferred to the workqueue. > > > > > > > > > > > > > > > > 3. Commit #2 starts before commit #1, dm_state #1 is used in the > > > > > > > > commit_tail and commit #2 completes, freeing dm_state #1. > > > > > > > > > > > > > > > > 4. Commit #1 starts after commit #2 completes, uses the freed > > > > > > > > dm_state > > > > > > > > 1 and dereferences a freelist pointer while setting the context. > > > > > > > > > > > > > > Well I only have a one mile high view on this, but why don't you > > > > > > > let > > > > > > > the work items execute in order? > > > > > > > > > > > > > > That would be better anyway cause this way we don't trigger a > > > > > > > cache > > > > > > > line ping pong between CPUs. > > > > > > > > > > > > > > Christian. > > > > > > > > > > > > We use the DRM helpers for managing drm_atomic_commit_state and > > > > > > those > > > > > > helpers internally push non-blocking commit work into the system > > > > > > unbound work queue. > > > > > > > > > > Mhm, well if you send those helper atomic commits in the order A,B and > > > > > they execute it in the order B,A I would call that a bug :) > > > > > > > > The way it works is it pushes all commits into unbound work queue, but > > > > then forces serialization as needed. We do _not_ want e.g. updates on > > > > different CRTC to be serialized, that would result in lots of judder. > > > > And hw is funny enough that there's all kinds of dependencies. > > > > > > > > The way you force synchronization is by adding other CRTC state > > > > objects. So if DC is busted and can only handle a single update per > > > > work item, then I guess you always need all CRTC states and everything > > > > will be run in order. But that also totally kills modern multi-screen > > > > compositors. Xorg isn't modern, just in case that's not clear :-) > > > > > > > > Lucking at the code it seems like you indeed have only a single dm > > > > state, so yeah global sync is what you'll need as immediate fix, and > > > > then maybe fix up DM to not be quite so silly ... or at least only do > > > > the dm state stuff when really needed. > > > > > > > > We could also sprinkle the drm_crtc_commit structure around a bit > > > > (it's the glue that provides the synchronization across commits), but > > > > since your dm state is global just grabbing all crtc states > > > > unconditionally as part of that is probably best. > > > > > > > > > > While we could duplicate a copy of that code with nothing but the > > > > > > workqueue changed that isn't something I'd really like to maintain > > > > > > going forward. > > > > > > > > > > I'm not talking about duplicating the code, I'm talking about fixing > > > > > the > > > > > helpers. I don't know that code well, but from the outside it sounds > > > > > like a bug there. > > > > > > > > > > And executing work items in the order they are submitted is trivial. > > > > > > > > > > Had anybody pinged Daniel or other people familiar with the helper > > > > > code > > > > > about it? > > > > > > > > Yeah something is wrong here, and the fix looks horrible :-) > > > > > > > > Aside, I've also seen some recent discussion flare up about > > > > drm_atomic_state_get/put used to paper over some other use-after-free, > > > > but this time related to interrupt handlers. Maybe a few rules about > > > > that: > > > > - dont > > > > - especially not when it's interrupt handlers, because you can't call > > > > drm_atomic_state_put from interrupt handlers. > > > > > > > > Instead have an spin_lock_irq to protect the shared date with your > > > > interrupt handler, and _copy_ the date over. This is e.g. what > > > > drm_crtc_arm_vblank_event does. > > > > > > Nicholas wrote a patch that attempted to resolve the issue by adding every > > > CRTC into the
Re: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)
The patch looks totally mangled to me, e.g. some spaces and new lines are missing. Probably because it was forwarded. Christian. Am 28.07.20 um 10:59 schrieb Liu, Monk: [AMD Official Use Only - Internal Distribution Only] -Original Message- From: Monk Liu Sent: Tuesday, July 28, 2020 2:59 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3) what: the MQD's save and restore of KCQ (kernel compute queue) cost lots of clocks during world switch which impacts a lot to multi-VF performance how: introduce a paramter to control the number of KCQ to avoid performance drop if there is no kernel compute queue needed notes: this paramter only affects gfx 8/9/10 v2: refine namings v3: choose queues for each ring to that try best to cross pipes evenly. TODO: in the future we will let hypervisor driver to set this paramter automatically thus no need for user to configure it through modprobe in virtual machine Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 29 +++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 31 7 files changed, 87 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e97c088..de11136 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -201,6 +201,7 @@ extern int amdgpu_si_support; #ifdef CONFIG_DRM_AMDGPU_CIK extern int amdgpu_cik_support; #endif +extern int amdgpu_num_kcq; #define AMDGPU_VM_MAX_NUM_CTX4096 #define AMDGPU_SG_THRESHOLD(256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac9..cf445bab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_gmc_tmz_set(adev); +if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { +amdgpu_num_kcq = 8; +dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid paramter provided by user\n"); +} + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 6291f5f..b545c40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -150,6 +150,7 @@ int amdgpu_noretry; int amdgpu_force_asic_type = -1; int amdgpu_tmz = 0; int amdgpu_reset_method = -1; /* auto */ +int amdgpu_num_kcq = -1; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); module_param_named(reset_method, amdgpu_reset_method, int, 0444); +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to +setup (8 if set to greater than 8 or less than 0, only affect gfx +8+)"); module_param_named(num_kcq, amdgpu_num_kcq, int, 0444); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 8eff017..f83a9a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) { -int i, queue, pipe, mec; +int i, queue, pipe; bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); +int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec * + adev->gfx.mec.num_queue_per_pipe, + adev->gfx.num_compute_rings); + +if (multipipe_policy) { +/* policy: make queues evenly cross all pipes on MEC1 only */ +for (i = 0; i < max_queues_per_mec; i++) { +pipe = i % adev->gfx.mec.num_pipe_per_mec; +queue = (i / adev->gfx.mec.num_pipe_per_mec) % +adev->gfx.mec.num_queue_per_pipe; + +set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue, +adev->gfx.mec.queue_bitmap); +} +} else { +int mec; -/* policy for amdgpu compute queue ownership */ -for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) { -queue = i % adev->gfx.mec.num_queue_per_pipe; -pipe = (i / adev->gfx.mec.num_queue_per_pipe) -% adev->gfx.mec.num_pipe_per_mec; -mec = (i / adev->gfx.mec.num_queue_per_pipe) -/ adev->gfx.mec.num_pipe_per_mec; - -/* we've run out of HW */ -if (mec >=
[PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)
what: the MQD's save and restore of KCQ (kernel compute queue) cost lots of clocks during world switch which impacts a lot to multi-VF performance how: introduce a paramter to control the number of KCQ to avoid performance drop if there is no kernel compute queue needed notes: this paramter only affects gfx 8/9/10 v2: refine namings v3: choose queues for each ring to that try best to cross pipes evenly. TODO: in the future we will let hypervisor driver to set this paramter automatically thus no need for user to configure it through modprobe in virtual machine Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 29 +++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 31 7 files changed, 87 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index e97c088..de11136 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -201,6 +201,7 @@ extern int amdgpu_si_support; #ifdef CONFIG_DRM_AMDGPU_CIK extern int amdgpu_cik_support; #endif +extern int amdgpu_num_kcq; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD(256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac9..cf445bab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_gmc_tmz_set(adev); + if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { + amdgpu_num_kcq = 8; + dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid paramter provided by user\n"); + } + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 6291f5f..b545c40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -150,6 +150,7 @@ int amdgpu_noretry; int amdgpu_force_asic_type = -1; int amdgpu_tmz = 0; int amdgpu_reset_method = -1; /* auto */ +int amdgpu_num_kcq = -1; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); module_param_named(reset_method, amdgpu_reset_method, int, 0444); +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); +module_param_named(num_kcq, amdgpu_num_kcq, int, 0444); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 8eff017..f83a9a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) { - int i, queue, pipe, mec; + int i, queue, pipe; bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); + int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec * + adev->gfx.mec.num_queue_per_pipe, + adev->gfx.num_compute_rings); + + if (multipipe_policy) { + /* policy: make queues evenly cross all pipes on MEC1 only */ + for (i = 0; i < max_queues_per_mec; i++) { + pipe = i % adev->gfx.mec.num_pipe_per_mec; + queue = (i / adev->gfx.mec.num_pipe_per_mec) % + adev->gfx.mec.num_queue_per_pipe; + + set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue, + adev->gfx.mec.queue_bitmap); + } + } else { + int mec; - /* policy for amdgpu compute queue ownership */ - for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) { - queue = i % adev->gfx.mec.num_queue_per_pipe; - pipe = (i / adev->gfx.mec.num_queue_per_pipe) - % adev->gfx.mec.num_pipe_per_mec; - mec = (i / adev->gfx.mec.num_queue_per_pipe)
RE: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name
[AMD Public Use] Thanks Christian. Your suggestion looks better, let me improve it. Regards, Guchun -Original Message- From: Koenig, Christian Sent: Tuesday, July 28, 2020 3:55 PM To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Li, Dennis ; Grodzovsky, Andrey ; Zhou1, Tao ; Clements, John ; Lazar, Lijo ; Yang, Stanley Subject: Re: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name Am 28.07.20 um 09:49 schrieb Guchun Chen: > Add one definition for the RAS module's FS name. It's used in both > debugfs and sysfs case. Maybe better do this with a "static const char*". Christian. > > Signed-off-by: Guchun Chen > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 - > 1 file changed, 8 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2cc09aa67423..c1ed0074a52b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -34,6 +34,8 @@ > #include "amdgpu_xgmi.h" > #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > > +#define AMDGPU_RAS_FS_NAME "ras" > + > const char *ras_error_string[] = { > "none", > "parity", > @@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct > amdgpu_device *adev) > NULL > }; > struct attribute_group group = { > - .name = "ras", > + .name = AMDGPU_RAS_FS_NAME, > .attrs = attrs, > #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) > .bin_attrs = bin_attrs, > @@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct > amdgpu_device *adev) > NULL > }; > struct attribute_group group = { > - .name = "ras", > + .name = AMDGPU_RAS_FS_NAME, > .attrs = attrs, > #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) > .bin_attrs = bin_attrs, > @@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device > *adev, > > if (sysfs_add_file_to_group(>dev->kobj, > >sysfs_attr.attr, > - "ras")) { > + AMDGPU_RAS_FS_NAME)) { > put_obj(obj); > return -EINVAL; > } > @@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device > *adev, > > sysfs_remove_file_from_group(>dev->kobj, > >sysfs_attr.attr, > - "ras"); > + AMDGPU_RAS_FS_NAME); > obj->attr_inuse = 0; > put_obj(obj); > > @@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct > amdgpu_device *adev) > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct drm_minor *minor = adev->ddev->primary; > > - con->dir = debugfs_create_dir("ras", minor->debugfs_root); > + con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME, > + minor->debugfs_root); > debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, > adev, _ras_debugfs_ctrl_ops); > debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, > con->dir, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name
Am 28.07.20 um 09:49 schrieb Guchun Chen: Add one definition for the RAS module's FS name. It's used in both debugfs and sysfs case. Maybe better do this with a "static const char*". Christian. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2cc09aa67423..c1ed0074a52b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,8 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#define AMDGPU_RAS_FS_NAME "ras" + const char *ras_error_string[] = { "none", "parity", @@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (sysfs_add_file_to_group(>dev->kobj, >sysfs_attr.attr, - "ras")) { + AMDGPU_RAS_FS_NAME)) { put_obj(obj); return -EINVAL; } @@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, sysfs_remove_file_from_group(>dev->kobj, >sysfs_attr.attr, - "ras"); + AMDGPU_RAS_FS_NAME); obj->attr_inuse = 0; put_obj(obj); @@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct drm_minor *minor = adev->ddev->primary; - con->dir = debugfs_create_dir("ras", minor->debugfs_root); + con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME, + minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, adev, _ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name
Add one definition for the RAS module's FS name. It's used in both debugfs and sysfs case. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2cc09aa67423..c1ed0074a52b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,8 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#define AMDGPU_RAS_FS_NAME "ras" + const char *ras_error_string[] = { "none", "parity", @@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) NULL }; struct attribute_group group = { - .name = "ras", + .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) .bin_attrs = bin_attrs, @@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (sysfs_add_file_to_group(>dev->kobj, >sysfs_attr.attr, - "ras")) { + AMDGPU_RAS_FS_NAME)) { put_obj(obj); return -EINVAL; } @@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, sysfs_remove_file_from_group(>dev->kobj, >sysfs_attr.attr, - "ras"); + AMDGPU_RAS_FS_NAME); obj->attr_inuse = 0; put_obj(obj); @@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct drm_minor *minor = adev->ddev->primary; - con->dir = debugfs_create_dir("ras", minor->debugfs_root); + con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME, + minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, adev, _ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 05/12] drm/amdgpu: skip bad page reservation once issuing from eeprom write
Once the ras recovery is issued from eeprom write itself, bad page reservation should be ignored, otherwise, recursive calling of writting to eeprom would happen. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 56e1aeba2d64..3c4b9127660d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -62,8 +62,6 @@ const char *ras_block_string[] = { #define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i]) -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) /* inject address is 52 bits */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 4672649a9293..cf9f60202334 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -31,6 +31,10 @@ #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1) +#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2) + enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__SDMA, @@ -503,10 +507,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - /* save bad page to eeprom before gpu reset, -* i2c may be unstable in gpu reset + /* +* Save bad page to eeprom before gpu reset, i2c may be unstable +* in gpu reset. +* +* Also, exclude the case when ras recovery issuer is +* eeprom page write itself. */ - if (in_task()) + if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task()) amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(>in_recovery, 0, 1) == 0) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 11/12] drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0
When amdgpu_bad_page_threshold = 0, bad page reservation stuffs are skipped in either UMC ECC irq or page retirement calling of sync flood isr. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ddcf2470a20b..bbff89caf8c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1678,7 +1678,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) int ret = 0; /* no bad page record, skip eeprom access */ - if (!control->num_recs) + if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) return ret; bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); @@ -1782,7 +1782,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) struct amdgpu_bo *bo = NULL; int i, ret = 0; - if (!con || !con->eh_data) + /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */ + if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0)) return 0; mutex_lock(>recovery_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index af1b1ccf613c..262baf0f61ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -125,8 +125,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, "detected in UMC block\n", err_data->ue_count); - if (err_data->err_addr_cnt && - amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + if ((amdgpu_bad_page_threshold != 0) && + err_data->err_addr_cnt && + amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt)) dev_warn(adev->dev, "Failed to add ras bad page!\n"); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 10/12] drm/amdgpu: decouple sysfs creating of bad page node
Bad page information should not be exposed by sysfs when bad page retirement is disabled, so decouple it from ras sysfs group creating, and add one guard before creating. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 71 - 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c1ed0074a52b..ddcf2470a20b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1027,6 +1027,35 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); } +static void amdgpu_ras_sysfs_add_badpage_node(struct amdgpu_device *adev) +{ +#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct attribute_group group; + struct bin_attribute *bin_attrs[] = { + >badpages_attr, + NULL, + }; + + con->badpages_attr = (struct bin_attribute) { + .attr = { + .name = "gpu_vram_bad_pages", + .mode = S_IRUGO, + }, + .size = 0, + .private = NULL, + .read = amdgpu_ras_sysfs_badpages_read, + }; + + group.name = AMDGPU_RAS_FS_NAME; + group.bin_attrs = bin_attrs; + + sysfs_bin_attr_init(bin_attrs[0]); + + sysfs_update_group(>dev->kobj, ); +#endif +} + static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1034,16 +1063,9 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) >features_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { - >badpages_attr, - NULL - }; struct attribute_group group = { .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, -#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) - .bin_attrs = bin_attrs, -#endif }; con->features_attr = (struct device_attribute) { @@ -1054,22 +1076,22 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) .show = amdgpu_ras_sysfs_features_read, }; - con->badpages_attr = (struct bin_attribute) { - .attr = { - .name = "gpu_vram_bad_pages", - .mode = S_IRUGO, - }, - .size = 0, - .private = NULL, - .read = amdgpu_ras_sysfs_badpages_read, - }; - sysfs_attr_init(attrs[0]); - sysfs_bin_attr_init(bin_attrs[0]); return sysfs_create_group(>dev->kobj, ); } +static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) +{ +#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + sysfs_remove_file_from_group(>dev->kobj, + >badpages_attr.attr, + AMDGPU_RAS_FS_NAME); +#endif +} + static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1077,16 +1099,9 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) >features_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { - >badpages_attr, - NULL - }; struct attribute_group group = { .name = AMDGPU_RAS_FS_NAME, .attrs = attrs, -#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS) - .bin_attrs = bin_attrs, -#endif }; sysfs_remove_group(>dev->kobj, ); @@ -1155,6 +1170,9 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_sysfs_remove(adev, >head); } + if (amdgpu_bad_page_threshold != 0) + amdgpu_ras_sysfs_remove_bad_page_node(adev); + amdgpu_ras_sysfs_remove_feature_node(adev); return 0; @@ -1283,6 +1301,9 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { amdgpu_ras_sysfs_create_feature_node(adev); + if (amdgpu_bad_page_threshold != 0) + amdgpu_ras_sysfs_add_badpage_node(adev); + return 0; } -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 03/12] drm/amdgpu: add bad gpu tag definition
This tag will be hired for bad gpu detection in eeprom's access. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index a2c982b1eac6..35c0c849d49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -46,6 +46,9 @@ #define EEPROM_TABLE_HDR_VAL 0x414d4452 #define EEPROM_TABLE_VER 0x0001 +/* Bad GPU tag ‘BADG’ */ +#define EEPROM_TABLE_HDR_BAD 0x42414447 + /* Assume 2 Mbit size */ #define EEPROM_SIZE_BYTES 256000 #define EEPROM_PAGE__SIZE_BYTES 256 -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 12/12] drm/amdgpu: reset eeprom once specifying one bigger threshold
During driver's probe, when it hits bad gpu tag in eeprom i2c init calling(the tag was set when reported bad page reaches bad page threshold in last driver's working loop), there are some strategys to deal with the cases: 1. when the module parameter amdgpu_bad_page_threshold = 0, that means page retirement feature is disabled, so just resetting the eeprom is fine. 2. When amdgpu_bad_page_threshold is not 0, and moreover, user sets one bigger valid value in order to make current boot up succeeds, reset the eeprom data and do not break booting. 3. For other cases, driver's probe will be broken. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index be895dc2d739..02933050081b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -248,6 +248,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct i2c_msg msg = { .addr = 0, .flags = I2C_M_RD, @@ -287,9 +288,15 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && (amdgpu_bad_page_threshold != 0)) { - *exceed_err_limit = true; - DRM_ERROR("Exceeding the bad_page_threshold parameter, " + if (ras->bad_page_cnt_threshold > control->num_recs) { + DRM_INFO("One valid bigger bad page threshold is " + "used, reset eeprom.\n"); + ret = amdgpu_ras_eeprom_reset_table(control); + } else { + *exceed_err_limit = true; + DRM_ERROR("Exceeding the bad_page_threshold parameter, " "disabling the GPU.\n"); + } } else { DRM_INFO("Creating new EEPROM table"); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 08/12] drm/amdgpu: restore ras flags when user resets eeprom
RAS flags needs to be cleaned as well when user requires one clean eeprom. v2: RAS flags shall be restored after eeprom reset succeeds. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dcb84f2ca078..2cc09aa67423 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -368,12 +368,19 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_device *adev = + (struct amdgpu_device *)file_inode(f)->i_private; int ret; - ret = amdgpu_ras_eeprom_reset_table(>psp.ras.ras->eeprom_control); + ret = amdgpu_ras_eeprom_reset_table( + &(amdgpu_ras_get_context(adev)->eeprom_control)); - return ret == 1 ? size : -EIO; + if (ret == 1) { + amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; + return size; + } else { + return -EIO; + } } static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 07/12] drm/amdgpu: break GPU recovery once it's in bad state
When GPU executes recovery and retriving bad GPU tag from external eerpom device, the recovery will be broken and error message is printed as well for user's awareness. v2: Refine warning message in threshold reaching case, and fix spelling typo. v3: Fix explicit calling of bad gpu. v4: Rename function names. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 20 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 + .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 40 +++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 4 ++ 5 files changed, 79 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 30af0dfee1a1..c893d9adbab7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4139,8 +4139,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_fbdev_set_suspend(tmp_adev, 0); - /* must succeed. */ - amdgpu_ras_resume(tmp_adev); + /* +* The GPU enters bad state once faulty pages +* by ECC has reached the threshold, and ras +* recovery is scheduled next. So add one check +* here to break recovery if it indeed exceeds +* bad page threshold, and remind user to +* retire this GPU or setting one bigger +* bad_page_threshold value to fix this once +* probing driver again. +*/ + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + /* must succeed. */ + amdgpu_ras_resume(tmp_adev); + } else { + r = -EINVAL; + goto out; + } /* Update PSP FW topology after reset */ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) @@ -4148,7 +4163,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - out: if (!r) { amdgpu_irq_gpu_reset_resume_helper(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4b9127660d..dcb84f2ca078 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2206,3 +2206,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } + +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool exc_err_limit = false; + + if (con && (amdgpu_bad_page_threshold != 0)) + amdgpu_ras_eeprom_check_err_threshold(>eeprom_control, + _err_limit); + + /* +* We are only interested in variable exc_err_limit, +* as it says if GPU is in bad state or not. +*/ + return exc_err_limit; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index cf9f60202334..70a6fca73617 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev); unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev); + /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index d24bf65f6dd7..be895dc2d739 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) return curr_address; } +int amdgpu_ras_eeprom_check_err_threshold( + struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + unsigned char buff[EEPROM_ADDRESS_SIZE + + EEPROM_TABLE_HEADER_SIZE] = { 0 }; + struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr; + struct i2c_msg msg = {
[PATCH 06/12] drm/amdgpu: schedule ras recovery when reaching bad page threshold
Once the bad page saved to eeprom reaches the configured threshold, ras recovery will be issued to notify user. v2: Fix spelling typo. Signed-off-by: Guchun Chen --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 37 ++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 67995b66d7d4..d24bf65f6dd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, int i, ret = 0; struct i2c_msg *msgs, *msg; unsigned char *buffs, *buff; + bool sched_ras_recovery = false; struct eeprom_table_record *record; struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS) return 0; @@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, goto free_buff; } + /* +* If saved bad pages number exceeds the bad page threshold for +* the whole VRAM, update table header to mark the BAD GPU tag +* and schedule one ras recovery after eeprom write is done, +* this can avoid the missing for latest records. +* +* This new header will be picked up and checked in the bootup +* by ras recovery, which may break bootup process to notify +* user this GPU is in bad state and to retire such GPU for +* further check. +*/ + if (write && (amdgpu_bad_page_threshold != 0) && + ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) { + dev_warn(adev->dev, + "Saved bad pages(%d) reaches threshold value(%d).\n", + control->num_recs + num, ras->bad_page_cnt_threshold); + control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD; + sched_ras_recovery = true; + } + /* In case of overflow just start from beginning to not lose newest records */ if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES)) control->next_addr = EEPROM_RECORD_START; - /* * TODO Currently makes EEPROM writes for each record, this creates * internal fragmentation. Optimized the code to do full page write of @@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, __update_tbl_checksum(control, records, num, old_hdr_byte_sum); __update_table_header(control, buffs); + + if (sched_ras_recovery) { + /* +* Before scheduling ras recovery, assert the related +* flag first, which shall bypass common bad page +* reservation execution in amdgpu_ras_reset_gpu. +*/ + amdgpu_ras_get_context(adev)->flags |= + AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV; + + dev_warn(adev->dev, "Conduct ras recovery due to bad " + "page threshold reached.\n"); + amdgpu_ras_reset_gpu(adev); + } } else if (!__validate_tbl_checksum(control, records, num)) { DRM_WARN("EEPROM Table checksum mismatch!"); /* TODO Uncomment when EEPROM read/write is relliable */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU
When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. v2: Fix spelling typo, correct the condition to detect bad gpu tag and refine error message. v3: Refine function argument name. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2662cd7c8685..30af0dfee1a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. -* recovery_init may fail, but it can free all resources allocated by -* itself and its failure should not stop amdgpu init process. +* +* amdgpu_ras_recovery_init may fail, but the upper only cares the +* failure from bad gpu situation and stop amdgpu init process +* accordingly. For other failed cases, it will still release all +* the resource and print error message, rather than returning one +* negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4c142e9d8a..56e1aeba2d64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(>eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit); + /* +* We only fail this calling and halt booting up +* when exc_err_limit is true. +*/ + if (exc_err_limit) { + ret = -EINVAL; goto free; + } if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* +* Except error threshold exceeding case, other failure cases in this +* function would not fail amdgpu driver init. +*/ + if (!exc_err_limit) + ret = 0; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 35c0c849d49b..67995b66d7d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) { int ret = 0; struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) .buf= buff, }; + *exceed_err_limit = false; + /* Verify i2c adapter is initialized */ if (!adev->pm.smu_i2c.algo) return -ENOENT; @@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", control->num_recs); + } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && + (amdgpu_bad_page_threshold != 0)) { + *exceed_err_limit = true; + DRM_ERROR("Exceeding the
[PATCH 00/12] BAD GPU retirement policy by total bad pages
The series is to enable/disable bad page feature and apply different bad page reservation strategy by different bad page threshold configurations. When the saved bad pages written to eeprom reach the threshold, one ras recovery will be issued immediately and the recovery will fail to tell user that the GPU is BAD and needs to be retired for further check or setting one valid bigger threshold value in next driver's probe to skip corresponding check. During bootup, similar bad page threshold check is conducted as well when eeprom get initialized, and it will possibly break boot up for user's awareness. When user sets bad_page_threshold=0 once probing driver, bad page retirement feature is completely disabled, and driver has no chance to process bad page information record and write it to eeprom. Guchun Chen (12): drm/amdgpu: add bad page count threshold in module parameter drm/amdgpu: validate bad page threshold in ras drm/amdgpu: add bad gpu tag definition drm/amdgpu: break driver init process when it's bad GPU drm/amdgpu: skip bad page reservation once issuing from eeprom write drm/amdgpu: schedule ras recovery when reaching bad page threshold drm/amdgpu: break GPU recovery once it's in bad state drm/amdgpu: restore ras flags when user resets eeprom drm/amdgpu: define one macro for RAS's sysfs/debugfs name drm/amdgpu: decouple sysfs creating of bad page node drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0 drm/amdgpu: reset eeprom once specifying one bigger threshold drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 32 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 186 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 19 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 102 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +- 8 files changed, 312 insertions(+), 53 deletions(-) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 02/12] drm/amdgpu: validate bad page threshold in ras
Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when saved bad pages exceed threshold value, and proceed corresponding actions. v2: When using the default typical value, it should be min value between typical value and eeprom max records length. v3: drop the case of setting bad_page_cnt_threshold to be 0x, as it confuses user. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 48 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 5 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 2 + 4 files changed, 58 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6f06e1214622..3c4c142e9d8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -69,6 +69,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #defineRAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ +#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1700,6 +1703,47 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int tmp_threshold = amdgpu_bad_page_threshold; + u64 val; + + /* +* Justification of value bad_page_cnt_threshold in ras structure +* +* Generally, -1 <= amdgpu_bad_page_threshold <= max record length +* in eeprom, and introduce two scenarios accordingly. +* +* Bad page retirement enablement: +*- If amdgpu_bad_page_threshold = -1, +* bad_page_cnt_threshold = typical value by formula. +* +*- When the value from user is 0 < amdgpu_bad_page_threshold < +* max record length in eeprom, use it directly. +* +* Bad page retirement disablement: +*- If amdgpu_bad_page_threshold = 0, bad page retirement +* functionality is disabled, and bad_page_cnt_threshold will +* take no effect. +*/ + + if (tmp_threshold < -1) + tmp_threshold = -1; + else if (tmp_threshold > max_length) + tmp_threshold = max_length; + + if (tmp_threshold == -1) { + val = adev->gmc.mc_vram_size; + do_div(val, RAS_BAD_PAGE_RATE); + con->bad_page_cnt_threshold = min(lower_32_bits(val), + max_length); + } else { + con->bad_page_cnt_threshold = tmp_threshold; + } +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1777,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; int ret; if (con) @@ -1795,6 +1840,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(>in_recovery, 0); con->adev = adev; + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + ret = amdgpu_ras_eeprom_init(>eeprom_control); if (ret) goto free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b2667342cf67..4672649a9293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -336,6 +336,9 @@ struct amdgpu_ras { struct amdgpu_ras_eeprom_control eeprom_control; bool error_query_ready; + + /* bad page count threshold */ + uint32_t bad_page_cnt_threshold; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index c0096097bbcf..a2c982b1eac6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, return ret == num ? 0 : -EIO; } +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +{ + return EEPROM_MAX_RECORD_NUM; +} + /* Used for testing if bugs encountered */ #if 0 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
[PATCH 01/12] drm/amdgpu: add bad page count threshold in module parameter
bad_page_threshold could be configured to enable/disable the associated bad page retirement feature in RAS. When it's -1, ras will use typical bad page failure value to handle bad page retirement. When it's 0, disable bad page retirement, and no bad page will be recorded and saved. For other valid value, driver will use this manual value as the threshold value of totoal bad pages. v2: correct documentation of this parameter. v3: remove confused statement in documentation. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 06bfb8658dec..bb83ffb5e26a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level; extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; +extern int amdgpu_bad_page_threshold; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d28b95f721c4..820a28c9e957 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = { }; int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0x; +int amdgpu_bad_page_threshold = -1; /** * DOC: vramlimit (int) @@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); module_param_named(reset_method, amdgpu_reset_method, int, 0444); +/** + * DOC: bad_page_threshold (int) + * Bad page threshold is to specify the threshold value of faulty pages + * detected by RAS ECC, that may result in GPU entering bad status if total + * faulty pages by ECC exceed threshold value and leave it for user's further + * check. + */ +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default typical value), 0 = disable bad page retirement)"); +module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); + static const struct pci_device_id pciidlist[] = { #ifdef CONFIG_DRM_AMDGPU_SI {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/4] drm: retrieve EDID via ACPI _DDC method
Some notebook computer systems expose the EDID for the internal panel via the ACPI _DDC method. On some systems this is because the panel does not populate the hardware DDC lines, and on some systems with dynamic display muxes, _DDC is implemented to allow the internal panel's EDID to be read at any time, regardless of how the mux is switched. The _DDC method can be implemented for each individual display output, so there could be an arbitrary number of outputs exposing their EDIDs via _DDC; however, in practice, this has only been implemented so far on systems with a single panel, so the current implementation of drm_get_edid_acpi() walks the outputs listed by each GPU's ACPI _DOD method and returns the first EDID successfully retrieved by any attached _DDC method. It may be necessary in the future to allow for the retrieval of distinct EDIDs for different output devices, but in order to do so, it will first be necessary to develop a way to correlate individual DRM outputs with their corresponding entities in ACPI. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/drm_edid.c | 161 + include/drm/drm_edid.h | 1 + 2 files changed, 162 insertions(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 116451101426..f66d6bf048c6 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2058,6 +2059,166 @@ struct edid *drm_get_edid_switcheroo(struct drm_connector *connector, } EXPORT_SYMBOL(drm_get_edid_switcheroo); +#if defined(CONFIG_ACPI) && defined(CONFIG_PCI) +static u64 *get_dod_entries(acpi_handle handle, int *count) +{ + acpi_status status; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *dod; + int i; + u64 *ret = NULL; + + *count = 0; + + status = acpi_evaluate_object(handle, "_DOD", NULL, ); + + if (ACPI_FAILURE(status)) + return NULL; + + dod = buf.pointer; + + if (dod == NULL || dod->type != ACPI_TYPE_PACKAGE) + goto done; + + ret = kmalloc_array(dod->package.count, sizeof(*ret), GFP_KERNEL); + if (ret == NULL) + goto done; + + for (i = 0; i < dod->package.count; i++) { + if (dod->package.elements[i].type != ACPI_TYPE_INTEGER) + continue; + ret[*count] = dod->package.elements[i].integer.value; + (*count)++; + } + +done: + kfree(buf.pointer); + return ret; +} + +static void *do_acpi_ddc(acpi_handle handle) +{ + int i; + void *ret = NULL; + + /* +* The _DDC spec defines an integer argument for specifying the size of +* the EDID to be retrieved. A value of 1 requests a 128-byte EDID, and +* a value of 2 requests a 256-byte EDID. Attempt the larger read first. +*/ + for (i = 2; i >= 1; i--) { + struct acpi_buffer out = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object arg = { ACPI_TYPE_INTEGER }; + struct acpi_object_list in = { 1, }; + union acpi_object *edid; + acpi_status status; + + arg.integer.value = i; + status = acpi_evaluate_object(handle, "_DDC", , ); + edid = out.pointer; + + if (ACPI_SUCCESS(status)) + ret = edid->buffer.pointer; + + kfree(edid); + + if (ret) + break; + } + + return ret; +} + +static struct edid *first_edid_from_acpi_ddc(struct pci_dev *pdev) +{ + acpi_handle handle; + acpi_status status; + struct acpi_device *device = NULL; + struct edid *ret = NULL; + int num_dod_entries; + u64 *dod_entries = NULL; + struct list_head *node, *next; + + handle = ACPI_HANDLE(>dev); + if (handle == NULL) + return NULL; + + dod_entries = get_dod_entries(handle, _dod_entries); + if (dod_entries == NULL || num_dod_entries == 0) + goto done; + + status = acpi_bus_get_device(handle, ); + if (ACPI_FAILURE(status) || device == NULL) + goto done; + + list_for_each_safe(node, next, >children) { + struct acpi_device *child; + u64 adr; + int i; + + child = list_entry(node, struct acpi_device, node); + if (child == NULL) + continue; + + status = acpi_evaluate_integer(child->handle, "_ADR", NULL, + ); + if (ACPI_FAILURE(status)) + continue; + + for (i = 0; i < num_dod_entries; i++) { + if (adr == dod_entries[i]) { + ret = do_acpi_ddc(child->handle); + + if
[PATCH 0/4] drm: add support for retrieving EDID via ACPI _DDC
Some notebook systems provide the EDID for the internal panel via the _DDC method in ACPI, instead of or in addition to providing the EDID via DDC on LVDS/eDP. Add a DRM helper to search for an ACP _DDC method under the ACPI namespace for each VGA/3D controller, and return the first EDID successfully retrieved via _DDC. Update the i915, nouveau, and radeon DRM-KMS drivers to fall back to retrieving the EDID via ACPI _DDC on notebook internal display panels after failing to retrieve an EDID via other means. This is useful for retrieving an internal panel's EDID both on hybrid graphics systems with muxed display output, when the display is muxed away, as well as on a small number of non-muxed and/or non-hybrid systems where ACPI _DDC is the only means of accessing the EDID for the internal panel. Daniel Dadap (4): drm: retrieve EDID via ACPI _DDC method i915: fall back to ACPI EDID retrieval nouveau: fall back to ACPI EDID retrieval radeon: fall back to ACPI EDID retrieval drivers/gpu/drm/drm_edid.c | 161 drivers/gpu/drm/i915/display/intel_dp.c | 8 +- drivers/gpu/drm/i915/display/intel_lvds.c | 4 + drivers/gpu/drm/nouveau/nouveau_connector.c | 6 + drivers/gpu/drm/radeon/radeon_combios.c | 6 +- include/drm/drm_edid.h | 1 + 6 files changed, 182 insertions(+), 4 deletions(-) -- 2.18.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/4] i915: fall back to ACPI EDID retrieval
Fall back to retrieving the EDID via the ACPI _DDC method, when present for notebook internal panels, when EDID retrieval via the standard EDID paths is unsuccessful. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/i915/display/intel_dp.c | 8 +++- drivers/gpu/drm/i915/display/intel_lvds.c | 4 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 804b1d966f66..ff402cef8183 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -5657,6 +5657,7 @@ static struct edid * intel_dp_get_edid(struct intel_dp *intel_dp) { struct intel_connector *intel_connector = intel_dp->attached_connector; + struct edid *edid; /* use cached edid if we have one */ if (intel_connector->edid) { @@ -5666,8 +5667,13 @@ intel_dp_get_edid(struct intel_dp *intel_dp) return drm_edid_duplicate(intel_connector->edid); } else - return drm_get_edid(_connector->base, + edid = drm_get_edid(_connector->base, _dp->aux.ddc); + + if (!edid && intel_dp_is_edp(intel_dp)) + edid = drm_get_edid_acpi(); + + return edid; } static void diff --git a/drivers/gpu/drm/i915/display/intel_lvds.c b/drivers/gpu/drm/i915/display/intel_lvds.c index 9a067effcfa0..811eea3f5d9f 100644 --- a/drivers/gpu/drm/i915/display/intel_lvds.c +++ b/drivers/gpu/drm/i915/display/intel_lvds.c @@ -946,6 +946,10 @@ void intel_lvds_init(struct drm_i915_private *dev_priv) else edid = drm_get_edid(connector, intel_gmbus_get_adapter(dev_priv, pin)); + + if (!edid) + edid = drm_get_edid_acpi(); + if (edid) { if (drm_add_edid_modes(connector, edid)) { drm_connector_update_edid_property(connector, -- 2.18.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/4] radeon: fall back to ACPI EDID retrieval
Fall back to retrieving the EDID via the ACPI _DDC method, when present for notebook internal panels, when retrieving BIOS-embedded EDIDs. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/radeon/radeon_combios.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index c3e49c973812..de801d9fca54 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct radeon_device *rdev) struct edid * radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) { - struct edid *edid; - if (rdev->mode_info.bios_hardcoded_edid) { + struct edid *edid; edid = kmalloc(rdev->mode_info.bios_hardcoded_edid_size, GFP_KERNEL); if (edid) { memcpy((unsigned char *)edid, @@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) return edid; } } - return NULL; + + return drm_get_edid_acpi(); } static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device *rdev, -- 2.18.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 3/4] nouveau: fall back to ACPI EDID retrieval
Fall back to retrieving the EDID via the ACPI _DDC method, when present for notebook internal panels, when EDID retrieval via the standard EDID paths is unsuccessful. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/nouveau/nouveau_connector.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 9a9a7f5003d3..95836a02a06b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -581,6 +581,12 @@ nouveau_connector_detect(struct drm_connector *connector, bool force) else nv_connector->edid = drm_get_edid(connector, i2c); + if (!nv_connector->edid && + (nv_connector->type == DCB_CONNECTOR_LVDS || + nv_connector->type == DCB_CONNECTOR_eDP)) { + nv_connector->edid = drm_get_edid_acpi(); + } + drm_connector_update_edid_property(connector, nv_connector->edid); if (!nv_connector->edid) { -- 2.18.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
[AMD Public Use] Hi Tianci, My point is, as in the new patch, one new local adev variable is introduced, then in the same function, for others where smu->adev is used should be replace by the new local adev as well. Otherwise, it looks not perfect from coding style's perspective. Regards, Guchun -Original Message- From: Yin, Tianci (Rico) Sent: Tuesday, July 28, 2020 2:48 PM To: Chen, Guchun ; amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit Hi Guchun, Since the adev variable invoked a few times, local adev make the code more concise. Thanks! Rico -Original Message- From: Chen, Guchun Sent: Tuesday, July 28, 2020 2:26 PM To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Yin, Tianci (Rico) ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit [AMD Public Use] One minor comment. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Tianci Yin Sent: Tuesday, July 28, 2020 1:27 PM To: amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Yin, Tianci (Rico) ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..5da0436d41e0 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); + struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, amdgpu_device_ip_set_clockgating_state(smu->adev, [Guchun]Use the local adev instead of smu->adev? AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12 && + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { + if (adev->gfx.funcs->init_spm_golden) { + dev_dbg(adev->dev,"GFXOFF exited, re-init SPM golden settings\n"); + amdgpu_gfx_init_spm_golden(adev); + } else + dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); + } } } else { /* exit umd pstate, restore level, enable gfx cg*/ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu/dc: Stop dma_resv_lock inversion in commit_tail
Am 27.07.20 um 23:30 schrieb Daniel Vetter: Trying to grab dma_resv_lock while in commit_tail before we've done all the code that leads to the eventual signalling of the vblank event (which can be a dma_fence) is deadlock-y. Don't do that. Here the solution is easy because just grabbing locks to read something races anyway. We don't need to bother, READ_ONCE is equivalent. And avoids the locking issue. v2: Also take into account tmz_surface boolean, plus just delete the old code. Cc: linux-me...@vger.kernel.org Cc: linaro-mm-...@lists.linaro.org Cc: linux-r...@vger.kernel.org Cc: amd-gfx@lists.freedesktop.org Cc: intel-...@lists.freedesktop.org Cc: Chris Wilson Cc: Maarten Lankhorst Cc: Christian König Signed-off-by: Daniel Vetter --- DC-folks, I think this split out patch from my series here https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F20200707201229.472834-1-daniel.vetter%40ffwll.ch%2Fdata=02%7C01%7Cchristian.koenig%40amd.com%7C8a4f5736682a4b5c943e08d832747ab1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637314823145521840sdata=qd7Nrox62Lr%2FXWbJJFVskg9RYL4%2FoRVCFjR6rUDMA5E%3Dreserved=0 should be ready for review/merging. I fixed it up a bit so that it's not just a gross hack :-) Cheers, Daniel --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 19 ++- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 21ec64fe5527..a20b62b1f2ef 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6959,20 +6959,13 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, DRM_ERROR("Waiting for fences timed out!"); /* -* TODO This might fail and hence better not used, wait -* explicitly on fences instead -* and in general should be called for -* blocking commit to as per framework helpers +* We cannot reserve buffers here, which means the normal flag +* access functions don't work. Paper over this with READ_ONCE, +* but maybe the flags are invariant enough that not even that +* would be needed. */ - r = amdgpu_bo_reserve(abo, true); - if (unlikely(r != 0)) - DRM_ERROR("failed to reserve buffer before flip\n"); - - amdgpu_bo_get_tiling_flags(abo, _flags); - - tmz_surface = amdgpu_bo_encrypted(abo); - - amdgpu_bo_unreserve(abo); + tiling_flags = READ_ONCE(abo->tiling_flags); + tmz_surface = READ_ONCE(abo->flags) & AMDGPU_GEM_CREATE_ENCRYPTED; Yeah, the abo->flags are mostly fixed after creation, especially the encrypted flag can't change or we corrupt page table tables. So that should work fine. Anybody who picks this up feel free to add an Reviewed-by: Christian König . Regards, Christian. fill_dc_plane_info_and_addr( dm->adev, new_plane_state, tiling_flags, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v2)
[AMD Official Use Only - Internal Distribution Only] Thanks Felix I reworked my patch with your suggestion and I can get queues evenly cross pipes, e.g.: modprobe amdgpu num_kcq=6 [ 409.878557] amdgpu :00:07.0: amdgpu: ring comp_1.0.0 uses VM inv eng 1 on hub 0 [ 409.878559] amdgpu :00:07.0: amdgpu: ring comp_1.1.0 uses VM inv eng 4 on hub 0 [ 409.878561] amdgpu :00:07.0: amdgpu: ring comp_1.2.0 uses VM inv eng 5 on hub 0 [ 409.878563] amdgpu :00:07.0: amdgpu: ring comp_1.3.0 uses VM inv eng 6 on hub 0 [ 409.878565] amdgpu :00:07.0: amdgpu: ring comp_1.0.1 uses VM inv eng 7 on hub 0 [ 409.878567] amdgpu :00:07.0: amdgpu: ring comp_1.1.1 uses VM inv eng 8 on hub 0 [ 409.878568] amdgpu :00:07.0: amdgpu: ring kiq_2.1.0 uses VM inv eng 9 on hub 0 Please review my patch upcoming _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Kuehling, Felix Sent: Tuesday, July 28, 2020 7:33 AM To: amd-gfx@lists.freedesktop.org; Liu, Monk Subject: Re: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v2) Am 2020-07-27 um 6:47 a.m. schrieb Monk Liu: > what: > the MQD's save and restore of kernel compute queues cost lots of > clocks during world switch which impacts a lot to multi-VF performance > > how: > introduce a paramter to control the number of kernel compute queues to > avoid performance drop if there is no kernel compute queue needed > > notes: > this paramter only affects gfx 8/9/10 > > TODO: > in the future we will let hypervisor driver to set this paramter > automatically thus no need for user to configure it through modprobe > in virtual machine > > Signed-off-by: Monk Liu > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 27 +- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 +++-- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 29 ++-- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 31 > +++--- > 7 files changed, 71 insertions(+), 56 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e97c088..71a3d6a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -201,6 +201,7 @@ extern int amdgpu_si_support; #ifdef > CONFIG_DRM_AMDGPU_CIK extern int amdgpu_cik_support; #endif > +extern int amdgpu_num_kcq_user_set; > > #define AMDGPU_VM_MAX_NUM_CTX4096 > #define AMDGPU_SG_THRESHOLD(256*1024*1024) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 62ecac9..18b93ef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct > amdgpu_device *adev) > > amdgpu_gmc_tmz_set(adev); > > +if (amdgpu_num_kcq_user_set > 8 || amdgpu_num_kcq_user_set < 0) { > +amdgpu_num_kcq_user_set = 8; > +dev_warn(adev-dev, "set KCQ number to 8 due to invalid paramter provided by > user\n"); > +} > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 6291f5f..03a94e9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -150,6 +150,7 @@ int amdgpu_noretry; int amdgpu_force_asic_type = > -1; int amdgpu_tmz = 0; int amdgpu_reset_method = -1; /* auto */ > +int amdgpu_num_kcq_user_set = 8; > > struct amdgpu_mgpu_info mgpu_info = { > .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), > @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); > MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), > 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)"); > module_param_named(reset_method, amdgpu_reset_method, int, 0444); > > +MODULE_PARM_DESC(num_kcq, "number of KCQ user want to setup (8 if set > +to greater than 8 or less than 0, only affect gfx 8+)"); > +module_param_named(num_kcq, amdgpu_num_kcq_user_set, int, 0444); > + > static const struct pci_device_id pciidlist[] = { #ifdef > CONFIG_DRM_AMDGPU_SI > {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index 8eff017..0b59049 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -202,7 +202,7 @@ bool > amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, > > void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) { > -int i, queue, pipe, mec; > +int i, queue, pipe, mec, j = 0; > bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); > > /*
Re: [PATCH 4/4] radeon: fall back to ACPI EDID retrieval
Am 27.07.20 um 22:53 schrieb Daniel Dadap: Fall back to retrieving the EDID via the ACPI _DDC method, when present for notebook internal panels, when retrieving BIOS-embedded EDIDs. Signed-off-by: Daniel Dadap --- drivers/gpu/drm/radeon/radeon_combios.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index c3e49c973812..de801d9fca54 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct radeon_device *rdev) struct edid * radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) { - struct edid *edid; - if (rdev->mode_info.bios_hardcoded_edid) { + struct edid *edid; That's an unrelated an incorrect style change. You need a blank line after declaration. edid = kmalloc(rdev->mode_info.bios_hardcoded_edid_size, GFP_KERNEL); if (edid) { memcpy((unsigned char *)edid, @@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct radeon_device *rdev) return edid; } } - return NULL; + + return drm_get_edid_acpi(); In general a good idea, but I'm wondering if we should really do this so unconditionally here. Regards, Christian. } static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device *rdev, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
Hi Guchun, Since the adev variable invoked a few times, local adev make the code more concise. Thanks! Rico -Original Message- From: Chen, Guchun Sent: Tuesday, July 28, 2020 2:26 PM To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Yin, Tianci (Rico) ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit [AMD Public Use] One minor comment. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Tianci Yin Sent: Tuesday, July 28, 2020 1:27 PM To: amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Yin, Tianci (Rico) ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..5da0436d41e0 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); + struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, amdgpu_device_ip_set_clockgating_state(smu->adev, [Guchun]Use the local adev instead of smu->adev? AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12 && + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { + if (adev->gfx.funcs->init_spm_golden) { + dev_dbg(adev->dev,"GFXOFF exited, re-init SPM golden settings\n"); + amdgpu_gfx_init_spm_golden(adev); + } else + dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); + } } } else { /* exit umd pstate, restore level, enable gfx cg*/ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
[AMD Public Use] One minor comment. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Tianci Yin Sent: Tuesday, July 28, 2020 1:27 PM To: amd-gfx@lists.freedesktop.org Cc: Xu, Feifei ; Yin, Tianci (Rico) ; Tuikov, Luben ; Hesik, Christopher ; Deucher, Alexander ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Zhang, Hawking Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..5da0436d41e0 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); + struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, amdgpu_device_ip_set_clockgating_state(smu->adev, [Guchun]Use the local adev instead of smu->adev? AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12 && + (adev->pm.pp_feature & PP_GFXOFF_MASK)) { + if (adev->gfx.funcs->init_spm_golden) { + dev_dbg(adev->dev,"GFXOFF exited, re-init SPM golden settings\n"); + amdgpu_gfx_init_spm_golden(adev); + } else + dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); + } } } else { /* exit umd pstate, restore level, enable gfx cg*/ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit
[AMD Official Use Only - Internal Distribution Only] Thanks Feifei! Rico -Original Message- From: Xu, Feifei Sent: Tuesday, July 28, 2020 2:21 PM To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Yin, Tianci (Rico) Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit [AMD Official Use Only - Internal Distribution Only] Series is Reviewed-by: Feifei Xu -Original Message- From: Tianci Yin Sent: Tuesday, July 28, 2020 1:27 PM To: amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Hesik, Christopher ; Swamy, Manjunatha ; Quan, Evan ; Feng, Kenneth ; Yin, Tianci (Rico) Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit From: "Tianci.Yin" On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, reconfigure the golden settings after GFXOFF exit. Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9 Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 55463e7a11e2..5da0436d41e0 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle, struct smu_context *smu = (struct smu_context*)(handle); struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm); +struct amdgpu_device *adev = smu->adev; if (!smu->is_apu && !smu_dpm_ctx->dpm_context) return -EINVAL; @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle, amdgpu_device_ip_set_clockgating_state(smu->adev, AMD_IP_BLOCK_TYPE_GFX, AMD_CG_STATE_UNGATE); + +if (adev->asic_type >= CHIP_NAVI10 && +adev->asic_type <= CHIP_NAVI12 && +(adev->pm.pp_feature & PP_GFXOFF_MASK)) { if +(adev->gfx.funcs->init_spm_golden) { dev_dbg(adev->dev,"GFXOFF exited, +re-init SPM golden settings\n"); amdgpu_gfx_init_spm_golden(adev); } +else dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); } } } else { /* exit umd pstate, restore level, enable gfx cg*/ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx