RE: [PATCH] drm/amdgpu: update GC golden setting for navy_flounder

2020-07-28 Thread Zhou1, Tao
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Jiansong Chen 
> Sent: Wednesday, July 29, 2020 12:02 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao ; Chen, Jiansong (Simon)
> 
> Subject: [PATCH] drm/amdgpu: update GC golden setting for navy_flounder
>
> Update GC golden setting for navy_flounder.
>
> Signed-off-by: Jiansong Chen 
> Change-Id: Ia7e82616b0be48f397c73b015823ac10ef907f08
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index db9f1e89a0f8..ca16f01956d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3127,7 +3127,7 @@ static const struct soc15_reg_golden
> golden_settings_gc_10_3_2[] =
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA0_CLK_CTRL,
> 0xff7f0fff, 0x3100),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA1_CLK_CTRL,
> 0xff7f0fff, 0x7e000100),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmCPF_GCR_CNTL, 0x0007,
> 0xc000),
> -SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x,
> 0x0200),
> +SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x,
> 0x0280),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x,
> 0x0080),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_EXCEPTION_CONTROL,
> 0x7fff0f1f, 0x00b8),
>  SOC15_REG_GOLDEN_VALUE(GC, 0,
> mmGCR_GENERAL_CNTL_Sienna_Cichlid, 0x1ff1, 0x0500), @@ -3158,7
> +3158,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3_2[]
> =
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER7_SELECT,
> 0xf0f001ff, 0x),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER8_SELECT,
> 0xf0f001ff, 0x),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT,
> 0xf0f001ff, 0x),
> -SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x,
> 0x010b),
> +SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7,
> 0x0103),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf,
> 0x00a0),
>  SOC15_REG_GOLDEN_VALUE(GC, 0, mmVGT_GS_MAX_WAVE_ID,
> 0x0fff, 0x03ff)  };
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: update GC golden setting for navy_flounder

2020-07-28 Thread Jiansong Chen
Update GC golden setting for navy_flounder.

Signed-off-by: Jiansong Chen 
Change-Id: Ia7e82616b0be48f397c73b015823ac10ef907f08
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index db9f1e89a0f8..ca16f01956d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3127,7 +3127,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_2[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA0_CLK_CTRL, 0xff7f0fff, 
0x3100),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_SPI_RA1_CLK_CTRL, 0xff7f0fff, 
0x7e000100),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCPF_GCR_CNTL, 0x0007, 0xc000),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0200),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG3, 0x, 0x0280),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_DEBUG4, 0x, 0x0080),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmDB_EXCEPTION_CONTROL, 0x7fff0f1f, 
0x00b8),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmGCR_GENERAL_CNTL_Sienna_Cichlid, 
0x1ff1, 0x0500),
@@ -3158,7 +3158,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_2[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER7_SELECT, 0xf0f001ff, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER8_SELECT, 0xf0f001ff, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT, 0xf0f001ff, 
0x),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x, 0x010b),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmVGT_GS_MAX_WAVE_ID, 0x0fff, 
0x03ff)
 };
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 11/12] drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0

2020-07-28 Thread Guchun Chen
When amdgpu_bad_page_threshold = 0, bad page reservation stuffs
are skipped in either UMC ECC irq or page retirement calling of
sync flood isr.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0b7317349bde..f47909d6a95b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1678,7 +1678,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device 
*adev)
int ret = 0;
 
/* no bad page record, skip eeprom access */
-   if (!control->num_recs)
+   if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
return ret;
 
bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
@@ -1782,7 +1782,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device 
*adev)
struct amdgpu_bo *bo = NULL;
int i, ret = 0;
 
-   if (!con || !con->eh_data)
+   /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */
+   if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
return 0;
 
mutex_lock(>recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index af1b1ccf613c..262baf0f61ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -125,8 +125,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
"detected in UMC block\n",
err_data->ue_count);
 
-   if (err_data->err_addr_cnt &&
-   amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+   if ((amdgpu_bad_page_threshold != 0) &&
+   err_data->err_addr_cnt &&
+   amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt))
dev_warn(adev->dev, "Failed to add ras bad page!\n");
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 09/12] drm/amdgpu: add one definition for RAS's sysfs/debugfs name

2020-07-28 Thread Guchun Chen
Add one definition for the RAS module's FS name. It's used
in both debugfs and sysfs cases.

v2: Use static variable instead of macro definition.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c519948ebcff..0328f7882199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,8 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 
+static const char *RAS_FS_NAME = "ras";
+
 const char *ras_error_string[] = {
"none",
"parity",
@@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = RAS_FS_NAME,
.attrs = attrs,
 #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = RAS_FS_NAME,
.attrs = attrs,
 #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 
if (sysfs_add_file_to_group(>dev->kobj,
>sysfs_attr.attr,
-   "ras")) {
+   RAS_FS_NAME)) {
put_obj(obj);
return -EINVAL;
}
@@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 
sysfs_remove_file_from_group(>dev->kobj,
>sysfs_attr.attr,
-   "ras");
+   RAS_FS_NAME);
obj->attr_inuse = 0;
put_obj(obj);
 
@@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct 
amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct drm_minor *minor = adev->ddev->primary;
 
-   con->dir = debugfs_create_dir("ras", minor->debugfs_root);
+   con->dir = debugfs_create_dir(RAS_FS_NAME,
+   minor->debugfs_root);
debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
adev, _ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 12/12] drm/amdgpu: update eeprom once specifying one bigger threshold

2020-07-28 Thread Guchun Chen
During driver's probe, when it hits bad gpu tag in eeprom i2c
init calling(the tag was set when reported bad page reaches
bad page threshold in last driver's working loop), there are
some strategys to deal with the cases:

1. when the module parameter amdgpu_bad_page_threshold = 0,
that means page retirement feature is disabled, so just resetting
the eeprom is fine.
2. When amdgpu_bad_page_threshold is not 0, and moreover, user
sets one bigger valid data in order to make current boot up
succeeds, correct eeprom header tag and do not break booting.
3. For other cases, driver's probe will be broken.

v2: Just update eeprom header tag instead of resetting the whole
table header when user sets one bigger threshold data.

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 30 +--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index be895dc2d739..c6c47c665f6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -216,6 +216,24 @@ static bool __validate_tbl_checksum(struct 
amdgpu_ras_eeprom_control *control,
return true;
 }
 
+static int amdgpu_ras_eeprom_correct_header_tag(
+   struct amdgpu_ras_eeprom_control *control,
+   uint32_t header)
+{
+   unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE];
+   struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
+   int ret = 0;
+
+   memset(buff, 0, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE);
+
+   mutex_lock(>tbl_mutex);
+   hdr->header = header;
+   ret = __update_table_header(control, buff);
+   mutex_unlock(>tbl_mutex);
+
+   return ret;
+}
+
 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 {
unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 
0 };
@@ -248,6 +266,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control 
*control,
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 
0 };
struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct i2c_msg msg = {
.addr   = 0,
.flags  = I2C_M_RD,
@@ -287,9 +306,16 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
 
} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
(amdgpu_bad_page_threshold != 0)) {
-   *exceed_err_limit = true;
-   DRM_ERROR("Exceeding the bad_page_threshold parameter, "
+   if (ras->bad_page_cnt_threshold > control->num_recs) {
+   DRM_INFO("Using one valid bigger bad page threshold "
+   "and correcting eeprom header tag.\n");
+   ret = amdgpu_ras_eeprom_correct_header_tag(control,
+   EEPROM_TABLE_HDR_VAL);
+   } else {
+   *exceed_err_limit = true;
+   DRM_ERROR("Exceeding the bad_page_threshold parameter, "
"disabling the GPU.\n");
+   }
} else {
DRM_INFO("Creating new EEPROM table");
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 08/12] drm/amdgpu: restore ras flags when user resets eeprom

2020-07-28 Thread Guchun Chen
RAS flags needs to be cleaned as well when user requires
one clean eeprom.

v2: RAS flags shall be restored after eeprom reset succeeds.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fab6f8d6bee6..c519948ebcff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -368,12 +368,19 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char 
__user *buf,
size_t size, loff_t *pos)
 {
-   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   struct amdgpu_device *adev =
+   (struct amdgpu_device *)file_inode(f)->i_private;
int ret;
 
-   ret = amdgpu_ras_eeprom_reset_table(>psp.ras.ras->eeprom_control);
+   ret = amdgpu_ras_eeprom_reset_table(
+   &(amdgpu_ras_get_context(adev)->eeprom_control));
 
-   return ret == 1 ? size : -EIO;
+   if (ret == 1) {
+   amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
+   return size;
+   } else {
+   return -EIO;
+   }
 }
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 10/12] drm/amdgpu: decouple sysfs creating of bad page node

2020-07-28 Thread Guchun Chen
Bad page information should not be exposed by sysfs when
bad page retirement is disabled, so decouple it from ras
sysfs group creating, and add one guard before creating.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 71 -
 1 file changed, 46 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0328f7882199..0b7317349bde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1027,6 +1027,35 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct 
device *dev,
return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 }
 
+static void amdgpu_ras_sysfs_add_badpage_node(struct amdgpu_device *adev)
+{
+#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct attribute_group group;
+   struct bin_attribute *bin_attrs[] = {
+   >badpages_attr,
+   NULL,
+   };
+
+   con->badpages_attr = (struct bin_attribute) {
+   .attr = {
+   .name = "gpu_vram_bad_pages",
+   .mode = S_IRUGO,
+   },
+   .size = 0,
+   .private = NULL,
+   .read = amdgpu_ras_sysfs_badpages_read,
+   };
+
+   group.name = RAS_FS_NAME;
+   group.bin_attrs = bin_attrs;
+
+   sysfs_bin_attr_init(bin_attrs[0]);
+
+   sysfs_update_group(>dev->kobj, );
+#endif
+}
+
 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1034,16 +1063,9 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
>features_attr.attr,
NULL
};
-   struct bin_attribute *bin_attrs[] = {
-   >badpages_attr,
-   NULL
-   };
struct attribute_group group = {
.name = RAS_FS_NAME,
.attrs = attrs,
-#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
-   .bin_attrs = bin_attrs,
-#endif
};
 
con->features_attr = (struct device_attribute) {
@@ -1054,22 +1076,22 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
.show = amdgpu_ras_sysfs_features_read,
};
 
-   con->badpages_attr = (struct bin_attribute) {
-   .attr = {
-   .name = "gpu_vram_bad_pages",
-   .mode = S_IRUGO,
-   },
-   .size = 0,
-   .private = NULL,
-   .read = amdgpu_ras_sysfs_badpages_read,
-   };
-
sysfs_attr_init(attrs[0]);
-   sysfs_bin_attr_init(bin_attrs[0]);
 
return sysfs_create_group(>dev->kobj, );
 }
 
+static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
+{
+#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   sysfs_remove_file_from_group(>dev->kobj,
+   >badpages_attr.attr,
+   RAS_FS_NAME);
+#endif
+}
+
 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1077,16 +1099,9 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
>features_attr.attr,
NULL
};
-   struct bin_attribute *bin_attrs[] = {
-   >badpages_attr,
-   NULL
-   };
struct attribute_group group = {
.name = RAS_FS_NAME,
.attrs = attrs,
-#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
-   .bin_attrs = bin_attrs,
-#endif
};
 
sysfs_remove_group(>dev->kobj, );
@@ -1155,6 +1170,9 @@ static int amdgpu_ras_sysfs_remove_all(struct 
amdgpu_device *adev)
amdgpu_ras_sysfs_remove(adev, >head);
}
 
+   if (amdgpu_bad_page_threshold != 0)
+   amdgpu_ras_sysfs_remove_bad_page_node(adev);
+
amdgpu_ras_sysfs_remove_feature_node(adev);
 
return 0;
@@ -1283,6 +1301,9 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 {
amdgpu_ras_sysfs_create_feature_node(adev);
 
+   if (amdgpu_bad_page_threshold != 0)
+   amdgpu_ras_sysfs_add_badpage_node(adev);
+
return 0;
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 02/12] drm/amdgpu: validate bad page threshold in ras

2020-07-28 Thread Guchun Chen
Bad page threshold value should be valid in the range between
-1 and max records length of eeprom. It could determine when
saved bad pages exceed threshold value, and proceed corresponding
actions.

v2: When using the default typical value, it should be min
value between typical value and eeprom max records length.

v3: drop the case of setting bad_page_cnt_threshold to be
0x, as it confuses user.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 48 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  3 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  5 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  2 +
 4 files changed, 58 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6f06e1214622..3c4c142e9d8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #defineRAS_UMC_INJECT_ADDR_LIMIT   (0x1ULL << 52)
 
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE  (100 * 1024 * 1024ULL)
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1700,6 +1703,47 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+   uint32_t max_length)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int tmp_threshold = amdgpu_bad_page_threshold;
+   u64 val;
+
+   /*
+* Justification of value bad_page_cnt_threshold in ras structure
+*
+* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
+* in eeprom, and introduce two scenarios accordingly.
+*
+* Bad page retirement enablement:
+*- If amdgpu_bad_page_threshold = -1,
+*  bad_page_cnt_threshold = typical value by formula.
+*
+*- When the value from user is 0 < amdgpu_bad_page_threshold <
+*  max record length in eeprom, use it directly.
+*
+* Bad page retirement disablement:
+*- If amdgpu_bad_page_threshold = 0, bad page retirement
+*  functionality is disabled, and bad_page_cnt_threshold will
+*  take no effect.
+*/
+
+   if (tmp_threshold < -1)
+   tmp_threshold = -1;
+   else if (tmp_threshold > max_length)
+   tmp_threshold = max_length;
+
+   if (tmp_threshold == -1) {
+   val = adev->gmc.mc_vram_size;
+   do_div(val, RAS_BAD_PAGE_RATE);
+   con->bad_page_cnt_threshold = min(lower_32_bits(val),
+   max_length);
+   } else {
+   con->bad_page_cnt_threshold = tmp_threshold;
+   }
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1777,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+   uint32_t max_eeprom_records_len = 0;
int ret;
 
if (con)
@@ -1795,6 +1840,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(>in_recovery, 0);
con->adev = adev;
 
+   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
ret = amdgpu_ras_eeprom_init(>eeprom_control);
if (ret)
goto free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b2667342cf67..4672649a9293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -336,6 +336,9 @@ struct amdgpu_ras {
struct amdgpu_ras_eeprom_control eeprom_control;
 
bool error_query_ready;
+
+   /* bad page count threshold */
+   uint32_t bad_page_cnt_threshold;
 };
 
 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c0096097bbcf..a2c982b1eac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
return ret == num ? 0 : -EIO;
 }
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+{
+   return EEPROM_MAX_RECORD_NUM;
+}
+
 /* Used for testing if bugs encountered */
 #if 0
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 

[PATCH 06/12] drm/amdgpu: schedule ras recovery when reaching bad page threshold

2020-07-28 Thread Guchun Chen
Once the bad page saved to eeprom reaches the configured
threshold, ras recovery will be issued to notify user.

v2: Fix spelling typo.

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 37 ++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 67995b66d7d4..d24bf65f6dd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
int i, ret = 0;
struct i2c_msg *msgs, *msg;
unsigned char *buffs, *buff;
+   bool sched_ras_recovery = false;
struct eeprom_table_record *record;
struct amdgpu_device *adev = to_amdgpu_device(control);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
return 0;
@@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
goto free_buff;
}
 
+   /*
+* If saved bad pages number exceeds the bad page threshold for
+* the whole VRAM, update table header to mark the BAD GPU tag
+* and schedule one ras recovery after eeprom write is done,
+* this can avoid the missing for latest records.
+*
+* This new header will be picked up and checked in the bootup
+* by ras recovery, which may break bootup process to notify
+* user this GPU is in bad state and to retire such GPU for
+* further check.
+*/
+   if (write && (amdgpu_bad_page_threshold != 0) &&
+   ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
+   dev_warn(adev->dev,
+   "Saved bad pages(%d) reaches threshold value(%d).\n",
+   control->num_recs + num, ras->bad_page_cnt_threshold);
+   control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
+   sched_ras_recovery = true;
+   }
+
/* In case of overflow just start from beginning to not lose newest 
records */
if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > 
EEPROM_SIZE_BYTES))
control->next_addr = EEPROM_RECORD_START;
 
-
/*
 * TODO Currently makes EEPROM writes for each record, this creates
 * internal fragmentation. Optimized the code to do full page write of
@@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 
__update_table_header(control, buffs);
+
+   if (sched_ras_recovery) {
+   /*
+* Before scheduling ras recovery, assert the related
+* flag first, which shall bypass common bad page
+* reservation execution in amdgpu_ras_reset_gpu.
+*/
+   amdgpu_ras_get_context(adev)->flags |=
+   AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
+
+   dev_warn(adev->dev, "Conduct ras recovery due to bad "
+   "page threshold reached.\n");
+   amdgpu_ras_reset_gpu(adev);
+   }
} else if (!__validate_tbl_checksum(control, records, num)) {
DRM_WARN("EEPROM Table checksum mismatch!");
/* TODO Uncomment when EEPROM read/write is relliable */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

2020-07-28 Thread Guchun Chen
When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.

v3: Refine function argument name.

v4: Fix missing check of returning value of i2c
initialization error case.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..30af0dfee1a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 * it should be called after amdgpu_device_ip_hw_init_phase2  since
 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
 * for I2C communication which only true at this point.
-* recovery_init may fail, but it can free all resources allocated by
-* itself and its failure should not stop amdgpu init process.
+*
+* amdgpu_ras_recovery_init may fail, but the upper only cares the
+* failure from bad gpu situation and stop amdgpu init process
+* accordingly. For other failed cases, it will still release all
+* the resource and print error message, rather than returning one
+* negative value to upper level.
 *
 * Note: theoretically, this should be called before all vram 
allocations
 * to protect retired page from abusing
 */
-   amdgpu_ras_recovery_init(adev);
+   r = amdgpu_ras_recovery_init(adev);
+   if (r)
+   goto init_failed;
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4c142e9d8a..67d9d65b069e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+   bool exc_err_limit = false;
int ret;
 
if (con)
@@ -1843,8 +1844,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-   ret = amdgpu_ras_eeprom_init(>eeprom_control);
-   if (ret)
+   ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
+   /*
+* This calling fails when exc_err_limit is true or
+* ret != 0.
+*/
+   if (exc_err_limit || ret)
goto free;
 
if (con->eeprom_control.num_recs) {
@@ -1868,6 +1873,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
 
+   /*
+* Except error threshold exceeding case, other failure cases in this
+* function would not fail amdgpu driver init.
+*/
+   if (!exc_err_limit)
+   ret = 0;
+   else
+   ret = -EINVAL;
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 35c0c849d49b..67995b66d7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct 
amdgpu_ras_eeprom_control *control)
 
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)
 {
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control 
*control)
.buf= buff,
};
 
+   *exceed_err_limit = false;
+
/* Verify i2c adapter is initialized */
if (!adev->pm.smu_i2c.algo)
return -ENOENT;
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 control->num_recs);
 
+   } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
+   (amdgpu_bad_page_threshold != 0)) {
+   *exceed_err_limit = true;
+   

[PATCH 07/12] drm/amdgpu: break GPU recovery once it's in bad state

2020-07-28 Thread Guchun Chen
When GPU executes recovery and retriving bad GPU tag
from external eerpom device, the recovery will be broken
and error message is printed as well for user's awareness.

v2: Refine warning message in threshold reaching case, and
fix spelling typo.

v3: Fix explicit calling of bad gpu.

v4: Rename function names.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 20 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 16 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 40 +++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  4 ++
 5 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 30af0dfee1a1..c893d9adbab7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4139,8 +4139,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 
amdgpu_fbdev_set_suspend(tmp_adev, 0);
 
-   /* must succeed. */
-   amdgpu_ras_resume(tmp_adev);
+   /*
+* The GPU enters bad state once faulty pages
+* by ECC has reached the threshold, and ras
+* recovery is scheduled next. So add one check
+* here to break recovery if it indeed exceeds
+* bad page threshold, and remind user to
+* retire this GPU or setting one bigger
+* bad_page_threshold value to fix this once
+* probing driver again.
+*/
+   if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+   /* must succeed. */
+   amdgpu_ras_resume(tmp_adev);
+   } else {
+   r = -EINVAL;
+   goto out;
+   }
 
/* Update PSP FW topology after reset */
if (hive && 
tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4148,7 +4163,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
}
}
 
-
 out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64ae0742f385..fab6f8d6bee6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2206,3 +2206,19 @@ bool amdgpu_ras_need_emergency_restart(struct 
amdgpu_device *adev)
 
return false;
 }
+
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool exc_err_limit = false;
+
+   if (con && (amdgpu_bad_page_threshold != 0))
+   amdgpu_ras_eeprom_check_err_threshold(>eeprom_control,
+   _err_limit);
+
+   /*
+* We are only interested in variable exc_err_limit,
+* as it says if GPU is in bad state or not.
+*/
+   return exc_err_limit;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cf9f60202334..70a6fca73617 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
 
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index d24bf65f6dd7..be895dc2d739 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t 
curr_address)
return curr_address;
 }
 
+int amdgpu_ras_eeprom_check_err_threshold(
+   struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)
+{
+   struct amdgpu_device *adev = to_amdgpu_device(control);
+   unsigned char buff[EEPROM_ADDRESS_SIZE +
+   EEPROM_TABLE_HEADER_SIZE] = { 0 };
+   struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
+   struct i2c_msg msg = {

[PATCH 05/12] drm/amdgpu: skip bad page reservation once issuing from eeprom write

2020-07-28 Thread Guchun Chen
Once the ras recovery is issued from eeprom write itself,
bad page reservation should be ignored, otherwise, recursive
calling of writting to eeprom would happen.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 +++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 67d9d65b069e..64ae0742f385 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -62,8 +62,6 @@ const char *ras_block_string[] = {
 #define ras_err_str(i) (ras_error_string[ffs(i)])
 #define ras_block_str(i) (ras_block_string[i])
 
-#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  1
-#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 
 /* inject address is 52 bits */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 4672649a9293..cf9f60202334 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,6 +31,10 @@
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
+#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
+#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2)
+
 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -503,10 +507,14 @@ static inline int amdgpu_ras_reset_gpu(struct 
amdgpu_device *adev)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-   /* save bad page to eeprom before gpu reset,
-* i2c may be unstable in gpu reset
+   /*
+* Save bad page to eeprom before gpu reset, i2c may be unstable
+* in gpu reset.
+*
+* Also, exclude the case when ras recovery issuer is
+* eeprom page write itself.
 */
-   if (in_task())
+   if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task())
amdgpu_ras_reserve_bad_pages(adev);
 
if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 03/12] drm/amdgpu: add bad gpu tag definition

2020-07-28 Thread Guchun Chen
This tag will be hired for bad gpu detection in eeprom's access.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index a2c982b1eac6..35c0c849d49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -46,6 +46,9 @@
 #define EEPROM_TABLE_HDR_VAL 0x414d4452
 #define EEPROM_TABLE_VER 0x0001
 
+/* Bad GPU tag ‘BADG’ */
+#define EEPROM_TABLE_HDR_BAD 0x42414447
+
 /* Assume 2 Mbit size */
 #define EEPROM_SIZE_BYTES 256000
 #define EEPROM_PAGE__SIZE_BYTES 256
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 00/12] BAD GPU retirement policy by total bad pages

2020-07-28 Thread Guchun Chen
The series is to enable/disable bad page feature and apply different
bad page reservation strategy by different bad page threshold
configurations.

When the saved bad pages written to eeprom reach the threshold,
one ras recovery will be issued immediately and the recovery will
fail to tell user that the GPU is BAD and needs to be retired for
further check or setting one valid bigger threshold value in next
driver's probe to skip corresponding check.

During bootup, similar bad page threshold check is conducted as
well when eeprom get initialized, and it will possibly break boot
up for user's awareness.

When user sets bad_page_threshold=0 once probing driver, bad page
retirement feature is completely disabled, and driver has no chance to
process bad page information record and write it to eeprom.

Guchun Chen (12):
  drm/amdgpu: add bad page count threshold in module parameter
  drm/amdgpu: validate bad page threshold in ras
  drm/amdgpu: add bad gpu tag definition
  drm/amdgpu: break driver init process when it's bad GPU
  drm/amdgpu: skip bad page reservation once issuing from eeprom write
  drm/amdgpu: schedule ras recovery when reaching bad page threshold
  drm/amdgpu: break GPU recovery once it's in bad state
  drm/amdgpu: restore ras flags when user resets eeprom
  drm/amdgpu: add one definition for RAS's sysfs/debugfs name
  drm/amdgpu: decouple sysfs creating of bad page node
  drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold =
0
  drm/amdgpu: update eeprom once specifying one bigger threshold

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  32 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  11 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 186 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  19 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 121 +++-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   |   5 +-
 8 files changed, 331 insertions(+), 53 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 01/12] drm/amdgpu: add bad page count threshold in module parameter

2020-07-28 Thread Guchun Chen
bad_page_threshold could be configured to enable/disable the
associated bad page retirement feature in RAS.

When it's -1, ras will use typical bad page failure value to
handle bad page retirement.

When it's 0, disable bad page retirement, and no bad page
will be recorded and saved.

For other valid value, driver will use this manual value
as the threshold value of totoal bad pages.

v2: correct documentation of this parameter.
v3: remove confused statement in documentation.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 06bfb8658dec..bb83ffb5e26a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level;
 extern struct amdgpu_mgpu_info mgpu_info;
 extern int amdgpu_ras_enable;
 extern uint amdgpu_ras_mask;
+extern int amdgpu_bad_page_threshold;
 extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d28b95f721c4..820a28c9e957 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0x;
+int amdgpu_bad_page_threshold = -1;
 
 /**
  * DOC: vramlimit (int)
@@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
 MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = 
legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
 module_param_named(reset_method, amdgpu_reset_method, int, 0444);
 
+/**
+ * DOC: bad_page_threshold (int)
+ * Bad page threshold is to specify the threshold value of faulty pages
+ * detected by RAS ECC, that may result in GPU entering bad status if total
+ * faulty pages by ECC exceed threshold value and leave it for user's further
+ * check.
+ */
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
typical value), 0 = disable bad page retirement)");
+module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
+
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Yin, Tianci (Rico)
[AMD Official Use Only - Internal Distribution Only]

Thanks very much Lunben and Guchun!

Regards,
Rico

From: Tuikov, Luben 
Sent: Wednesday, July 29, 2020 2:44
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Zhang, Hawking 
; Xu, Feifei ; Hesik, Christopher 
; Swamy, Manjunatha ; 
Quan, Evan ; Feng, Kenneth 
Subject: Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

On 2020-07-28 1:27 a.m., Tianci Yin wrote:
> From: "Tianci.Yin" 
>
> On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,

" are lost "

> reconfigure the golden settings after GFXOFF exit.

" so reconfigure ..."

>
> Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
> Signed-off-by: Tianci.Yin 
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index 55463e7a11e2..5da0436d41e0 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,
>
>struct smu_context *smu = (struct smu_context*)(handle);
>struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
> + struct amdgpu_device *adev = smu->adev;
>
>if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
>return -EINVAL;
> @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle,
>amdgpu_device_ip_set_clockgating_state(smu->adev,
>   
> AMD_IP_BLOCK_TYPE_GFX,
>   
> AMD_CG_STATE_UNGATE);
> +
> + if (adev->asic_type >= CHIP_NAVI10 &&
> + adev->asic_type <= CHIP_NAVI12 &&
> + (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
> + if (adev->gfx.funcs->init_spm_golden) {
> + dev_dbg(adev->dev,"GFXOFF exited, 
> re-init SPM golden settings\n");

Space after comma is required.

> + amdgpu_gfx_init_spm_golden(adev);
> + } else
> + dev_warn(adev->dev,"Callback 
> init_spm_golden is NULL\n");

Space after comma is required.

Please add braces to the single statement of the "else". The reason for this
is that it complements the braces of the "if ( ) {" of the multi-line statement
and closes the block. "checkpatch" calls it "unbalanced braces".

With these three fixed, this patch is
Reviewed-by: Luben Tuikov 

Regards,
Luben

> + }
>}
>} else {
>/* exit umd pstate, restore level, enable gfx cg*/
>

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden for Navi1x

2020-07-28 Thread Yin, Tianci (Rico)
[AMD Official Use Only - Internal Distribution Only]

Thanks very much Luben!

Regards,
Rico

From: Tuikov, Luben 
Sent: Wednesday, July 29, 2020 2:29
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Zhang, Hawking 
; Xu, Feifei ; Hesik, Christopher 
; Swamy, Manjunatha ; 
Quan, Evan ; Feng, Kenneth 
Subject: Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden 
for Navi1x

On 2020-07-28 1:27 a.m., Tianci Yin wrote:
> From: "Tianci.Yin" 
>
> On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,

Use present tense:... " are lost after "

> reconfiguration is needed. Make the configuration code as an interface for

Add "so a reconfiguration is needed. "

> future use.
>

If the lines of your commit message are too long, then "git push" complains
about them. Sixty char wide is perfect, since "git log" indents them when
displaying them.

With this fixed, then Reviewed-by: Luben Tuikov 

Regards,
Luben

> Change-Id: I172f3dc7f59da69b0364052dcad75a9c9aab019e
> Signed-off-by: Tianci.Yin 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 34 ++---
>  2 files changed, 27 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 1e7a2b0997c5..a611e78dd4ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -216,6 +216,7 @@ struct amdgpu_gfx_funcs {
>int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
>int (*query_ras_error_count) (struct amdgpu_device *adev, void 
> *ras_error_status);
>void (*reset_ras_error_count) (struct amdgpu_device *adev);
> + void (*init_spm_golden)(struct amdgpu_device *adev);
>  };
>
>  struct sq_work {
> @@ -324,6 +325,7 @@ struct amdgpu_gfx {
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) 
> (adev)->gfx.funcs->get_gpu_clock_counter((adev))
>  #define amdgpu_gfx_select_se_sh(adev, se, sh, instance) 
> (adev)->gfx.funcs->select_se_sh((adev), (se), (sh), (instance))
>  #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q, vmid) 
> (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q), (vmid))
> +#define amdgpu_gfx_init_spm_golden(adev) 
> (adev)->gfx.funcs->init_spm_golden((adev))
>
>  /**
>   * amdgpu_gfx_create_bitmask - create a bitmask
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index db9f1e89a0f8..da21ad04ac0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3307,6 +3307,29 @@ static void gfx_v10_0_set_kiq_pm4_funcs(struct 
> amdgpu_device *adev)
>adev->gfx.kiq.pmf = _v10_0_kiq_pm4_funcs;
>  }
>
> +static void gfx_v10_0_init_spm_golden_registers(struct amdgpu_device *adev)
> +{
> + switch (adev->asic_type) {
> + case CHIP_NAVI10:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_0_nv10,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10));
> + break;
> + case CHIP_NAVI14:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_1_nv14,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14));
> + break;
> + case CHIP_NAVI12:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_1_2_nv12,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_2_nv12));
> + break;
> + default:
> + break;
> + }
> +}
> +
>  static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev)
>  {
>switch (adev->asic_type) {
> @@ -3317,9 +3340,6 @@ static void gfx_v10_0_init_golden_registers(struct 
> amdgpu_device *adev)
>soc15_program_register_sequence(adev,
>golden_settings_gc_10_0_nv10,
>(const 
> u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10));
> - soc15_program_register_sequence(adev,
> - 
> golden_settings_gc_rlc_spm_10_0_nv10,
> - (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10));
>break;
>case CHIP_NAVI14:
>soc15_program_register_sequence(adev,
> @@ -3328,9 +3348,6 @@ static void gfx_v10_0_init_golden_registers(struct 
> amdgpu_device *adev)
>soc15_program_register_sequence(adev,
>

Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2)

2020-07-28 Thread Yin, Tianci (Rico)
[AMD Public Use]

Hi Alex,

amdgpu_gfx_off_ctrl() invoked by a few other functions, like 
amdgpu_info_ioctl() ,
putting the code into amdgpu_gfx_off_ctrl() will cost more meaningless time on 
SPM golden reconfiguration.
amdgpu_gfx_off_ctrl(adev, false);
amdgpu_asic_read_register(adev, se_num, sh_num, info->read_mmr_reg.dword_offset 
+ i, [i]);
amdgpu_gfx_off_ctrl(adev, true);

In most cases, we don't care about the SPM, so I think smu_enable_umd_pstate is 
a better place.

Thanks very much!
Rico

From: Deucher, Alexander 
Sent: Tuesday, July 28, 2020 22:16
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org 

Cc: Tuikov, Luben ; Zhang, Hawking 
; Xu, Feifei ; Hesik, Christopher 
; Swamy, Manjunatha ; 
Quan, Evan ; Chen, Guchun ; Feng, 
Kenneth 
Subject: Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit(v2)


[AMD Public Use]

Would it be better to put this code into amdgpu_gfx_off_ctrl()?  Then we'll 
handle this in all cases where we disable gfx off.

Alex


From: Tianci Yin 
Sent: Tuesday, July 28, 2020 3:04 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Xu, Feifei 
; Hesik, Christopher ; Swamy, 
Manjunatha ; Quan, Evan ; Chen, 
Guchun ; Feng, Kenneth ; Yin, Tianci 
(Rico) 
Subject: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after 
GFXOFF exit(v2)

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Reviewed-by: Feifei Xu 
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..41487123c207 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,

 struct smu_context *smu = (struct smu_context*)(handle);
 struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+   struct amdgpu_device *adev = smu->adev;

 if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
 return -EINVAL;
@@ -1318,12 +1319,22 @@ static int smu_enable_umd_pstate(void *handle,
 if (*level & profile_mode_mask) {
 smu_dpm_ctx->saved_dpm_level = smu_dpm_ctx->dpm_level;
 smu_dpm_ctx->enable_umd_pstate = true;
-   amdgpu_device_ip_set_powergating_state(smu->adev,
+   amdgpu_device_ip_set_powergating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_PG_STATE_UNGATE);
-   amdgpu_device_ip_set_clockgating_state(smu->adev,
+   amdgpu_device_ip_set_clockgating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_CG_STATE_UNGATE);
+
+   if (adev->asic_type >= CHIP_NAVI10 &&
+   adev->asic_type <= CHIP_NAVI12 &&
+   (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,"GFXOFF exited, 
re-init SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   } else
+   dev_warn(adev->dev,"Callback 
init_spm_golden is NULL\n");
+   }
 }
 } else {
 /* exit umd pstate, restore level, enable gfx cg*/
@@ -1331,10 +1342,10 @@ static int smu_enable_umd_pstate(void *handle,
 if (*level == AMD_DPM_FORCED_LEVEL_PROFILE_EXIT)
 *level = smu_dpm_ctx->saved_dpm_level;
 smu_dpm_ctx->enable_umd_pstate = false;
-   amdgpu_device_ip_set_clockgating_state(smu->adev,
+   amdgpu_device_ip_set_clockgating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_CG_STATE_GATE);
-   amdgpu_device_ip_set_powergating_state(smu->adev,
+   amdgpu_device_ip_set_powergating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_PG_STATE_GATE);
 }
--
2.17.1


RE: [PATCH] drm/amd/powerplay: update driver if version for navy_flounder

2020-07-28 Thread Zhou1, Tao
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Jiansong Chen 
> Sent: Tuesday, July 28, 2020 7:21 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao ; Feng, Kenneth
> ; Chen, Jiansong (Simon) 
> Subject: [PATCH] drm/amd/powerplay: update driver if version for
> navy_flounder
>
> It's in accordance with pmfw 65.5.0 for navy_flounder.
>
> Signed-off-by: Jiansong Chen 
> Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0
> ---
>  drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> index 9504f9954fd3..6a42331aba8a 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> @@ -31,7 +31,7 @@
>  #define SMU11_DRIVER_IF_VERSION_NV12 0x33  #define
> SMU11_DRIVER_IF_VERSION_NV14 0x36  #define
> SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34 -#define
> SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2
> +#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3
>
>  /* MP Apertures */
>  #define MP0_Public0x0380
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 01/14] drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at

2020-07-28 Thread Felix Kuehling
Am 2020-07-28 um 6:45 p.m. schrieb Alex Deucher:
> Just return early to match other bo_create functions.
>
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 5ac7b5561475..16a37caa654a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -366,6 +366,11 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device 
> *adev,
>   unsigned int i;
>   int r;
>  
> + if (!size) {
> + amdgpu_bo_unref(bo_ptr);

I was going to say, unreffing the bo_ptr before allocating anything
seems weird. But amdgpu_bo_create_reserved, which is called just below,
does the same thing. So this doesn't really change anything. Never mind.

Regards,
  Felix


> + return 0;
> + }
> +
>   offset &= PAGE_MASK;
>   size = ALIGN(size, PAGE_SIZE);
>  
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 14/14] drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus

2020-07-28 Thread Felix Kuehling
Am 2020-07-28 um 6:46 p.m. schrieb Alex Deucher:
> I suspect the only reason this was set was to avoid touching
> the display related registers on arcturus.  Someone should
> double check this on arcturus with S3.

Sounds reasonable, given that the other offenders here are all APUs.
AFAIK, we haven't tried S3 on Arcturus. Doesn't seem like something one
would do on a server.

See one more comment on patch 1. Other than that the series is

Reviewed-by: Felix Kuehling 

Regards,
  Felix


>
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index fc9e18aaa76e..0bd7b3797534 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -448,7 +448,6 @@ void amdgpu_gmc_get_vbios_allocations(struct 
> amdgpu_device *adev)
>   switch (adev->asic_type) {
>   case CHIP_VEGA10:
>   case CHIP_RAVEN:
> - case CHIP_ARCTURUS:
>   case CHIP_RENOIR:
>   adev->gmc.keep_stolen_vga_memory = true;
>   break;
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 01/14] drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at

2020-07-28 Thread Alex Deucher
Just return early to match other bo_create functions.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 5ac7b5561475..16a37caa654a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -366,6 +366,11 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev,
unsigned int i;
int r;
 
+   if (!size) {
+   amdgpu_bo_unref(bo_ptr);
+   return 0;
+   }
+
offset &= PAGE_MASK;
size = ALIGN(size, PAGE_SIZE);
 
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 13/14] drm/amdgpu: drop the CPU pointers for the stolen vga bos

2020-07-28 Thread Alex Deucher
We never use them.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 16 +---
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ec975251b171..3df9d5a53741 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1915,7 +1915,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
-   void *stolen_vga_buf, *stolen_extended_buf;
 
mutex_init(>mman.gtt_window_lock);
 
@@ -1982,14 +1981,14 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_vga_size,
   AMDGPU_GEM_DOMAIN_VRAM,
   >gmc.stolen_vga_memory,
-  _vga_buf);
+  NULL);
if (r)
return r;
r = amdgpu_bo_create_kernel_at(adev, adev->gmc.stolen_vga_size,
   adev->gmc.stolen_extended_size,
   AMDGPU_GEM_DOMAIN_VRAM,
   >gmc.stolen_extended_memory,
-  _extended_buf);
+  NULL);
if (r)
return r;
 
@@ -2048,13 +2047,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
  */
 void amdgpu_ttm_late_init(struct amdgpu_device *adev)
 {
-   void *stolen_vga_buf, *stolen_extended_buf;
-
/* return the VGA stolen memory (if any) back to VRAM */
if (!adev->gmc.keep_stolen_vga_memory)
-   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
-   amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL,
- _extended_buf);
+   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, NULL);
+   amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL, NULL);
 }
 
 /**
@@ -2062,15 +2058,13 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev)
  */
 void amdgpu_ttm_fini(struct amdgpu_device *adev)
 {
-   void *stolen_vga_buf;
-
if (!adev->mman.initialized)
return;
 
amdgpu_ttm_training_reserve_vram_fini(adev);
/* return the stolen vga memory back to VRAM */
if (adev->gmc.keep_stolen_vga_memory)
-   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
+   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, NULL);
/* return the IP Discovery TMR memory back to VRAM */
amdgpu_bo_free_kernel(>discovery_memory, NULL, NULL);
amdgpu_ttm_fw_reserve_vram_fini(adev);
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 03/14] drm/amdgpu: use a define for the memory size of the vga emulator

2020-07-28 Thread Alex Deucher
Rather than open coding it everywhere.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h| 2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 4 ++--
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 58e39429395f..2a7fbe21619d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -212,6 +212,8 @@ extern int amdgpu_cik_support;
 #define AMDGPUFB_CONN_LIMIT4
 #define AMDGPU_BIOS_NUM_SCRATCH16
 
+#define AMDGPU_VBIOS_VGA_ALLOCATION(9 * 1024 * 1024) /* reserve 
8MB for vga emulator and 1 MB for FB */
+
 /* hard reset data */
 #define AMDGPU_ASIC_RESET_DATA  0x39d5e86b
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 1a78073c2f05..040220e97cf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -707,7 +707,7 @@ static unsigned gmc_v10_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
unsigned size;
 
if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 
MB for FB */
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
} else {
u32 viewport;
u32 pitch;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
index 538e7ee35cdf..4de996868d32 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
@@ -805,7 +805,7 @@ static unsigned gmc_v6_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
unsigned size;
 
if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 
MB for FB */
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
} else {
u32 viewport = RREG32(mmVIEWPORT_SIZE);
size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) 
*
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 0f8e8aff9114..4113f2d33b75 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -970,7 +970,7 @@ static unsigned gmc_v7_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
unsigned size;
 
if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 
MB for FB */
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
} else {
u32 viewport = RREG32(mmVIEWPORT_SIZE);
size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) 
*
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index abe64010f0d5..f29ff9afcc10 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1087,7 +1087,7 @@ static unsigned gmc_v8_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
unsigned size;
 
if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 
MB for FB */
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
} else {
u32 viewport = RREG32(mmVIEWPORT_SIZE);
size = (REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_HEIGHT) 
*
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c5f94bab4a01..ac15d7678d24 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1077,11 +1077,11 @@ static unsigned gmc_v9_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
 * Check related code in gmc_v9_0_sw_fini
 * */
if (gmc_v9_0_keep_stolen_memory(adev))
-   return 9 * 1024 * 1024;
+   return AMDGPU_VBIOS_VGA_ALLOCATION;
 
d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = 9 * 1024 * 1024; /* reserve 8MB for vga emulator and 1 
MB for FB */
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
} else {
u32 viewport;
 
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 05/14] drm/amdgpu: move keep stolen memory check into gmc core

2020-07-28 Thread Alex Deucher
Rather than leaving this as a gmc v9 specific hack.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  9 -
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 +++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index ddb64be670c2..0cf18f01e67a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -215,6 +215,7 @@ struct amdgpu_gmc {
boolprt_warning;
uint64_tstolen_vga_size;
struct amdgpu_bo*stolen_vga_memory;
+   boolkeep_stolen_vga_memory;
uint32_tsdpif_register;
/* apertures */
u64 shared_aperture_start;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 39781127d059..fd61769202b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2042,8 +2042,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 void amdgpu_ttm_late_init(struct amdgpu_device *adev)
 {
void *stolen_vga_buf;
+
/* return the VGA stolen memory (if any) back to VRAM */
-   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
+   if (!adev->gmc.keep_stolen_vga_memory)
+   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
 }
 
 /**
@@ -2051,10 +2053,15 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev)
  */
 void amdgpu_ttm_fini(struct amdgpu_device *adev)
 {
+   void *stolen_vga_buf;
+
if (!adev->mman.initialized)
return;
 
amdgpu_ttm_training_reserve_vram_fini(adev);
+   /* return the stolen vga memory back to VRAM */
+   if (adev->gmc.keep_stolen_vga_memory)
+   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
/* return the IP Discovery TMR memory back to VRAM */
amdgpu_bo_free_kernel(>discovery_memory, NULL, NULL);
amdgpu_ttm_fw_reserve_vram_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b66c60680dba..c5d2e4390fba 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -932,8 +932,7 @@ static int gmc_v9_0_late_init(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int r;
 
-   if (!gmc_v9_0_keep_stolen_memory(adev))
-   amdgpu_bo_late_init(adev);
+   amdgpu_bo_late_init(adev);
 
r = amdgpu_gmc_allocate_vm_inv_eng(adev);
if (r)
@@ -1076,7 +1075,7 @@ static unsigned gmc_v9_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
 * TODO Remove once GART corruption is resolved
 * Check related code in gmc_v9_0_sw_fini
 * */
-   if (gmc_v9_0_keep_stolen_memory(adev))
+   if (adev->gmc.keep_stolen_vga_memory)
return AMDGPU_VBIOS_VGA_ALLOCATION;
 
d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
@@ -1243,6 +1242,7 @@ static int gmc_v9_0_sw_init(void *handle)
if (r)
return r;
 
+   adev->gmc.keep_stolen_vga_memory = gmc_v9_0_keep_stolen_memory(adev);
adev->gmc.stolen_vga_size = gmc_v9_0_get_vbios_fb_size(adev);
 
/* Memory manager */
@@ -1275,15 +1275,10 @@ static int gmc_v9_0_sw_init(void *handle)
 static int gmc_v9_0_sw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   void *stolen_vga_buf;
 
amdgpu_gmc_ras_fini(adev);
amdgpu_gem_force_release(adev);
amdgpu_vm_manager_fini(adev);
-
-   if (gmc_v9_0_keep_stolen_memory(adev))
-   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
-
amdgpu_gart_table_vram_free(adev);
amdgpu_bo_fini(adev);
amdgpu_gart_fini(adev);
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 00/14] rework stolen pre-OS fb allocation handling

2020-07-28 Thread Alex Deucher
Split the allocations into two so we can still support the S3
workarounds required on some platforms while also avoiding
any artifacts when transitioning from bios to driver.

In the future we could integrate handling of the ip discovery
data and other vbios allocations into this helper function
to consolidate handling of all of the vbios reservations.

Alex Deucher (14):
  drm/amdgpu: handle bo size 0 in amdgpu_bo_create_kernel_at
  drm/amdgpu: use create_at for the stolen pre-OS buffer
  drm/amdgpu: use a define for the memory size of the vga emulator
  drm/amdgpu: move stolen vga bo from amdgpu to amdgpu.gmc
  drm/amdgpu: move keep stolen memory check into gmc core
  drm/amdgpu: add support for extended stolen vga memory
  drm/amdgpu/gmc: add new helper to get the FB size used by pre-OS
console
  drm/amdgpu/gmc6: switch to using amdgpu_gmc_get_vbios_allocations
  drm/amdgpu/gmc7: switch to using amdgpu_gmc_get_vbios_allocations
  drm/amdgpu/gmc8: switch to using amdgpu_gmc_get_vbios_allocations
  drm/amdgpu/gmc9: switch to using amdgpu_gmc_get_vbios_allocations
  drm/amdgpu/gmc10: switch to using amdgpu_gmc_get_vbios_allocations
  drm/amdgpu: drop the CPU pointers for the stolen vga bos
  drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus

 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c|  42 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|  11 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c|  24 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  57 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |   8 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  11 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  11 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 119 +++--
 10 files changed, 153 insertions(+), 138 deletions(-)

-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 12/14] drm/amdgpu/gmc10: switch to using amdgpu_gmc_get_vbios_allocations

2020-07-28 Thread Alex Deucher
The new helper centralizes the logic in one place.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 57 +++---
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 337d70503970..fcde302d3eb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -553,6 +553,28 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device 
*adev,
}
 }
 
+static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev)
+{
+   u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
+   unsigned size;
+
+   if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
+   } else {
+   u32 viewport;
+   u32 pitch;
+
+   viewport = RREG32_SOC15(DCE, 0, 
mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION);
+   pitch = RREG32_SOC15(DCE, 0, mmHUBPREQ0_DCSURF_SURFACE_PITCH);
+   size = (REG_GET_FIELD(viewport,
+   HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, 
PRI_VIEWPORT_HEIGHT) *
+   REG_GET_FIELD(pitch, 
HUBPREQ0_DCSURF_SURFACE_PITCH, PITCH) *
+   4);
+   }
+
+   return size;
+}
+
 static const struct amdgpu_gmc_funcs gmc_v10_0_gmc_funcs = {
.flush_gpu_tlb = gmc_v10_0_flush_gpu_tlb,
.flush_gpu_tlb_pasid = gmc_v10_0_flush_gpu_tlb_pasid,
@@ -560,7 +582,8 @@ static const struct amdgpu_gmc_funcs gmc_v10_0_gmc_funcs = {
.emit_pasid_mapping = gmc_v10_0_emit_pasid_mapping,
.map_mtype = gmc_v10_0_map_mtype,
.get_vm_pde = gmc_v10_0_get_vm_pde,
-   .get_vm_pte = gmc_v10_0_get_vm_pte
+   .get_vm_pte = gmc_v10_0_get_vm_pte,
+   .get_vbios_fb_size = gmc_v10_0_get_vbios_fb_size,
 };
 
 static void gmc_v10_0_set_gmc_funcs(struct amdgpu_device *adev)
@@ -701,36 +724,6 @@ static int gmc_v10_0_gart_init(struct amdgpu_device *adev)
return amdgpu_gart_table_vram_alloc(adev);
 }
 
-static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev)
-{
-   u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
-   unsigned size;
-
-   if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = AMDGPU_VBIOS_VGA_ALLOCATION;
-   } else {
-   u32 viewport;
-   u32 pitch;
-
-   viewport = RREG32_SOC15(DCE, 0, 
mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION);
-   pitch = RREG32_SOC15(DCE, 0, mmHUBPREQ0_DCSURF_SURFACE_PITCH);
-   size = (REG_GET_FIELD(viewport,
-   HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, 
PRI_VIEWPORT_HEIGHT) *
-   REG_GET_FIELD(pitch, 
HUBPREQ0_DCSURF_SURFACE_PITCH, PITCH) *
-   4);
-   }
-   /* return 0 if the pre-OS buffer uses up most of vram */
-   if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024)) {
-   DRM_ERROR("Warning: pre-OS buffer uses most of vram, \
-   be aware of gart table overwrite\n");
-   return 0;
-   }
-
-   return size;
-}
-
-
-
 static int gmc_v10_0_sw_init(void *handle)
 {
int r, vram_width = 0, vram_type = 0, vram_vendor = 0;
@@ -812,7 +805,7 @@ static int gmc_v10_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_vga_size = gmc_v10_0_get_vbios_fb_size(adev);
+   amdgpu_gmc_get_vbios_allocations(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 11/14] drm/amdgpu/gmc9: switch to using amdgpu_gmc_get_vbios_allocations

2020-07-28 Thread Alex Deucher
The new helper centralizes the logic in one place.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 112 +-
 1 file changed, 38 insertions(+), 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c5d2e4390fba..65488ddc34c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -827,6 +827,41 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
*flags |= AMDGPU_PTE_SNOOPED;
 }
 
+static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev)
+{
+   u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
+   unsigned size;
+
+   if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
+   size = AMDGPU_VBIOS_VGA_ALLOCATION;
+   } else {
+   u32 viewport;
+
+   switch (adev->asic_type) {
+   case CHIP_RAVEN:
+   case CHIP_RENOIR:
+   viewport = RREG32_SOC15(DCE, 0, 
mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION);
+   size = (REG_GET_FIELD(viewport,
+ 
HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) *
+   REG_GET_FIELD(viewport,
+ 
HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) *
+   4);
+   break;
+   case CHIP_VEGA10:
+   case CHIP_VEGA12:
+   case CHIP_VEGA20:
+   default:
+   viewport = RREG32_SOC15(DCE, 0, mmSCL0_VIEWPORT_SIZE);
+   size = (REG_GET_FIELD(viewport, SCL0_VIEWPORT_SIZE, 
VIEWPORT_HEIGHT) *
+   REG_GET_FIELD(viewport, SCL0_VIEWPORT_SIZE, 
VIEWPORT_WIDTH) *
+   4);
+   break;
+   }
+   }
+
+   return size;
+}
+
 static const struct amdgpu_gmc_funcs gmc_v9_0_gmc_funcs = {
.flush_gpu_tlb = gmc_v9_0_flush_gpu_tlb,
.flush_gpu_tlb_pasid = gmc_v9_0_flush_gpu_tlb_pasid,
@@ -834,7 +869,8 @@ static const struct amdgpu_gmc_funcs gmc_v9_0_gmc_funcs = {
.emit_pasid_mapping = gmc_v9_0_emit_pasid_mapping,
.map_mtype = gmc_v9_0_map_mtype,
.get_vm_pde = gmc_v9_0_get_vm_pde,
-   .get_vm_pte = gmc_v9_0_get_vm_pte
+   .get_vm_pte = gmc_v9_0_get_vm_pte,
+   .get_vbios_fb_size = gmc_v9_0_get_vbios_fb_size,
 };
 
 static void gmc_v9_0_set_gmc_funcs(struct amdgpu_device *adev)
@@ -902,31 +938,6 @@ static int gmc_v9_0_early_init(void *handle)
return 0;
 }
 
-static bool gmc_v9_0_keep_stolen_memory(struct amdgpu_device *adev)
-{
-
-   /*
-* TODO:
-* Currently there is a bug where some memory client outside
-* of the driver writes to first 8M of VRAM on S3 resume,
-* this overrides GART which by default gets placed in first 8M and
-* causes VM_FAULTS once GTT is accessed.
-* Keep the stolen memory reservation until the while this is not 
solved.
-* Also check code in gmc_v9_0_get_vbios_fb_size and gmc_v9_0_late_init
-*/
-   switch (adev->asic_type) {
-   case CHIP_VEGA10:
-   case CHIP_RAVEN:
-   case CHIP_ARCTURUS:
-   case CHIP_RENOIR:
-   return true;
-   case CHIP_VEGA12:
-   case CHIP_VEGA20:
-   default:
-   return false;
-   }
-}
-
 static int gmc_v9_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1066,52 +1077,6 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
return amdgpu_gart_table_vram_alloc(adev);
 }
 
-static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev)
-{
-   u32 d1vga_control;
-   unsigned size;
-
-   /*
-* TODO Remove once GART corruption is resolved
-* Check related code in gmc_v9_0_sw_fini
-* */
-   if (adev->gmc.keep_stolen_vga_memory)
-   return AMDGPU_VBIOS_VGA_ALLOCATION;
-
-   d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
-   if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
-   size = AMDGPU_VBIOS_VGA_ALLOCATION;
-   } else {
-   u32 viewport;
-
-   switch (adev->asic_type) {
-   case CHIP_RAVEN:
-   case CHIP_RENOIR:
-   viewport = RREG32_SOC15(DCE, 0, 
mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION);
-   size = (REG_GET_FIELD(viewport,
- 
HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) *
-   REG_GET_FIELD(viewport,
- 
HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) *
-   4);
-   break;
-   

[PATCH 04/14] drm/amdgpu: move stolen vga bo from amdgpu to amdgpu.gmc

2020-07-28 Thread Alex Deucher
Since that is where we store the other data related to
the stolen vga memory.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 4 ++--
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a7fbe21619d..899664357015 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -767,7 +767,6 @@ struct amdgpu_device {
boolis_atom_fw;
uint8_t *bios;
uint32_tbios_size;
-   struct amdgpu_bo*stolen_vga_memory;
uint32_tbios_scratch_reg_offset;
uint32_tbios_scratch[AMDGPU_BIOS_NUM_SCRATCH];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 9d58c56f6cfc..ddb64be670c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -213,7 +213,8 @@ struct amdgpu_gmc {
uint8_t vram_vendor;
uint32_tsrbm_soft_reset;
boolprt_warning;
-   uint64_tstolen_size;
+   uint64_tstolen_vga_size;
+   struct amdgpu_bo*stolen_vga_memory;
uint32_tsdpif_register;
/* apertures */
u64 shared_aperture_start;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index a188216bccc2..39781127d059 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1979,9 +1979,9 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 * This is used for VGA emulation and pre-OS scanout buffers to
 * avoid display artifacts while transitioning between pre-OS
 * and driver.  */
-   r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_size,
+   r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_vga_size,
   AMDGPU_GEM_DOMAIN_VRAM,
-  >stolen_vga_memory,
+  >gmc.stolen_vga_memory,
   _vga_buf);
if (r)
return r;
@@ -2043,7 +2043,7 @@ void amdgpu_ttm_late_init(struct amdgpu_device *adev)
 {
void *stolen_vga_buf;
/* return the VGA stolen memory (if any) back to VRAM */
-   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf);
+   amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 040220e97cf3..337d70503970 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -812,7 +812,7 @@ static int gmc_v10_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_size = gmc_v10_0_get_vbios_fb_size(adev);
+   adev->gmc.stolen_vga_size = gmc_v10_0_get_vbios_fb_size(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
index 4de996868d32..28ddb41a78c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
@@ -862,7 +862,7 @@ static int gmc_v6_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_size = gmc_v6_0_get_vbios_fb_size(adev);
+   adev->gmc.stolen_vga_size = gmc_v6_0_get_vbios_fb_size(adev);
 
r = amdgpu_bo_init(adev);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 4113f2d33b75..8b8ecbb99d84 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1035,7 +1035,7 @@ static int gmc_v7_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_size = gmc_v7_0_get_vbios_fb_size(adev);
+   adev->gmc.stolen_vga_size = gmc_v7_0_get_vbios_fb_size(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index f29ff9afcc10..8e3763ec268f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1160,7 +1160,7 @@ static int gmc_v8_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_size = 

[PATCH 09/14] drm/amdgpu/gmc7: switch to using amdgpu_gmc_get_vbios_allocations

2020-07-28 Thread Alex Deucher
The new helper centralizes the logic in one place.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 8b8ecbb99d84..80c146df338a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -977,9 +977,7 @@ static unsigned gmc_v7_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) *
4);
}
-   /* return 0 if the pre-OS buffer uses up most of vram */
-   if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024))
-   return 0;
+
return size;
 }
 
@@ -1035,7 +1033,7 @@ static int gmc_v7_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_vga_size = gmc_v7_0_get_vbios_fb_size(adev);
+   amdgpu_gmc_get_vbios_allocations(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
@@ -1372,7 +1370,8 @@ static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = 
{
.emit_pasid_mapping = gmc_v7_0_emit_pasid_mapping,
.set_prt = gmc_v7_0_set_prt,
.get_vm_pde = gmc_v7_0_get_vm_pde,
-   .get_vm_pte = gmc_v7_0_get_vm_pte
+   .get_vm_pte = gmc_v7_0_get_vm_pte,
+   .get_vbios_fb_size = gmc_v7_0_get_vbios_fb_size,
 };
 
 static const struct amdgpu_irq_src_funcs gmc_v7_0_irq_funcs = {
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 06/14] drm/amdgpu: add support for extended stolen vga memory

2020-07-28 Thread Alex Deucher
This will allow us to split the allocation for systems
where we have to keep the stolen memory around to avoid
S3 issues.  This way we don't waste as much memory and
still avoid any screen artifacts during the bios to
driver transition.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 13 +++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0cf18f01e67a..8f4af955d72c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -215,6 +215,8 @@ struct amdgpu_gmc {
boolprt_warning;
uint64_tstolen_vga_size;
struct amdgpu_bo*stolen_vga_memory;
+   uint64_tstolen_extended_size;
+   struct amdgpu_bo*stolen_extended_memory;
boolkeep_stolen_vga_memory;
uint32_tsdpif_register;
/* apertures */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index fd61769202b3..ec975251b171 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1915,7 +1915,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
-   void *stolen_vga_buf;
+   void *stolen_vga_buf, *stolen_extended_buf;
 
mutex_init(>mman.gtt_window_lock);
 
@@ -1985,6 +1985,13 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
   _vga_buf);
if (r)
return r;
+   r = amdgpu_bo_create_kernel_at(adev, adev->gmc.stolen_vga_size,
+  adev->gmc.stolen_extended_size,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  >gmc.stolen_extended_memory,
+  _extended_buf);
+   if (r)
+   return r;
 
DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
 (unsigned) (adev->gmc.real_vram_size / (1024 * 1024)));
@@ -2041,11 +2048,13 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
  */
 void amdgpu_ttm_late_init(struct amdgpu_device *adev)
 {
-   void *stolen_vga_buf;
+   void *stolen_vga_buf, *stolen_extended_buf;
 
/* return the VGA stolen memory (if any) back to VRAM */
if (!adev->gmc.keep_stolen_vga_memory)
amdgpu_bo_free_kernel(>gmc.stolen_vga_memory, NULL, 
_vga_buf);
+   amdgpu_bo_free_kernel(>gmc.stolen_extended_memory, NULL,
+ _extended_buf);
 }
 
 /**
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 02/14] drm/amdgpu: use create_at for the stolen pre-OS buffer

2020-07-28 Thread Alex Deucher
Should be functionally the same since nothing else is
allocated at that point, but let's be exact.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 605d266754f6..a188216bccc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1979,10 +1979,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 * This is used for VGA emulation and pre-OS scanout buffers to
 * avoid display artifacts while transitioning between pre-OS
 * and driver.  */
-   r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
-   AMDGPU_GEM_DOMAIN_VRAM,
-   >stolen_vga_memory,
-   NULL, _vga_buf);
+   r = amdgpu_bo_create_kernel_at(adev, 0, adev->gmc.stolen_size,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  >stolen_vga_memory,
+  _vga_buf);
if (r)
return r;
 
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 14/14] drm/amdgpu/gmc: disable keep_stolen_vga_memory on arcturus

2020-07-28 Thread Alex Deucher
I suspect the only reason this was set was to avoid touching
the display related registers on arcturus.  Someone should
double check this on arcturus with S3.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index fc9e18aaa76e..0bd7b3797534 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -448,7 +448,6 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device 
*adev)
switch (adev->asic_type) {
case CHIP_VEGA10:
case CHIP_RAVEN:
-   case CHIP_ARCTURUS:
case CHIP_RENOIR:
adev->gmc.keep_stolen_vga_memory = true;
break;
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 08/14] drm/amdgpu/gmc6: switch to using amdgpu_gmc_get_vbios_allocations

2020-07-28 Thread Alex Deucher
The new helper centralizes the logic in one place.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
index 28ddb41a78c8..95a9117e9564 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
@@ -812,9 +812,6 @@ static unsigned gmc_v6_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) *
4);
}
-   /* return 0 if the pre-OS buffer uses up most of vram */
-   if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024))
-   return 0;
return size;
 }
 
@@ -862,7 +859,7 @@ static int gmc_v6_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_vga_size = gmc_v6_0_get_vbios_fb_size(adev);
+   amdgpu_gmc_get_vbios_allocations(adev);
 
r = amdgpu_bo_init(adev);
if (r)
@@ -1136,6 +1133,7 @@ static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = 
{
.set_prt = gmc_v6_0_set_prt,
.get_vm_pde = gmc_v6_0_get_vm_pde,
.get_vm_pte = gmc_v6_0_get_vm_pte,
+   .get_vbios_fb_size = gmc_v6_0_get_vbios_fb_size,
 };
 
 static const struct amdgpu_irq_src_funcs gmc_v6_0_irq_funcs = {
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 07/14] drm/amdgpu/gmc: add new helper to get the FB size used by pre-OS console

2020-07-28 Thread Alex Deucher
This adds a new gmc callback to get the size reserved by the pre-OS
console and provides a helper function for use by gmc IP drivers.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 43 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  5 +++
 2 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 21d2c8543f85..fc9e18aaa76e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -27,6 +27,7 @@
 #include 
 
 #include "amdgpu.h"
+#include "amdgpu_gmc.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_xgmi.h"
 
@@ -431,3 +432,45 @@ void amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device 
*adev, int hub_type,
WREG32(reg, tmp);
}
 }
+
+void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev)
+{
+   unsigned size;
+
+   /*
+* TODO:
+* Currently there is a bug where some memory client outside
+* of the driver writes to first 8M of VRAM on S3 resume,
+* this overrides GART which by default gets placed in first 8M and
+* causes VM_FAULTS once GTT is accessed.
+* Keep the stolen memory reservation until the while this is not 
solved.
+*/
+   switch (adev->asic_type) {
+   case CHIP_VEGA10:
+   case CHIP_RAVEN:
+   case CHIP_ARCTURUS:
+   case CHIP_RENOIR:
+   adev->gmc.keep_stolen_vga_memory = true;
+   break;
+   default:
+   adev->gmc.keep_stolen_vga_memory = false;
+   break;
+   }
+
+   if (!amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE))
+   size = 0;
+   else
+   size = amdgpu_gmc_get_vbios_fb_size(adev);
+
+   /* set to 0 if the pre-OS buffer uses up most of vram */
+   if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024))
+   size = 0;
+
+   if (size > AMDGPU_VBIOS_VGA_ALLOCATION) {
+   adev->gmc.stolen_vga_size = AMDGPU_VBIOS_VGA_ALLOCATION;
+   adev->gmc.stolen_extended_size = size - 
adev->gmc.stolen_vga_size;
+   } else {
+   adev->gmc.stolen_vga_size = size;
+   adev->gmc.stolen_extended_size = 0;
+   }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 8f4af955d72c..c2a85d0b1546 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -131,6 +131,8 @@ struct amdgpu_gmc_funcs {
void (*get_vm_pte)(struct amdgpu_device *adev,
   struct amdgpu_bo_va_mapping *mapping,
   uint64_t *flags);
+   /* get the amount of memory used by the vbios for pre-OS console */
+   unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_xgmi {
@@ -253,6 +255,7 @@ struct amdgpu_gmc {
 #define amdgpu_gmc_map_mtype(adev, flags) 
(adev)->gmc.gmc_funcs->map_mtype((adev),(flags))
 #define amdgpu_gmc_get_vm_pde(adev, level, dst, flags) 
(adev)->gmc.gmc_funcs->get_vm_pde((adev), (level), (dst), (flags))
 #define amdgpu_gmc_get_vm_pte(adev, mapping, flags) 
(adev)->gmc.gmc_funcs->get_vm_pte((adev), (mapping), (flags))
+#define amdgpu_gmc_get_vbios_fb_size(adev) 
(adev)->gmc.gmc_funcs->get_vbios_fb_size((adev))
 
 /**
  * amdgpu_gmc_vram_full_visible - Check if full VRAM is visible through the BAR
@@ -307,4 +310,6 @@ extern void
 amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, int hub_type,
  bool enable);
 
+void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
+
 #endif
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 10/14] drm/amdgpu/gmc8: switch to using amdgpu_gmc_get_vbios_allocations

2020-07-28 Thread Alex Deucher
The new helper centralizes the logic in one place.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 8e3763ec268f..9ab65ca7df77 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1094,9 +1094,7 @@ static unsigned gmc_v8_0_get_vbios_fb_size(struct 
amdgpu_device *adev)
REG_GET_FIELD(viewport, VIEWPORT_SIZE, VIEWPORT_WIDTH) *
4);
}
-   /* return 0 if the pre-OS buffer uses up most of vram */
-   if ((adev->gmc.real_vram_size - size) < (8 * 1024 * 1024))
-   return 0;
+
return size;
 }
 
@@ -1160,7 +1158,7 @@ static int gmc_v8_0_sw_init(void *handle)
if (r)
return r;
 
-   adev->gmc.stolen_vga_size = gmc_v8_0_get_vbios_fb_size(adev);
+   amdgpu_gmc_get_vbios_allocations(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
@@ -1739,7 +1737,8 @@ static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = 
{
.emit_pasid_mapping = gmc_v8_0_emit_pasid_mapping,
.set_prt = gmc_v8_0_set_prt,
.get_vm_pde = gmc_v8_0_get_vm_pde,
-   .get_vm_pte = gmc_v8_0_get_vm_pte
+   .get_vm_pte = gmc_v8_0_get_vm_pte,
+   .get_vbios_fb_size = gmc_v8_0_get_vbios_fb_size,
 };
 
 static const struct amdgpu_irq_src_funcs gmc_v8_0_irq_funcs = {
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free

2020-07-28 Thread daniel
On Tue, Jul 28, 2020 at 01:07:13PM -0400, Kazlauskas, Nicholas wrote:
> On 2020-07-28 5:22 a.m., Paul Menzel wrote:
> > Dear Linux folks,
> > 
> > 
> > Am 25.07.20 um 07:20 schrieb Mazin Rezk:
> > > On Saturday, July 25, 2020 12:59 AM, Duncan wrote:
> > > 
> > > > On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote:
> > > > 
> > > > > > Am 24.07.20 um 19:33 schrieb Kees Cook:
> > > > > > 
> > > > > > > There was a fix to disable the async path for this driver that
> > > > > > > worked around the bug too, yes? That seems like a safer and more
> > > > > > > focused change that doesn't revert the SLUB defense for all
> > > > > > > users, and would actually provide a complete, I think, workaround
> > > > > 
> > > > > That said, I haven't seen the async disabling patch. If you could
> > > > > link to it, I'd be glad to test it out and perhaps we can use that
> > > > > instead.
> > > > 
> > > > I'm confused. Not to put words in Kees' mouth; /I/ am confused (which
> > > > admittedly could well be just because I make no claims to be a
> > > > coder and am simply reading the bug and thread, but I'd appreciate some
> > > > "unconfusing" anyway).
> > > > 
> > > > My interpretation of the "async disabling" reference was that it was to
> > > > comment #30 on the bug:
> > > > 
> > > > https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30
> > > > 
> > > > 
> > > > ... which (if I'm not confused on this point too) appears to be yours.
> > > > There it was stated...
> > > > 
> > > > I've also found that this bug exclusively occurs when commit_work is on
> > > > the workqueue. After forcing drm_atomic_helper_commit to run all of the
> > > > commits without adding to the workqueue and running the OS, the issue
> > > > seems to have disappeared.
> > > > 
> > > > 
> > > > Would not forcing all commits to run directly, without placing them on
> > > > the workqueue, be "async disabling"? That's what I /thought/ he was
> > > > referencing.
> > > 
> > > Oh, I thought he was referring to a different patch. Kees, could I get
> > > your confirmation on this?
> > > 
> > > The change I made actually affected all of the DRM code, although
> > > this could
> > > easily be changed to be specific to amdgpu. (By forcing blocking on
> > > amdgpu_dm's non-blocking commit code)
> > > 
> > > That said, I'd still need to test further because I only did test it
> > > for a
> > > couple of hours then. Although it should work in theory.
> > > 
> > > > OTOH your base/context swap idea sounds like a possibly "less
> > > > disturbance" workaround, if it works, and given the point in the
> > > > commit cycle... (But if it's out Sunday it's likely too late to test
> > > > and get it in now anyway; if it's another week, tho...)
> > > 
> > > The base/context swap idea should make the use-after-free behave how it
> > > did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a
> > > "less disturbance" workaround and more of a "no disturbance" workaround.
> > 
> > Sorry for bothering, but is there now a solution, besides reverting the
> > commits, to avoid freezes/crashes *without* performance regressions?
> > 
> > 
> > Kind regards,
> > 
> > Paul
> 
> Mazin's "drm/amd/display: Clear dm_state for fast updates" change
> accomplishes this, at least as a temporary hack.

Yeah I gets it's horrible, but better than nothing. Reverting the old
amdgpu change to a private state object is probably a lot more invasive.

> I've started work on a more large scale fix that we could get in in after.

Does that include a fix for the "stuff needed by irq handler"? Either way
pls cc dri-devel, I think this is something worth of a bit wider
discussion. Feels like unsolved homework from the entire "make DC
integrate into linux" saga ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 4/4] radeon: fall back to ACPI EDID retrieval

2020-07-28 Thread Daniel Dadap

On 7/28/20 1:50 AM, Christian König wrote:


Am 27.07.20 um 22:53 schrieb Daniel Dadap:

Fall back to retrieving the EDID via the ACPI _DDC method, when present
for notebook internal panels, when retrieving BIOS-embedded EDIDs.

Signed-off-by: Daniel Dadap 
---
  drivers/gpu/drm/radeon/radeon_combios.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_combios.c 
b/drivers/gpu/drm/radeon/radeon_combios.c

index c3e49c973812..de801d9fca54 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct 
radeon_device *rdev)

  struct edid *
  radeon_bios_get_hardcoded_edid(struct radeon_device *rdev)
  {
- struct edid *edid;
-
  if (rdev->mode_info.bios_hardcoded_edid) {
+ struct edid *edid;


That's an unrelated an incorrect style change. You need a blank line
after declaration.



Ah, yes, that doesn't really need to be changed. I'll remove it from 
this patch. Would a separate patch to change the scope of that 
declaration (with a blank line after) be welcome, or should I just leave 
it alone?





  edid = 
kmalloc(rdev->mode_info.bios_hardcoded_edid_size, GFP_KERNEL);

  if (edid) {
  memcpy((unsigned char *)edid,
@@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct 
radeon_device *rdev)

  return edid;
  }
  }
- return NULL;
+
+ return drm_get_edid_acpi();


In general a good idea, but I'm wondering if we should really do this so
unconditionally here.



I'm not personally aware of any AMD notebook designs that require the 
ACPI _DDC EDID retrieval. I've only seen it on NVIDIA+Intel hybrid 
systems and on a small number of NVIDIA discrete-only systems. I just 
figured I'd update the radeon DRM-KMS driver while updating i915 and 
Nouveau, for completeness, as it could be helpful should such a design 
exist. As for whether there should be some condition around this, I 
suppose that's reasonable, but I'm not really sure what would make sense 
as a condition. As it stands, drm_edid_acpi() only returns a value if at 
least one of the VGA or 3D controllers on the system provides an ACPI 
_DDC method, and if that ACPI method successfully returns an EDID.


On the caller's end, it's currently part of the path where the radeon 
driver is already trying to fall back to a hardcoded EDID provided by 
the system. Perhaps instead if we call it within the LVDS || eDP 
condition here, instead?



    if (rdev->is_atom_bios) {
    /* some laptops provide a hardcoded edid in rom for LCDs */
    if (((connector->connector_type == DRM_MODE_CONNECTOR_LVDS) ||
 (connector->connector_type == DRM_MODE_CONNECTOR_eDP)))
    radeon_connector->edid = 
radeon_bios_get_hardcoded_edid(rdev);

    } else {
    /* some servers provide a hardcoded edid in rom for KVMs */
    radeon_connector->edid = radeon_bios_get_hardcoded_edid(rdev);
}

That would be more in line with the changes in this patchset for i915 
and nouveau.




Regards,
Christian.


  }

  static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct 
radeon_device *rdev,



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[Linux-kernel-mentees] [PATCH] drm/amdgpu: Prevent kernel-infoleak in amdgpu_info_ioctl()

2020-07-28 Thread Peilin Ye
Compiler leaves a 4-byte hole near the end of `dev_info`, causing
amdgpu_info_ioctl() to copy uninitialized kernel stack memory to userspace
when `size` is greater than 356.

In 2015 we tried to fix this issue by doing `= {};` on `dev_info`, which
unfortunately does not initialize that 4-byte hole. Fix it by using
memset() instead.

Cc: sta...@vger.kernel.org
Fixes: c193fa91b918 ("drm/amdgpu: information leak in amdgpu_info_ioctl()")
Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)")
Suggested-by: Dan Carpenter 
Signed-off-by: Peilin Ye 
---
$ pahole -C "drm_amdgpu_info_device" drivers/gpu/drm/amd/amdgpu/amdgpu_kms.o
struct drm_amdgpu_info_device {
__u32  device_id;/* 0 4 */
__u32  chip_rev; /* 4 4 */
__u32  external_rev; /* 8 4 */
__u32  pci_rev;  /*12 4 */
__u32  family;   /*16 4 */
__u32  num_shader_engines;   /*20 4 */
__u32  num_shader_arrays_per_engine; /*24 4 
*/
__u32  gpu_counter_freq; /*28 4 */
__u64  max_engine_clock; /*32 8 */
__u64  max_memory_clock; /*40 8 */
__u32  cu_active_number; /*48 4 */
__u32  cu_ao_mask;   /*52 4 */
__u32  cu_bitmap[4][4];  /*5664 */
/* --- cacheline 1 boundary (64 bytes) was 56 bytes ago --- */
__u32  enabled_rb_pipes_mask; /*   120 4 */
__u32  num_rb_pipes; /*   124 4 */
/* --- cacheline 2 boundary (128 bytes) --- */
__u32  num_hw_gfx_contexts;  /*   128 4 */
__u32  _pad; /*   132 4 */
__u64  ids_flags;/*   136 8 */
__u64  virtual_address_offset; /*   144 8 */
__u64  virtual_address_max;  /*   152 8 */
__u32  virtual_address_alignment; /*   160 4 */
__u32  pte_fragment_size;/*   164 4 */
__u32  gart_page_size;   /*   168 4 */
__u32  ce_ram_size;  /*   172 4 */
__u32  vram_type;/*   176 4 */
__u32  vram_bit_width;   /*   180 4 */
__u32  vce_harvest_config;   /*   184 4 */
__u32  gc_double_offchip_lds_buf; /*   188 4 */
/* --- cacheline 3 boundary (192 bytes) --- */
__u64  prim_buf_gpu_addr;/*   192 8 */
__u64  pos_buf_gpu_addr; /*   200 8 */
__u64  cntl_sb_buf_gpu_addr; /*   208 8 */
__u64  param_buf_gpu_addr;   /*   216 8 */
__u32  prim_buf_size;/*   224 4 */
__u32  pos_buf_size; /*   228 4 */
__u32  cntl_sb_buf_size; /*   232 4 */
__u32  param_buf_size;   /*   236 4 */
__u32  wave_front_size;  /*   240 4 */
__u32  num_shader_visible_vgprs; /*   244 4 */
__u32  num_cu_per_sh;/*   248 4 */
__u32  num_tcc_blocks;   /*   252 4 */
/* --- cacheline 4 boundary (256 bytes) --- */
__u32  gs_vgt_table_depth;   /*   256 4 */
__u32  gs_prim_buffer_depth; /*   260 4 */
__u32  max_gs_waves_per_vgt; /*   264 4 */
__u32  _pad1;/*   268 4 */
__u32  cu_ao_bitmap[4][4];   /*   27264 */
/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
__u64  high_va_offset;   /*   336 8 */
__u64  high_va_max;  /*   344 8 */
__u32  pa_sc_tile_steering_override; /*   352 4 
*/

/* XXX 4 bytes hole, try to pack */

__u64  tcc_disabled_mask;/*   360 8 */

/* size: 368, cachelines: 6, members: 49 */
/* sum members: 364, holes: 1, sum holes: 4 */
/* last cacheline: 48 bytes */
};

 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 

Re: [PATCH] drm/amdgpu: fix PSP autoload twice in FLR

2020-07-28 Thread Luben Tuikov
On 2020-07-28 2:04 p.m., Luben Tuikov wrote:
> Thanks for removing the braces.
> 
> On 2020-07-27 10:29 p.m., Liu ChengZhe wrote:
>> the block->status.hw = false assignment will overwrite PSP's previous
> ^^
> You want to start a sentence here. Capitalize "The".
> Also don't use future tense in commit descriptions (and commit titles).
> Simply use present tense. Using future tense makes it confusing if
> this is what the code used to do before this change or if the code
> is doing this right now as someone is reading the commit in the future with 
> "git log".
> 
>> hw status, which will cause PSP execute resume operation after hw init.
> 
> I've found it best to describe what's being done as if telling a story.
> Break it down into "tell what's happening" and "tell what what's fixed and
> how it affects the rest of the system". Something like this:
> 
>   Assigning false to block->status.hw overwrites PSP's previous
>   hardware status, which causes the PSP to resume operation after
>   hardware init.
> 
>   Remove this assignment and let the PSP start when it is told to.
> 
> Check if the above rendition of your change is correct, and use it if so.

Double checking now, since "resume" is an op, you should capitalize it.

"... which causes the PSP to execute Resume operation right
 after hardware init.

 Remove this assignment and let the PSP execute Resume operation when it
 is told to do so."

Or something to that effect.

Regards,
Luben

> 
> Regards,
> Luben
> 
>>
>> v2: (R)remove the braces(.)
> 
> 
> 
>>
>> Signed-off-by: Liu ChengZhe 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 62ecac97fbd2..5d9affa1d35a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2574,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
>> amdgpu_device *adev)
>>  AMD_IP_BLOCK_TYPE_IH,
>>  };
>>  
>> +for (i = 0; i < adev->num_ip_blocks; i++)
>> +adev->ip_blocks[i].status.hw = false;
>> +
>>  for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
>>  int j;
>>  struct amdgpu_ip_block *block;
>> @@ -2581,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
>> amdgpu_device *adev)
>>  for (j = 0; j < adev->num_ip_blocks; j++) {
>>  block = >ip_blocks[j];
>>  
>> -block->status.hw = false;
>>  if (block->version->type != ip_order[i] ||
>>  !block->status.valid)
>>  continue;
>>
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Luben Tuikov
On 2020-07-28 1:27 a.m., Tianci Yin wrote:
> From: "Tianci.Yin" 
> 
> On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,

" are lost "

> reconfigure the golden settings after GFXOFF exit.

" so reconfigure ..."

> 
> Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
> Signed-off-by: Tianci.Yin 
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index 55463e7a11e2..5da0436d41e0 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,
>  
>   struct smu_context *smu = (struct smu_context*)(handle);
>   struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
> + struct amdgpu_device *adev = smu->adev;
>  
>   if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
>   return -EINVAL;
> @@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle,
>   amdgpu_device_ip_set_clockgating_state(smu->adev,
>  
> AMD_IP_BLOCK_TYPE_GFX,
>  
> AMD_CG_STATE_UNGATE);
> +
> + if (adev->asic_type >= CHIP_NAVI10 &&
> + adev->asic_type <= CHIP_NAVI12 &&
> + (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
> + if (adev->gfx.funcs->init_spm_golden) {
> + dev_dbg(adev->dev,"GFXOFF exited, 
> re-init SPM golden settings\n");

Space after comma is required.

> + amdgpu_gfx_init_spm_golden(adev);
> + } else
> + dev_warn(adev->dev,"Callback 
> init_spm_golden is NULL\n");

Space after comma is required.

Please add braces to the single statement of the "else". The reason for this
is that it complements the braces of the "if ( ) {" of the multi-line statement
and closes the block. "checkpatch" calls it "unbalanced braces".

With these three fixed, this patch is
Reviewed-by: Luben Tuikov 

Regards,
Luben

> + }
>   }
>   } else {
>   /* exit umd pstate, restore level, enable gfx cg*/
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm/amdgpu: add interface amdgpu_gfx_init_spm_golden for Navi1x

2020-07-28 Thread Luben Tuikov
On 2020-07-28 1:27 a.m., Tianci Yin wrote:
> From: "Tianci.Yin" 
> 
> On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,

Use present tense:... " are lost after "

> reconfiguration is needed. Make the configuration code as an interface for

Add "so a reconfiguration is needed. "

> future use.
> 

If the lines of your commit message are too long, then "git push" complains
about them. Sixty char wide is perfect, since "git log" indents them when
displaying them.

With this fixed, then Reviewed-by: Luben Tuikov 

Regards,
Luben

> Change-Id: I172f3dc7f59da69b0364052dcad75a9c9aab019e
> Signed-off-by: Tianci.Yin 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 34 ++---
>  2 files changed, 27 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 1e7a2b0997c5..a611e78dd4ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -216,6 +216,7 @@ struct amdgpu_gfx_funcs {
>   int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
>   int (*query_ras_error_count) (struct amdgpu_device *adev, void 
> *ras_error_status);
>   void (*reset_ras_error_count) (struct amdgpu_device *adev);
> + void (*init_spm_golden)(struct amdgpu_device *adev);
>  };
>  
>  struct sq_work {
> @@ -324,6 +325,7 @@ struct amdgpu_gfx {
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) 
> (adev)->gfx.funcs->get_gpu_clock_counter((adev))
>  #define amdgpu_gfx_select_se_sh(adev, se, sh, instance) 
> (adev)->gfx.funcs->select_se_sh((adev), (se), (sh), (instance))
>  #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q, vmid) 
> (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q), (vmid))
> +#define amdgpu_gfx_init_spm_golden(adev) 
> (adev)->gfx.funcs->init_spm_golden((adev))
>  
>  /**
>   * amdgpu_gfx_create_bitmask - create a bitmask
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index db9f1e89a0f8..da21ad04ac0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3307,6 +3307,29 @@ static void gfx_v10_0_set_kiq_pm4_funcs(struct 
> amdgpu_device *adev)
>   adev->gfx.kiq.pmf = _v10_0_kiq_pm4_funcs;
>  }
>  
> +static void gfx_v10_0_init_spm_golden_registers(struct amdgpu_device *adev)
> +{
> + switch (adev->asic_type) {
> + case CHIP_NAVI10:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_0_nv10,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10));
> + break;
> + case CHIP_NAVI14:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_1_nv14,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14));
> + break;
> + case CHIP_NAVI12:
> + soc15_program_register_sequence(adev,
> + 
> golden_settings_gc_rlc_spm_10_1_2_nv12,
> + (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_2_nv12));
> + break;
> + default:
> + break;
> + }
> +}
> +
>  static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev)
>  {
>   switch (adev->asic_type) {
> @@ -3317,9 +3340,6 @@ static void gfx_v10_0_init_golden_registers(struct 
> amdgpu_device *adev)
>   soc15_program_register_sequence(adev,
>   golden_settings_gc_10_0_nv10,
>   (const 
> u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10));
> - soc15_program_register_sequence(adev,
> - 
> golden_settings_gc_rlc_spm_10_0_nv10,
> - (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10));
>   break;
>   case CHIP_NAVI14:
>   soc15_program_register_sequence(adev,
> @@ -3328,9 +3348,6 @@ static void gfx_v10_0_init_golden_registers(struct 
> amdgpu_device *adev)
>   soc15_program_register_sequence(adev,
>   golden_settings_gc_10_1_nv14,
>   (const 
> u32)ARRAY_SIZE(golden_settings_gc_10_1_nv14));
> - soc15_program_register_sequence(adev,
> - 
> golden_settings_gc_rlc_spm_10_1_nv14,
> - (const 
> u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_1_nv14));
>   break;
>   case CHIP_NAVI12:
>   

[PATCH 1/1] drm/ttm: fix offset in VMAs with a pg_offs in ttm_bo_vm_access

2020-07-28 Thread Felix Kuehling
VMAs with a pg_offs that's offset from the start of the vma_node need
to adjust the offset within the BO accordingly. This matches the
offset calculation in ttm_bo_vm_fault_reserved.

Signed-off-by: Felix Kuehling 
Tested-by: Laurent Morichetti 
---
 drivers/gpu/drm/ttm/ttm_bo_vm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 389128b8c4dd..60b41447bec8 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -405,8 +405,10 @@ static int ttm_bo_vm_access_kmap(struct ttm_buffer_object 
*bo,
 int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
 void *buf, int len, int write)
 {
-   unsigned long offset = (addr) - vma->vm_start;
struct ttm_buffer_object *bo = vma->vm_private_data;
+   unsigned long offset = (addr) - vma->vm_start +
+   ((vma->vm_pgoff - drm_vma_node_start(>base.vma_node))
+<< PAGE_SHIFT);
int ret;
 
if (len < 1 || (offset + len) >> PAGE_SHIFT > bo->num_pages)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm amdgpu: Skip tmr load for SRIOV

2020-07-28 Thread Luben Tuikov
On 2020-07-28 1:36 a.m., Liu ChengZhe wrote:
> 1. For Navi12, CHIP_SIENNA_CICHLID, skip tmr load operation;
> 2. Check pointer before release firmware.
> 
> v2: use CHIP_SIENNA_CICHLID instead
> v3: remove local "bool ret"; fix grammer issue
> v4: use my name instead of "root"
> 

Don't indent any lines.

> Signed-off-by: Liu ChengZhe 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 35 -
>  1 file changed, 29 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index a053b7af0680..7f18286a0cc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -193,12 +193,18 @@ static int psp_sw_fini(void *handle)
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>   psp_memory_training_fini(>psp);
> - release_firmware(adev->psp.sos_fw);
> - adev->psp.sos_fw = NULL;
> - release_firmware(adev->psp.asd_fw);
> - adev->psp.asd_fw = NULL;
> - release_firmware(adev->psp.ta_fw);
> - adev->psp.ta_fw = NULL;
> + if (adev->psp.sos_fw) {
> + release_firmware(adev->psp.sos_fw);
> + adev->psp.sos_fw = NULL;
> + }
> + if (adev->psp.asd_fw) {
> + release_firmware(adev->psp.asd_fw);
> + adev->psp.asd_fw = NULL;
> + }
> + if (adev->psp.ta_fw) {
> + release_firmware(adev->psp.ta_fw);
> + adev->psp.ta_fw = NULL;
> + }
>  
>   if (adev->asic_type == CHIP_NAVI10)
>   psp_sysfs_fini(adev);
> @@ -409,11 +415,28 @@ static int psp_clear_vf_fw(struct psp_context *psp)
>   return ret;
>  }
>  
> +static bool psp_skip_tmr(struct psp_context *psp)
> +{
> + switch (psp->adev->asic_type) {
> + case CHIP_NAVI12:
> + case CHIP_SIENNA_CICHLID:
> + return true;
> + default:
> + return false;
> + }
> +}

Yeah, that's very nice now.

> +
>  static int psp_tmr_load(struct psp_context *psp)
>  {
>   int ret;
>   struct psp_gfx_cmd_resp *cmd;
>  

Fix this:

> + /* (F)for Navi12 and CHIP_SIENNA_CICHLID SRIOV, do not set up TMR(.)
> +  * (A)already set( )up by host driver(.)

Thanks,
Luben


> +  */
> + if (amdgpu_sriov_vf(psp->adev) && psp_skip_tmr(psp))
> + return 0;
> +
>   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
>   if (!cmd)
>   return -ENOMEM;
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm amdgpu: Skip tmr load for SRIOV

2020-07-28 Thread Luben Tuikov
Thanks for this patch.

On 2020-07-28 1:12 a.m., Liu ChengZhe wrote:
> From: root 

You should fix your Git setup to show proper user name,
not "root". I've prepared a Confluence page which shows
a way to do it, and a few other things along the way:

http://confluence.amd.com/display/~ltuikov/Git+Setup

> 
> 1. For Navi12, CHIP_SIENNA_CICHLID, skip tmr load operation;
> 2. Check pointer before release firmware.
> 
> v2: use CHIP_SIENNA_CICHLID instead
> v3: remove local "bool ret"; fix grammer issue
> Signed-off-by: root 

You're missing an empty line between your commit message
and the Signed-off-by: line.

Also please do not indent your commit message. "git log" already
indents it and it would look too indented to the right.

Below for more:

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 35 -
>  1 file changed, 29 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index a053b7af0680..7f18286a0cc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -193,12 +193,18 @@ static int psp_sw_fini(void *handle)
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>   psp_memory_training_fini(>psp);
> - release_firmware(adev->psp.sos_fw);
> - adev->psp.sos_fw = NULL;
> - release_firmware(adev->psp.asd_fw);
> - adev->psp.asd_fw = NULL;
> - release_firmware(adev->psp.ta_fw);
> - adev->psp.ta_fw = NULL;
> + if (adev->psp.sos_fw) {
> + release_firmware(adev->psp.sos_fw);
> + adev->psp.sos_fw = NULL;
> + }
> + if (adev->psp.asd_fw) {
> + release_firmware(adev->psp.asd_fw);
> + adev->psp.asd_fw = NULL;
> + }
> + if (adev->psp.ta_fw) {
> + release_firmware(adev->psp.ta_fw);
> + adev->psp.ta_fw = NULL;
> + }
>  
>   if (adev->asic_type == CHIP_NAVI10)
>   psp_sysfs_fini(adev);
> @@ -409,11 +415,28 @@ static int psp_clear_vf_fw(struct psp_context *psp)
>   return ret;
>  }
>  
> +static bool psp_skip_tmr(struct psp_context *psp)
> +{
> + switch (psp->adev->asic_type) {
> + case CHIP_NAVI12:
> + case CHIP_SIENNA_CICHLID:
> + return true;
> + default:
> + return false;
> + }
> +}
> +
>  static int psp_tmr_load(struct psp_context *psp)
>  {
>   int ret;
>   struct psp_gfx_cmd_resp *cmd;
>  
> + /* for Navi12 and CHIP_SIENNA_CICHLID SRIOV, do not set up TMR
> +  * (already setup by host driver)

Thanks for fixing noun "setup" to verb "set up". But there
is another "already setup by" should be "already set up by the host driver".

Thanks and regards,
Luben


> +  */
> + if (amdgpu_sriov_vf(psp->adev) && psp_skip_tmr(psp))
> + return 0;
> +
>   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
>   if (!cmd)
>   return -ENOMEM;
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: fix PSP autoload twice in FLR

2020-07-28 Thread Luben Tuikov
Thanks for removing the braces.

On 2020-07-27 10:29 p.m., Liu ChengZhe wrote:
> the block->status.hw = false assignment will overwrite PSP's previous
^^
You want to start a sentence here. Capitalize "The".
Also don't use future tense in commit descriptions (and commit titles).
Simply use present tense. Using future tense makes it confusing if
this is what the code used to do before this change or if the code
is doing this right now as someone is reading the commit in the future with 
"git log".

> hw status, which will cause PSP execute resume operation after hw init.

I've found it best to describe what's being done as if telling a story.
Break it down into "tell what's happening" and "tell what what's fixed and
how it affects the rest of the system". Something like this:

Assigning false to block->status.hw overwrites PSP's previous
hardware status, which causes the PSP to resume operation after
hardware init.

Remove this assignment and let the PSP start when it is told to.

Check if the above rendition of your change is correct, and use it if so.

Regards,
Luben

> 
> v2: (R)remove the braces(.)



> 
> Signed-off-by: Liu ChengZhe 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62ecac97fbd2..5d9affa1d35a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2574,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
> amdgpu_device *adev)
>   AMD_IP_BLOCK_TYPE_IH,
>   };
>  
> + for (i = 0; i < adev->num_ip_blocks; i++)
> + adev->ip_blocks[i].status.hw = false;
> +
>   for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
>   int j;
>   struct amdgpu_ip_block *block;
> @@ -2581,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
> amdgpu_device *adev)
>   for (j = 0; j < adev->num_ip_blocks; j++) {
>   block = >ip_blocks[j];
>  
> - block->status.hw = false;
>   if (block->version->type != ip_order[i] ||
>   !block->status.valid)
>   continue;
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free

2020-07-28 Thread Kazlauskas, Nicholas

On 2020-07-28 5:22 a.m., Paul Menzel wrote:

Dear Linux folks,


Am 25.07.20 um 07:20 schrieb Mazin Rezk:

On Saturday, July 25, 2020 12:59 AM, Duncan wrote:


On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote:


Am 24.07.20 um 19:33 schrieb Kees Cook:


There was a fix to disable the async path for this driver that
worked around the bug too, yes? That seems like a safer and more
focused change that doesn't revert the SLUB defense for all
users, and would actually provide a complete, I think, workaround


That said, I haven't seen the async disabling patch. If you could
link to it, I'd be glad to test it out and perhaps we can use that
instead.


I'm confused. Not to put words in Kees' mouth; /I/ am confused (which
admittedly could well be just because I make no claims to be a
coder and am simply reading the bug and thread, but I'd appreciate some
"unconfusing" anyway).

My interpretation of the "async disabling" reference was that it was to
comment #30 on the bug:

https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30 



... which (if I'm not confused on this point too) appears to be yours.
There it was stated...

I've also found that this bug exclusively occurs when commit_work is on
the workqueue. After forcing drm_atomic_helper_commit to run all of the
commits without adding to the workqueue and running the OS, the issue
seems to have disappeared.


Would not forcing all commits to run directly, without placing them on
the workqueue, be "async disabling"? That's what I /thought/ he was
referencing.


Oh, I thought he was referring to a different patch. Kees, could I get
your confirmation on this?

The change I made actually affected all of the DRM code, although this 
could

easily be changed to be specific to amdgpu. (By forcing blocking on
amdgpu_dm's non-blocking commit code)

That said, I'd still need to test further because I only did test it 
for a

couple of hours then. Although it should work in theory.


OTOH your base/context swap idea sounds like a possibly "less
disturbance" workaround, if it works, and given the point in the
commit cycle... (But if it's out Sunday it's likely too late to test
and get it in now anyway; if it's another week, tho...)


The base/context swap idea should make the use-after-free behave how it
did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a
"less disturbance" workaround and more of a "no disturbance" workaround.


Sorry for bothering, but is there now a solution, besides reverting the 
commits, to avoid freezes/crashes *without* performance regressions?



Kind regards,

Paul


Mazin's "drm/amd/display: Clear dm_state for fast updates" change 
accomplishes this, at least as a temporary hack.


I've started work on a more large scale fix that we could get in in after.

Regards,
Nicholas Kazlauskas
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amd/display: parse ta firmware for navy_flounder

2020-07-28 Thread Alex Deucher
On Tue, Jul 28, 2020 at 11:43 AM Bhawanpreet Lakha
 wrote:
>
> Use the same case as sienna_cichlid
>
> Signed-off-by: Bhawanpreet Lakha 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
> index d488d250805d..e16874f30d5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
> @@ -179,12 +179,11 @@ static int psp_v11_0_init_microcode(struct psp_context 
> *psp)
> }
> break;
> case CHIP_SIENNA_CICHLID:
> +   case CHIP_NAVY_FLOUNDER:
> err = psp_init_ta_microcode(>psp, chip_name);
> if (err)
> return err;
> break;
> -   case CHIP_NAVY_FLOUNDER:
> -   break;
> default:
> BUG();
> }
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/display: parse ta firmware for navy_flounder

2020-07-28 Thread Bhawanpreet Lakha
Use the same case as sienna_cichlid

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index d488d250805d..e16874f30d5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -179,12 +179,11 @@ static int psp_v11_0_init_microcode(struct psp_context 
*psp)
}
break;
case CHIP_SIENNA_CICHLID:
+   case CHIP_NAVY_FLOUNDER:
err = psp_init_ta_microcode(>psp, chip_name);
if (err)
return err;
break;
-   case CHIP_NAVY_FLOUNDER:
-   break;
default:
BUG();
}
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] amdgpu_dm: fix nonblocking atomic commit use-after-free

2020-07-28 Thread Paul Menzel

Dear Linux folks,


Am 25.07.20 um 07:20 schrieb Mazin Rezk:

On Saturday, July 25, 2020 12:59 AM, Duncan wrote:


On Sat, 25 Jul 2020 03:03:52 + Mazin Rezk wrote:


Am 24.07.20 um 19:33 schrieb Kees Cook:


There was a fix to disable the async path for this driver that
worked around the bug too, yes? That seems like a safer and more
focused change that doesn't revert the SLUB defense for all
users, and would actually provide a complete, I think, workaround


That said, I haven't seen the async disabling patch. If you could
link to it, I'd be glad to test it out and perhaps we can use that
instead.


I'm confused. Not to put words in Kees' mouth; /I/ am confused (which
admittedly could well be just because I make no claims to be a
coder and am simply reading the bug and thread, but I'd appreciate some
"unconfusing" anyway).

My interpretation of the "async disabling" reference was that it was to
comment #30 on the bug:

https://bugzilla.kernel.org/show_bug.cgi?id=207383#c30

... which (if I'm not confused on this point too) appears to be yours.
There it was stated...

I've also found that this bug exclusively occurs when commit_work is on
the workqueue. After forcing drm_atomic_helper_commit to run all of the
commits without adding to the workqueue and running the OS, the issue
seems to have disappeared.


Would not forcing all commits to run directly, without placing them on
the workqueue, be "async disabling"? That's what I /thought/ he was
referencing.


Oh, I thought he was referring to a different patch. Kees, could I get
your confirmation on this?

The change I made actually affected all of the DRM code, although this could
easily be changed to be specific to amdgpu. (By forcing blocking on
amdgpu_dm's non-blocking commit code)

That said, I'd still need to test further because I only did test it for a
couple of hours then. Although it should work in theory.


OTOH your base/context swap idea sounds like a possibly "less
disturbance" workaround, if it works, and given the point in the
commit cycle... (But if it's out Sunday it's likely too late to test
and get it in now anyway; if it's another week, tho...)


The base/context swap idea should make the use-after-free behave how it
did in 5.6. Since the bug doesn't cause an issue in 5.6, it's less of a
"less disturbance" workaround and more of a "no disturbance" workaround.


Sorry for bothering, but is there now a solution, besides reverting the 
commits, to avoid freezes/crashes *without* performance regressions?



Kind regards,

Paul
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

2020-07-28 Thread Felix Kuehling
Am 2020-07-28 um 5:00 a.m. schrieb Monk Liu:
> what:
> the MQD's save and restore of KCQ (kernel compute queue)
> cost lots of clocks during world switch which impacts a lot
> to multi-VF performance
>
> how:
> introduce a paramter to control the number of KCQ to avoid
> performance drop if there is no kernel compute queue needed
>
> notes:
> this paramter only affects gfx 8/9/10
>
> v2:
> refine namings
>
> v3:
> choose queues for each ring to that try best to cross pipes evenly.

Thanks. Some more suggestions for simplifications inline.


>
> TODO:
> in the future we will let hypervisor driver to set this paramter
> automatically thus no need for user to configure it through
> modprobe in virtual machine
>
> Signed-off-by: Monk Liu 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 
> +++---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 29 +++
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 31 
>  7 files changed, 87 insertions(+), 71 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e97c088..de11136 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -201,6 +201,7 @@ extern int amdgpu_si_support;
>  #ifdef CONFIG_DRM_AMDGPU_CIK
>  extern int amdgpu_cik_support;
>  #endif
> +extern int amdgpu_num_kcq;
>  
>  #define AMDGPU_VM_MAX_NUM_CTX4096
>  #define AMDGPU_SG_THRESHOLD  (256*1024*1024)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62ecac9..cf445bab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct 
> amdgpu_device *adev)
>  
>   amdgpu_gmc_tmz_set(adev);
>  
> + if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
> + amdgpu_num_kcq = 8;
> + dev_warn(adev->dev, "set kernel compute queue number to 8 due 
> to invalid paramter provided by user\n");
> + }
> +
>   return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 6291f5f..b545c40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -150,6 +150,7 @@ int amdgpu_noretry;
>  int amdgpu_force_asic_type = -1;
>  int amdgpu_tmz = 0;
>  int amdgpu_reset_method = -1; /* auto */
> +int amdgpu_num_kcq = -1;
>  
>  struct amdgpu_mgpu_info mgpu_info = {
>   .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
> @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
>  MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = 
> legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
>  module_param_named(reset_method, amdgpu_reset_method, int, 0444);
>  
> +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
> (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
> +module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
> +
>  static const struct pci_device_id pciidlist[] = {
>  #ifdef  CONFIG_DRM_AMDGPU_SI
>   {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 8eff017..f83a9a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct 
> amdgpu_device *adev,
>  
>  void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)
>  {
> - int i, queue, pipe, mec;
> + int i, queue, pipe;
>   bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
> + int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec *
> +  
> adev->gfx.mec.num_queue_per_pipe,
> +  
> adev->gfx.num_compute_rings);

Indentation looks wrong. Did you use the wrong TAB size?


> +
> + if (multipipe_policy) {
> + /* policy: make queues evenly cross all pipes on MEC1 only */
> + for (i = 0; i < max_queues_per_mec; i++) {
> + pipe = i % adev->gfx.mec.num_pipe_per_mec;
> + queue = (i / adev->gfx.mec.num_pipe_per_mec) %
> + adev->gfx.mec.num_queue_per_pipe;
> +
> + set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue,
> + adev->gfx.mec.queue_bitmap);
> + }
> + } else {
> + int 

Re: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit(v2)

2020-07-28 Thread Deucher, Alexander
[AMD Public Use]

Would it be better to put this code into amdgpu_gfx_off_ctrl()?  Then we'll 
handle this in all cases where we disable gfx off.

Alex


From: Tianci Yin 
Sent: Tuesday, July 28, 2020 3:04 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Xu, Feifei 
; Hesik, Christopher ; Swamy, 
Manjunatha ; Quan, Evan ; Chen, 
Guchun ; Feng, Kenneth ; Yin, Tianci 
(Rico) 
Subject: [PATCH] drm/amdgpu: reconfigure spm golden settings on Navi1x after 
GFXOFF exit(v2)

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit,
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Reviewed-by: Feifei Xu 
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..41487123c207 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,

 struct smu_context *smu = (struct smu_context*)(handle);
 struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+   struct amdgpu_device *adev = smu->adev;

 if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
 return -EINVAL;
@@ -1318,12 +1319,22 @@ static int smu_enable_umd_pstate(void *handle,
 if (*level & profile_mode_mask) {
 smu_dpm_ctx->saved_dpm_level = smu_dpm_ctx->dpm_level;
 smu_dpm_ctx->enable_umd_pstate = true;
-   amdgpu_device_ip_set_powergating_state(smu->adev,
+   amdgpu_device_ip_set_powergating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_PG_STATE_UNGATE);
-   amdgpu_device_ip_set_clockgating_state(smu->adev,
+   amdgpu_device_ip_set_clockgating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_CG_STATE_UNGATE);
+
+   if (adev->asic_type >= CHIP_NAVI10 &&
+   adev->asic_type <= CHIP_NAVI12 &&
+   (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,"GFXOFF exited, 
re-init SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   } else
+   dev_warn(adev->dev,"Callback 
init_spm_golden is NULL\n");
+   }
 }
 } else {
 /* exit umd pstate, restore level, enable gfx cg*/
@@ -1331,10 +1342,10 @@ static int smu_enable_umd_pstate(void *handle,
 if (*level == AMD_DPM_FORCED_LEVEL_PROFILE_EXIT)
 *level = smu_dpm_ctx->saved_dpm_level;
 smu_dpm_ctx->enable_umd_pstate = false;
-   amdgpu_device_ip_set_clockgating_state(smu->adev,
+   amdgpu_device_ip_set_clockgating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_CG_STATE_GATE);
-   amdgpu_device_ip_set_powergating_state(smu->adev,
+   amdgpu_device_ip_set_powergating_state(adev,

AMD_IP_BLOCK_TYPE_GFX,

AMD_PG_STATE_GATE);
 }
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

2020-07-28 Thread Chen, Guchun
[AMD Public Use]

Hi Dennis,

Please check my response after yours.

Regards,
Guchun

-Original Message-
From: Li, Dennis  
Sent: Tuesday, July 28, 2020 5:43 PM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Grodzovsky, Andrey ; Zhou1, Tao ; 
Clements, John ; Lazar, Lijo ; 
Koenig, Christian ; Yang, Stanley 

Subject: RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad 
GPU

[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
  Please see my below comments.

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Tuesday, July 28, 2020 3:49 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhang, Hawking ; Li, Dennis 
; Grodzovsky, Andrey ; Zhou1, Tao 
; Clements, John ; Lazar, Lijo 
; Koenig, Christian ; Yang, 
Stanley 
Cc: Chen, Guchun 
Subject: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs 
to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.

v3: Refine function argument name.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..30af0dfee1a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 * it should be called after amdgpu_device_ip_hw_init_phase2  since
 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
 * for I2C communication which only true at this point.
-* recovery_init may fail, but it can free all resources allocated by
-* itself and its failure should not stop amdgpu init process.
+*
+* amdgpu_ras_recovery_init may fail, but the upper only cares the
+* failure from bad gpu situation and stop amdgpu init process
+* accordingly. For other failed cases, it will still release all
+* the resource and print error message, rather than returning one
+* negative value to upper level.
 *
 * Note: theoretically, this should be called before all vram 
allocations
 * to protect retired page from abusing
 */
-   amdgpu_ras_recovery_init(adev);
+   r = amdgpu_ras_recovery_init(adev);
+   if (r)
+   goto init_failed;
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4c142e9d8a..56e1aeba2d64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+   bool exc_err_limit = false;
int ret;
 
if (con)
@@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-   ret = amdgpu_ras_eeprom_init(>eeprom_control);
-   if (ret)
+   ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
+   /*
+* We only fail this calling and halt booting up
+* when exc_err_limit is true.
+*/
+   if (exc_err_limit) {
+   ret = -EINVAL;
goto free;
+   }

[Dennis Li] Compared with old codes,  new change miss checking ret.
[Guchun] Yeah, this hits me that another if condition is that ret should be 
checked as well when exc_err_limit is false,
that means there is some problem with eeprom i2c functionality.
It will be addressed in next patch set.
 
if (con->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ 
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
 
+   /*
+* Except error threshold exceeding case, other failure cases in this
+* function would not fail amdgpu driver init.
+*/
+   if (!exc_err_limit)
+   ret = 0;
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 

Re: [PATCH] drm/amd/powerplay: update driver if version for navy_flounder

2020-07-28 Thread Feng, Kenneth
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Kenneth Feng 


在 2020/7/28 下午7:21,“Jiansong Chen” 写入:

It's in accordance with pmfw 65.5.0 for navy_flounder.

Signed-off-by: Jiansong Chen 
Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0
---
 drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h 
b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
index 9504f9954fd3..6a42331aba8a 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
@@ -31,7 +31,7 @@
 #define SMU11_DRIVER_IF_VERSION_NV12 0x33
 #define SMU11_DRIVER_IF_VERSION_NV14 0x36
 #define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34
-#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2
+#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3

 /* MP Apertures */
 #define MP0_Public0x0380
--
2.17.1


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amd/display: Clear dm_state for fast updates

2020-07-28 Thread Mazin Rezk
On Monday, July 27, 2020 7:42 PM, Mazin Rezk  wrote:

> On Monday, July 27, 2020 5:32 PM, Daniel Vetter  wrote:
>
> > On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk  wrote:
> > >
> > > On Monday, July 27, 2020 4:29 PM, Daniel Vetter  wrote:
> > >
> > > > On Mon, Jul 27, 2020 at 9:28 PM Christian König
> > > >  wrote:
> > > > >
> > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas:
> > > > > > On 2020-07-27 9:39 a.m., Christian König wrote:
> > > > > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk:
> > > > > >>> This patch fixes a race condition that causes a use-after-free 
> > > > > >>> during
> > > > > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking
> > > > > >>> commits
> > > > > >>> are requested and the second one finishes before the first.
> > > > > >>> Essentially,
> > > > > >>> this bug occurs when the following sequence of events happens:
> > > > > >>>
> > > > > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is
> > > > > >>> deferred to the workqueue.
> > > > > >>>
> > > > > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is
> > > > > >>> deferred to the workqueue.
> > > > > >>>
> > > > > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the
> > > > > >>> commit_tail and commit #2 completes, freeing dm_state #1.
> > > > > >>>
> > > > > >>> 4. Commit #1 starts after commit #2 completes, uses the freed 
> > > > > >>> dm_state
> > > > > >>> 1 and dereferences a freelist pointer while setting the context.
> > > > > >>
> > > > > >> Well I only have a one mile high view on this, but why don't you 
> > > > > >> let
> > > > > >> the work items execute in order?
> > > > > >>
> > > > > >> That would be better anyway cause this way we don't trigger a cache
> > > > > >> line ping pong between CPUs.
> > > > > >>
> > > > > >> Christian.
> > > > > >
> > > > > > We use the DRM helpers for managing drm_atomic_commit_state and 
> > > > > > those
> > > > > > helpers internally push non-blocking commit work into the system
> > > > > > unbound work queue.
> > > > >
> > > > > Mhm, well if you send those helper atomic commits in the order A,B and
> > > > > they execute it in the order B,A I would call that a bug :)
> > > >
> > > > The way it works is it pushes all commits into unbound work queue, but
> > > > then forces serialization as needed. We do _not_ want e.g. updates on
> > > > different CRTC to be serialized, that would result in lots of judder.
> > > > And hw is funny enough that there's all kinds of dependencies.
> > > >
> > > > The way you force synchronization is by adding other CRTC state
> > > > objects. So if DC is busted and can only handle a single update per
> > > > work item, then I guess you always need all CRTC states and everything
> > > > will be run in order. But that also totally kills modern multi-screen
> > > > compositors. Xorg isn't modern, just in case that's not clear :-)
> > > >
> > > > Lucking at the code it seems like you indeed have only a single dm
> > > > state, so yeah global sync is what you'll need as immediate fix, and
> > > > then maybe fix up DM to not be quite so silly ... or at least only do
> > > > the dm state stuff when really needed.
> > > >
> > > > We could also sprinkle the drm_crtc_commit structure around a bit
> > > > (it's the glue that provides the synchronization across commits), but
> > > > since your dm state is global just grabbing all crtc states
> > > > unconditionally as part of that is probably best.
> > > >
> > > > > > While we could duplicate a copy of that code with nothing but the
> > > > > > workqueue changed that isn't something I'd really like to maintain
> > > > > > going forward.
> > > > >
> > > > > I'm not talking about duplicating the code, I'm talking about fixing 
> > > > > the
> > > > > helpers. I don't know that code well, but from the outside it sounds
> > > > > like a bug there.
> > > > >
> > > > > And executing work items in the order they are submitted is trivial.
> > > > >
> > > > > Had anybody pinged Daniel or other people familiar with the helper 
> > > > > code
> > > > > about it?
> > > >
> > > > Yeah something is wrong here, and the fix looks horrible :-)
> > > >
> > > > Aside, I've also seen some recent discussion flare up about
> > > > drm_atomic_state_get/put used to paper over some other use-after-free,
> > > > but this time related to interrupt handlers. Maybe a few rules about
> > > > that:
> > > > - dont
> > > > - especially not when it's interrupt handlers, because you can't call
> > > > drm_atomic_state_put from interrupt handlers.
> > > >
> > > > Instead have an spin_lock_irq to protect the shared date with your
> > > > interrupt handler, and _copy_ the date over. This is e.g. what
> > > > drm_crtc_arm_vblank_event does.
> > >
> > > Nicholas wrote a patch that attempted to resolve the issue by adding every
> > > CRTC into the commit to use use the stall checks. [1] While this forces
> > > synchronisation on commits, it's kind of a hacky method that 

Re: [PATCH] drm/amd/display: Clear dm_state for fast updates

2020-07-28 Thread Mazin Rezk
On Monday, July 27, 2020 4:29 PM, Daniel Vetter  wrote:

> On Mon, Jul 27, 2020 at 9:28 PM Christian König
>  wrote:
> >
> > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas:
> > > On 2020-07-27 9:39 a.m., Christian König wrote:
> > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk:
> > >>> This patch fixes a race condition that causes a use-after-free during
> > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking
> > >>> commits
> > >>> are requested and the second one finishes before the first.
> > >>> Essentially,
> > >>> this bug occurs when the following sequence of events happens:
> > >>>
> > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is
> > >>> deferred to the workqueue.
> > >>>
> > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is
> > >>> deferred to the workqueue.
> > >>>
> > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the
> > >>> commit_tail and commit #2 completes, freeing dm_state #1.
> > >>>
> > >>> 4. Commit #1 starts after commit #2 completes, uses the freed dm_state
> > >>> 1 and dereferences a freelist pointer while setting the context.
> > >>
> > >> Well I only have a one mile high view on this, but why don't you let
> > >> the work items execute in order?
> > >>
> > >> That would be better anyway cause this way we don't trigger a cache
> > >> line ping pong between CPUs.
> > >>
> > >> Christian.
> > >
> > > We use the DRM helpers for managing drm_atomic_commit_state and those
> > > helpers internally push non-blocking commit work into the system
> > > unbound work queue.
> >
> > Mhm, well if you send those helper atomic commits in the order A,B and
> > they execute it in the order B,A I would call that a bug :)
>
> The way it works is it pushes all commits into unbound work queue, but
> then forces serialization as needed. We do _not_ want e.g. updates on
> different CRTC to be serialized, that would result in lots of judder.
> And hw is funny enough that there's all kinds of dependencies.
>
> The way you force synchronization is by adding other CRTC state
> objects. So if DC is busted and can only handle a single update per
> work item, then I guess you always need all CRTC states and everything
> will be run in order. But that also totally kills modern multi-screen
> compositors. Xorg isn't modern, just in case that's not clear :-)
>
> Lucking at the code it seems like you indeed have only a single dm
> state, so yeah global sync is what you'll need as immediate fix, and
> then maybe fix up DM to not be quite so silly ... or at least only do
> the dm state stuff when really needed.
>
> We could also sprinkle the drm_crtc_commit structure around a bit
> (it's the glue that provides the synchronization across commits), but
> since your dm state is global just grabbing all crtc states
> unconditionally as part of that is probably best.
>
> > > While we could duplicate a copy of that code with nothing but the
> > > workqueue changed that isn't something I'd really like to maintain
> > > going forward.
> >
> > I'm not talking about duplicating the code, I'm talking about fixing the
> > helpers. I don't know that code well, but from the outside it sounds
> > like a bug there.
> >
> > And executing work items in the order they are submitted is trivial.
> >
> > Had anybody pinged Daniel or other people familiar with the helper code
> > about it?
>
> Yeah something is wrong here, and the fix looks horrible :-)
>
> Aside, I've also seen some recent discussion flare up about
> drm_atomic_state_get/put used to paper over some other use-after-free,
> but this time related to interrupt handlers. Maybe a few rules about
> that:
> - dont
> - especially not when it's interrupt handlers, because you can't call
> drm_atomic_state_put from interrupt handlers.
>
> Instead have an spin_lock_irq to protect the shared date with your
> interrupt handler, and _copy_ the date over. This is e.g. what
> drm_crtc_arm_vblank_event does.

Nicholas wrote a patch that attempted to resolve the issue by adding every
CRTC into the commit to use use the stall checks. [1] While this forces
synchronisation on commits, it's kind of a hacky method that may take a
toll on performance.

Is it possible to have a DRM helper that forces synchronisation on some
commits without having to add every CRTC into the commit?

Also, is synchronisation really necessary for fast updates in amdgpu?
I'll admit, the idea of eliminating the use-after-free bug by eliminating
the use entirely doesn't seem ideal; but is forcing synchronisation on
these updates that much better?

[1] https://bugzilla.kernel.org/show_bug.cgi?id=207383#c96

Thanks,
Mazin Rezk

>
> Cheers, Daniel
>
> >
> > Regards,
> > Christian.
> >
> > >
> > > Regards,
> > > Nicholas Kazlauskas
> > >
> > >>
> > >>>
> > >>> Since this bug has only been spotted with fast commits, this patch
> > >>> fixes
> > >>> the bug by clearing the dm_state instead of using the old dc_state for
> > >>> fast updates. In 

Re: [PATCH] drm/amd/display: Clear dm_state for fast updates

2020-07-28 Thread Mazin Rezk
On Monday, July 27, 2020 5:32 PM, Daniel Vetter  wrote:

> On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk  wrote:
> >
> > On Monday, July 27, 2020 4:29 PM, Daniel Vetter  wrote:
> >
> > > On Mon, Jul 27, 2020 at 9:28 PM Christian König
> > >  wrote:
> > > >
> > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas:
> > > > > On 2020-07-27 9:39 a.m., Christian König wrote:
> > > > >> Am 27.07.20 um 07:40 schrieb Mazin Rezk:
> > > > >>> This patch fixes a race condition that causes a use-after-free 
> > > > >>> during
> > > > >>> amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking
> > > > >>> commits
> > > > >>> are requested and the second one finishes before the first.
> > > > >>> Essentially,
> > > > >>> this bug occurs when the following sequence of events happens:
> > > > >>>
> > > > >>> 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and is
> > > > >>> deferred to the workqueue.
> > > > >>>
> > > > >>> 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and is
> > > > >>> deferred to the workqueue.
> > > > >>>
> > > > >>> 3. Commit #2 starts before commit #1, dm_state #1 is used in the
> > > > >>> commit_tail and commit #2 completes, freeing dm_state #1.
> > > > >>>
> > > > >>> 4. Commit #1 starts after commit #2 completes, uses the freed 
> > > > >>> dm_state
> > > > >>> 1 and dereferences a freelist pointer while setting the context.
> > > > >>
> > > > >> Well I only have a one mile high view on this, but why don't you let
> > > > >> the work items execute in order?
> > > > >>
> > > > >> That would be better anyway cause this way we don't trigger a cache
> > > > >> line ping pong between CPUs.
> > > > >>
> > > > >> Christian.
> > > > >
> > > > > We use the DRM helpers for managing drm_atomic_commit_state and those
> > > > > helpers internally push non-blocking commit work into the system
> > > > > unbound work queue.
> > > >
> > > > Mhm, well if you send those helper atomic commits in the order A,B and
> > > > they execute it in the order B,A I would call that a bug :)
> > >
> > > The way it works is it pushes all commits into unbound work queue, but
> > > then forces serialization as needed. We do _not_ want e.g. updates on
> > > different CRTC to be serialized, that would result in lots of judder.
> > > And hw is funny enough that there's all kinds of dependencies.
> > >
> > > The way you force synchronization is by adding other CRTC state
> > > objects. So if DC is busted and can only handle a single update per
> > > work item, then I guess you always need all CRTC states and everything
> > > will be run in order. But that also totally kills modern multi-screen
> > > compositors. Xorg isn't modern, just in case that's not clear :-)
> > >
> > > Lucking at the code it seems like you indeed have only a single dm
> > > state, so yeah global sync is what you'll need as immediate fix, and
> > > then maybe fix up DM to not be quite so silly ... or at least only do
> > > the dm state stuff when really needed.
> > >
> > > We could also sprinkle the drm_crtc_commit structure around a bit
> > > (it's the glue that provides the synchronization across commits), but
> > > since your dm state is global just grabbing all crtc states
> > > unconditionally as part of that is probably best.
> > >
> > > > > While we could duplicate a copy of that code with nothing but the
> > > > > workqueue changed that isn't something I'd really like to maintain
> > > > > going forward.
> > > >
> > > > I'm not talking about duplicating the code, I'm talking about fixing the
> > > > helpers. I don't know that code well, but from the outside it sounds
> > > > like a bug there.
> > > >
> > > > And executing work items in the order they are submitted is trivial.
> > > >
> > > > Had anybody pinged Daniel or other people familiar with the helper code
> > > > about it?
> > >
> > > Yeah something is wrong here, and the fix looks horrible :-)
> > >
> > > Aside, I've also seen some recent discussion flare up about
> > > drm_atomic_state_get/put used to paper over some other use-after-free,
> > > but this time related to interrupt handlers. Maybe a few rules about
> > > that:
> > > - dont
> > > - especially not when it's interrupt handlers, because you can't call
> > > drm_atomic_state_put from interrupt handlers.
> > >
> > > Instead have an spin_lock_irq to protect the shared date with your
> > > interrupt handler, and _copy_ the date over. This is e.g. what
> > > drm_crtc_arm_vblank_event does.
> >
> > Nicholas wrote a patch that attempted to resolve the issue by adding every
> > CRTC into the commit to use use the stall checks. [1] While this forces
> > synchronisation on commits, it's kind of a hacky method that may take a
> > toll on performance.
> >
> > Is it possible to have a DRM helper that forces synchronisation on some
> > commits without having to add every CRTC into the commit?
> >
> > Also, is synchronisation really necessary for fast updates in amdgpu?
> > I'll admit, the idea of eliminating the 

[PATCH] drm/amd/powerplay: update driver if version for navy_flounder

2020-07-28 Thread Jiansong Chen
It's in accordance with pmfw 65.5.0 for navy_flounder.

Signed-off-by: Jiansong Chen 
Change-Id: I984a1147030264adbc02230e2e1dd416d4ad63b0
---
 drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h 
b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
index 9504f9954fd3..6a42331aba8a 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
@@ -31,7 +31,7 @@
 #define SMU11_DRIVER_IF_VERSION_NV12 0x33
 #define SMU11_DRIVER_IF_VERSION_NV14 0x36
 #define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x34
-#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x2
+#define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0x3
 
 /* MP Apertures */
 #define MP0_Public 0x0380
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring

2020-07-28 Thread Christian König

Am 28.07.20 um 12:21 schrieb Jack Xiao:

assign the cpu/gpu address of fence for the normal or mes ring
from ring structure.

Signed-off-by: Jack Xiao 
Reviewed-by: Hawking Zhang 
Acked-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 58d4c219178a..0be3e2007387 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
uint64_t index;
  
  	if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) {

-   ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs];
-   ring->fence_drv.gpu_addr = adev->wb.gpu_addr + 
(ring->fence_offs * 4);
+   ring->fence_drv.cpu_addr = ring->fence_cpu_addr;
+   ring->fence_drv.gpu_addr = ring->fence_gpu_addr;


That doesn't look like a good idea to me. We should probably rather 
remove ring->fence_offs and move all the handling here instead.


Christian.


} else {
/* put fence directly behind firmware */
index = ALIGN(adev->uvd.fw->size, 8);


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 2/4] drm/amdgpu: initialize the cpu/gpu address of rptr/wptr/fence

2020-07-28 Thread Christian König

Am 28.07.20 um 12:21 schrieb Jack Xiao:

Initialize the cpu/gpu address of rptr/wptr/fence.

Signed-off-by: Jack Xiao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 37 
  1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 13ea8ebc421c..ff63ecc861bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -150,6 +150,12 @@ void amdgpu_ring_undo(struct amdgpu_ring *ring)
ring->funcs->end_use(ring);
  }
  
+#define amdgpu_ring_get_gpu_addr(ring, offset)	\

+   (ring->adev->wb.gpu_addr + offset * 4)
+
+#define amdgpu_ring_get_cpu_addr(ring, offset) \
+   (>adev->wb.wb[offset])


Those are not ring functions, but rather wb functions. Please clean that up.

Christian.


+
  /**
   * amdgpu_ring_init - init driver ring struct.
   *
@@ -217,17 +223,38 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
"(%d) ring trail_fence_offs wb alloc failed\n", r);
return r;
}
-   ring->trail_fence_gpu_addr =
-   adev->wb.gpu_addr + (ring->trail_fence_offs * 4);
-   ring->trail_fence_cpu_addr = >wb.wb[ring->trail_fence_offs];
  
  	r = amdgpu_device_wb_get(adev, >cond_exe_offs);

if (r) {
dev_err(adev->dev, "(%d) ring cond_exec_polling wb alloc 
failed\n", r);
return r;
}
-   ring->cond_exe_gpu_addr = adev->wb.gpu_addr + (ring->cond_exe_offs * 4);
-   ring->cond_exe_cpu_addr = >wb.wb[ring->cond_exe_offs];
+
+   ring->fence_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->fence_offs);
+   ring->fence_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->fence_offs);
+
+   ring->rptr_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->rptr_offs);
+   ring->rptr_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->rptr_offs);
+
+   ring->wptr_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->wptr_offs);
+   ring->wptr_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->wptr_offs);
+
+   ring->trail_fence_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->trail_fence_offs);
+   ring->trail_fence_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->trail_fence_offs);
+
+   ring->cond_exe_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->cond_exe_offs);
+   ring->cond_exe_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->cond_exe_offs);
+
/* always set cond_exec_polling to CONTINUE */
*ring->cond_exe_cpu_addr = 1;
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring

2020-07-28 Thread Ma, Le
[AMD Public Use]

Series is Reviewed-by: Le Ma 

Regards,
Ma Le

-Original Message-
From: Xiao, Jack  
Sent: Tuesday, July 28, 2020 6:22 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhang, Hawking ; Koenig, 
Christian ; Ma, Le 
Cc: Xiao, Jack ; Koenig, Christian 
Subject: [PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring

assign the cpu/gpu address of fence for the normal or mes ring from ring 
structure.

Signed-off-by: Jack Xiao 
Reviewed-by: Hawking Zhang 
Acked-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 58d4c219178a..0be3e2007387 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
uint64_t index;
 
if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) {
-   ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs];
-   ring->fence_drv.gpu_addr = adev->wb.gpu_addr + 
(ring->fence_offs * 4);
+   ring->fence_drv.cpu_addr = ring->fence_cpu_addr;
+   ring->fence_drv.gpu_addr = ring->fence_gpu_addr;
} else {
/* put fence directly behind firmware */
index = ALIGN(adev->uvd.fw->size, 8);
--
2.26.2
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/4] drm/amdgpu: initialize the cpu/gpu address of rptr/wptr/fence

2020-07-28 Thread Jack Xiao
Initialize the cpu/gpu address of rptr/wptr/fence.

Signed-off-by: Jack Xiao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 37 
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 13ea8ebc421c..ff63ecc861bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -150,6 +150,12 @@ void amdgpu_ring_undo(struct amdgpu_ring *ring)
ring->funcs->end_use(ring);
 }
 
+#define amdgpu_ring_get_gpu_addr(ring, offset) \
+   (ring->adev->wb.gpu_addr + offset * 4)
+
+#define amdgpu_ring_get_cpu_addr(ring, offset) \
+   (>adev->wb.wb[offset])
+
 /**
  * amdgpu_ring_init - init driver ring struct.
  *
@@ -217,17 +223,38 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ring *ring,
"(%d) ring trail_fence_offs wb alloc failed\n", r);
return r;
}
-   ring->trail_fence_gpu_addr =
-   adev->wb.gpu_addr + (ring->trail_fence_offs * 4);
-   ring->trail_fence_cpu_addr = >wb.wb[ring->trail_fence_offs];
 
r = amdgpu_device_wb_get(adev, >cond_exe_offs);
if (r) {
dev_err(adev->dev, "(%d) ring cond_exec_polling wb alloc 
failed\n", r);
return r;
}
-   ring->cond_exe_gpu_addr = adev->wb.gpu_addr + (ring->cond_exe_offs * 4);
-   ring->cond_exe_cpu_addr = >wb.wb[ring->cond_exe_offs];
+
+   ring->fence_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->fence_offs);
+   ring->fence_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->fence_offs);
+
+   ring->rptr_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->rptr_offs);
+   ring->rptr_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->rptr_offs);
+
+   ring->wptr_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->wptr_offs);
+   ring->wptr_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->wptr_offs);
+
+   ring->trail_fence_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->trail_fence_offs);
+   ring->trail_fence_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->trail_fence_offs);
+
+   ring->cond_exe_gpu_addr =
+   amdgpu_ring_get_gpu_addr(ring, ring->cond_exe_offs);
+   ring->cond_exe_cpu_addr =
+   amdgpu_ring_get_cpu_addr(ring, ring->cond_exe_offs);
+
/* always set cond_exec_polling to CONTINUE */
*ring->cond_exe_cpu_addr = 1;
 
-- 
2.26.2

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/4] drm/amdgpu: use ring structure to access rptr/wptr v2

2020-07-28 Thread Jack Xiao
Use ring structure to access the cpu/gpu address of rptr/wptr.

v2: merge gfx10/sdma5/sdma5.2 patches

Signed-off-by: Jack Xiao 
Reviewed-by: Christian König 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  8 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 37 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c  |  8 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c  | 12 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 20 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 25 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c |  4 +--
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c |  4 +--
 drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c |  4 +--
 drivers/gpu/drm/amd/amdgpu/mes_v10_1.c | 11 
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  8 +++---
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 16 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 28 ---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 16 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 16 +--
 drivers/gpu/drm/amd/amdgpu/si_dma.c|  4 +--
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c  |  6 ++---
 drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  6 ++---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c  | 12 -
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c  | 12 -
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c  | 12 -
 21 files changed, 126 insertions(+), 143 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index 20f108818b2b..a6a7aa9e9aec 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -164,7 +164,7 @@ static uint64_t cik_sdma_ring_get_rptr(struct amdgpu_ring 
*ring)
 {
u32 rptr;
 
-   rptr = ring->adev->wb.wb[ring->rptr_offs];
+   rptr = *ring->rptr_cpu_addr;
 
return (rptr & 0x3fffc) >> 2;
 }
@@ -432,12 +432,10 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev)
struct amdgpu_ring *ring;
u32 rb_cntl, ib_cntl;
u32 rb_bufsz;
-   u32 wb_offset;
int i, j, r;
 
for (i = 0; i < adev->sdma.num_instances; i++) {
ring = >sdma.instance[i].ring;
-   wb_offset = (ring->rptr_offs * 4);
 
mutex_lock(>srbm_mutex);
for (j = 0; j < 16; j++) {
@@ -473,9 +471,9 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev)
 
/* set the wb address whether it's enabled or not */
WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_HI + sdma_offsets[i],
-  upper_32_bits(adev->wb.gpu_addr + wb_offset) & 
0x);
+  upper_32_bits(ring->rptr_gpu_addr) & 0x);
WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_LO + sdma_offsets[i],
-  ((adev->wb.gpu_addr + wb_offset) & 0xFFFC));
+  ((ring->rptr_gpu_addr) & 0xFFFC));
 
rb_cntl |= SDMA0_GFX_RB_CNTL__RPTR_WRITEBACK_ENABLE_MASK;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index db9f1e89a0f8..7036e286b627 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3205,9 +3205,8 @@ static void gfx10_kiq_set_resources(struct amdgpu_ring 
*kiq_ring, uint64_t queue
 static void gfx10_kiq_map_queues(struct amdgpu_ring *kiq_ring,
 struct amdgpu_ring *ring)
 {
-   struct amdgpu_device *adev = kiq_ring->adev;
uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
-   uint64_t wptr_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
+   uint64_t wptr_addr = ring->wptr_gpu_addr;
uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
 
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
@@ -5835,12 +5834,12 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device 
*adev)
WREG32_SOC15(GC, 0, mmCP_RB0_WPTR_HI, upper_32_bits(ring->wptr));
 
/* set the wb address wether it's enabled or not */
-   rptr_addr = adev->wb.gpu_addr + (ring->rptr_offs * 4);
+   rptr_addr = ring->rptr_gpu_addr;
WREG32_SOC15(GC, 0, mmCP_RB0_RPTR_ADDR, lower_32_bits(rptr_addr));
WREG32_SOC15(GC, 0, mmCP_RB0_RPTR_ADDR_HI, upper_32_bits(rptr_addr) &
 CP_RB_RPTR_ADDR_HI__RB_RPTR_ADDR_HI_MASK);
 
-   wptr_gpu_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
+   wptr_gpu_addr = ring->wptr_gpu_addr;
WREG32_SOC15(GC, 0, mmCP_RB_WPTR_POLL_ADDR_LO,
 lower_32_bits(wptr_gpu_addr));
WREG32_SOC15(GC, 0, mmCP_RB_WPTR_POLL_ADDR_HI,
@@ -5873,11 +5872,11 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device 
*adev)
WREG32_SOC15(GC, 0, mmCP_RB1_WPTR, lower_32_bits(ring->wptr));
WREG32_SOC15(GC, 0, mmCP_RB1_WPTR_HI, 
upper_32_bits(ring->wptr));
/* Set the wb address wether it's enabled or not */
-   rptr_addr = 

[PATCH 4/4] drm/amdgpu: assign the cpu/gpu address of fence from ring

2020-07-28 Thread Jack Xiao
assign the cpu/gpu address of fence for the normal or mes ring
from ring structure.

Signed-off-by: Jack Xiao 
Reviewed-by: Hawking Zhang 
Acked-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 58d4c219178a..0be3e2007387 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -407,8 +407,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
uint64_t index;
 
if (ring->funcs->type != AMDGPU_RING_TYPE_UVD) {
-   ring->fence_drv.cpu_addr = >wb.wb[ring->fence_offs];
-   ring->fence_drv.gpu_addr = adev->wb.gpu_addr + 
(ring->fence_offs * 4);
+   ring->fence_drv.cpu_addr = ring->fence_cpu_addr;
+   ring->fence_drv.gpu_addr = ring->fence_gpu_addr;
} else {
/* put fence directly behind firmware */
index = ALIGN(adev->uvd.fw->size, 8);
-- 
2.26.2

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/4] drm/amdgpu: define ring structure to access rptr/wptr/fence

2020-07-28 Thread Jack Xiao
Define ring structure to access the cpu/gpu address of rptr/wptr/fence
instead of dynamic calculation.

Cc: Christian König 
Suggested-by: Christian König 
Signed-off-by: Jack Xiao 
Reviewed-by: Hawking Zhang 
Acked-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index da871d84b742..940618d1bd4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -208,6 +208,8 @@ struct amdgpu_ring {
struct amdgpu_bo*ring_obj;
volatile uint32_t   *ring;
unsignedrptr_offs;
+   u64 rptr_gpu_addr;
+   volatile u32*rptr_cpu_addr;
u64 wptr;
u64 wptr_old;
unsignedring_size;
@@ -228,7 +230,11 @@ struct amdgpu_ring {
booluse_doorbell;
booluse_pollmem;
unsignedwptr_offs;
+   u64 wptr_gpu_addr;
+   volatile u32*wptr_cpu_addr;
unsignedfence_offs;
+   u64 fence_gpu_addr;
+   volatile u32*fence_cpu_addr;
uint64_tcurrent_ctx;
charname[16];
u32 trail_seq;
-- 
2.26.2

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

2020-07-28 Thread Li, Dennis
[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
  Please see my below comments.

Best Regards
Dennis Li
-Original Message-
From: Chen, Guchun  
Sent: Tuesday, July 28, 2020 3:49 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Zhang, Hawking ; Li, Dennis 
; Grodzovsky, Andrey ; Zhou1, Tao 
; Clements, John ; Lazar, Lijo 
; Koenig, Christian ; Yang, 
Stanley 
Cc: Chen, Guchun 
Subject: [PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs 
to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.

v3: Refine function argument name.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..30af0dfee1a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 * it should be called after amdgpu_device_ip_hw_init_phase2  since
 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
 * for I2C communication which only true at this point.
-* recovery_init may fail, but it can free all resources allocated by
-* itself and its failure should not stop amdgpu init process.
+*
+* amdgpu_ras_recovery_init may fail, but the upper only cares the
+* failure from bad gpu situation and stop amdgpu init process
+* accordingly. For other failed cases, it will still release all
+* the resource and print error message, rather than returning one
+* negative value to upper level.
 *
 * Note: theoretically, this should be called before all vram 
allocations
 * to protect retired page from abusing
 */
-   amdgpu_ras_recovery_init(adev);
+   r = amdgpu_ras_recovery_init(adev);
+   if (r)
+   goto init_failed;
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4c142e9d8a..56e1aeba2d64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+   bool exc_err_limit = false;
int ret;
 
if (con)
@@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-   ret = amdgpu_ras_eeprom_init(>eeprom_control);
-   if (ret)
+   ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
+   /*
+* We only fail this calling and halt booting up
+* when exc_err_limit is true.
+*/
+   if (exc_err_limit) {
+   ret = -EINVAL;
goto free;
+   }

[Dennis Li] Compared with old codes,  new change miss checking ret.
 
if (con->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev); @@ -1868,6 +1875,13 @@ 
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
 
+   /*
+* Except error threshold exceeding case, other failure cases in this
+* function would not fail amdgpu driver init.
+*/
+   if (!exc_err_limit)
+   ret = 0;
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 35c0c849d49b..67995b66d7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct 
amdgpu_ras_eeprom_control *control)
 
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)

 {
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 
+255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
.buf= buff,
};
 
+   

RE: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

2020-07-28 Thread Liu, Monk
[AMD Official Use Only - Internal Distribution Only]

I repeated the patch broadcast through git-send-email

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Koenig, Christian 
Sent: Tuesday, July 28, 2020 5:04 PM
To: Liu, Monk ; amd-...@freedesktop.org
Cc: Kuehling, Felix 
Subject: Re: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how 
many KCQ we want(v3)

The patch looks totally mangled to me, e.g. some spaces and new lines are 
missing.

Probably because it was forwarded.

Christian.

Am 28.07.20 um 10:59 schrieb Liu, Monk:
> [AMD Official Use Only - Internal Distribution Only]
>
> -Original Message-
> From: Monk Liu 
> Sent: Tuesday, July 28, 2020 2:59 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk 
> Subject: [PATCH] drm/amdgpu: introduce a new parameter to configure
> how many KCQ we want(v3)
>
> what:
> the MQD's save and restore of KCQ (kernel compute queue) cost lots of
> clocks during world switch which impacts a lot to multi-VF performance
>
> how:
> introduce a paramter to control the number of KCQ to avoid performance
> drop if there is no kernel compute queue needed
>
> notes:
> this paramter only affects gfx 8/9/10
>
> v2:
> refine namings
>
> v3:
> choose queues for each ring to that try best to cross pipes evenly.
>
> TODO:
> in the future we will let hypervisor driver to set this paramter
> automatically thus no need for user to configure it through modprobe
> in virtual machine
>
> Signed-off-by: Monk Liu 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 
> +++---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 29 +++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 31 
>   7 files changed, 87 insertions(+), 71 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e97c088..de11136 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -201,6 +201,7 @@ extern int amdgpu_si_support;  #ifdef
> CONFIG_DRM_AMDGPU_CIK  extern int amdgpu_cik_support;  #endif
> +extern int amdgpu_num_kcq;
>
>   #define AMDGPU_VM_MAX_NUM_CTX4096
>   #define AMDGPU_SG_THRESHOLD(256*1024*1024)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62ecac9..cf445bab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct
> amdgpu_device *adev)
>
>   amdgpu_gmc_tmz_set(adev);
>
> +if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { amdgpu_num_kcq = 8;
> +dev_warn(adev->dev, "set kernel compute queue number to 8 due to
> +invalid paramter provided by user\n"); }
> +
>   return 0;
>   }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 6291f5f..b545c40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -150,6 +150,7 @@ int amdgpu_noretry;
>   int amdgpu_force_asic_type = -1;
>   int amdgpu_tmz = 0;
>   int amdgpu_reset_method = -1; /* auto */
> +int amdgpu_num_kcq = -1;
>
>   struct amdgpu_mgpu_info mgpu_info = {
>   .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
> @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
> MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default),
> 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
> module_param_named(reset_method, amdgpu_reset_method, int, 0444);
>
> +MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want
> +to setup (8 if set to greater than 8 or less than 0, only affect gfx
> +8+)"); module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
> +
>   static const struct pci_device_id pciidlist[] = {  #ifdef  
> CONFIG_DRM_AMDGPU_SI
>   {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 8eff017..f83a9a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -202,40 +202,42 @@ bool
> amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
>
>   void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)  {
> -int i, queue, pipe, mec;
> +int i, queue, pipe;
>   bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
> +int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec *
> + adev->gfx.mec.num_queue_per_pipe,
> + adev->gfx.num_compute_rings);
> +
> +if (multipipe_policy) {
> +/* policy: make queues evenly cross all pipes on MEC1 only */ for (i
> += 

Re: [PATCH] drm/amd/display: Clear dm_state for fast updates

2020-07-28 Thread daniel
On Mon, Jul 27, 2020 at 10:49:48PM -0400, Kazlauskas, Nicholas wrote:
> On 2020-07-27 5:32 p.m., Daniel Vetter wrote:
> > On Mon, Jul 27, 2020 at 11:11 PM Mazin Rezk  wrote:
> > > 
> > > On Monday, July 27, 2020 4:29 PM, Daniel Vetter  wrote:
> > > 
> > > > On Mon, Jul 27, 2020 at 9:28 PM Christian König
> > > >  wrote:
> > > > > 
> > > > > Am 27.07.20 um 16:05 schrieb Kazlauskas, Nicholas:
> > > > > > On 2020-07-27 9:39 a.m., Christian König wrote:
> > > > > > > Am 27.07.20 um 07:40 schrieb Mazin Rezk:
> > > > > > > > This patch fixes a race condition that causes a use-after-free 
> > > > > > > > during
> > > > > > > > amdgpu_dm_atomic_commit_tail. This can occur when 2 non-blocking
> > > > > > > > commits
> > > > > > > > are requested and the second one finishes before the first.
> > > > > > > > Essentially,
> > > > > > > > this bug occurs when the following sequence of events happens:
> > > > > > > > 
> > > > > > > > 1. Non-blocking commit #1 is requested w/ a new dm_state #1 and 
> > > > > > > > is
> > > > > > > > deferred to the workqueue.
> > > > > > > > 
> > > > > > > > 2. Non-blocking commit #2 is requested w/ a new dm_state #2 and 
> > > > > > > > is
> > > > > > > > deferred to the workqueue.
> > > > > > > > 
> > > > > > > > 3. Commit #2 starts before commit #1, dm_state #1 is used in the
> > > > > > > > commit_tail and commit #2 completes, freeing dm_state #1.
> > > > > > > > 
> > > > > > > > 4. Commit #1 starts after commit #2 completes, uses the freed 
> > > > > > > > dm_state
> > > > > > > > 1 and dereferences a freelist pointer while setting the context.
> > > > > > > 
> > > > > > > Well I only have a one mile high view on this, but why don't you 
> > > > > > > let
> > > > > > > the work items execute in order?
> > > > > > > 
> > > > > > > That would be better anyway cause this way we don't trigger a 
> > > > > > > cache
> > > > > > > line ping pong between CPUs.
> > > > > > > 
> > > > > > > Christian.
> > > > > > 
> > > > > > We use the DRM helpers for managing drm_atomic_commit_state and 
> > > > > > those
> > > > > > helpers internally push non-blocking commit work into the system
> > > > > > unbound work queue.
> > > > > 
> > > > > Mhm, well if you send those helper atomic commits in the order A,B and
> > > > > they execute it in the order B,A I would call that a bug :)
> > > > 
> > > > The way it works is it pushes all commits into unbound work queue, but
> > > > then forces serialization as needed. We do _not_ want e.g. updates on
> > > > different CRTC to be serialized, that would result in lots of judder.
> > > > And hw is funny enough that there's all kinds of dependencies.
> > > > 
> > > > The way you force synchronization is by adding other CRTC state
> > > > objects. So if DC is busted and can only handle a single update per
> > > > work item, then I guess you always need all CRTC states and everything
> > > > will be run in order. But that also totally kills modern multi-screen
> > > > compositors. Xorg isn't modern, just in case that's not clear :-)
> > > > 
> > > > Lucking at the code it seems like you indeed have only a single dm
> > > > state, so yeah global sync is what you'll need as immediate fix, and
> > > > then maybe fix up DM to not be quite so silly ... or at least only do
> > > > the dm state stuff when really needed.
> > > > 
> > > > We could also sprinkle the drm_crtc_commit structure around a bit
> > > > (it's the glue that provides the synchronization across commits), but
> > > > since your dm state is global just grabbing all crtc states
> > > > unconditionally as part of that is probably best.
> > > > 
> > > > > > While we could duplicate a copy of that code with nothing but the
> > > > > > workqueue changed that isn't something I'd really like to maintain
> > > > > > going forward.
> > > > > 
> > > > > I'm not talking about duplicating the code, I'm talking about fixing 
> > > > > the
> > > > > helpers. I don't know that code well, but from the outside it sounds
> > > > > like a bug there.
> > > > > 
> > > > > And executing work items in the order they are submitted is trivial.
> > > > > 
> > > > > Had anybody pinged Daniel or other people familiar with the helper 
> > > > > code
> > > > > about it?
> > > > 
> > > > Yeah something is wrong here, and the fix looks horrible :-)
> > > > 
> > > > Aside, I've also seen some recent discussion flare up about
> > > > drm_atomic_state_get/put used to paper over some other use-after-free,
> > > > but this time related to interrupt handlers. Maybe a few rules about
> > > > that:
> > > > - dont
> > > > - especially not when it's interrupt handlers, because you can't call
> > > > drm_atomic_state_put from interrupt handlers.
> > > > 
> > > > Instead have an spin_lock_irq to protect the shared date with your
> > > > interrupt handler, and _copy_ the date over. This is e.g. what
> > > > drm_crtc_arm_vblank_event does.
> > > 
> > > Nicholas wrote a patch that attempted to resolve the issue by adding every
> > > CRTC into the 

Re: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

2020-07-28 Thread Christian König
The patch looks totally mangled to me, e.g. some spaces and new lines 
are missing.


Probably because it was forwarded.

Christian.

Am 28.07.20 um 10:59 schrieb Liu, Monk:

[AMD Official Use Only - Internal Distribution Only]

-Original Message-
From: Monk Liu 
Sent: Tuesday, July 28, 2020 2:59 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH] drm/amdgpu: introduce a new parameter to configure how many 
KCQ we want(v3)

what:
the MQD's save and restore of KCQ (kernel compute queue) cost lots of clocks 
during world switch which impacts a lot to multi-VF performance

how:
introduce a paramter to control the number of KCQ to avoid performance drop if 
there is no kernel compute queue needed

notes:
this paramter only affects gfx 8/9/10

v2:
refine namings

v3:
choose queues for each ring to that try best to cross pipes evenly.

TODO:
in the future we will let hypervisor driver to set this paramter automatically 
thus no need for user to configure it through modprobe in virtual machine

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 +++---
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 29 +++
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 31 
  7 files changed, 87 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e97c088..de11136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -201,6 +201,7 @@ extern int amdgpu_si_support;  #ifdef CONFIG_DRM_AMDGPU_CIK 
 extern int amdgpu_cik_support;  #endif
+extern int amdgpu_num_kcq;

  #define AMDGPU_VM_MAX_NUM_CTX4096
  #define AMDGPU_SG_THRESHOLD(256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac9..cf445bab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct 
amdgpu_device *adev)

  amdgpu_gmc_tmz_set(adev);

+if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
+amdgpu_num_kcq = 8;
+dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid paramter 
provided by user\n");
+}
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6291f5f..b545c40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -150,6 +150,7 @@ int amdgpu_noretry;
  int amdgpu_force_asic_type = -1;
  int amdgpu_tmz = 0;
  int amdgpu_reset_method = -1; /* auto */
+int amdgpu_num_kcq = -1;

  struct amdgpu_mgpu_info mgpu_info = {
  .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);  
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 
= mode0, 2 = mode1, 3 = mode2, 4 = baco)");  module_param_named(reset_method, 
amdgpu_reset_method, int, 0444);

+MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to
+setup (8 if set to greater than 8 or less than 0, only affect gfx
+8+)"); module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
+
  static const struct pci_device_id pciidlist[] = {  #ifdef  
CONFIG_DRM_AMDGPU_SI
  {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 8eff017..f83a9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct 
amdgpu_device *adev,

  void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)  {
-int i, queue, pipe, mec;
+int i, queue, pipe;
  bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
+int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec *
+ adev->gfx.mec.num_queue_per_pipe,
+ adev->gfx.num_compute_rings);
+
+if (multipipe_policy) {
+/* policy: make queues evenly cross all pipes on MEC1 only */
+for (i = 0; i < max_queues_per_mec; i++) {
+pipe = i % adev->gfx.mec.num_pipe_per_mec;
+queue = (i / adev->gfx.mec.num_pipe_per_mec) %
+adev->gfx.mec.num_queue_per_pipe;
+
+set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue,
+adev->gfx.mec.queue_bitmap);
+}
+} else {
+int mec;

-/* policy for amdgpu compute queue ownership */
-for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
-queue = i % adev->gfx.mec.num_queue_per_pipe;
-pipe = (i / adev->gfx.mec.num_queue_per_pipe)
-% adev->gfx.mec.num_pipe_per_mec;
-mec = (i / adev->gfx.mec.num_queue_per_pipe)
-/ adev->gfx.mec.num_pipe_per_mec;
-
-/* we've run out of HW */
-if (mec >= 

[PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

2020-07-28 Thread Monk Liu
what:
the MQD's save and restore of KCQ (kernel compute queue)
cost lots of clocks during world switch which impacts a lot
to multi-VF performance

how:
introduce a paramter to control the number of KCQ to avoid
performance drop if there is no kernel compute queue needed

notes:
this paramter only affects gfx 8/9/10

v2:
refine namings

v3:
choose queues for each ring to that try best to cross pipes evenly.

TODO:
in the future we will let hypervisor driver to set this paramter
automatically thus no need for user to configure it through
modprobe in virtual machine

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 58 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 29 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 31 
 7 files changed, 87 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e97c088..de11136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -201,6 +201,7 @@ extern int amdgpu_si_support;
 #ifdef CONFIG_DRM_AMDGPU_CIK
 extern int amdgpu_cik_support;
 #endif
+extern int amdgpu_num_kcq;
 
 #define AMDGPU_VM_MAX_NUM_CTX  4096
 #define AMDGPU_SG_THRESHOLD(256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac9..cf445bab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct 
amdgpu_device *adev)
 
amdgpu_gmc_tmz_set(adev);
 
+   if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
+   amdgpu_num_kcq = 8;
+   dev_warn(adev->dev, "set kernel compute queue number to 8 due 
to invalid paramter provided by user\n");
+   }
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6291f5f..b545c40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -150,6 +150,7 @@ int amdgpu_noretry;
 int amdgpu_force_asic_type = -1;
 int amdgpu_tmz = 0;
 int amdgpu_reset_method = -1; /* auto */
+int amdgpu_num_kcq = -1;
 
 struct amdgpu_mgpu_info mgpu_info = {
.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
 MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = 
legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
 module_param_named(reset_method, amdgpu_reset_method, int, 0444);
 
+MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup 
(8 if set to greater than 8 or less than 0, only affect gfx 8+)");
+module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
+
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 8eff017..f83a9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct 
amdgpu_device *adev,
 
 void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)
 {
-   int i, queue, pipe, mec;
+   int i, queue, pipe;
bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
+   int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec *
+
adev->gfx.mec.num_queue_per_pipe,
+
adev->gfx.num_compute_rings);
+
+   if (multipipe_policy) {
+   /* policy: make queues evenly cross all pipes on MEC1 only */
+   for (i = 0; i < max_queues_per_mec; i++) {
+   pipe = i % adev->gfx.mec.num_pipe_per_mec;
+   queue = (i / adev->gfx.mec.num_pipe_per_mec) %
+   adev->gfx.mec.num_queue_per_pipe;
+
+   set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue,
+   adev->gfx.mec.queue_bitmap);
+   }
+   } else {
+   int mec;
 
-   /* policy for amdgpu compute queue ownership */
-   for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
-   queue = i % adev->gfx.mec.num_queue_per_pipe;
-   pipe = (i / adev->gfx.mec.num_queue_per_pipe)
-   % adev->gfx.mec.num_pipe_per_mec;
-   mec = (i / adev->gfx.mec.num_queue_per_pipe)

RE: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name

2020-07-28 Thread Chen, Guchun
[AMD Public Use]

Thanks Christian. Your suggestion looks better, let me improve it.

Regards,
Guchun

-Original Message-
From: Koenig, Christian  
Sent: Tuesday, July 28, 2020 3:55 PM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Li, Dennis ; Grodzovsky, Andrey ; 
Zhou1, Tao ; Clements, John ; Lazar, 
Lijo ; Yang, Stanley 
Subject: Re: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs 
name

Am 28.07.20 um 09:49 schrieb Guchun Chen:
> Add one definition for the RAS module's FS name. It's used in both 
> debugfs and sysfs case.

Maybe better do this with a "static const char*".

Christian.

>
> Signed-off-by: Guchun Chen 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 -
>   1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2cc09aa67423..c1ed0074a52b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -34,6 +34,8 @@
>   #include "amdgpu_xgmi.h"
>   #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>   
> +#define AMDGPU_RAS_FS_NAME "ras"
> +
>   const char *ras_error_string[] = {
>   "none",
>   "parity",
> @@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
> amdgpu_device *adev)
>   NULL
>   };
>   struct attribute_group group = {
> - .name = "ras",
> + .name = AMDGPU_RAS_FS_NAME,
>   .attrs = attrs,
>   #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
>   .bin_attrs = bin_attrs,
> @@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
> amdgpu_device *adev)
>   NULL
>   };
>   struct attribute_group group = {
> - .name = "ras",
> + .name = AMDGPU_RAS_FS_NAME,
>   .attrs = attrs,
>   #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
>   .bin_attrs = bin_attrs,
> @@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device 
> *adev,
>   
>   if (sysfs_add_file_to_group(>dev->kobj,
>   >sysfs_attr.attr,
> - "ras")) {
> + AMDGPU_RAS_FS_NAME)) {
>   put_obj(obj);
>   return -EINVAL;
>   }
> @@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device 
> *adev,
>   
>   sysfs_remove_file_from_group(>dev->kobj,
>   >sysfs_attr.attr,
> - "ras");
> + AMDGPU_RAS_FS_NAME);
>   obj->attr_inuse = 0;
>   put_obj(obj);
>   
> @@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct 
> amdgpu_device *adev)
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   struct drm_minor *minor = adev->ddev->primary;
>   
> - con->dir = debugfs_create_dir("ras", minor->debugfs_root);
> + con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME,
> + minor->debugfs_root);
>   debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
>   adev, _ras_debugfs_ctrl_ops);
>   debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, 
> con->dir,
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name

2020-07-28 Thread Christian König

Am 28.07.20 um 09:49 schrieb Guchun Chen:

Add one definition for the RAS module's FS name. It's used
in both debugfs and sysfs case.


Maybe better do this with a "static const char*".

Christian.



Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 -
  1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2cc09aa67423..c1ed0074a52b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,8 @@
  #include "amdgpu_xgmi.h"
  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
  
+#define AMDGPU_RAS_FS_NAME "ras"

+
  const char *ras_error_string[] = {
"none",
"parity",
@@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
  #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
  #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
  
  	if (sysfs_add_file_to_group(>dev->kobj,

>sysfs_attr.attr,
-   "ras")) {
+   AMDGPU_RAS_FS_NAME)) {
put_obj(obj);
return -EINVAL;
}
@@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
  
  	sysfs_remove_file_from_group(>dev->kobj,

>sysfs_attr.attr,
-   "ras");
+   AMDGPU_RAS_FS_NAME);
obj->attr_inuse = 0;
put_obj(obj);
  
@@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)

struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct drm_minor *minor = adev->ddev->primary;
  
-	con->dir = debugfs_create_dir("ras", minor->debugfs_root);

+   con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME,
+   minor->debugfs_root);
debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
adev, _ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 09/12] drm/amdgpu: define one macro for RAS's sysfs/debugfs name

2020-07-28 Thread Guchun Chen
Add one definition for the RAS module's FS name. It's used
in both debugfs and sysfs case.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2cc09aa67423..c1ed0074a52b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,8 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 
+#define AMDGPU_RAS_FS_NAME "ras"
+
 const char *ras_error_string[] = {
"none",
"parity",
@@ -1037,7 +1039,7 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
 #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1080,7 +1082,7 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
NULL
};
struct attribute_group group = {
-   .name = "ras",
+   .name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
 #if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
.bin_attrs = bin_attrs,
@@ -1117,7 +1119,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 
if (sysfs_add_file_to_group(>dev->kobj,
>sysfs_attr.attr,
-   "ras")) {
+   AMDGPU_RAS_FS_NAME)) {
put_obj(obj);
return -EINVAL;
}
@@ -1137,7 +1139,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 
sysfs_remove_file_from_group(>dev->kobj,
>sysfs_attr.attr,
-   "ras");
+   AMDGPU_RAS_FS_NAME);
obj->attr_inuse = 0;
put_obj(obj);
 
@@ -1183,7 +1185,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct 
amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct drm_minor *minor = adev->ddev->primary;
 
-   con->dir = debugfs_create_dir("ras", minor->debugfs_root);
+   con->dir = debugfs_create_dir(AMDGPU_RAS_FS_NAME,
+   minor->debugfs_root);
debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
adev, _ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 05/12] drm/amdgpu: skip bad page reservation once issuing from eeprom write

2020-07-28 Thread Guchun Chen
Once the ras recovery is issued from eeprom write itself,
bad page reservation should be ignored, otherwise, recursive
calling of writting to eeprom would happen.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 +++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 56e1aeba2d64..3c4b9127660d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -62,8 +62,6 @@ const char *ras_block_string[] = {
 #define ras_err_str(i) (ras_error_string[ffs(i)])
 #define ras_block_str(i) (ras_block_string[i])
 
-#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  1
-#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 
 /* inject address is 52 bits */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 4672649a9293..cf9f60202334 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,6 +31,10 @@
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
+#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
+#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV (0x1 << 2)
+
 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -503,10 +507,14 @@ static inline int amdgpu_ras_reset_gpu(struct 
amdgpu_device *adev)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-   /* save bad page to eeprom before gpu reset,
-* i2c may be unstable in gpu reset
+   /*
+* Save bad page to eeprom before gpu reset, i2c may be unstable
+* in gpu reset.
+*
+* Also, exclude the case when ras recovery issuer is
+* eeprom page write itself.
 */
-   if (in_task())
+   if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task())
amdgpu_ras_reserve_bad_pages(adev);
 
if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 11/12] drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold = 0

2020-07-28 Thread Guchun Chen
When amdgpu_bad_page_threshold = 0, bad page reservation stuffs
are skipped in either UMC ECC irq or page retirement calling of
sync flood isr.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ddcf2470a20b..bbff89caf8c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1678,7 +1678,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device 
*adev)
int ret = 0;
 
/* no bad page record, skip eeprom access */
-   if (!control->num_recs)
+   if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
return ret;
 
bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
@@ -1782,7 +1782,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device 
*adev)
struct amdgpu_bo *bo = NULL;
int i, ret = 0;
 
-   if (!con || !con->eh_data)
+   /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */
+   if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
return 0;
 
mutex_lock(>recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index af1b1ccf613c..262baf0f61ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -125,8 +125,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
"detected in UMC block\n",
err_data->ue_count);
 
-   if (err_data->err_addr_cnt &&
-   amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+   if ((amdgpu_bad_page_threshold != 0) &&
+   err_data->err_addr_cnt &&
+   amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt))
dev_warn(adev->dev, "Failed to add ras bad page!\n");
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 10/12] drm/amdgpu: decouple sysfs creating of bad page node

2020-07-28 Thread Guchun Chen
Bad page information should not be exposed by sysfs when
bad page retirement is disabled, so decouple it from ras
sysfs group creating, and add one guard before creating.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 71 -
 1 file changed, 46 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c1ed0074a52b..ddcf2470a20b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1027,6 +1027,35 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct 
device *dev,
return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 }
 
+static void amdgpu_ras_sysfs_add_badpage_node(struct amdgpu_device *adev)
+{
+#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct attribute_group group;
+   struct bin_attribute *bin_attrs[] = {
+   >badpages_attr,
+   NULL,
+   };
+
+   con->badpages_attr = (struct bin_attribute) {
+   .attr = {
+   .name = "gpu_vram_bad_pages",
+   .mode = S_IRUGO,
+   },
+   .size = 0,
+   .private = NULL,
+   .read = amdgpu_ras_sysfs_badpages_read,
+   };
+
+   group.name = AMDGPU_RAS_FS_NAME;
+   group.bin_attrs = bin_attrs;
+
+   sysfs_bin_attr_init(bin_attrs[0]);
+
+   sysfs_update_group(>dev->kobj, );
+#endif
+}
+
 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1034,16 +1063,9 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
>features_attr.attr,
NULL
};
-   struct bin_attribute *bin_attrs[] = {
-   >badpages_attr,
-   NULL
-   };
struct attribute_group group = {
.name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
-#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
-   .bin_attrs = bin_attrs,
-#endif
};
 
con->features_attr = (struct device_attribute) {
@@ -1054,22 +1076,22 @@ static int amdgpu_ras_sysfs_create_feature_node(struct 
amdgpu_device *adev)
.show = amdgpu_ras_sysfs_features_read,
};
 
-   con->badpages_attr = (struct bin_attribute) {
-   .attr = {
-   .name = "gpu_vram_bad_pages",
-   .mode = S_IRUGO,
-   },
-   .size = 0,
-   .private = NULL,
-   .read = amdgpu_ras_sysfs_badpages_read,
-   };
-
sysfs_attr_init(attrs[0]);
-   sysfs_bin_attr_init(bin_attrs[0]);
 
return sysfs_create_group(>dev->kobj, );
 }
 
+static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
+{
+#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   sysfs_remove_file_from_group(>dev->kobj,
+   >badpages_attr.attr,
+   AMDGPU_RAS_FS_NAME);
+#endif
+}
+
 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1077,16 +1099,9 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
>features_attr.attr,
NULL
};
-   struct bin_attribute *bin_attrs[] = {
-   >badpages_attr,
-   NULL
-   };
struct attribute_group group = {
.name = AMDGPU_RAS_FS_NAME,
.attrs = attrs,
-#if defined(HAVE_ATTRIBUTE_GROUP_BIN_ATTRS)
-   .bin_attrs = bin_attrs,
-#endif
};
 
sysfs_remove_group(>dev->kobj, );
@@ -1155,6 +1170,9 @@ static int amdgpu_ras_sysfs_remove_all(struct 
amdgpu_device *adev)
amdgpu_ras_sysfs_remove(adev, >head);
}
 
+   if (amdgpu_bad_page_threshold != 0)
+   amdgpu_ras_sysfs_remove_bad_page_node(adev);
+
amdgpu_ras_sysfs_remove_feature_node(adev);
 
return 0;
@@ -1283,6 +1301,9 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 {
amdgpu_ras_sysfs_create_feature_node(adev);
 
+   if (amdgpu_bad_page_threshold != 0)
+   amdgpu_ras_sysfs_add_badpage_node(adev);
+
return 0;
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 03/12] drm/amdgpu: add bad gpu tag definition

2020-07-28 Thread Guchun Chen
This tag will be hired for bad gpu detection in eeprom's access.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index a2c982b1eac6..35c0c849d49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -46,6 +46,9 @@
 #define EEPROM_TABLE_HDR_VAL 0x414d4452
 #define EEPROM_TABLE_VER 0x0001
 
+/* Bad GPU tag ‘BADG’ */
+#define EEPROM_TABLE_HDR_BAD 0x42414447
+
 /* Assume 2 Mbit size */
 #define EEPROM_SIZE_BYTES 256000
 #define EEPROM_PAGE__SIZE_BYTES 256
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 12/12] drm/amdgpu: reset eeprom once specifying one bigger threshold

2020-07-28 Thread Guchun Chen
During driver's probe, when it hits bad gpu tag in eeprom i2c
init calling(the tag was set when reported bad page reaches
bad page threshold in last driver's working loop), there are
some strategys to deal with the cases:

1. when the module parameter amdgpu_bad_page_threshold = 0,
that means page retirement feature is disabled, so just resetting
the eeprom is fine.
2. When amdgpu_bad_page_threshold is not 0, and moreover, user
sets one bigger valid value in order to make current boot up
succeeds, reset the eeprom data and do not break booting.
3. For other cases, driver's probe will be broken.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index be895dc2d739..02933050081b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -248,6 +248,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control 
*control,
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 
0 };
struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct i2c_msg msg = {
.addr   = 0,
.flags  = I2C_M_RD,
@@ -287,9 +288,15 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
 
} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
(amdgpu_bad_page_threshold != 0)) {
-   *exceed_err_limit = true;
-   DRM_ERROR("Exceeding the bad_page_threshold parameter, "
+   if (ras->bad_page_cnt_threshold > control->num_recs) {
+   DRM_INFO("One valid bigger bad page threshold is "
+   "used, reset eeprom.\n");
+   ret = amdgpu_ras_eeprom_reset_table(control);
+   } else {
+   *exceed_err_limit = true;
+   DRM_ERROR("Exceeding the bad_page_threshold parameter, "
"disabling the GPU.\n");
+   }
} else {
DRM_INFO("Creating new EEPROM table");
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 08/12] drm/amdgpu: restore ras flags when user resets eeprom

2020-07-28 Thread Guchun Chen
RAS flags needs to be cleaned as well when user requires
one clean eeprom.

v2: RAS flags shall be restored after eeprom reset succeeds.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dcb84f2ca078..2cc09aa67423 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -368,12 +368,19 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char 
__user *buf,
size_t size, loff_t *pos)
 {
-   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   struct amdgpu_device *adev =
+   (struct amdgpu_device *)file_inode(f)->i_private;
int ret;
 
-   ret = amdgpu_ras_eeprom_reset_table(>psp.ras.ras->eeprom_control);
+   ret = amdgpu_ras_eeprom_reset_table(
+   &(amdgpu_ras_get_context(adev)->eeprom_control));
 
-   return ret == 1 ? size : -EIO;
+   if (ret == 1) {
+   amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
+   return size;
+   } else {
+   return -EIO;
+   }
 }
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 07/12] drm/amdgpu: break GPU recovery once it's in bad state

2020-07-28 Thread Guchun Chen
When GPU executes recovery and retriving bad GPU tag
from external eerpom device, the recovery will be broken
and error message is printed as well for user's awareness.

v2: Refine warning message in threshold reaching case, and
fix spelling typo.

v3: Fix explicit calling of bad gpu.

v4: Rename function names.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 20 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 16 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 40 +++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  4 ++
 5 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 30af0dfee1a1..c893d9adbab7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4139,8 +4139,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 
amdgpu_fbdev_set_suspend(tmp_adev, 0);
 
-   /* must succeed. */
-   amdgpu_ras_resume(tmp_adev);
+   /*
+* The GPU enters bad state once faulty pages
+* by ECC has reached the threshold, and ras
+* recovery is scheduled next. So add one check
+* here to break recovery if it indeed exceeds
+* bad page threshold, and remind user to
+* retire this GPU or setting one bigger
+* bad_page_threshold value to fix this once
+* probing driver again.
+*/
+   if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+   /* must succeed. */
+   amdgpu_ras_resume(tmp_adev);
+   } else {
+   r = -EINVAL;
+   goto out;
+   }
 
/* Update PSP FW topology after reset */
if (hive && 
tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4148,7 +4163,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
}
}
 
-
 out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4b9127660d..dcb84f2ca078 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2206,3 +2206,19 @@ bool amdgpu_ras_need_emergency_restart(struct 
amdgpu_device *adev)
 
return false;
 }
+
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool exc_err_limit = false;
+
+   if (con && (amdgpu_bad_page_threshold != 0))
+   amdgpu_ras_eeprom_check_err_threshold(>eeprom_control,
+   _err_limit);
+
+   /*
+* We are only interested in variable exc_err_limit,
+* as it says if GPU is in bad state or not.
+*/
+   return exc_err_limit;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cf9f60202334..70a6fca73617 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
 
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index d24bf65f6dd7..be895dc2d739 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t 
curr_address)
return curr_address;
 }
 
+int amdgpu_ras_eeprom_check_err_threshold(
+   struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)
+{
+   struct amdgpu_device *adev = to_amdgpu_device(control);
+   unsigned char buff[EEPROM_ADDRESS_SIZE +
+   EEPROM_TABLE_HEADER_SIZE] = { 0 };
+   struct amdgpu_ras_eeprom_table_header *hdr = >tbl_hdr;
+   struct i2c_msg msg = {

[PATCH 06/12] drm/amdgpu: schedule ras recovery when reaching bad page threshold

2020-07-28 Thread Guchun Chen
Once the bad page saved to eeprom reaches the configured
threshold, ras recovery will be issued to notify user.

v2: Fix spelling typo.

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 37 ++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 67995b66d7d4..d24bf65f6dd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
int i, ret = 0;
struct i2c_msg *msgs, *msg;
unsigned char *buffs, *buff;
+   bool sched_ras_recovery = false;
struct eeprom_table_record *record;
struct amdgpu_device *adev = to_amdgpu_device(control);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
return 0;
@@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
goto free_buff;
}
 
+   /*
+* If saved bad pages number exceeds the bad page threshold for
+* the whole VRAM, update table header to mark the BAD GPU tag
+* and schedule one ras recovery after eeprom write is done,
+* this can avoid the missing for latest records.
+*
+* This new header will be picked up and checked in the bootup
+* by ras recovery, which may break bootup process to notify
+* user this GPU is in bad state and to retire such GPU for
+* further check.
+*/
+   if (write && (amdgpu_bad_page_threshold != 0) &&
+   ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
+   dev_warn(adev->dev,
+   "Saved bad pages(%d) reaches threshold value(%d).\n",
+   control->num_recs + num, ras->bad_page_cnt_threshold);
+   control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
+   sched_ras_recovery = true;
+   }
+
/* In case of overflow just start from beginning to not lose newest 
records */
if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > 
EEPROM_SIZE_BYTES))
control->next_addr = EEPROM_RECORD_START;
 
-
/*
 * TODO Currently makes EEPROM writes for each record, this creates
 * internal fragmentation. Optimized the code to do full page write of
@@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 
__update_table_header(control, buffs);
+
+   if (sched_ras_recovery) {
+   /*
+* Before scheduling ras recovery, assert the related
+* flag first, which shall bypass common bad page
+* reservation execution in amdgpu_ras_reset_gpu.
+*/
+   amdgpu_ras_get_context(adev)->flags |=
+   AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
+
+   dev_warn(adev->dev, "Conduct ras recovery due to bad "
+   "page threshold reached.\n");
+   amdgpu_ras_reset_gpu(adev);
+   }
} else if (!__validate_tbl_checksum(control, records, num)) {
DRM_WARN("EEPROM Table checksum mismatch!");
/* TODO Uncomment when EEPROM read/write is relliable */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 04/12] drm/amdgpu: break driver init process when it's bad GPU

2020-07-28 Thread Guchun Chen
When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.

v3: Refine function argument name.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..30af0dfee1a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 * it should be called after amdgpu_device_ip_hw_init_phase2  since
 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
 * for I2C communication which only true at this point.
-* recovery_init may fail, but it can free all resources allocated by
-* itself and its failure should not stop amdgpu init process.
+*
+* amdgpu_ras_recovery_init may fail, but the upper only cares the
+* failure from bad gpu situation and stop amdgpu init process
+* accordingly. For other failed cases, it will still release all
+* the resource and print error message, rather than returning one
+* negative value to upper level.
 *
 * Note: theoretically, this should be called before all vram 
allocations
 * to protect retired page from abusing
 */
-   amdgpu_ras_recovery_init(adev);
+   r = amdgpu_ras_recovery_init(adev);
+   if (r)
+   goto init_failed;
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c4c142e9d8a..56e1aeba2d64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+   bool exc_err_limit = false;
int ret;
 
if (con)
@@ -1843,9 +1844,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-   ret = amdgpu_ras_eeprom_init(>eeprom_control);
-   if (ret)
+   ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
+   /*
+* We only fail this calling and halt booting up
+* when exc_err_limit is true.
+*/
+   if (exc_err_limit) {
+   ret = -EINVAL;
goto free;
+   }
 
if (con->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
@@ -1868,6 +1875,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
 
+   /*
+* Except error threshold exceeding case, other failure cases in this
+* function would not fail amdgpu driver init.
+*/
+   if (!exc_err_limit)
+   ret = 0;
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 35c0c849d49b..67995b66d7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct 
amdgpu_ras_eeprom_control *control)
 
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+   bool *exceed_err_limit)
 {
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control 
*control)
.buf= buff,
};
 
+   *exceed_err_limit = false;
+
/* Verify i2c adapter is initialized */
if (!adev->pm.smu_i2c.algo)
return -ENOENT;
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 control->num_recs);
 
+   } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
+   (amdgpu_bad_page_threshold != 0)) {
+   *exceed_err_limit = true;
+   DRM_ERROR("Exceeding the 

[PATCH 00/12] BAD GPU retirement policy by total bad pages

2020-07-28 Thread Guchun Chen
The series is to enable/disable bad page feature and apply different
bad page reservation strategy by different bad page threshold
configurations.

When the saved bad pages written to eeprom reach the threshold,
one ras recovery will be issued immediately and the recovery will
fail to tell user that the GPU is BAD and needs to be retired for
further check or setting one valid bigger threshold value in next
driver's probe to skip corresponding check.

During bootup, similar bad page threshold check is conducted as
well when eeprom get initialized, and it will possibly break boot
up for user's awareness.

When user sets bad_page_threshold=0 once probing driver, bad page
retirement feature is completely disabled, and driver has no chance to
process bad page information record and write it to eeprom.

Guchun Chen (12):
  drm/amdgpu: add bad page count threshold in module parameter
  drm/amdgpu: validate bad page threshold in ras
  drm/amdgpu: add bad gpu tag definition
  drm/amdgpu: break driver init process when it's bad GPU
  drm/amdgpu: skip bad page reservation once issuing from eeprom write
  drm/amdgpu: schedule ras recovery when reaching bad page threshold
  drm/amdgpu: break GPU recovery once it's in bad state
  drm/amdgpu: restore ras flags when user resets eeprom
  drm/amdgpu: define one macro for RAS's sysfs/debugfs name
  drm/amdgpu: decouple sysfs creating of bad page node
  drm/amdgpu: disable page reservation when amdgpu_bad_page_threshold =
0
  drm/amdgpu: reset eeprom once specifying one bigger threshold

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  32 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  11 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 186 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  19 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 102 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   |   5 +-
 8 files changed, 312 insertions(+), 53 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 02/12] drm/amdgpu: validate bad page threshold in ras

2020-07-28 Thread Guchun Chen
Bad page threshold value should be valid in the range between
-1 and max records length of eeprom. It could determine when
saved bad pages exceed threshold value, and proceed corresponding
actions.

v2: When using the default typical value, it should be min
value between typical value and eeprom max records length.

v3: drop the case of setting bad_page_cnt_threshold to be
0x, as it confuses user.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 48 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  3 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  5 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  2 +
 4 files changed, 58 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6f06e1214622..3c4c142e9d8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #defineRAS_UMC_INJECT_ADDR_LIMIT   (0x1ULL << 52)
 
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE  (100 * 1024 * 1024ULL)
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1700,6 +1703,47 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+   uint32_t max_length)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int tmp_threshold = amdgpu_bad_page_threshold;
+   u64 val;
+
+   /*
+* Justification of value bad_page_cnt_threshold in ras structure
+*
+* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
+* in eeprom, and introduce two scenarios accordingly.
+*
+* Bad page retirement enablement:
+*- If amdgpu_bad_page_threshold = -1,
+*  bad_page_cnt_threshold = typical value by formula.
+*
+*- When the value from user is 0 < amdgpu_bad_page_threshold <
+*  max record length in eeprom, use it directly.
+*
+* Bad page retirement disablement:
+*- If amdgpu_bad_page_threshold = 0, bad page retirement
+*  functionality is disabled, and bad_page_cnt_threshold will
+*  take no effect.
+*/
+
+   if (tmp_threshold < -1)
+   tmp_threshold = -1;
+   else if (tmp_threshold > max_length)
+   tmp_threshold = max_length;
+
+   if (tmp_threshold == -1) {
+   val = adev->gmc.mc_vram_size;
+   do_div(val, RAS_BAD_PAGE_RATE);
+   con->bad_page_cnt_threshold = min(lower_32_bits(val),
+   max_length);
+   } else {
+   con->bad_page_cnt_threshold = tmp_threshold;
+   }
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1777,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+   uint32_t max_eeprom_records_len = 0;
int ret;
 
if (con)
@@ -1795,6 +1840,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(>in_recovery, 0);
con->adev = adev;
 
+   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
ret = amdgpu_ras_eeprom_init(>eeprom_control);
if (ret)
goto free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b2667342cf67..4672649a9293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -336,6 +336,9 @@ struct amdgpu_ras {
struct amdgpu_ras_eeprom_control eeprom_control;
 
bool error_query_ready;
+
+   /* bad page count threshold */
+   uint32_t bad_page_cnt_threshold;
 };
 
 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c0096097bbcf..a2c982b1eac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
return ret == num ? 0 : -EIO;
 }
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+{
+   return EEPROM_MAX_RECORD_NUM;
+}
+
 /* Used for testing if bugs encountered */
 #if 0
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 

[PATCH 01/12] drm/amdgpu: add bad page count threshold in module parameter

2020-07-28 Thread Guchun Chen
bad_page_threshold could be configured to enable/disable the
associated bad page retirement feature in RAS.

When it's -1, ras will use typical bad page failure value to
handle bad page retirement.

When it's 0, disable bad page retirement, and no bad page
will be recorded and saved.

For other valid value, driver will use this manual value
as the threshold value of totoal bad pages.

v2: correct documentation of this parameter.
v3: remove confused statement in documentation.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 06bfb8658dec..bb83ffb5e26a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level;
 extern struct amdgpu_mgpu_info mgpu_info;
 extern int amdgpu_ras_enable;
 extern uint amdgpu_ras_mask;
+extern int amdgpu_bad_page_threshold;
 extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d28b95f721c4..820a28c9e957 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 };
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0x;
+int amdgpu_bad_page_threshold = -1;
 
 /**
  * DOC: vramlimit (int)
@@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
 MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = 
legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
 module_param_named(reset_method, amdgpu_reset_method, int, 0444);
 
+/**
+ * DOC: bad_page_threshold (int)
+ * Bad page threshold is to specify the threshold value of faulty pages
+ * detected by RAS ECC, that may result in GPU entering bad status if total
+ * faulty pages by ECC exceed threshold value and leave it for user's further
+ * check.
+ */
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default 
typical value), 0 = disable bad page retirement)");
+module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
+
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/4] drm: retrieve EDID via ACPI _DDC method

2020-07-28 Thread Daniel Dadap
Some notebook computer systems expose the EDID for the internal
panel via the ACPI _DDC method. On some systems this is because
the panel does not populate the hardware DDC lines, and on some
systems with dynamic display muxes, _DDC is implemented to allow
the internal panel's EDID to be read at any time, regardless of
how the mux is switched.

The _DDC method can be implemented for each individual display
output, so there could be an arbitrary number of outputs exposing
their EDIDs via _DDC; however, in practice, this has only been
implemented so far on systems with a single panel, so the current
implementation of drm_get_edid_acpi() walks the outputs listed by
each GPU's ACPI _DOD method and returns the first EDID successfully
retrieved by any attached _DDC method. It may be necessary in the
future to allow for the retrieval of distinct EDIDs for different
output devices, but in order to do so, it will first be necessary
to develop a way to correlate individual DRM outputs with their
corresponding entities in ACPI.

Signed-off-by: Daniel Dadap 
---
 drivers/gpu/drm/drm_edid.c | 161 +
 include/drm/drm_edid.h |   1 +
 2 files changed, 162 insertions(+)

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 116451101426..f66d6bf048c6 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2058,6 +2059,166 @@ struct edid *drm_get_edid_switcheroo(struct 
drm_connector *connector,
 }
 EXPORT_SYMBOL(drm_get_edid_switcheroo);
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_PCI)
+static u64 *get_dod_entries(acpi_handle handle, int *count)
+{
+   acpi_status status;
+   struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+   union acpi_object *dod;
+   int i;
+   u64 *ret = NULL;
+
+   *count = 0;
+
+   status = acpi_evaluate_object(handle, "_DOD", NULL, );
+
+   if (ACPI_FAILURE(status))
+   return NULL;
+
+   dod = buf.pointer;
+
+   if (dod == NULL || dod->type != ACPI_TYPE_PACKAGE)
+   goto done;
+
+   ret = kmalloc_array(dod->package.count, sizeof(*ret), GFP_KERNEL);
+   if (ret == NULL)
+   goto done;
+
+   for (i = 0; i < dod->package.count; i++) {
+   if (dod->package.elements[i].type != ACPI_TYPE_INTEGER)
+   continue;
+   ret[*count] = dod->package.elements[i].integer.value;
+   (*count)++;
+   }
+
+done:
+   kfree(buf.pointer);
+   return ret;
+}
+
+static void *do_acpi_ddc(acpi_handle handle)
+{
+   int i;
+   void *ret = NULL;
+
+   /*
+* The _DDC spec defines an integer argument for specifying the size of
+* the EDID to be retrieved. A value of 1 requests a 128-byte EDID, and
+* a value of 2 requests a 256-byte EDID. Attempt the larger read first.
+*/
+   for (i = 2; i >= 1; i--) {
+   struct acpi_buffer out = { ACPI_ALLOCATE_BUFFER, NULL };
+   union acpi_object arg = { ACPI_TYPE_INTEGER };
+   struct acpi_object_list in = { 1,  };
+   union acpi_object *edid;
+   acpi_status status;
+
+   arg.integer.value = i;
+   status = acpi_evaluate_object(handle, "_DDC", , );
+   edid = out.pointer;
+
+   if (ACPI_SUCCESS(status))
+   ret = edid->buffer.pointer;
+
+   kfree(edid);
+
+   if (ret)
+   break;
+   }
+
+   return ret;
+}
+
+static struct edid *first_edid_from_acpi_ddc(struct pci_dev *pdev)
+{
+   acpi_handle handle;
+   acpi_status status;
+   struct acpi_device *device = NULL;
+   struct edid *ret = NULL;
+   int num_dod_entries;
+   u64 *dod_entries = NULL;
+   struct list_head *node, *next;
+
+   handle = ACPI_HANDLE(>dev);
+   if (handle == NULL)
+   return NULL;
+
+   dod_entries = get_dod_entries(handle, _dod_entries);
+   if (dod_entries == NULL || num_dod_entries == 0)
+   goto done;
+
+   status = acpi_bus_get_device(handle, );
+   if (ACPI_FAILURE(status) || device == NULL)
+   goto done;
+
+   list_for_each_safe(node, next, >children) {
+   struct acpi_device *child;
+   u64 adr;
+   int i;
+
+   child = list_entry(node, struct acpi_device, node);
+   if (child == NULL)
+   continue;
+
+   status = acpi_evaluate_integer(child->handle, "_ADR", NULL,
+   );
+   if (ACPI_FAILURE(status))
+   continue;
+
+   for (i = 0; i < num_dod_entries; i++) {
+   if (adr == dod_entries[i]) {
+   ret = do_acpi_ddc(child->handle);
+
+   if 

[PATCH 0/4] drm: add support for retrieving EDID via ACPI _DDC

2020-07-28 Thread Daniel Dadap
Some notebook systems provide the EDID for the internal panel via the
_DDC method in ACPI, instead of or in addition to providing the EDID via
DDC on LVDS/eDP. Add a DRM helper to search for an ACP _DDC method under
the ACPI namespace for each VGA/3D controller, and return the first EDID
successfully retrieved via _DDC. Update the i915, nouveau, and radeon
DRM-KMS drivers to fall back to retrieving the EDID via ACPI _DDC on
notebook internal display panels after failing to retrieve an EDID via
other means.

This is useful for retrieving an internal panel's EDID both on hybrid
graphics systems with muxed display output, when the display is muxed
away, as well as on a small number of non-muxed and/or non-hybrid
systems where ACPI _DDC is the only means of accessing the EDID for
the internal panel.

Daniel Dadap (4):
  drm: retrieve EDID via ACPI _DDC method
  i915: fall back to ACPI EDID retrieval
  nouveau: fall back to ACPI EDID retrieval
  radeon: fall back to ACPI EDID retrieval

 drivers/gpu/drm/drm_edid.c  | 161 
 drivers/gpu/drm/i915/display/intel_dp.c |   8 +-
 drivers/gpu/drm/i915/display/intel_lvds.c   |   4 +
 drivers/gpu/drm/nouveau/nouveau_connector.c |   6 +
 drivers/gpu/drm/radeon/radeon_combios.c |   6 +-
 include/drm/drm_edid.h  |   1 +
 6 files changed, 182 insertions(+), 4 deletions(-)

-- 
2.18.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/4] i915: fall back to ACPI EDID retrieval

2020-07-28 Thread Daniel Dadap
Fall back to retrieving the EDID via the ACPI _DDC method, when present
for notebook internal panels, when EDID retrieval via the standard EDID
paths is unsuccessful.

Signed-off-by: Daniel Dadap 
---
 drivers/gpu/drm/i915/display/intel_dp.c   | 8 +++-
 drivers/gpu/drm/i915/display/intel_lvds.c | 4 
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dp.c 
b/drivers/gpu/drm/i915/display/intel_dp.c
index 804b1d966f66..ff402cef8183 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -5657,6 +5657,7 @@ static struct edid *
 intel_dp_get_edid(struct intel_dp *intel_dp)
 {
struct intel_connector *intel_connector = intel_dp->attached_connector;
+   struct edid *edid;
 
/* use cached edid if we have one */
if (intel_connector->edid) {
@@ -5666,8 +5667,13 @@ intel_dp_get_edid(struct intel_dp *intel_dp)
 
return drm_edid_duplicate(intel_connector->edid);
} else
-   return drm_get_edid(_connector->base,
+   edid = drm_get_edid(_connector->base,
_dp->aux.ddc);
+
+   if (!edid && intel_dp_is_edp(intel_dp))
+   edid = drm_get_edid_acpi();
+
+   return edid;
 }
 
 static void
diff --git a/drivers/gpu/drm/i915/display/intel_lvds.c 
b/drivers/gpu/drm/i915/display/intel_lvds.c
index 9a067effcfa0..811eea3f5d9f 100644
--- a/drivers/gpu/drm/i915/display/intel_lvds.c
+++ b/drivers/gpu/drm/i915/display/intel_lvds.c
@@ -946,6 +946,10 @@ void intel_lvds_init(struct drm_i915_private *dev_priv)
else
edid = drm_get_edid(connector,
intel_gmbus_get_adapter(dev_priv, pin));
+
+   if (!edid)
+   edid = drm_get_edid_acpi();
+
if (edid) {
if (drm_add_edid_modes(connector, edid)) {
drm_connector_update_edid_property(connector,
-- 
2.18.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 4/4] radeon: fall back to ACPI EDID retrieval

2020-07-28 Thread Daniel Dadap
Fall back to retrieving the EDID via the ACPI _DDC method, when present
for notebook internal panels, when retrieving BIOS-embedded EDIDs.

Signed-off-by: Daniel Dadap 
---
 drivers/gpu/drm/radeon/radeon_combios.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_combios.c 
b/drivers/gpu/drm/radeon/radeon_combios.c
index c3e49c973812..de801d9fca54 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct 
radeon_device *rdev)
 struct edid *
 radeon_bios_get_hardcoded_edid(struct radeon_device *rdev)
 {
-   struct edid *edid;
-
if (rdev->mode_info.bios_hardcoded_edid) {
+   struct edid *edid;
edid = kmalloc(rdev->mode_info.bios_hardcoded_edid_size, 
GFP_KERNEL);
if (edid) {
memcpy((unsigned char *)edid,
@@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct radeon_device *rdev)
return edid;
}
}
-   return NULL;
+
+   return drm_get_edid_acpi();
 }
 
 static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device 
*rdev,
-- 
2.18.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/4] nouveau: fall back to ACPI EDID retrieval

2020-07-28 Thread Daniel Dadap
Fall back to retrieving the EDID via the ACPI _DDC method, when present
for notebook internal panels, when EDID retrieval via the standard EDID
paths is unsuccessful.

Signed-off-by: Daniel Dadap 
---
 drivers/gpu/drm/nouveau/nouveau_connector.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c 
b/drivers/gpu/drm/nouveau/nouveau_connector.c
index 9a9a7f5003d3..95836a02a06b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_connector.c
+++ b/drivers/gpu/drm/nouveau/nouveau_connector.c
@@ -581,6 +581,12 @@ nouveau_connector_detect(struct drm_connector *connector, 
bool force)
else
nv_connector->edid = drm_get_edid(connector, i2c);
 
+   if (!nv_connector->edid &&
+   (nv_connector->type == DCB_CONNECTOR_LVDS ||
+   nv_connector->type == DCB_CONNECTOR_eDP)) {
+   nv_connector->edid = drm_get_edid_acpi();
+   }
+
drm_connector_update_edid_property(connector,
nv_connector->edid);
if (!nv_connector->edid) {
-- 
2.18.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Chen, Guchun
[AMD Public Use]

Hi Tianci,

My point is, as in the new patch, one new local adev variable is introduced, 
then in the same function, for others where smu->adev is used should be replace 
by the new local adev as well.
Otherwise, it looks not perfect from coding style's perspective.

Regards,
Guchun

-Original Message-
From: Yin, Tianci (Rico)  
Sent: Tuesday, July 28, 2020 2:48 PM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Tuikov, Luben ; 
Hesik, Christopher ; Deucher, Alexander 
; Swamy, Manjunatha ; 
Quan, Evan ; Feng, Kenneth ; Zhang, 
Hawking 
Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

Hi Guchun,

Since the adev variable invoked a few times, local adev make the code more 
concise.

Thanks!
Rico

-Original Message-
From: Chen, Guchun  
Sent: Tuesday, July 28, 2020 2:26 PM
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Yin, Tianci (Rico) ; 
Tuikov, Luben ; Hesik, Christopher 
; Deucher, Alexander ; 
Swamy, Manjunatha ; Quan, Evan ; 
Feng, Kenneth ; Zhang, Hawking 
Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

[AMD Public Use]

One minor comment.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Tianci Yin
Sent: Tuesday, July 28, 2020 1:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Yin, Tianci (Rico) ; 
Tuikov, Luben ; Hesik, Christopher 
; Deucher, Alexander ; 
Swamy, Manjunatha ; Quan, Evan ; 
Feng, Kenneth ; Zhang, Hawking 
Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, 
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..5da0436d41e0 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,
 
struct smu_context *smu = (struct smu_context*)(handle);
struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+   struct amdgpu_device *adev = smu->adev;

if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
return -EINVAL;
@@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle,
amdgpu_device_ip_set_clockgating_state(smu->adev,
[Guchun]Use the local adev instead of smu->adev?

   
AMD_IP_BLOCK_TYPE_GFX,
   
AMD_CG_STATE_UNGATE);

+
+   if (adev->asic_type >= CHIP_NAVI10 &&
+   adev->asic_type <= CHIP_NAVI12 &&
+   (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,"GFXOFF exited, 
re-init SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   } else
+   dev_warn(adev->dev,"Callback 
init_spm_golden is NULL\n");
+   }
}
} else {
/* exit umd pstate, restore level, enable gfx cg*/
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu/dc: Stop dma_resv_lock inversion in commit_tail

2020-07-28 Thread Christian König

Am 27.07.20 um 23:30 schrieb Daniel Vetter:

Trying to grab dma_resv_lock while in commit_tail before we've done
all the code that leads to the eventual signalling of the vblank event
(which can be a dma_fence) is deadlock-y. Don't do that.

Here the solution is easy because just grabbing locks to read
something races anyway. We don't need to bother, READ_ONCE is
equivalent. And avoids the locking issue.

v2: Also take into account tmz_surface boolean, plus just delete the
old code.

Cc: linux-me...@vger.kernel.org
Cc: linaro-mm-...@lists.linaro.org
Cc: linux-r...@vger.kernel.org
Cc: amd-gfx@lists.freedesktop.org
Cc: intel-...@lists.freedesktop.org
Cc: Chris Wilson 
Cc: Maarten Lankhorst 
Cc: Christian König 
Signed-off-by: Daniel Vetter 
---
DC-folks, I think this split out patch from my series here

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F20200707201229.472834-1-daniel.vetter%40ffwll.ch%2Fdata=02%7C01%7Cchristian.koenig%40amd.com%7C8a4f5736682a4b5c943e08d832747ab1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637314823145521840sdata=qd7Nrox62Lr%2FXWbJJFVskg9RYL4%2FoRVCFjR6rUDMA5E%3Dreserved=0

should be ready for review/merging. I fixed it up a bit so that it's not
just a gross hack :-)

Cheers, Daniel


---
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 19 ++-
  1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 21ec64fe5527..a20b62b1f2ef 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6959,20 +6959,13 @@ static void amdgpu_dm_commit_planes(struct 
drm_atomic_state *state,
DRM_ERROR("Waiting for fences timed out!");
  
  		/*

-* TODO This might fail and hence better not used, wait
-* explicitly on fences instead
-* and in general should be called for
-* blocking commit to as per framework helpers
+* We cannot reserve buffers here, which means the normal flag
+* access functions don't work. Paper over this with READ_ONCE,
+* but maybe the flags are invariant enough that not even that
+* would be needed.
 */
-   r = amdgpu_bo_reserve(abo, true);
-   if (unlikely(r != 0))
-   DRM_ERROR("failed to reserve buffer before flip\n");
-
-   amdgpu_bo_get_tiling_flags(abo, _flags);
-
-   tmz_surface = amdgpu_bo_encrypted(abo);
-
-   amdgpu_bo_unreserve(abo);
+   tiling_flags = READ_ONCE(abo->tiling_flags);
+   tmz_surface = READ_ONCE(abo->flags) & 
AMDGPU_GEM_CREATE_ENCRYPTED;


Yeah, the abo->flags are mostly fixed after creation, especially the 
encrypted flag can't change or we corrupt page table tables. So that 
should work fine.


Anybody who picks this up feel free to add an Reviewed-by: Christian 
König .


Regards,
Christian.

  
  		fill_dc_plane_info_and_addr(

dm->adev, new_plane_state, tiling_flags,


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v2)

2020-07-28 Thread Liu, Monk
[AMD Official Use Only - Internal Distribution Only]

Thanks Felix

I reworked my patch with your suggestion and I can get queues evenly cross 
pipes, e.g.: modprobe amdgpu num_kcq=6

[  409.878557] amdgpu :00:07.0: amdgpu: ring comp_1.0.0 uses VM inv eng 1 
on hub 0
[  409.878559] amdgpu :00:07.0: amdgpu: ring comp_1.1.0 uses VM inv eng 4 
on hub 0
[  409.878561] amdgpu :00:07.0: amdgpu: ring comp_1.2.0 uses VM inv eng 5 
on hub 0
[  409.878563] amdgpu :00:07.0: amdgpu: ring comp_1.3.0 uses VM inv eng 6 
on hub 0
[  409.878565] amdgpu :00:07.0: amdgpu: ring comp_1.0.1 uses VM inv eng 7 
on hub 0
[  409.878567] amdgpu :00:07.0: amdgpu: ring comp_1.1.1 uses VM inv eng 8 
on hub 0
[  409.878568] amdgpu :00:07.0: amdgpu: ring kiq_2.1.0 uses VM inv eng 9 on 
hub 0

Please review my patch upcoming

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Kuehling, Felix 
Sent: Tuesday, July 28, 2020 7:33 AM
To: amd-gfx@lists.freedesktop.org; Liu, Monk 
Subject: Re: [PATCH] drm/amdgpu: introduce a new parameter to configure how 
many KCQ we want(v2)

Am 2020-07-27 um 6:47 a.m. schrieb Monk Liu:
> what:
> the MQD's save and restore of kernel compute queues cost lots of
> clocks during world switch which impacts a lot to multi-VF performance
>
> how:
> introduce a paramter to control the number of kernel compute queues to
> avoid performance drop if there is no kernel compute queue needed
>
> notes:
> this paramter only affects gfx 8/9/10
>
> TODO:
> in the future we will let hypervisor driver to set this paramter
> automatically thus no need for user to configure it through modprobe
> in virtual machine
>
> Signed-off-by: Monk Liu 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 27 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 30 +++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 29 ++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 31 
> +++---
>  7 files changed, 71 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e97c088..71a3d6a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -201,6 +201,7 @@ extern int amdgpu_si_support;  #ifdef
> CONFIG_DRM_AMDGPU_CIK  extern int amdgpu_cik_support;  #endif
> +extern int amdgpu_num_kcq_user_set;
>
>  #define AMDGPU_VM_MAX_NUM_CTX4096
>  #define AMDGPU_SG_THRESHOLD(256*1024*1024)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62ecac9..18b93ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct
> amdgpu_device *adev)
>
>  amdgpu_gmc_tmz_set(adev);
>
> +if (amdgpu_num_kcq_user_set > 8 || amdgpu_num_kcq_user_set < 0) {
> +amdgpu_num_kcq_user_set = 8;
> +dev_warn(adev-dev, "set KCQ number to 8 due to invalid paramter provided by 
> user\n");
> +}
> +
>  return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 6291f5f..03a94e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -150,6 +150,7 @@ int amdgpu_noretry;  int amdgpu_force_asic_type =
> -1;  int amdgpu_tmz = 0;  int amdgpu_reset_method = -1; /* auto */
> +int amdgpu_num_kcq_user_set = 8;
>
>  struct amdgpu_mgpu_info mgpu_info = {
>  .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
> @@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
> MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default),
> 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
> module_param_named(reset_method, amdgpu_reset_method, int, 0444);
>
> +MODULE_PARM_DESC(num_kcq, "number of KCQ user want to setup (8 if set
> +to greater than 8 or less than 0, only affect gfx 8+)");
> +module_param_named(num_kcq, amdgpu_num_kcq_user_set, int, 0444);
> +
>  static const struct pci_device_id pciidlist[] = {  #ifdef
> CONFIG_DRM_AMDGPU_SI
>  {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 8eff017..0b59049 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -202,7 +202,7 @@ bool
> amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
>
>  void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)  {
> -int i, queue, pipe, mec;
> +int i, queue, pipe, mec, j = 0;
>  bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
>
>  /* 

Re: [PATCH 4/4] radeon: fall back to ACPI EDID retrieval

2020-07-28 Thread Christian König

Am 27.07.20 um 22:53 schrieb Daniel Dadap:

Fall back to retrieving the EDID via the ACPI _DDC method, when present
for notebook internal panels, when retrieving BIOS-embedded EDIDs.

Signed-off-by: Daniel Dadap 
---
  drivers/gpu/drm/radeon/radeon_combios.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_combios.c 
b/drivers/gpu/drm/radeon/radeon_combios.c
index c3e49c973812..de801d9fca54 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -401,9 +401,8 @@ bool radeon_combios_check_hardcoded_edid(struct 
radeon_device *rdev)
  struct edid *
  radeon_bios_get_hardcoded_edid(struct radeon_device *rdev)
  {
-   struct edid *edid;
-
if (rdev->mode_info.bios_hardcoded_edid) {
+   struct edid *edid;


That's an unrelated an incorrect style change. You need a blank line 
after declaration.



edid = kmalloc(rdev->mode_info.bios_hardcoded_edid_size, 
GFP_KERNEL);
if (edid) {
memcpy((unsigned char *)edid,
@@ -412,7 +411,8 @@ radeon_bios_get_hardcoded_edid(struct radeon_device *rdev)
return edid;
}
}
-   return NULL;
+
+   return drm_get_edid_acpi();


In general a good idea, but I'm wondering if we should really do this so 
unconditionally here.


Regards,
Christian.


  }
  
  static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device *rdev,


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Yin, Tianci (Rico)
Hi Guchun,

Since the adev variable invoked a few times, local adev make the code more 
concise.

Thanks!
Rico

-Original Message-
From: Chen, Guchun  
Sent: Tuesday, July 28, 2020 2:26 PM
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Yin, Tianci (Rico) ; 
Tuikov, Luben ; Hesik, Christopher 
; Deucher, Alexander ; 
Swamy, Manjunatha ; Quan, Evan ; 
Feng, Kenneth ; Zhang, Hawking 
Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

[AMD Public Use]

One minor comment.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Tianci Yin
Sent: Tuesday, July 28, 2020 1:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Yin, Tianci (Rico) ; 
Tuikov, Luben ; Hesik, Christopher 
; Deucher, Alexander ; 
Swamy, Manjunatha ; Quan, Evan ; 
Feng, Kenneth ; Zhang, Hawking 
Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, 
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..5da0436d41e0 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,
 
struct smu_context *smu = (struct smu_context*)(handle);
struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+   struct amdgpu_device *adev = smu->adev;

if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
return -EINVAL;
@@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle,
amdgpu_device_ip_set_clockgating_state(smu->adev,
[Guchun]Use the local adev instead of smu->adev?

   
AMD_IP_BLOCK_TYPE_GFX,
   
AMD_CG_STATE_UNGATE);

+
+   if (adev->asic_type >= CHIP_NAVI10 &&
+   adev->asic_type <= CHIP_NAVI12 &&
+   (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,"GFXOFF exited, 
re-init SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   } else
+   dev_warn(adev->dev,"Callback 
init_spm_golden is NULL\n");
+   }
}
} else {
/* exit umd pstate, restore level, enable gfx cg*/
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Chen, Guchun
[AMD Public Use]

One minor comment.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Tianci Yin
Sent: Tuesday, July 28, 2020 1:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei ; Yin, Tianci (Rico) ; 
Tuikov, Luben ; Hesik, Christopher 
; Deucher, Alexander ; 
Swamy, Manjunatha ; Quan, Evan ; 
Feng, Kenneth ; Zhang, Hawking 
Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, 
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..5da0436d41e0 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,
 
struct smu_context *smu = (struct smu_context*)(handle);
struct smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+   struct amdgpu_device *adev = smu->adev;

if (!smu->is_apu && !smu_dpm_ctx->dpm_context)
return -EINVAL;
@@ -1324,6 +1325,16 @@ static int smu_enable_umd_pstate(void *handle,
amdgpu_device_ip_set_clockgating_state(smu->adev,
[Guchun]Use the local adev instead of smu->adev?

   
AMD_IP_BLOCK_TYPE_GFX,
   
AMD_CG_STATE_UNGATE);

+
+   if (adev->asic_type >= CHIP_NAVI10 &&
+   adev->asic_type <= CHIP_NAVI12 &&
+   (adev->pm.pp_feature & PP_GFXOFF_MASK)) {
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,"GFXOFF exited, 
re-init SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   } else
+   dev_warn(adev->dev,"Callback 
init_spm_golden is NULL\n");
+   }
}
} else {
/* exit umd pstate, restore level, enable gfx cg*/
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cguchun.chen%40amd.com%7Ce6176c766fe747a6fe1a08d832b6ee3c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637315108573876551sdata=43GHkX%2FCRLsfMmNxurLMIZy4l4ElUB2tnslPyfT7NJg%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x after GFXOFF exit

2020-07-28 Thread Yin, Tianci (Rico)
[AMD Official Use Only - Internal Distribution Only]

Thanks Feifei!

Rico

-Original Message-
From: Xu, Feifei 
Sent: Tuesday, July 28, 2020 2:21 PM
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org
Cc: Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Hesik, 
Christopher ; Swamy, Manjunatha 
; Quan, Evan ; Feng, Kenneth 
; Yin, Tianci (Rico) 
Subject: RE: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

[AMD Official Use Only - Internal Distribution Only]

Series is Reviewed-by: Feifei Xu 

-Original Message-
From: Tianci Yin 
Sent: Tuesday, July 28, 2020 1:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Xu, Feifei 
; Hesik, Christopher ; Swamy, 
Manjunatha ; Quan, Evan ; Feng, 
Kenneth ; Yin, Tianci (Rico) 
Subject: [PATCH 2/2] drm/amdgpu: reconfigure spm golden settings on Navi1x 
after GFXOFF exit

From: "Tianci.Yin" 

On Navi1x, the SPM golden settings will be lost after GFXOFF enter/exit, 
reconfigure the golden settings after GFXOFF exit.

Change-Id: I9358ba9c65f241c36f8a35916170b19535148ee9
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 55463e7a11e2..5da0436d41e0 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1309,6 +1309,7 @@ static int smu_enable_umd_pstate(void *handle,

 struct smu_context *smu = (struct smu_context*)(handle);  struct 
smu_dpm_context *smu_dpm_ctx = &(smu->smu_dpm);
+struct amdgpu_device *adev = smu->adev;

 if (!smu->is_apu && !smu_dpm_ctx->dpm_context)  return -EINVAL; @@ -1324,6 
+1325,16 @@ static int smu_enable_umd_pstate(void *handle,  
amdgpu_device_ip_set_clockgating_state(smu->adev,
AMD_IP_BLOCK_TYPE_GFX,
AMD_CG_STATE_UNGATE);
+
+if (adev->asic_type >= CHIP_NAVI10 &&
+adev->asic_type <= CHIP_NAVI12 &&
+(adev->pm.pp_feature & PP_GFXOFF_MASK)) { if
+(adev->gfx.funcs->init_spm_golden) { dev_dbg(adev->dev,"GFXOFF exited,
+re-init SPM golden settings\n"); amdgpu_gfx_init_spm_golden(adev); }
+else dev_warn(adev->dev,"Callback init_spm_golden is NULL\n"); }
 }
 } else {
 /* exit umd pstate, restore level, enable gfx cg*/
--
2.17.1


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx