[AMD Public Use]

amd-gfx-bounces distribution list is not correct. Simply correct to amd-gfx DL 
in mail TO line.

Regards,
Guchun

-----Original Message-----
From: Yang, Stanley <[email protected]> 
Sent: Tuesday, August 25, 2020 9:53 AM
To: Chen, Guchun <[email protected]>; [email protected]
Cc: Zhang, Hawking <[email protected]>; Tye, Tony <[email protected]>; 
Kuehling, Felix <[email protected]>; Li, Dennis <[email protected]>
Subject: RE: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties value

[AMD Public Use]

Hi Guchun,

Thanks, I will fix typos and push it to branch.

Regards,
Stanley
> -----Original Message-----
> From: Chen, Guchun <[email protected]>
> Sent: Monday, August 24, 2020 9:48 PM
> To: Yang, Stanley <[email protected]>; amd-gfx- 
> [email protected]
> Cc: Zhang, Hawking <[email protected]>; Tye, Tony 
> <[email protected]>; Kuehling, Felix <[email protected]>; Li, 
> Dennis <[email protected]>; Yang, Stanley <[email protected]>
> Subject: RE: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras 
> properties value
> 
> [AMD Public Use]
> 
> Three comments inline.
> 
> With these comments fixed, the patch is:
> Reviewed-by: Guchun Chen <[email protected]>
> 
> Regards,
> Guchun
> 
> -----Original Message-----
> From: Stanley.Yang <[email protected]>
> Sent: Monday, August 24, 2020 6:34 PM
> To: [email protected]
> Cc: Zhang, Hawking <[email protected]>; Tye, Tony 
> <[email protected]>; Kuehling, Felix <[email protected]>; Chen, 
> Guchun <[email protected]>; Li, Dennis <[email protected]>; Yang, 
> Stanley <[email protected]>
> Subject: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties 
> value
> 
> The ctx->features are new RAS implementation which is only available 
> for
> Vega20 and onwards, it is not available for vega10, vega10 should 
> follow legacy ECC implementation.
> 
> Changed from V1:
>     wrap function to initialize kfd node properties
> 
> Changed from V2:
>     remove wrap funcion, remove SRMA ECC check
> 
> [Guchun]Spelling typos, not funcion or SRMA. Moreover, it should be 
> more concise by 'remove wrap function and SDMA SRAM ECC check'
> 
> Change-Id: I1e3ff899bf066611fe5775e67104ce2e0bf8b7d0
> Signed-off-by: Stanley.Yang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 16 ++++++++-------
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 24 
> +++++++++++------------
>  3 files changed, 21 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f9d97f61aa5..573e2712df35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -986,6 +986,7 @@ struct amdgpu_device {
> 
>       atomic_t                        throttling_logging_enabled;
>       struct ratelimit_state          throttling_logging_rs;
> +     uint32_t                        ras_features;
>  };
> 
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index cd1403f83dcf..d462244863f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1974,7 +1974,8 @@ static void amdgpu_ras_check_supported(struct 
> amdgpu_device *adev,
>       *supported = 0;
> 
>       if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
> -         (adev->asic_type != CHIP_VEGA20   &&
> +         (adev->asic_type != CHIP_VEGA10 &&
> +          adev->asic_type != CHIP_VEGA20 &&
>            adev->asic_type != CHIP_ARCTURUS &&
>            adev->asic_type != CHIP_SIENNA_CICHLID)) [Guchun]I suggest 
> moving all ASIC check into one static function. This will benefit 
> later modification if user adds more ASICs to support RAS, and it's 
> indeed more readable.
> 
>               return;
> @@ -1998,6 +1999,7 @@ static void amdgpu_ras_check_supported(struct 
> amdgpu_device *adev,
> 
>       *supported = amdgpu_ras_enable == 0 ?
>                       0 : *hw_supported & amdgpu_ras_mask;
> +     adev->ras_features = *supported;
>  }
> 
>  int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2020,9 +2022,9 @@ 
> int amdgpu_ras_init(struct amdgpu_device *adev)
> 
>       amdgpu_ras_check_supported(adev, &con->hw_supported,
>                       &con->supported);
> -     if (!con->hw_supported) {
> +     if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
>               r = 0;
> -             goto err_out;
> +             goto release_con;
>       }
> 
>       con->features = 0;
> @@ -2033,25 +2035,25 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
>       if (adev->nbio.funcs->init_ras_controller_interrupt) {
>               r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
>               if (r)
> -                     goto err_out;
> +                     goto release_con;
>       }
> 
>       if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
>               r = adev->nbio.funcs-
> >init_ras_err_event_athub_interrupt(adev);
>               if (r)
> -                     goto err_out;
> +                     goto release_con;
>       }
> 
>       if (amdgpu_ras_fs_init(adev)) {
>               r = -EINVAL;
> -             goto err_out;
> +             goto release_con;
>       }
> 
>       dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>                       "hardware ability[%x] ras_mask[%x]\n",
>                       con->hw_supported, con->supported);
>       return 0;
> -err_out:
> +release_con:
>       amdgpu_ras_set_context(adev, NULL);
>       kfree(con);
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index f185f6cbc05c..0ba960a17ead 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1239,7 +1239,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>       void *crat_image = NULL;
>       size_t image_size = 0;
>       int proximity_domain;
> -     struct amdgpu_ras *ctx;
> +     struct amdgpu_device *adev;
> 
>       INIT_LIST_HEAD(&temp_topology_device_list);
> 
> @@ -1404,19 +1404,17 @@ int kfd_topology_add_device(struct kfd_dev
> *gpu)
>               dev->node_props.max_waves_per_simd = 10;
>       }
> 
> -     ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu-
> >kgd));
> -     if (ctx) {
> -             /* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on
> UMC */
> -             dev->node_props.capability |=
> -                     (((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
> -                      ((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
> -                     HSA_CAP_SRAM_EDCSUPPORTED : 0;
> -             dev->node_props.capability |= ((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> -                     HSA_CAP_MEM_EDCSUPPORTED : 0;
> -
> -             dev->node_props.capability |= (ctx->features != 0) ?
> +     adev = (struct amdgpu_device *)(dev->gpu->kgd);
> +     /* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC
> */
> [Guchun]Modify comment to drop SDMA.
> 
> +     dev->node_props.capability |=
> +             ((adev->ras_features &
> BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
> +             HSA_CAP_SRAM_EDCSUPPORTED : 0;
> +     dev->node_props.capability |= ((adev->ras_features &
> BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> +             HSA_CAP_MEM_EDCSUPPORTED : 0;
> +
> +     if (adev->asic_type != CHIP_VEGA10)
> +             dev->node_props.capability |= (adev->ras_features != 0) ?
>                       HSA_CAP_RASEVENTNOTIFY : 0;
> -     }
> 
>       kfd_debug_print_topology();
> 
> --
> 2.25.1
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to