RE: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and deadlocking
[Public] > -Original Message- > From: Lyude Paul > Sent: Tuesday, November 15, 2022 5:55 AM > To: Lin, Wayne ; amd-...@lists.freedesktop.org > Cc: Wentland, Harry ; sta...@vger.kernel.org; > Li, Sun peng (Leo) ; Siqueira, Rodrigo > ; Deucher, Alexander > ; Koenig, Christian > ; Pan, Xinhui ; David > Airlie ; Daniel Vetter ; Kazlauskas, > Nicholas ; Pillai, Aurabindo > ; Li, Roman ; Zuo, Jerry > ; Wu, Hersen ; Thomas > Zimmermann ; Mahfooz, Hamza > ; Hung, Alex ; Francis, > David ; Mikita Lipski ; Liu, > Wenjing ; open list:DRM DRIVERS de...@lists.freedesktop.org>; open list > Subject: Re: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and > deadlocking > > On Wed, 2022-11-09 at 09:48 +, Lin, Wayne wrote: > > > } > > > - if (!drm_dp_mst_atomic_check(state) && !debugfs_overwrite) { > > > + ret = drm_dp_mst_atomic_check(state); > > > + if (ret == 0 && !debugfs_overwrite) { > > > set_dsc_configs_from_fairness_vars(params, vars, count, > > > k); > > > - return true; > > > + return 0; > > > + } else if (ret == -EDEADLK) { > > > + return ret; > > > > I think we should return here whenever there is an error. Not just for > > EDEADLK case. > > Are we sure about this one? I think we may actually want to make this so it > returns on ret != -ENOSPC, since we want the function to continue if there's > no space in the atomic state available so it can try recomputing things with > compression enabled. On ret == 0 it should return early without doing > compression, and on ret == -ENOSPC it should just continue the function > from there > Oh, right.. Thanks for saving me from causing disaster : ) > -- > Cheers, > Lyude Paul (she/her) > Software Engineer at Red Hat -- Regards, Wayne
Re: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and deadlocking
On Wed, 2022-11-09 at 09:48 +, Lin, Wayne wrote: > > } > > - if (!drm_dp_mst_atomic_check(state) && !debugfs_overwrite) { > > + ret = drm_dp_mst_atomic_check(state); > > + if (ret == 0 && !debugfs_overwrite) { > > set_dsc_configs_from_fairness_vars(params, vars, count, > > k); > > - return true; > > + return 0; > > + } else if (ret == -EDEADLK) { > > + return ret; > > I think we should return here whenever there is an error. Not just for > EDEADLK case. Are we sure about this one? I think we may actually want to make this so it returns on ret != -ENOSPC, since we want the function to continue if there's no space in the atomic state available so it can try recomputing things with compression enabled. On ret == 0 it should return early without doing compression, and on ret == -ENOSPC it should just continue the function from there -- Cheers, Lyude Paul (she/her) Software Engineer at Red Hat
Re: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and deadlocking
On Wed, 2022-11-09 at 09:48 +, Lin, Wayne wrote: > [Public] > > Thanks, Lyude! > Comments inline. > > > -Original Message- > > From: Lyude Paul > > Sent: Saturday, November 5, 2022 7:59 AM > > To: amd-...@lists.freedesktop.org > > Cc: Wentland, Harry ; sta...@vger.kernel.org; > > Li, Sun peng (Leo) ; Siqueira, Rodrigo > > ; Deucher, Alexander > > ; Koenig, Christian > > ; Pan, Xinhui ; David > > Airlie ; Daniel Vetter ; Kazlauskas, > > Nicholas ; Pillai, Aurabindo > > ; Li, Roman ; Zuo, Jerry > > ; Wu, Hersen ; Lin, Wayne > > ; Thomas Zimmermann ; > > Mahfooz, Hamza ; Hung, Alex > > ; Francis, David ; Mikita > > Lipski ; Liu, Wenjing ; > > open list:DRM DRIVERS ; open list > ker...@vger.kernel.org> > > Subject: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and > > deadlocking > > > > It appears that amdgpu makes the mistake of completely ignoring the return > > values from the DP MST helpers, and instead just returns a simple > > true/false. > > In this case, it seems to have come back to bite us because as a result of > > simply returning false from compute_mst_dsc_configs_for_state(), amdgpu > > had no way of telling when a deadlock happened from these helpers. This > > could definitely result in some kernel splats. > > > > Signed-off-by: Lyude Paul > > Fixes: 8c20a1ed9b4f ("drm/amd/display: MST DSC compute fair share") > > Cc: Harry Wentland > > Cc: # v5.6+ > > --- > > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 18 +-- > > .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 107 ++-- > > -- > > .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 12 +- > > 3 files changed, 73 insertions(+), 64 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > index 0db2a88cd4d7b..6f76b2c84cdb5 100644 > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > @@ -6462,7 +6462,7 @@ static int > > dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, > > struct drm_connector_state *new_con_state; > > struct amdgpu_dm_connector *aconnector; > > struct dm_connector_state *dm_conn_state; > > - int i, j; > > + int i, j, ret; > > int vcpi, pbn_div, pbn, slot_num = 0; > > > > for_each_new_connector_in_state(state, connector, > > new_con_state, i) { @@ -6509,8 +6509,11 @@ static int > > dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, > > dm_conn_state->pbn = pbn; > > dm_conn_state->vcpi_slots = slot_num; > > > > - drm_dp_mst_atomic_enable_dsc(state, aconnector- > > > port, dm_conn_state->pbn, > > -false); > > + ret = drm_dp_mst_atomic_enable_dsc(state, > > aconnector->port, > > + dm_conn_state- > > > pbn, false); > > + if (ret != 0) > > + return ret; > > + > > continue; > > } > > > > @@ -9523,10 +9526,9 @@ static int amdgpu_dm_atomic_check(struct > > drm_device *dev, > > > > #if defined(CONFIG_DRM_AMD_DC_DCN) > > if (dc_resource_is_dsc_encoding_supported(dc)) { > > - if (!pre_validate_dsc(state, _state, vars)) { > > - ret = -EINVAL; > > + ret = pre_validate_dsc(state, _state, vars); > > + if (ret != 0) > > goto fail; > > - } > > } > > #endif > > > > @@ -9621,9 +9623,9 @@ static int amdgpu_dm_atomic_check(struct > > drm_device *dev, > > } > > > > #if defined(CONFIG_DRM_AMD_DC_DCN) > > - if (!compute_mst_dsc_configs_for_state(state, dm_state- > > > context, vars)) { > > + ret = compute_mst_dsc_configs_for_state(state, dm_state- > > > context, vars); > > + if (ret) { > > > > DRM_DEBUG_DRIVER("compute_mst_dsc_configs_for_state() > > failed\n"); > > - ret = -EINVAL; > > goto fail; > > } > > > > diff --git > > a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c > > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
RE: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and deadlocking
[Public] Thanks, Lyude! Comments inline. > -Original Message- > From: Lyude Paul > Sent: Saturday, November 5, 2022 7:59 AM > To: amd-...@lists.freedesktop.org > Cc: Wentland, Harry ; sta...@vger.kernel.org; > Li, Sun peng (Leo) ; Siqueira, Rodrigo > ; Deucher, Alexander > ; Koenig, Christian > ; Pan, Xinhui ; David > Airlie ; Daniel Vetter ; Kazlauskas, > Nicholas ; Pillai, Aurabindo > ; Li, Roman ; Zuo, Jerry > ; Wu, Hersen ; Lin, Wayne > ; Thomas Zimmermann ; > Mahfooz, Hamza ; Hung, Alex > ; Francis, David ; Mikita > Lipski ; Liu, Wenjing ; > open list:DRM DRIVERS ; open list ker...@vger.kernel.org> > Subject: [PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and > deadlocking > > It appears that amdgpu makes the mistake of completely ignoring the return > values from the DP MST helpers, and instead just returns a simple true/false. > In this case, it seems to have come back to bite us because as a result of > simply returning false from compute_mst_dsc_configs_for_state(), amdgpu > had no way of telling when a deadlock happened from these helpers. This > could definitely result in some kernel splats. > > Signed-off-by: Lyude Paul > Fixes: 8c20a1ed9b4f ("drm/amd/display: MST DSC compute fair share") > Cc: Harry Wentland > Cc: # v5.6+ > --- > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 18 +-- > .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 107 ++-- > -- > .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 12 +- > 3 files changed, 73 insertions(+), 64 deletions(-) > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > index 0db2a88cd4d7b..6f76b2c84cdb5 100644 > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > @@ -6462,7 +6462,7 @@ static int > dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, > struct drm_connector_state *new_con_state; > struct amdgpu_dm_connector *aconnector; > struct dm_connector_state *dm_conn_state; > - int i, j; > + int i, j, ret; > int vcpi, pbn_div, pbn, slot_num = 0; > > for_each_new_connector_in_state(state, connector, > new_con_state, i) { @@ -6509,8 +6509,11 @@ static int > dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, > dm_conn_state->pbn = pbn; > dm_conn_state->vcpi_slots = slot_num; > > - drm_dp_mst_atomic_enable_dsc(state, aconnector- > >port, dm_conn_state->pbn, > - false); > + ret = drm_dp_mst_atomic_enable_dsc(state, > aconnector->port, > +dm_conn_state- > >pbn, false); > + if (ret != 0) > + return ret; > + > continue; > } > > @@ -9523,10 +9526,9 @@ static int amdgpu_dm_atomic_check(struct > drm_device *dev, > > #if defined(CONFIG_DRM_AMD_DC_DCN) > if (dc_resource_is_dsc_encoding_supported(dc)) { > - if (!pre_validate_dsc(state, _state, vars)) { > - ret = -EINVAL; > + ret = pre_validate_dsc(state, _state, vars); > + if (ret != 0) > goto fail; > - } > } > #endif > > @@ -9621,9 +9623,9 @@ static int amdgpu_dm_atomic_check(struct > drm_device *dev, > } > > #if defined(CONFIG_DRM_AMD_DC_DCN) > - if (!compute_mst_dsc_configs_for_state(state, dm_state- > >context, vars)) { > + ret = compute_mst_dsc_configs_for_state(state, dm_state- > >context, vars); > + if (ret) { > > DRM_DEBUG_DRIVER("compute_mst_dsc_configs_for_state() > failed\n"); > - ret = -EINVAL; > goto fail; > } > > diff --git > a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c > index 6ff96b4bdda5c..30bc2e5058b70 100644 > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c > +++ > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c > @@ -864,25 +864,25 @@ static bool try_disable_dsc(struct > drm_atomic_state *state, > return true; > } > > -static bool compute_mst_dsc_configs_for_link(struct drm_atomic_state > *state, > - struct dc_state *dc_state, > - struct dc
[PATCH 1/2] drm/amdgpu/mst: Stop ignoring error codes and deadlocking
It appears that amdgpu makes the mistake of completely ignoring the return values from the DP MST helpers, and instead just returns a simple true/false. In this case, it seems to have come back to bite us because as a result of simply returning false from compute_mst_dsc_configs_for_state(), amdgpu had no way of telling when a deadlock happened from these helpers. This could definitely result in some kernel splats. Signed-off-by: Lyude Paul Fixes: 8c20a1ed9b4f ("drm/amd/display: MST DSC compute fair share") Cc: Harry Wentland Cc: # v5.6+ --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 18 +-- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 107 ++ .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 12 +- 3 files changed, 73 insertions(+), 64 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0db2a88cd4d7b..6f76b2c84cdb5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6462,7 +6462,7 @@ static int dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, struct drm_connector_state *new_con_state; struct amdgpu_dm_connector *aconnector; struct dm_connector_state *dm_conn_state; - int i, j; + int i, j, ret; int vcpi, pbn_div, pbn, slot_num = 0; for_each_new_connector_in_state(state, connector, new_con_state, i) { @@ -6509,8 +6509,11 @@ static int dm_update_mst_vcpi_slots_for_dsc(struct drm_atomic_state *state, dm_conn_state->pbn = pbn; dm_conn_state->vcpi_slots = slot_num; - drm_dp_mst_atomic_enable_dsc(state, aconnector->port, dm_conn_state->pbn, -false); + ret = drm_dp_mst_atomic_enable_dsc(state, aconnector->port, + dm_conn_state->pbn, false); + if (ret != 0) + return ret; + continue; } @@ -9523,10 +9526,9 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, #if defined(CONFIG_DRM_AMD_DC_DCN) if (dc_resource_is_dsc_encoding_supported(dc)) { - if (!pre_validate_dsc(state, _state, vars)) { - ret = -EINVAL; + ret = pre_validate_dsc(state, _state, vars); + if (ret != 0) goto fail; - } } #endif @@ -9621,9 +9623,9 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, } #if defined(CONFIG_DRM_AMD_DC_DCN) - if (!compute_mst_dsc_configs_for_state(state, dm_state->context, vars)) { + ret = compute_mst_dsc_configs_for_state(state, dm_state->context, vars); + if (ret) { DRM_DEBUG_DRIVER("compute_mst_dsc_configs_for_state() failed\n"); - ret = -EINVAL; goto fail; } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 6ff96b4bdda5c..30bc2e5058b70 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -864,25 +864,25 @@ static bool try_disable_dsc(struct drm_atomic_state *state, return true; } -static bool compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, -struct dc_state *dc_state, -struct dc_link *dc_link, -struct dsc_mst_fairness_vars *vars, -struct drm_dp_mst_topology_mgr *mgr, -int *link_vars_start_index) +static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, + struct dc_state *dc_state, + struct dc_link *dc_link, + struct dsc_mst_fairness_vars *vars, + struct drm_dp_mst_topology_mgr *mgr, + int *link_vars_start_index) { struct dc_stream_state *stream; struct dsc_mst_fairness_params params[MAX_PIPES]; struct amdgpu_dm_connector *aconnector; struct drm_dp_mst_topology_state *mst_state = drm_atomic_get_mst_topology_state(state, mgr); int count = 0; - int i, k; + int i, k, ret; bool debugfs_overwrite = false; memset(params, 0, sizeof(params)); if (IS_ERR(mst_state)) - return false; + return PTR_ERR(mst_state); mst_state->pbn_div =