Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync
Hi Lucas, On Wed, Jun 17, 2020 at 4:53 AM Lucas Stach wrote: > > Hi Navid, > > Am Montag, den 15.06.2020, 01:12 -0500 schrieb Navid Emamdoost: > > in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs, > > and etnaviv_gpu_init the call to pm_runtime_get_sync increments the > > counter even in case of failure, leading to incorrect ref count. > > In case of failure, decrement the ref count before returning. > > While that change is correct with the current API, may I ask the > question why the way this API works is considered reasonable? A API > call that fails, but still changes internal state and expects the > caller to clean up the mess it not really what I would consider fool- > proof API design. Is there a specific reason why it is done this way > and not handled internally? I share the same concern with you on the way this API is working now. To the best of my knowledge, there are ongoing discussions on this issue: https://lkml.org/lkml/2020/6/14/76 https://patchwork.ozlabs.org/project/linux-tegra/patch/20200520095148.10995-1-dinghao@zju.edu.cn/ > > Regards, > Lucas > > > Signed-off-by: Navid Emamdoost > > --- > > Changes in v2: > > - replace pm_runtime_put with pm_runtime_put_noidle > > --- > > drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++ > > 1 file changed, 10 insertions(+), 4 deletions(-) > > > > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > > b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > > index a31eeff2b297..7c9f3f9ba123 100644 > > --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > > @@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) > > ret = pm_runtime_get_sync(gpu->dev); > > if (ret < 0) { > > dev_err(gpu->dev, "Failed to enable GPU power domain\n"); > > - return ret; > > + goto pm_put; > > } > > > > etnaviv_hw_identify(gpu); > > @@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) > > > > fail: > > pm_runtime_mark_last_busy(gpu->dev); > > +pm_put: > > pm_runtime_put_autosuspend(gpu->dev); > > > > return ret; > > @@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct > > seq_file *m) > > > > ret = pm_runtime_get_sync(gpu->dev); > > if (ret < 0) > > - return ret; > > + goto pm_put; > > > > dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW); > > dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH); > > @@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, > > struct seq_file *m) > > ret = 0; > > > > pm_runtime_mark_last_busy(gpu->dev); > > +pm_put: > > pm_runtime_put_autosuspend(gpu->dev); > > > > return ret; > > @@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) > > dev_err(gpu->dev, "recover hung GPU!\n"); > > > > if (pm_runtime_get_sync(gpu->dev) < 0) > > - return; > > + goto pm_put; > > > > mutex_lock(&gpu->lock); > > > > @@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) > > > > mutex_unlock(&gpu->lock); > > pm_runtime_mark_last_busy(gpu->dev); > > +pm_put: > > pm_runtime_put_autosuspend(gpu->dev); > > } > > > > @@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct > > etnaviv_gem_submit *submit) > > > > if (!submit->runtime_resumed) { > > ret = pm_runtime_get_sync(gpu->dev); > > - if (ret < 0) > > + if (ret < 0) { > > + pm_runtime_put_noidle(gpu->dev); > > return NULL; > > + } > > submit->runtime_resumed = true; > > } > > > > @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct > > etnaviv_gem_submit *submit) > > ret = event_alloc(gpu, nr_events, event); > > if (ret) { > > DRM_ERROR("no free events\n"); > > + pm_runtime_put_noidle(gpu->dev); > > return NULL; > > } > > > -- Navid.
Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync
Hi Navid, Am Montag, den 15.06.2020, 01:12 -0500 schrieb Navid Emamdoost: > in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs, > and etnaviv_gpu_init the call to pm_runtime_get_sync increments the > counter even in case of failure, leading to incorrect ref count. > In case of failure, decrement the ref count before returning. While that change is correct with the current API, may I ask the question why the way this API works is considered reasonable? A API call that fails, but still changes internal state and expects the caller to clean up the mess it not really what I would consider fool- proof API design. Is there a specific reason why it is done this way and not handled internally? Regards, Lucas > Signed-off-by: Navid Emamdoost > --- > Changes in v2: > - replace pm_runtime_put with pm_runtime_put_noidle > --- > drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++ > 1 file changed, 10 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > index a31eeff2b297..7c9f3f9ba123 100644 > --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c > @@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) > ret = pm_runtime_get_sync(gpu->dev); > if (ret < 0) { > dev_err(gpu->dev, "Failed to enable GPU power domain\n"); > - return ret; > + goto pm_put; > } > > etnaviv_hw_identify(gpu); > @@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) > > fail: > pm_runtime_mark_last_busy(gpu->dev); > +pm_put: > pm_runtime_put_autosuspend(gpu->dev); > > return ret; > @@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct > seq_file *m) > > ret = pm_runtime_get_sync(gpu->dev); > if (ret < 0) > - return ret; > + goto pm_put; > > dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW); > dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH); > @@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct > seq_file *m) > ret = 0; > > pm_runtime_mark_last_busy(gpu->dev); > +pm_put: > pm_runtime_put_autosuspend(gpu->dev); > > return ret; > @@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) > dev_err(gpu->dev, "recover hung GPU!\n"); > > if (pm_runtime_get_sync(gpu->dev) < 0) > - return; > + goto pm_put; > > mutex_lock(&gpu->lock); > > @@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) > > mutex_unlock(&gpu->lock); > pm_runtime_mark_last_busy(gpu->dev); > +pm_put: > pm_runtime_put_autosuspend(gpu->dev); > } > > @@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct > etnaviv_gem_submit *submit) > > if (!submit->runtime_resumed) { > ret = pm_runtime_get_sync(gpu->dev); > - if (ret < 0) > + if (ret < 0) { > + pm_runtime_put_noidle(gpu->dev); > return NULL; > + } > submit->runtime_resumed = true; > } > > @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct > etnaviv_gem_submit *submit) > ret = event_alloc(gpu, nr_events, event); > if (ret) { > DRM_ERROR("no free events\n"); > + pm_runtime_put_noidle(gpu->dev); > return NULL; > } >
Re: [PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync
… > In case of failure, decrement the ref count before returning. Can it be nicer to use the term “reference count” here? Will the tag “Fixes” become helpful for the commit message? … > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c … > @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct > etnaviv_gem_submit *submit) > ret = event_alloc(gpu, nr_events, event); > if (ret) { > DRM_ERROR("no free events\n"); > + pm_runtime_put_noidle(gpu->dev); > return NULL; > } I suggest to move a bit of exception handling code to the end of this function implementation so that it can be better reused after the addition of a jump target like “put_runtime”. Regards, Markus
[PATCH v2] drm/etnaviv: fix ref count leak via pm_runtime_get_sync
in etnaviv_gpu_submit, etnaviv_gpu_recover_hang, etnaviv_gpu_debugfs, and etnaviv_gpu_init the call to pm_runtime_get_sync increments the counter even in case of failure, leading to incorrect ref count. In case of failure, decrement the ref count before returning. Signed-off-by: Navid Emamdoost --- Changes in v2: - replace pm_runtime_put with pm_runtime_put_noidle --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index a31eeff2b297..7c9f3f9ba123 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -722,7 +722,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) ret = pm_runtime_get_sync(gpu->dev); if (ret < 0) { dev_err(gpu->dev, "Failed to enable GPU power domain\n"); - return ret; + goto pm_put; } etnaviv_hw_identify(gpu); @@ -819,6 +819,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) fail: pm_runtime_mark_last_busy(gpu->dev); +pm_put: pm_runtime_put_autosuspend(gpu->dev); return ret; @@ -859,7 +860,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m) ret = pm_runtime_get_sync(gpu->dev); if (ret < 0) - return ret; + goto pm_put; dma_lo = gpu_read(gpu, VIVS_FE_DMA_LOW); dma_hi = gpu_read(gpu, VIVS_FE_DMA_HIGH); @@ -1003,6 +1004,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m) ret = 0; pm_runtime_mark_last_busy(gpu->dev); +pm_put: pm_runtime_put_autosuspend(gpu->dev); return ret; @@ -1016,7 +1018,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) dev_err(gpu->dev, "recover hung GPU!\n"); if (pm_runtime_get_sync(gpu->dev) < 0) - return; + goto pm_put; mutex_lock(&gpu->lock); @@ -1035,6 +1037,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) mutex_unlock(&gpu->lock); pm_runtime_mark_last_busy(gpu->dev); +pm_put: pm_runtime_put_autosuspend(gpu->dev); } @@ -1308,8 +1311,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) if (!submit->runtime_resumed) { ret = pm_runtime_get_sync(gpu->dev); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(gpu->dev); return NULL; + } submit->runtime_resumed = true; } @@ -1326,6 +1331,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) ret = event_alloc(gpu, nr_events, event); if (ret) { DRM_ERROR("no free events\n"); + pm_runtime_put_noidle(gpu->dev); return NULL; } -- 2.17.1