Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
sycobjs is relatively hot, but the 96% of the calls are for a single
object. (~4% for two points, and never more than three points. While
a more trivial workload like vkmark under Plasma is even more skewed
to single point waits.)

Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
allocated stack array for those cases.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com>
Reviewed-by: Maíra Canal <mca...@igalia.com>
---
v2:
 * Document rationale for stack array in a comment.
---
 drivers/gpu/drm/drm_syncobj.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index bf2fbe07add2..b906d6acb4ef 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1035,6 +1035,12 @@ static signed long drm_syncobj_array_wait_timeout(struct 
drm_syncobj **syncobjs,
                                                  uint32_t *idx,
                                                  ktime_t *deadline)
 {
+       /*
+        * Empirically vast majority of calls here works with just a single
+        * point (96%) and never more than three points. Therefore a small stack
+        * array can cheaply avoid multiple per frame allocations.
+        */
+       struct syncobj_wait_entry stack_entries[4];
        struct syncobj_wait_entry *entries;
        uint32_t signaled_count, i;
        struct dma_fence *fence;
@@ -1049,9 +1055,14 @@ static signed long drm_syncobj_array_wait_timeout(struct 
drm_syncobj **syncobjs,
            !access_ok(user_points, count * sizeof(*user_points)))
                return -EFAULT;
 
-       entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
-       if (!entries)
-               return -ENOMEM;
+       if (count > ARRAY_SIZE(stack_entries)) {
+               entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
+               if (!entries)
+                       return -ENOMEM;
+       } else {
+               memset(stack_entries, 0, sizeof(stack_entries));
+               entries = stack_entries;
+       }
 
        /* Walk the list of sync objects and initialize entries.  We do
         * this up-front so that we can properly return -EINVAL if there is
@@ -1174,7 +1185,9 @@ static signed long drm_syncobj_array_wait_timeout(struct 
drm_syncobj **syncobjs,
                                                  &entries[i].fence_cb);
                dma_fence_put(entries[i].fence);
        }
-       kfree(entries);
+
+       if (entries != stack_entries)
+               kfree(entries);
 
        return timeout;
 }
-- 
2.48.0

Reply via email to