Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM sycobjs is relatively hot, but the 96% of the calls are for a single object. (~4% for two points, and never more than three points. While a more trivial workload like vkmark under Plasma is even more skewed to single point waits.)
Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre- allocated stack array for those cases. Signed-off-by: Tvrtko Ursulin <[email protected]> Reviewed-by: MaĆra Canal <[email protected]> # v3 --- v2: * Document rationale for stack array in a comment. v3: * Added DRM_SYNCOBJ_FAST_PATH_ENTRIES to avoid hardcoding fast path array size. v4: * Rebased to be standalone. --- drivers/gpu/drm/drm_syncobj.c | 44 ++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 1333ef0ea03b..99aada85865d 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -237,6 +237,14 @@ static void syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, struct syncobj_eventfd_entry *entry); +/* + * Empirically vast majority of ioctls pass in a single syncobj (96%) and never + * more than three points. Therefore implement a fast path with a small stack + * array to avoid going into the allocator sometimes several times per + * userspace rendered frame. + */ +#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4 + /** * drm_syncobj_find - lookup and reference a sync object. * @file_private: drm file private pointer @@ -1063,10 +1071,12 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, uint32_t *idx, ktime_t *deadline) { + struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; + u64 stack_points[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct syncobj_wait_entry *entries; struct dma_fence *fence; - uint64_t *points; uint32_t signaled_count, i; + uint64_t *points; if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) { @@ -1074,24 +1084,33 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, lockdep_assert_none_held_once(); } - points = kmalloc_array(count, sizeof(*points), GFP_KERNEL); - if (points == NULL) - return -ENOMEM; + if (count > ARRAY_SIZE(stack_points)) { + points = kmalloc_array(count, sizeof(*points), GFP_KERNEL); + if (!points) + return -ENOMEM; + } else { + points = stack_points; + } if (!user_points) { memset(points, 0, count * sizeof(uint64_t)); - } else if (copy_from_user(points, user_points, sizeof(uint64_t) * count)) { timeout = -EFAULT; goto err_free_points; } - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); - if (!entries) { - timeout = -ENOMEM; - goto err_free_points; + if (count > ARRAY_SIZE(stack_entries)) { + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); + if (!entries) { + timeout = -ENOMEM; + goto err_free_points; + } + } else { + memset(stack_entries, 0, sizeof(stack_entries)); + entries = stack_entries; } + /* Walk the list of sync objects and initialize entries. We do * this up-front so that we can properly return -EINVAL if there is * a syncobj with a missing fence and then never have the chance of @@ -1208,10 +1227,13 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, &entries[i].fence_cb); dma_fence_put(entries[i].fence); } - kfree(entries); + + if (entries != stack_entries) + kfree(entries); err_free_points: - kfree(points); + if (points != stack_points) + kfree(points); return timeout; } -- 2.48.0
