Each put_task_struct() is an atomic_dec. Do that in batches.

Tested io_uring-bench(iopoll,QD=128) with a custom nullblk, where
added ->iopoll() is not optimised at all:

before: 529504 IOPS
after:  538415 IOPS
diff:   ~1.8%

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f767781351f..3216cc00061b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1761,8 +1761,18 @@ static void io_free_req(struct io_kiocb *req)
 struct req_batch {
        void *reqs[IO_IOPOLL_BATCH];
        int to_free;
+
+       struct task_struct      *task;
+       int                     task_refs;
 };
 
+static void io_init_req_batch(struct req_batch *rb)
+{
+       rb->to_free = 0;
+       rb->task_refs = 0;
+       rb->task = NULL;
+}
+
 static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
                                      struct req_batch *rb)
 {
@@ -1776,6 +1786,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx 
*ctx,
 {
        if (rb->to_free)
                __io_req_free_batch_flush(ctx, rb);
+       if (rb->task) {
+               put_task_struct_many(rb->task, rb->task_refs);
+               rb->task = NULL;
+       }
 }
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
@@ -1787,6 +1801,16 @@ static void io_req_free_batch(struct req_batch *rb, 
struct io_kiocb *req)
        if (req->flags & REQ_F_LINK_HEAD)
                io_queue_next(req);
 
+       if (req->flags & REQ_F_TASK_PINNED) {
+               if (req->task != rb->task && rb->task) {
+                       put_task_struct_many(rb->task, rb->task_refs);
+                       rb->task = req->task;
+                       rb->task_refs = 0;
+               }
+               rb->task_refs++;
+               req->flags &= ~REQ_F_TASK_PINNED;
+       }
+
        io_dismantle_req(req);
        rb->reqs[rb->to_free++] = req;
        if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
@@ -1809,7 +1833,7 @@ static void io_submit_flush_completions(struct 
io_comp_state *cs)
        spin_unlock_irq(&ctx->completion_lock);
        io_cqring_ev_posted(ctx);
 
-       rb.to_free = 0;
+       io_init_req_batch(&rb);
        for (i = 0; i < nr; ++i) {
                req = cs->reqs[i];
                if (refcount_dec_and_test(&req->refs))
@@ -1973,7 +1997,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, 
unsigned int *nr_events,
        /* order with ->result store in io_complete_rw_iopoll() */
        smp_rmb();
 
-       rb.to_free = 0;
+       io_init_req_batch(&rb);
        while (!list_empty(done)) {
                int cflags = 0;
 
-- 
2.24.0

Reply via email to