ep_events_available() checks for available events by looking at
ep->rdllist and ep_is_scanning(). However, this is done without a lock
and can report false negative if ep_start_scan() or ep_done_scan() are
executed by another task concurrently. For example:
_________________________________________________________________________
                                   |ep_start_scan()
                                   |  list_splice_init(&ep->rdllist, ...)
ep_events_available()              |
  !list_empty_careful(&ep->rdllist)|
  || ep_is_scanning(ep)            |
                                   |  ep_enter_scan(ep)
___________________________________|_____________________________________

Another example:
_________________________________________________________________________
ep_events_available()              |
                                   |ep_start_scan()
                                   |  list_splice_init(&ep->rdllist, ...)
                                   |  ep_enter_scan(ep)
  !list_empty_careful(&ep->rdllist)|
                                   |ep_done_scan()
                                   |  ep_exit_scan(ep)
                                   |  list_splice(..., &ep->rdllist)
  || ep_is_scanning(ep)            |
___________________________________|_____________________________________

In the above examples, ep_events_available() sees no event despite
events being available. In case epoll_wait() is called with timeout=0,
epoll_wait() will wrongly return "no event" to user.

Introduce a sequence lock to resolve this issue.

Measuring the time consumption of 10 million loop iterations doing
epoll_wait(), the following performance drop is observed:

   timeout  #event  before    after    diff
     0ms      0     3727ms   3974ms   +6.6%
     0ms      1     8099ms   9134ms    +13%
     1ms      1    13525ms  13586ms  +0.45%

Considering the use case of epoll_wait() (wait for events, do something
with the events, repeat), it should only contribute to a small portion of
user's CPU consumption. Therefore this performance drop is not alarming.

Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")
Suggested-by: Mateusz Guzik <[email protected]>
Signed-off-by: Nam Cao <[email protected]>
---
 fs/eventpoll.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index baa97d0edade..df364a8783b5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <linux/capability.h>
+#include <linux/seqlock.h>
 #include <net/busy_poll.h>
 
 /*
@@ -312,6 +313,9 @@ struct eventpoll {
        /* Lock which protects rdllist and ovflist */
        spinlock_t lock;
 
+       /* Protect switching between rdllist and ovflist */
+       seqcount_spinlock_t seq;
+
        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;
 
@@ -590,7 +594,10 @@ static inline void epi_clear_ovflist(struct epitem *epi)
 /* True iff @ep has ready events that epoll_wait() might harvest. */
 static inline bool ep_events_available(struct eventpoll *ep)
 {
-       return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep);
+       unsigned int seq = read_seqcount_begin(&ep->seq);
+
+       return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) ||
+               read_seqcount_retry(&ep->seq, seq);
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -947,8 +954,12 @@ static void ep_start_scan(struct eventpoll *ep, struct 
list_head *scan_batch)
         */
        lockdep_assert_irqs_enabled();
        spin_lock_irq(&ep->lock);
+       write_seqcount_begin(&ep->seq);
+
        list_splice_init(&ep->rdllist, scan_batch);
        ep_enter_scan(ep);
+
+       write_seqcount_end(&ep->seq);
        spin_unlock_irq(&ep->lock);
 }
 
@@ -979,6 +990,9 @@ static void ep_done_scan(struct eventpoll *ep,
                        ep_pm_stay_awake(epi);
                }
        }
+
+       write_seqcount_begin(&ep->seq);
+
        /* Back out of scan mode; callbacks target ep->rdllist again. */
        ep_exit_scan(ep);
 
@@ -986,6 +1000,9 @@ static void ep_done_scan(struct eventpoll *ep,
         * Quickly re-inject items left on "scan_batch".
         */
        list_splice(scan_batch, &ep->rdllist);
+
+       write_seqcount_end(&ep->seq);
+
        __pm_relax(ep->ws);
 
        if (!list_empty(&ep->rdllist)) {
@@ -1405,6 +1422,7 @@ static int ep_alloc(struct eventpoll **pep)
 
        mutex_init(&ep->mtx);
        spin_lock_init(&ep->lock);
+       seqcount_spinlock_init(&ep->seq, &ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
-- 
2.47.3


Reply via email to