On 12/3/18 6:02 AM, Roman Penyaev wrote:
> Hi all,
> 
> The goal of this patch is to reduce contention of ep_poll_callback() which
> can be called concurrently from different CPUs in case of high events
> rates and many fds per epoll.  Problem can be very well reproduced by
> generating events (write to pipe or eventfd) from many threads, while
> consumer thread does polling.  In other words this patch increases the
> bandwidth of events which can be delivered from sources to the poller by
> adding poll items in a lockless way to the list.
> 
> The main change is in replacement of the spinlock with a rwlock, which is
> taken on read in ep_poll_callback(), and then by adding poll items to the
> tail of the list using xchg atomic instruction.  Write lock is taken
> everywhere else in order to stop list modifications and guarantee that list
> updates are fully completed (I assume that write side of a rwlock does not
> starve, it seems qrwlock implementation has these guarantees).
> 
> The following are some microbenchmark results based on the test [1] which
> starts threads which generate N events each.  The test ends when all
> events are successfully fetched by the poller thread:
> 
> spinlock
> ========
> 
> threads       run time    events per ms
> -------       ---------   -------------
>       8         13191ms         6064/ms
>      16         30758ms         5201/ms
>      32         44315ms         7220/ms
> 
> rwlock + xchg
> =============
> 
> threads       run time    events per ms
> -------       ---------   -------------
>       8          8581ms         9323/ms
>      16         13800ms        11594/ms
>      32         24167ms        13240/ms
> 
> According to the results bandwidth of delivered events is significantly
> increased, thus execution time is reduced.
> 
> This is RFC because I did not run any benchmarks comparing current
> qrwlock and spinlock implementations (4.19 kernel), although I did
> not notice any epoll performance degradations in other benchmarks.
> 
> Also I'm not quite sure where to put very special lockless variant
> of adding element to the list (list_add_tail_lockless() in this
> patch).  Seems keeping it locally is safer.
> 
> [1] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
> 
> Signed-off-by: Roman Penyaev <rpeny...@suse.de>
> Cc: Alexander Viro <v...@zeniv.linux.org.uk>
> Cc: "Paul E. McKenney" <paul...@linux.vnet.ibm.com>
> Cc: Linus Torvalds <torva...@linux-foundation.org>
> Cc: linux-fsde...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  fs/eventpoll.c | 107 +++++++++++++++++++++++++++++++------------------
>  1 file changed, 69 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 42bbe6824b4b..89debda47aca 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -50,10 +50,10 @@
>   *
>   * 1) epmutex (mutex)
>   * 2) ep->mtx (mutex)
> - * 3) ep->wq.lock (spinlock)
> + * 3) ep->lock (rwlock)
>   *
>   * The acquire order is the one listed above, from 1 to 3.
> - * We need a spinlock (ep->wq.lock) because we manipulate objects
> + * We need a rwlock (ep->lock) because we manipulate objects
>   * from inside the poll callback, that might be triggered from
>   * a wake_up() that in turn might be called from IRQ context.
>   * So we can't sleep inside the poll callback and hence we need
> @@ -85,7 +85,7 @@
>   * of epoll file descriptors, we use the current recursion depth as
>   * the lockdep subkey.
>   * It is possible to drop the "ep->mtx" and to use the global
> - * mutex "epmutex" (together with "ep->wq.lock") to have it working,
> + * mutex "epmutex" (together with "ep->lock") to have it working,
>   * but having "ep->mtx" will make the interface more scalable.
>   * Events that require holding "epmutex" are very rare, while for
>   * normal operations the epoll private "ep->mtx" will guarantee
> @@ -182,8 +182,6 @@ struct epitem {
>   * This structure is stored inside the "private_data" member of the file
>   * structure and represents the main data structure for the eventpoll
>   * interface.
> - *
> - * Access to it is protected by the lock inside wq.
>   */
>  struct eventpoll {
>       /*
> @@ -203,13 +201,16 @@ struct eventpoll {
>       /* List of ready file descriptors */
>       struct list_head rdllist;
>  
> +     /* Lock which protects rdllist and ovflist */
> +     rwlock_t lock;
> +
>       /* RB tree root used to store monitored fd structs */
>       struct rb_root_cached rbr;
>  
>       /*
>        * This is a single linked list that chains all the "struct epitem" that
>        * happened while transferring ready events to userspace w/out
> -      * holding ->wq.lock.
> +      * holding ->lock.
>        */
>       struct epitem *ovflist;
>  
> @@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>        * because we want the "sproc" callback to be able to do it
>        * in a lockless way.
>        */
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>       list_splice_init(&ep->rdllist, &txlist);
>       ep->ovflist = NULL;
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       /*
>        * Now call the callback function.
>        */
>       res = (*sproc)(ep, &txlist, priv);
>  
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>       /*
>        * During the time we spent inside the "sproc" callback, some
>        * other events might have been queued by the poll callback.
> @@ -722,7 +723,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>                * contain them, and the list_splice() below takes care of them.
>                */
>               if (!ep_is_linked(epi)) {
> -                     list_add_tail(&epi->rdllink, &ep->rdllist);
> +                     /* Reverse ->ovflist, events should be in FIFO */
> +                     list_add(&epi->rdllink, &ep->rdllist);
>                       ep_pm_stay_awake(epi);
>               }
>       }
> @@ -745,11 +747,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>                * the ->poll() wait list (delayed after we release the lock).
>                */
>               if (waitqueue_active(&ep->wq))
> -                     wake_up_locked(&ep->wq);
> +                     wake_up(&ep->wq);
>               if (waitqueue_active(&ep->poll_wait))
>                       pwake++;
>       }
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       if (!ep_locked)
>               mutex_unlock(&ep->mtx);
> @@ -789,10 +791,10 @@ static int ep_remove(struct eventpoll *ep, struct 
> epitem *epi)
>  
>       rb_erase_cached(&epi->rbn, &ep->rbr);
>  
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>       if (ep_is_linked(epi))
>               list_del_init(&epi->rdllink);
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       wakeup_source_unregister(ep_wakeup_source(epi));
>       /*
> @@ -842,7 +844,7 @@ static void ep_free(struct eventpoll *ep)
>        * Walks through the whole tree by freeing each "struct epitem". At this
>        * point we are sure no poll callbacks will be lingering around, and 
> also by
>        * holding "epmutex" we can be sure that no file cleanup code will hit
> -      * us during this operation. So we can avoid the lock on "ep->wq.lock".
> +      * us during this operation. So we can avoid the lock on "ep->lock".
>        * We do not need to lock ep->mtx, either, we only do it to prevent
>        * a lockdep warning.
>        */
> @@ -1023,6 +1025,7 @@ static int ep_alloc(struct eventpoll **pep)
>               goto free_uid;
>  
>       mutex_init(&ep->mtx);
> +     rwlock_init(&ep->lock);
>       init_waitqueue_head(&ep->wq);
>       init_waitqueue_head(&ep->poll_wait);
>       INIT_LIST_HEAD(&ep->rdllist);
> @@ -1112,10 +1115,38 @@ struct file *get_epoll_tfile_raw_ptr(struct file 
> *file, int tfd,
>  }
>  #endif /* CONFIG_CHECKPOINT_RESTORE */
>  
> +/*
> + * Adds a new entry to the tail of the list in a lockless way, i.e.
> + * multiple CPUs are allowed to call this function concurrently.
> + *
> + * Beware: it is necessary to prevent any other modifications of the
> + *         existing list until all changes are completed, in other words
> + *         concurrent list_add_tail_lockless() calls should be protected
> + *         with a read lock, where write lock acts as a barrier which
> + *         makes sure all list_add_tail_lockless() calls are fully
> + *         completed.
> + */
> +static inline void list_add_tail_lockless(struct list_head *new,
> +                                       struct list_head *head)
> +{
> +     struct list_head *prev;
> +
> +     new->next = head;
> +     prev = xchg(&head->prev, new);
> +     prev->next = new;
> +     new->prev = prev;
> +}
> +
>  /*
>   * This is the callback that is passed to the wait queue wakeup
>   * mechanism. It is called by the stored file descriptors when they
>   * have events to report.
> + *
> + * This callback takes a read lock in order not to content with concurrent
> + * events from another file descriptors, thus all modifications to ->rdllist
> + * or ->ovflist are lockless.  Read lock is paired with the write lock from
> + * ep_scan_ready_list(), which stops all list modifications and guarantees
> + * that lists won't be corrupted.
>   */
>  static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int 
> sync, void *key)
>  {
> @@ -1126,7 +1157,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, 
> unsigned mode, int sync, v
>       __poll_t pollflags = key_to_poll(key);
>       int ewake = 0;
>  
> -     spin_lock_irqsave(&ep->wq.lock, flags);
> +     read_lock_irqsave(&ep->lock, flags);
>  
>       ep_set_busy_poll_napi_id(epi);
>  
> @@ -1156,8 +1187,8 @@ static int ep_poll_callback(wait_queue_entry_t *wait, 
> unsigned mode, int sync, v
>        */
>       if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
>               if (epi->next == EP_UNACTIVE_PTR) {
> -                     epi->next = ep->ovflist;
> -                     ep->ovflist = epi;
> +                     /* Atomically exchange tail */
> +                     epi->next = xchg(&ep->ovflist, epi);

This also relies on the fact that the same epi can't be added to the
list in parallel, because the wait queue doing the wakeup will have the
wait_queue_head lock. That was a little confusing for me b/c we only had
the read_lock() above.

>                       if (epi->ws) {
>                               /*
>                                * Activate ep->ws since epi->ws may get
> @@ -1172,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, 
> unsigned mode, int sync, v
>  
>       /* If this file is already in the ready list we exit soon */
>       if (!ep_is_linked(epi)) {
> -             list_add_tail(&epi->rdllink, &ep->rdllist);
> +             list_add_tail_lockless(&epi->rdllink, &ep->rdllist);
>               ep_pm_stay_awake_rcu(epi);
>       }

same for this.

>  
> @@ -1197,13 +1228,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, 
> unsigned mode, int sync, v
>                               break;
>                       }
>               }
> -             wake_up_locked(&ep->wq);
> +             wake_up(&ep->wq);

why the switch here to the locked() variant? Shouldn't the 'reader'
side, in this case which takes the rwlock for write see all updates in a
coherent state at this point?

>       }
>       if (waitqueue_active(&ep->poll_wait))
>               pwake++;
>  
>  out_unlock:
> -     spin_unlock_irqrestore(&ep->wq.lock, flags);
> +     read_unlock_irqrestore(&ep->lock, flags);
>  
>       /* We have to call this outside the lock */
>       if (pwake)
> @@ -1489,7 +1520,7 @@ static int ep_insert(struct eventpoll *ep, const struct 
> epoll_event *event,
>               goto error_remove_epi;
>  
>       /* We have to drop the new item inside our item list to keep track of 
> it */
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>  
>       /* record NAPI ID of new item if present */
>       ep_set_busy_poll_napi_id(epi);
> @@ -1501,12 +1532,12 @@ static int ep_insert(struct eventpoll *ep, const 
> struct epoll_event *event,
>  
>               /* Notify waiting tasks that events are available */
>               if (waitqueue_active(&ep->wq))
> -                     wake_up_locked(&ep->wq);
> +                     wake_up(&ep->wq);

is this necessary to switch as well? Is this to make lockdep happy?
Looks like there are few more conversions too below...

Thanks,

-Jason



>               if (waitqueue_active(&ep->poll_wait))
>                       pwake++;
>       }
>  
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       atomic_long_inc(&ep->user->epoll_watches);
>  
> @@ -1532,10 +1563,10 @@ static int ep_insert(struct eventpoll *ep, const 
> struct epoll_event *event,
>        * list, since that is used/cleaned only inside a section bound by 
> "mtx".
>        * And ep_insert() is called with "mtx" held.
>        */
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>       if (ep_is_linked(epi))
>               list_del_init(&epi->rdllink);
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       wakeup_source_unregister(ep_wakeup_source(epi));
>  
> @@ -1579,9 +1610,9 @@ static int ep_modify(struct eventpoll *ep, struct 
> epitem *epi,
>        * 1) Flush epi changes above to other CPUs.  This ensures
>        *    we do not miss events from ep_poll_callback if an
>        *    event occurs immediately after we call f_op->poll().
> -      *    We need this because we did not take ep->wq.lock while
> +      *    We need this because we did not take ep->lock while
>        *    changing epi above (but ep_poll_callback does take
> -      *    ep->wq.lock).
> +      *    ep->lock).
>        *
>        * 2) We also need to ensure we do not miss _past_ events
>        *    when calling f_op->poll().  This barrier also
> @@ -1600,18 +1631,18 @@ static int ep_modify(struct eventpoll *ep, struct 
> epitem *epi,
>        * list, push it inside.
>        */
>       if (ep_item_poll(epi, &pt, 1)) {
> -             spin_lock_irq(&ep->wq.lock);
> +             write_lock_irq(&ep->lock);
>               if (!ep_is_linked(epi)) {
>                       list_add_tail(&epi->rdllink, &ep->rdllist);
>                       ep_pm_stay_awake(epi);
>  
>                       /* Notify waiting tasks that events are available */
>                       if (waitqueue_active(&ep->wq))
> -                             wake_up_locked(&ep->wq);
> +                             wake_up(&ep->wq);
>                       if (waitqueue_active(&ep->poll_wait))
>                               pwake++;
>               }
> -             spin_unlock_irq(&ep->wq.lock);
> +             write_unlock_irq(&ep->lock);
>       }
>  
>       /* We have to call this outside the lock */
> @@ -1764,7 +1795,7 @@ static int ep_poll(struct eventpoll *ep, struct 
> epoll_event __user *events,
>                * caller specified a non blocking operation.
>                */
>               timed_out = 1;
> -             spin_lock_irq(&ep->wq.lock);
> +             write_lock_irq(&ep->lock);
>               goto check_events;
>       }
>  
> @@ -1773,7 +1804,7 @@ static int ep_poll(struct eventpoll *ep, struct 
> epoll_event __user *events,
>       if (!ep_events_available(ep))
>               ep_busy_loop(ep, timed_out);
>  
> -     spin_lock_irq(&ep->wq.lock);
> +     write_lock_irq(&ep->lock);
>  
>       if (!ep_events_available(ep)) {
>               /*
> @@ -1789,7 +1820,7 @@ static int ep_poll(struct eventpoll *ep, struct 
> epoll_event __user *events,
>                * ep_poll_callback() when events will become available.
>                */
>               init_waitqueue_entry(&wait, current);
> -             __add_wait_queue_exclusive(&ep->wq, &wait);
> +             add_wait_queue_exclusive(&ep->wq, &wait);
>  
>               for (;;) {
>                       /*
> @@ -1815,21 +1846,21 @@ static int ep_poll(struct eventpoll *ep, struct 
> epoll_event __user *events,
>                               break;
>                       }
>  
> -                     spin_unlock_irq(&ep->wq.lock);
> +                     write_unlock_irq(&ep->lock);
>                       if (!schedule_hrtimeout_range(to, slack, 
> HRTIMER_MODE_ABS))
>                               timed_out = 1;
>  
> -                     spin_lock_irq(&ep->wq.lock);
> +                     write_lock_irq(&ep->lock);
>               }
>  
> -             __remove_wait_queue(&ep->wq, &wait);
> +             remove_wait_queue(&ep->wq, &wait);
>               __set_current_state(TASK_RUNNING);
>       }
>  check_events:
>       /* Is it worth to try to dig for events ? */
>       eavail = ep_events_available(ep);
>  
> -     spin_unlock_irq(&ep->wq.lock);
> +     write_unlock_irq(&ep->lock);
>  
>       /*
>        * Try to transfer events to user space. In case we get 0 events and
> 

Reply via email to