On Thu, 7 Jan 2010, Michael S. Tsirkin wrote:

> Sure, I was trying to be as brief as possible, here's a detailed summary.
> 
> Description of the system (MSI emulation in KVM):
> 
> KVM supports an ioctl to assign/deassign an eventfd file to interrupt message
> in guest OS.  When this eventfd is signalled, interrupt message is sent.
> This assignment is done from qemu system emulator.
> 
> eventfd is signalled from device emulation in another thread in
> userspace or from kernel, which talks with guest OS through another
> eventfd and shared memory (possibility of out of process was discussed
> but never got implemented yet).
> 
> Note: it's okay to delay messages from correctness point of view, but
> generally this is latency-sensitive path. If multiple identical messages
> are requested, it's okay to send a single last message, but missing a
> message altogether causes deadlocks.  Sending a message when none were
> requested might in theory cause crashes, in practice doing this causes
> performance degradation.
> 
> Another KVM feature is interrupt masking: guest OS requests that we
> stop sending some interrupt message, possibly modified mapping
> and re-enables this message. This needs to be done without
> involving the device that might keep requesting events:
> while masked, message is marked "pending", and guest might test
> the pending status.
> 
> We can implement masking in system emulator in userspace, by using
> assign/deassign ioctls: when message is masked, we simply deassign all
> eventfd, and when it is unmasked, we assign them back.
> 
> Here's some code to illustrate how this all works: assign/deassign code
> in kernel looks like the following:
> 
> 
> this is called to unmask interrupt
> 
> static int
> kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> {
>       struct _irqfd *irqfd, *tmp;
>       struct file *file = NULL;
>       struct eventfd_ctx *eventfd = NULL;
>       int ret;
>       unsigned int events;
> 
>       irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> 
> ...
> 
>       file = eventfd_fget(fd);
>       if (IS_ERR(file)) {
>               ret = PTR_ERR(file);
>               goto fail;
>       }
> 
>       eventfd = eventfd_ctx_fileget(file);
>       if (IS_ERR(eventfd)) {
>               ret = PTR_ERR(eventfd);
>               goto fail;
>       }
> 
>       irqfd->eventfd = eventfd;
> 
>       /*
>        * Install our own custom wake-up handling so we are notified via
>        * a callback whenever someone signals the underlying eventfd
>        */
>       init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
>       init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
> 
>       spin_lock_irq(&kvm->irqfds.lock);
> 
>       events = file->f_op->poll(file, &irqfd->pt);
> 
>       list_add_tail(&irqfd->list, &kvm->irqfds.items);
>       spin_unlock_irq(&kvm->irqfds.lock);
> 
> A.
>       /*
>        * Check if there was an event already pending on the eventfd
>        * before we registered, and trigger it as if we didn't miss it.
>        */
>       if (events & POLLIN)
>               schedule_work(&irqfd->inject);
> 
>       /*
>        * do not drop the file until the irqfd is fully initialized, otherwise
>        * we might race against the POLLHUP
>        */
>       fput(file);
> 
>       return 0;
> 
> fail:
>       ...
> }
> 
> This is called to mask interrupt
> 
> /*
>  * shutdown any irqfd's that match fd+gsi
>  */
> static int
> kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> {
>       struct _irqfd *irqfd, *tmp;
>       struct eventfd_ctx *eventfd;
> 
>       eventfd = eventfd_ctx_fdget(fd);
>       if (IS_ERR(eventfd))
>               return PTR_ERR(eventfd);
> 
>       spin_lock_irq(&kvm->irqfds.lock);
> 
>       list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
>               if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
>                       irqfd_deactivate(irqfd);
>       }
> 
>       spin_unlock_irq(&kvm->irqfds.lock);
>       eventfd_ctx_put(eventfd);
> 
>       /*
>        * Block until we know all outstanding shutdown jobs have completed
>        * so that we guarantee there will not be any more interrupts on this
>        * gsi once this deassign function returns.
>        */
>       flush_workqueue(irqfd_cleanup_wq);
> 
>       return 0;
> }
> 
> 
> And deactivation deep down does this (from irqfd_cleanup_wq workqueue,
> so this is not under the spinlock):
> 
>         /*
>          * Synchronize with the wait-queue and unhook ourselves to
>          * prevent
>          * further events.
>          */
> B.
>         remove_wait_queue(irqfd->wqh, &irqfd->wait);
> 
>       ....
> 
>         /*
>          * It is now safe to release the object's resources
>          */
>         eventfd_ctx_put(irqfd->eventfd);
>         kfree(irqfd);
> 
> 
> The problems (really the same bug) in KVM that I am trying to fix:
> 1. Because of A above, if event was requested while message was masked,
>    we will not miss a message. However, because we never clear
>    the counter, so we currently get a spurious message each time
>    we unmask.
> 
>    We should clear the counter either each time we
>    deliver message, or at least when message is masked.
> 
> 2. We currently always return pending status 0 to guests,
>    but for strict spec compliance we must do it correctly
>    and report pending message if one was requested while
>    it was masked.
> 
>    An obvious way to do this is to do nonblocking read on the eventfd.
>    Since this clears the counter, we will have to write the value
>    back. But again, this requires that all messages that were
>    sent before mask are cleared from the counter, otherwise
>    we will always report pending messages.
> 
> 
> So, how should I clear the counter?
> 
> I started with the idea of clearing it in irqfd_wakeup.  However, as you
> point out this will cause another list scan, so it's messy.  It
> currently uses a workqueue to work around KVM locking issue, but the
> issue is fixed so we will stop doing this soon, so can not rely on that.
> 
> Thus I came up with the idea to do this on unmask:
> this is slow path and can always be done from workqueue or ioctl
> context. So I would add eventfd_read_ctx in code at point B:
> 
> 
>               eventfd_read_ctx(irqfd->eventfd, &ucnt);
>       ->
>               remove_wait_queue(irqfd->wqh, &irqfd->wait);
> 
> now, if device signals eventfd at point marked by ->,
> it will be sent but counter not cleared,
> so we will get spurious message when we unmask.
> 
> Or if I do it the other way:
>               remove_wait_queue(irqfd->wqh, &irqfd->wait);
>       ->
>               eventfd_read_ctx(irqfd->eventfd, &ucnt);
> 
> now, if device signals eventfd at point marked by ->,
> it will not be sent but counter will be cleared,
> so we will loose a message.
> 
> There does not appear to be a way to avoid this race unless we both
> remove from wait queue and clear counter under the wait queue lock, just
> like eventfd_read currently does.
> 
> I hope this clarifies.

Yes, it sure does, thanks.
When you unmask, shouldn't the fd be in a "clear" state anyway?
So, can't you just do  eventfd_ctx_read(ctx, 1, &ucnt)  on unmask, and 
ignore the value (and the POLLIN flags)?
Another solution, is to keep track of the counter (using 
eventfd_ctx_read() and eventfd_ctx_peek_value()) inside your code, and 
decide the status based on its changes.




- Davide


---
 fs/eventfd.c            |   74 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/eventfd.h |   13 ++++++++
 2 files changed, 72 insertions(+), 15 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c     2010-01-06 12:37:28.000000000 -0800
+++ linux-2.6.mod/fs/eventfd.c  2010-01-06 17:43:09.000000000 -0800
@@ -135,26 +135,56 @@ static unsigned int eventfd_poll(struct 
        return events;
 }
 
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
-                           loff_t *ppos)
+/**
+ * eventfd_ctx_peek_value - Reads the eventfd counter.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * NOTE: Must be called with the wait queue lock held.  Locking the
+ * eventfd_ctx interface is not actually exposed outside of the
+ * eventfd implementation, but when poll callback registers with the
+ * eventfd poll, such callbacks will be invoked with the wait queue
+ * lock held.  IOW this means that this function can only be called from
+ * inside a registered poll callback at the moment.
+ * The function returns the current counter value, without consuming it.
+ */
+__u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx)
+{
+       assert_spin_locked(&ctx->wqh.lock);
+
+       return ctx->count;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_peek_value);
+
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 {
-       struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
-       __u64 ucnt = 0;
        DECLARE_WAITQUEUE(wait, current);
 
-       if (count < sizeof(ucnt))
-               return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
+       *cnt = 0;
        res = -EAGAIN;
        if (ctx->count > 0)
-               res = sizeof(ucnt);
-       else if (!(file->f_flags & O_NONBLOCK)) {
+               res = 0;
+       else if (!no_wait) {
                __add_wait_queue(&ctx->wqh, &wait);
-               for (res = 0;;) {
+               for (;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
-                               res = sizeof(ucnt);
+                               res = 0;
                                break;
                        }
                        if (signal_pending(current)) {
@@ -168,18 +198,32 @@ static ssize_t eventfd_read(struct file 
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-       if (likely(res > 0)) {
-               ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
-               ctx->count -= ucnt;
+       if (likely(res == 0)) {
+               *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1: ctx->count;
+               ctx->count -= *cnt;
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
        }
        spin_unlock_irq(&ctx->wqh.lock);
-       if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
-               return -EFAULT;
 
        return res;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+                           loff_t *ppos)
+{
+       struct eventfd_ctx *ctx = file->private_data;
+       ssize_t res;
+       __u64 cnt;
+
+       if (count < sizeof(cnt))
+               return -EINVAL;
+       if ((res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt)) < 0)
+               return res;
+
+       return put_user(cnt, (__u64 __user *) buf) ? -EFAULT: sizeof(cnt);
+}
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t 
count,
                             loff_t *ppos)
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h  2010-01-06 12:37:28.000000000 
-0800
+++ linux-2.6.mod/include/linux/eventfd.h       2010-01-06 14:54:36.000000000 
-0800
@@ -34,6 +34,8 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 int eventfd_signal(struct eventfd_ctx *ctx, int n);
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
+__u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx);
 
 #else /* CONFIG_EVENTFD */
 
@@ -61,6 +63,17 @@ static inline void eventfd_ctx_put(struc
 
 }
 
+static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
+                                      __u64 *cnt)
+{
+       return -ENOSYS;
+}
+
+static inline __u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx)
+{
+       return 0;
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to