Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-11 Thread Yan Zhao
On Thu, Dec 12, 2019 at 11:07:42AM +0800, Alex Williamson wrote:
> On Wed, 11 Dec 2019 21:02:40 -0500
> Yan Zhao  wrote:
> 
> > On Thu, Dec 12, 2019 at 02:56:55AM +0800, Alex Williamson wrote:
> > > On Wed, 11 Dec 2019 01:25:55 -0500
> > > Yan Zhao  wrote:
> > >   
> > > > On Wed, Dec 11, 2019 at 12:38:05AM +0800, Alex Williamson wrote:  
> > > > > On Tue, 10 Dec 2019 02:44:44 -0500
> > > > > Yan Zhao  wrote:
> > > > > 
> > > > > > On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:
> > > > > > > On Mon, 9 Dec 2019 01:22:12 -0500
> > > > > > > Yan Zhao  wrote:
> > > > > > >   
> > > > > > > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson 
> > > > > > > > wrote:  
> > > > > > > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > > > > > > Yan Zhao  wrote:
> > > > > > > > > 
> > > > > > > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson 
> > > > > > > > > > wrote:
> > > > > > > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > > > > > > Yan Zhao  wrote:
> > > > > > > > > > >   
> > > > > > > > > > > > Dynamic trap bar info region is a channel for QEMU and 
> > > > > > > > > > > > vendor driver to
> > > > > > > > > > > > communicate dynamic trap info. It is of type
> > > > > > > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > > > > > > 
> > > > > > > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > > > > > > When QEMU detects a device regions of this type, it 
> > > > > > > > > > > > will create an
> > > > > > > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > > > > > > When vendor drivre signals this eventfd, QEMU reads 
> > > > > > > > > > > > trap field of this
> > > > > > > > > > > > info region.
> > > > > > > > > > > > - If trap is true, QEMU would search the device's PCI 
> > > > > > > > > > > > BAR
> > > > > > > > > > > > regions and disable all the sparse mmaped subregions 
> > > > > > > > > > > > (if the sparse
> > > > > > > > > > > > mmaped subregion is disablable).
> > > > > > > > > > > > - If trap is false, QEMU would re-enable those 
> > > > > > > > > > > > subregions.
> > > > > > > > > > > > 
> > > > > > > > > > > > A typical usage is
> > > > > > > > > > > > 1. vendor driver first cuts its bar 0 into several 
> > > > > > > > > > > > sections, all in a
> > > > > > > > > > > > sparse mmap array. So initally, all its bar 0 are 
> > > > > > > > > > > > passthroughed.
> > > > > > > > > > > > 2. vendor driver specifys part of bar 0 sections to be 
> > > > > > > > > > > > disablable.
> > > > > > > > > > > > 3. on migration starts, vendor driver signals dt_fd and 
> > > > > > > > > > > > set trap to true
> > > > > > > > > > > > to notify QEMU disabling the bar 0 sections of 
> > > > > > > > > > > > disablable flags on.
> > > > > > > > > > > > 4. QEMU disables those bar 0 section and hence let 
> > > > > > > > > > > > vendor driver be able
> > > > > > > > > > > > to trap access of bar 0 registers and make dirty page 
> > > > > > > > > > > > tracking possible.
> > > > > > > > > > > > 5. on migration failure, vendor driver signals dt_fd to 
> > > > > > > > > > > > QEMU again.
> > > > > > > > > > > > QEMU reads trap field of this info region which is 
> > > > > > > > > > > > false and QEMU
> > > > > > > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > > > > > > 
> > > > > > > > > > > > Vendor driver specifies whether it supports 
> > > > > > > > > > > > dynamic-trap-bar-info region
> > > > > > > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > > > > > > 
> > > > > > > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > > > > > > dynamic_trap_bar_info region on behalf of vendor driver 
> > > > > > > > > > > > with region len=0
> > > > > > > > > > > > and region->ops=null.
> > > > > > > > > > > > Vvendor driver should override this region's len, 
> > > > > > > > > > > > flags, rw, mmap in its
> > > > > > > > > > > > vfio_pci_mediate_ops.  
> > > > > > > > > > > 
> > > > > > > > > > > TBH, I don't like this interface at all.  Userspace 
> > > > > > > > > > > doesn't pass data
> > > > > > > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl 
> > > > > > > > > > > for
> > > > > > > > > > > configuring user signaling with eventfds.  I think we 
> > > > > > > > > > > only need to
> > > > > > > > > > > define an IRQ type that tells the user to re-evaluate the 
> > > > > > > > > > > sparse mmap
> > > > > > > > > > > information for a region.  The user would enumerate the 
> > > > > > > > > > > device IRQs via
> > > > > > > > > > > GET_IRQ_INFO, find one of this type where the IRQ info 
> > > > > > > > > > > would also
> > > > > > > > > > > indicate which region(s) should be re-evaluated on 
> > > > > > > > > > > signaling.  The user
> > > > > > > > > > > would 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-11 Thread Alex Williamson
On Wed, 11 Dec 2019 21:02:40 -0500
Yan Zhao  wrote:

> On Thu, Dec 12, 2019 at 02:56:55AM +0800, Alex Williamson wrote:
> > On Wed, 11 Dec 2019 01:25:55 -0500
> > Yan Zhao  wrote:
> >   
> > > On Wed, Dec 11, 2019 at 12:38:05AM +0800, Alex Williamson wrote:  
> > > > On Tue, 10 Dec 2019 02:44:44 -0500
> > > > Yan Zhao  wrote:
> > > > 
> > > > > On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:
> > > > > > On Mon, 9 Dec 2019 01:22:12 -0500
> > > > > > Yan Zhao  wrote:
> > > > > >   
> > > > > > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:  
> > > > > > > 
> > > > > > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > > > > > Yan Zhao  wrote:
> > > > > > > > 
> > > > > > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson 
> > > > > > > > > wrote:
> > > > > > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > > > > > Yan Zhao  wrote:
> > > > > > > > > >   
> > > > > > > > > > > Dynamic trap bar info region is a channel for QEMU and 
> > > > > > > > > > > vendor driver to
> > > > > > > > > > > communicate dynamic trap info. It is of type
> > > > > > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > > > > > 
> > > > > > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > > > > > When QEMU detects a device regions of this type, it will 
> > > > > > > > > > > create an
> > > > > > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > > > > > When vendor drivre signals this eventfd, QEMU reads trap 
> > > > > > > > > > > field of this
> > > > > > > > > > > info region.
> > > > > > > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > > > > > > regions and disable all the sparse mmaped subregions (if 
> > > > > > > > > > > the sparse
> > > > > > > > > > > mmaped subregion is disablable).
> > > > > > > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > > > > > > 
> > > > > > > > > > > A typical usage is
> > > > > > > > > > > 1. vendor driver first cuts its bar 0 into several 
> > > > > > > > > > > sections, all in a
> > > > > > > > > > > sparse mmap array. So initally, all its bar 0 are 
> > > > > > > > > > > passthroughed.
> > > > > > > > > > > 2. vendor driver specifys part of bar 0 sections to be 
> > > > > > > > > > > disablable.
> > > > > > > > > > > 3. on migration starts, vendor driver signals dt_fd and 
> > > > > > > > > > > set trap to true
> > > > > > > > > > > to notify QEMU disabling the bar 0 sections of disablable 
> > > > > > > > > > > flags on.
> > > > > > > > > > > 4. QEMU disables those bar 0 section and hence let vendor 
> > > > > > > > > > > driver be able
> > > > > > > > > > > to trap access of bar 0 registers and make dirty page 
> > > > > > > > > > > tracking possible.
> > > > > > > > > > > 5. on migration failure, vendor driver signals dt_fd to 
> > > > > > > > > > > QEMU again.
> > > > > > > > > > > QEMU reads trap field of this info region which is false 
> > > > > > > > > > > and QEMU
> > > > > > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > > > > > 
> > > > > > > > > > > Vendor driver specifies whether it supports 
> > > > > > > > > > > dynamic-trap-bar-info region
> > > > > > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > > > > > 
> > > > > > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > > > > > dynamic_trap_bar_info region on behalf of vendor driver 
> > > > > > > > > > > with region len=0
> > > > > > > > > > > and region->ops=null.
> > > > > > > > > > > Vvendor driver should override this region's len, flags, 
> > > > > > > > > > > rw, mmap in its
> > > > > > > > > > > vfio_pci_mediate_ops.  
> > > > > > > > > > 
> > > > > > > > > > TBH, I don't like this interface at all.  Userspace doesn't 
> > > > > > > > > > pass data
> > > > > > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > > > > > > configuring user signaling with eventfds.  I think we only 
> > > > > > > > > > need to
> > > > > > > > > > define an IRQ type that tells the user to re-evaluate the 
> > > > > > > > > > sparse mmap
> > > > > > > > > > information for a region.  The user would enumerate the 
> > > > > > > > > > device IRQs via
> > > > > > > > > > GET_IRQ_INFO, find one of this type where the IRQ info 
> > > > > > > > > > would also
> > > > > > > > > > indicate which region(s) should be re-evaluated on 
> > > > > > > > > > signaling.  The user
> > > > > > > > > > would enable that signaling via SET_IRQS and simply 
> > > > > > > > > > re-evaluate the  
> > > > > > > > > ok. I'll try to switch to this way. Thanks for this 
> > > > > > > > > suggestion.
> > > > > > > > > 
> > > > > > > > > > sparse mmap capability for the associated regions when 
> > > > > > > > > > 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-11 Thread Yan Zhao
On Thu, Dec 12, 2019 at 02:56:55AM +0800, Alex Williamson wrote:
> On Wed, 11 Dec 2019 01:25:55 -0500
> Yan Zhao  wrote:
> 
> > On Wed, Dec 11, 2019 at 12:38:05AM +0800, Alex Williamson wrote:
> > > On Tue, 10 Dec 2019 02:44:44 -0500
> > > Yan Zhao  wrote:
> > >   
> > > > On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:  
> > > > > On Mon, 9 Dec 2019 01:22:12 -0500
> > > > > Yan Zhao  wrote:
> > > > > 
> > > > > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:
> > > > > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > > > > Yan Zhao  wrote:
> > > > > > >   
> > > > > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson 
> > > > > > > > wrote:  
> > > > > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > > > > Yan Zhao  wrote:
> > > > > > > > > 
> > > > > > > > > > Dynamic trap bar info region is a channel for QEMU and 
> > > > > > > > > > vendor driver to
> > > > > > > > > > communicate dynamic trap info. It is of type
> > > > > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > > > > 
> > > > > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > > > > When QEMU detects a device regions of this type, it will 
> > > > > > > > > > create an
> > > > > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > > > > When vendor drivre signals this eventfd, QEMU reads trap 
> > > > > > > > > > field of this
> > > > > > > > > > info region.
> > > > > > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > > > > > regions and disable all the sparse mmaped subregions (if 
> > > > > > > > > > the sparse
> > > > > > > > > > mmaped subregion is disablable).
> > > > > > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > > > > > 
> > > > > > > > > > A typical usage is
> > > > > > > > > > 1. vendor driver first cuts its bar 0 into several 
> > > > > > > > > > sections, all in a
> > > > > > > > > > sparse mmap array. So initally, all its bar 0 are 
> > > > > > > > > > passthroughed.
> > > > > > > > > > 2. vendor driver specifys part of bar 0 sections to be 
> > > > > > > > > > disablable.
> > > > > > > > > > 3. on migration starts, vendor driver signals dt_fd and set 
> > > > > > > > > > trap to true
> > > > > > > > > > to notify QEMU disabling the bar 0 sections of disablable 
> > > > > > > > > > flags on.
> > > > > > > > > > 4. QEMU disables those bar 0 section and hence let vendor 
> > > > > > > > > > driver be able
> > > > > > > > > > to trap access of bar 0 registers and make dirty page 
> > > > > > > > > > tracking possible.
> > > > > > > > > > 5. on migration failure, vendor driver signals dt_fd to 
> > > > > > > > > > QEMU again.
> > > > > > > > > > QEMU reads trap field of this info region which is false 
> > > > > > > > > > and QEMU
> > > > > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > > > > 
> > > > > > > > > > Vendor driver specifies whether it supports 
> > > > > > > > > > dynamic-trap-bar-info region
> > > > > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > > > > 
> > > > > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > > > > dynamic_trap_bar_info region on behalf of vendor driver 
> > > > > > > > > > with region len=0
> > > > > > > > > > and region->ops=null.
> > > > > > > > > > Vvendor driver should override this region's len, flags, 
> > > > > > > > > > rw, mmap in its
> > > > > > > > > > vfio_pci_mediate_ops.
> > > > > > > > > 
> > > > > > > > > TBH, I don't like this interface at all.  Userspace doesn't 
> > > > > > > > > pass data
> > > > > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > > > > > configuring user signaling with eventfds.  I think we only 
> > > > > > > > > need to
> > > > > > > > > define an IRQ type that tells the user to re-evaluate the 
> > > > > > > > > sparse mmap
> > > > > > > > > information for a region.  The user would enumerate the 
> > > > > > > > > device IRQs via
> > > > > > > > > GET_IRQ_INFO, find one of this type where the IRQ info would 
> > > > > > > > > also
> > > > > > > > > indicate which region(s) should be re-evaluated on signaling. 
> > > > > > > > >  The user
> > > > > > > > > would enable that signaling via SET_IRQS and simply 
> > > > > > > > > re-evaluate the
> > > > > > > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > > > > > > >   
> > > > > > > > > sparse mmap capability for the associated regions when 
> > > > > > > > > signaled.
> > > > > > > > 
> > > > > > > > Do you like the "disablable" flag of sparse mmap ?
> > > > > > > > I think it's a lightweight way for user to switch mmap state of 
> > > > > > > > a whole region,
> > > > > > > > otherwise going through a complete flow of GET_REGION_INFO and 
> > > > 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-11 Thread Alex Williamson
On Wed, 11 Dec 2019 01:25:55 -0500
Yan Zhao  wrote:

> On Wed, Dec 11, 2019 at 12:38:05AM +0800, Alex Williamson wrote:
> > On Tue, 10 Dec 2019 02:44:44 -0500
> > Yan Zhao  wrote:
> >   
> > > On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:  
> > > > On Mon, 9 Dec 2019 01:22:12 -0500
> > > > Yan Zhao  wrote:
> > > > 
> > > > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:
> > > > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > > > Yan Zhao  wrote:
> > > > > >   
> > > > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:  
> > > > > > > 
> > > > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > > > Yan Zhao  wrote:
> > > > > > > > 
> > > > > > > > > Dynamic trap bar info region is a channel for QEMU and vendor 
> > > > > > > > > driver to
> > > > > > > > > communicate dynamic trap info. It is of type
> > > > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > > > 
> > > > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > > > When QEMU detects a device regions of this type, it will 
> > > > > > > > > create an
> > > > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > > > When vendor drivre signals this eventfd, QEMU reads trap 
> > > > > > > > > field of this
> > > > > > > > > info region.
> > > > > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > > > > regions and disable all the sparse mmaped subregions (if the 
> > > > > > > > > sparse
> > > > > > > > > mmaped subregion is disablable).
> > > > > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > > > > 
> > > > > > > > > A typical usage is
> > > > > > > > > 1. vendor driver first cuts its bar 0 into several sections, 
> > > > > > > > > all in a
> > > > > > > > > sparse mmap array. So initally, all its bar 0 are 
> > > > > > > > > passthroughed.
> > > > > > > > > 2. vendor driver specifys part of bar 0 sections to be 
> > > > > > > > > disablable.
> > > > > > > > > 3. on migration starts, vendor driver signals dt_fd and set 
> > > > > > > > > trap to true
> > > > > > > > > to notify QEMU disabling the bar 0 sections of disablable 
> > > > > > > > > flags on.
> > > > > > > > > 4. QEMU disables those bar 0 section and hence let vendor 
> > > > > > > > > driver be able
> > > > > > > > > to trap access of bar 0 registers and make dirty page 
> > > > > > > > > tracking possible.
> > > > > > > > > 5. on migration failure, vendor driver signals dt_fd to QEMU 
> > > > > > > > > again.
> > > > > > > > > QEMU reads trap field of this info region which is false and 
> > > > > > > > > QEMU
> > > > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > > > 
> > > > > > > > > Vendor driver specifies whether it supports 
> > > > > > > > > dynamic-trap-bar-info region
> > > > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > > > 
> > > > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > > > dynamic_trap_bar_info region on behalf of vendor driver with 
> > > > > > > > > region len=0
> > > > > > > > > and region->ops=null.
> > > > > > > > > Vvendor driver should override this region's len, flags, rw, 
> > > > > > > > > mmap in its
> > > > > > > > > vfio_pci_mediate_ops.
> > > > > > > > 
> > > > > > > > TBH, I don't like this interface at all.  Userspace doesn't 
> > > > > > > > pass data
> > > > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > > > > configuring user signaling with eventfds.  I think we only need 
> > > > > > > > to
> > > > > > > > define an IRQ type that tells the user to re-evaluate the 
> > > > > > > > sparse mmap
> > > > > > > > information for a region.  The user would enumerate the device 
> > > > > > > > IRQs via
> > > > > > > > GET_IRQ_INFO, find one of this type where the IRQ info would 
> > > > > > > > also
> > > > > > > > indicate which region(s) should be re-evaluated on signaling.  
> > > > > > > > The user
> > > > > > > > would enable that signaling via SET_IRQS and simply re-evaluate 
> > > > > > > > the
> > > > > > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > > > > > >   
> > > > > > > > sparse mmap capability for the associated regions when 
> > > > > > > > signaled.
> > > > > > > 
> > > > > > > Do you like the "disablable" flag of sparse mmap ?
> > > > > > > I think it's a lightweight way for user to switch mmap state of a 
> > > > > > > whole region,
> > > > > > > otherwise going through a complete flow of GET_REGION_INFO and 
> > > > > > > re-setup
> > > > > > > region might be too heavy.  
> > > > > > 
> > > > > > No, I don't like the disable-able flag.  At what frequency do we 
> > > > > > expect
> > > > > > regions to change?  It seems like we'd only change when switching 
> > > > 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-10 Thread Yan Zhao
On Wed, Dec 11, 2019 at 12:38:05AM +0800, Alex Williamson wrote:
> On Tue, 10 Dec 2019 02:44:44 -0500
> Yan Zhao  wrote:
> 
> > On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:
> > > On Mon, 9 Dec 2019 01:22:12 -0500
> > > Yan Zhao  wrote:
> > >   
> > > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:  
> > > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > > Yan Zhao  wrote:
> > > > > 
> > > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:
> > > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > > Yan Zhao  wrote:
> > > > > > >   
> > > > > > > > Dynamic trap bar info region is a channel for QEMU and vendor 
> > > > > > > > driver to
> > > > > > > > communicate dynamic trap info. It is of type
> > > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > > 
> > > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > > When QEMU detects a device regions of this type, it will create 
> > > > > > > > an
> > > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > > When vendor drivre signals this eventfd, QEMU reads trap field 
> > > > > > > > of this
> > > > > > > > info region.
> > > > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > > > regions and disable all the sparse mmaped subregions (if the 
> > > > > > > > sparse
> > > > > > > > mmaped subregion is disablable).
> > > > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > > > 
> > > > > > > > A typical usage is
> > > > > > > > 1. vendor driver first cuts its bar 0 into several sections, 
> > > > > > > > all in a
> > > > > > > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > > > > > > 2. vendor driver specifys part of bar 0 sections to be 
> > > > > > > > disablable.
> > > > > > > > 3. on migration starts, vendor driver signals dt_fd and set 
> > > > > > > > trap to true
> > > > > > > > to notify QEMU disabling the bar 0 sections of disablable flags 
> > > > > > > > on.
> > > > > > > > 4. QEMU disables those bar 0 section and hence let vendor 
> > > > > > > > driver be able
> > > > > > > > to trap access of bar 0 registers and make dirty page tracking 
> > > > > > > > possible.
> > > > > > > > 5. on migration failure, vendor driver signals dt_fd to QEMU 
> > > > > > > > again.
> > > > > > > > QEMU reads trap field of this info region which is false and 
> > > > > > > > QEMU
> > > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > > 
> > > > > > > > Vendor driver specifies whether it supports 
> > > > > > > > dynamic-trap-bar-info region
> > > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > > 
> > > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > > dynamic_trap_bar_info region on behalf of vendor driver with 
> > > > > > > > region len=0
> > > > > > > > and region->ops=null.
> > > > > > > > Vvendor driver should override this region's len, flags, rw, 
> > > > > > > > mmap in its
> > > > > > > > vfio_pci_mediate_ops.  
> > > > > > > 
> > > > > > > TBH, I don't like this interface at all.  Userspace doesn't pass 
> > > > > > > data
> > > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > > > configuring user signaling with eventfds.  I think we only need to
> > > > > > > define an IRQ type that tells the user to re-evaluate the sparse 
> > > > > > > mmap
> > > > > > > information for a region.  The user would enumerate the device 
> > > > > > > IRQs via
> > > > > > > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > > > > > > indicate which region(s) should be re-evaluated on signaling.  
> > > > > > > The user
> > > > > > > would enable that signaling via SET_IRQS and simply re-evaluate 
> > > > > > > the  
> > > > > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > > > > > 
> > > > > > > sparse mmap capability for the associated regions when signaled.  
> > > > > > > 
> > > > > > 
> > > > > > Do you like the "disablable" flag of sparse mmap ?
> > > > > > I think it's a lightweight way for user to switch mmap state of a 
> > > > > > whole region,
> > > > > > otherwise going through a complete flow of GET_REGION_INFO and 
> > > > > > re-setup
> > > > > > region might be too heavy.
> > > > > 
> > > > > No, I don't like the disable-able flag.  At what frequency do we 
> > > > > expect
> > > > > regions to change?  It seems like we'd only change when switching into
> > > > > and out of the _SAVING state, which is rare.  It seems easy for
> > > > > userspace, at least QEMU, to drop the entire mmap configuration and   
> > > > >  
> > > > ok. I'll try this way.
> > > >   
> > > > > re-read it.  Another concern here is how do we synchronize the event?
> > > > > Are we assuming that this event would 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-10 Thread Alex Williamson
On Tue, 10 Dec 2019 02:44:44 -0500
Yan Zhao  wrote:

> On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:
> > On Mon, 9 Dec 2019 01:22:12 -0500
> > Yan Zhao  wrote:
> >   
> > > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:  
> > > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > > Yan Zhao  wrote:
> > > > 
> > > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:
> > > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > > Yan Zhao  wrote:
> > > > > >   
> > > > > > > Dynamic trap bar info region is a channel for QEMU and vendor 
> > > > > > > driver to
> > > > > > > communicate dynamic trap info. It is of type
> > > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > > 
> > > > > > > This region has two fields: dt_fd and trap.
> > > > > > > When QEMU detects a device regions of this type, it will create an
> > > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > > When vendor drivre signals this eventfd, QEMU reads trap field of 
> > > > > > > this
> > > > > > > info region.
> > > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > > regions and disable all the sparse mmaped subregions (if the 
> > > > > > > sparse
> > > > > > > mmaped subregion is disablable).
> > > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > > 
> > > > > > > A typical usage is
> > > > > > > 1. vendor driver first cuts its bar 0 into several sections, all 
> > > > > > > in a
> > > > > > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > > > > > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > > > > > > 3. on migration starts, vendor driver signals dt_fd and set trap 
> > > > > > > to true
> > > > > > > to notify QEMU disabling the bar 0 sections of disablable flags 
> > > > > > > on.
> > > > > > > 4. QEMU disables those bar 0 section and hence let vendor driver 
> > > > > > > be able
> > > > > > > to trap access of bar 0 registers and make dirty page tracking 
> > > > > > > possible.
> > > > > > > 5. on migration failure, vendor driver signals dt_fd to QEMU 
> > > > > > > again.
> > > > > > > QEMU reads trap field of this info region which is false and QEMU
> > > > > > > re-passthrough the whole bar 0 region.
> > > > > > > 
> > > > > > > Vendor driver specifies whether it supports dynamic-trap-bar-info 
> > > > > > > region
> > > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > > vfio_pci_mediate_ops->open().
> > > > > > > 
> > > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > > dynamic_trap_bar_info region on behalf of vendor driver with 
> > > > > > > region len=0
> > > > > > > and region->ops=null.
> > > > > > > Vvendor driver should override this region's len, flags, rw, mmap 
> > > > > > > in its
> > > > > > > vfio_pci_mediate_ops.  
> > > > > > 
> > > > > > TBH, I don't like this interface at all.  Userspace doesn't pass 
> > > > > > data
> > > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > > configuring user signaling with eventfds.  I think we only need to
> > > > > > define an IRQ type that tells the user to re-evaluate the sparse 
> > > > > > mmap
> > > > > > information for a region.  The user would enumerate the device IRQs 
> > > > > > via
> > > > > > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > > > > > indicate which region(s) should be re-evaluated on signaling.  The 
> > > > > > user
> > > > > > would enable that signaling via SET_IRQS and simply re-evaluate the 
> > > > > >  
> > > > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > > > > 
> > > > > > sparse mmap capability for the associated regions when signaled.
> > > > > >   
> > > > > 
> > > > > Do you like the "disablable" flag of sparse mmap ?
> > > > > I think it's a lightweight way for user to switch mmap state of a 
> > > > > whole region,
> > > > > otherwise going through a complete flow of GET_REGION_INFO and 
> > > > > re-setup
> > > > > region might be too heavy.
> > > > 
> > > > No, I don't like the disable-able flag.  At what frequency do we expect
> > > > regions to change?  It seems like we'd only change when switching into
> > > > and out of the _SAVING state, which is rare.  It seems easy for
> > > > userspace, at least QEMU, to drop the entire mmap configuration and
> > > ok. I'll try this way.
> > >   
> > > > re-read it.  Another concern here is how do we synchronize the event?
> > > > Are we assuming that this event would occur when a user switch to
> > > > _SAVING mode on the device?  That operation is synchronous, the device
> > > > must be in saving mode after the write to device state completes, but
> > > > it seems like this might be trying to add an asynchronous dependency.
> > > > Will the write to device_state only complete once the user 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-09 Thread Yan Zhao
On Tue, Dec 10, 2019 at 05:16:08AM +0800, Alex Williamson wrote:
> On Mon, 9 Dec 2019 01:22:12 -0500
> Yan Zhao  wrote:
> 
> > On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:
> > > On Fri, 6 Dec 2019 01:04:07 -0500
> > > Yan Zhao  wrote:
> > >   
> > > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:  
> > > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > > Yan Zhao  wrote:
> > > > > 
> > > > > > Dynamic trap bar info region is a channel for QEMU and vendor 
> > > > > > driver to
> > > > > > communicate dynamic trap info. It is of type
> > > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > > 
> > > > > > This region has two fields: dt_fd and trap.
> > > > > > When QEMU detects a device regions of this type, it will create an
> > > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > > When vendor drivre signals this eventfd, QEMU reads trap field of 
> > > > > > this
> > > > > > info region.
> > > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > > regions and disable all the sparse mmaped subregions (if the sparse
> > > > > > mmaped subregion is disablable).
> > > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > > 
> > > > > > A typical usage is
> > > > > > 1. vendor driver first cuts its bar 0 into several sections, all in 
> > > > > > a
> > > > > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > > > > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > > > > > 3. on migration starts, vendor driver signals dt_fd and set trap to 
> > > > > > true
> > > > > > to notify QEMU disabling the bar 0 sections of disablable flags on.
> > > > > > 4. QEMU disables those bar 0 section and hence let vendor driver be 
> > > > > > able
> > > > > > to trap access of bar 0 registers and make dirty page tracking 
> > > > > > possible.
> > > > > > 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> > > > > > QEMU reads trap field of this info region which is false and QEMU
> > > > > > re-passthrough the whole bar 0 region.
> > > > > > 
> > > > > > Vendor driver specifies whether it supports dynamic-trap-bar-info 
> > > > > > region
> > > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > > vfio_pci_mediate_ops->open().
> > > > > > 
> > > > > > If vfio-pci detects this cap, it will create a default
> > > > > > dynamic_trap_bar_info region on behalf of vendor driver with region 
> > > > > > len=0
> > > > > > and region->ops=null.
> > > > > > Vvendor driver should override this region's len, flags, rw, mmap 
> > > > > > in its
> > > > > > vfio_pci_mediate_ops.
> > > > > 
> > > > > TBH, I don't like this interface at all.  Userspace doesn't pass data
> > > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > > configuring user signaling with eventfds.  I think we only need to
> > > > > define an IRQ type that tells the user to re-evaluate the sparse mmap
> > > > > information for a region.  The user would enumerate the device IRQs 
> > > > > via
> > > > > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > > > > indicate which region(s) should be re-evaluated on signaling.  The 
> > > > > user
> > > > > would enable that signaling via SET_IRQS and simply re-evaluate the   
> > > > >  
> > > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > > >   
> > > > > sparse mmap capability for the associated regions when signaled.
> > > > 
> > > > Do you like the "disablable" flag of sparse mmap ?
> > > > I think it's a lightweight way for user to switch mmap state of a whole 
> > > > region,
> > > > otherwise going through a complete flow of GET_REGION_INFO and re-setup
> > > > region might be too heavy.  
> > > 
> > > No, I don't like the disable-able flag.  At what frequency do we expect
> > > regions to change?  It seems like we'd only change when switching into
> > > and out of the _SAVING state, which is rare.  It seems easy for
> > > userspace, at least QEMU, to drop the entire mmap configuration and  
> > ok. I'll try this way.
> > 
> > > re-read it.  Another concern here is how do we synchronize the event?
> > > Are we assuming that this event would occur when a user switch to
> > > _SAVING mode on the device?  That operation is synchronous, the device
> > > must be in saving mode after the write to device state completes, but
> > > it seems like this might be trying to add an asynchronous dependency.
> > > Will the write to device_state only complete once the user handles the
> > > eventfd?  How would the kernel know when the mmap re-evaluation is
> > > complete.  It seems like there are gaps here that the vendor driver
> > > could miss traps required for migration because the user hasn't
> > > completed the mmap transition yet.  Thanks,
> > > 
> > > Alex  
> > 
> > yes, this asynchronous event notification will cause 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-09 Thread Alex Williamson
On Mon, 9 Dec 2019 01:22:12 -0500
Yan Zhao  wrote:

> On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:
> > On Fri, 6 Dec 2019 01:04:07 -0500
> > Yan Zhao  wrote:
> >   
> > > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:  
> > > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > > Yan Zhao  wrote:
> > > > 
> > > > > Dynamic trap bar info region is a channel for QEMU and vendor driver 
> > > > > to
> > > > > communicate dynamic trap info. It is of type
> > > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > > 
> > > > > This region has two fields: dt_fd and trap.
> > > > > When QEMU detects a device regions of this type, it will create an
> > > > > eventfd and write its eventfd id to dt_fd field.
> > > > > When vendor drivre signals this eventfd, QEMU reads trap field of this
> > > > > info region.
> > > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > > regions and disable all the sparse mmaped subregions (if the sparse
> > > > > mmaped subregion is disablable).
> > > > > - If trap is false, QEMU would re-enable those subregions.
> > > > > 
> > > > > A typical usage is
> > > > > 1. vendor driver first cuts its bar 0 into several sections, all in a
> > > > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > > > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > > > > 3. on migration starts, vendor driver signals dt_fd and set trap to 
> > > > > true
> > > > > to notify QEMU disabling the bar 0 sections of disablable flags on.
> > > > > 4. QEMU disables those bar 0 section and hence let vendor driver be 
> > > > > able
> > > > > to trap access of bar 0 registers and make dirty page tracking 
> > > > > possible.
> > > > > 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> > > > > QEMU reads trap field of this info region which is false and QEMU
> > > > > re-passthrough the whole bar 0 region.
> > > > > 
> > > > > Vendor driver specifies whether it supports dynamic-trap-bar-info 
> > > > > region
> > > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > > vfio_pci_mediate_ops->open().
> > > > > 
> > > > > If vfio-pci detects this cap, it will create a default
> > > > > dynamic_trap_bar_info region on behalf of vendor driver with region 
> > > > > len=0
> > > > > and region->ops=null.
> > > > > Vvendor driver should override this region's len, flags, rw, mmap in 
> > > > > its
> > > > > vfio_pci_mediate_ops.
> > > > 
> > > > TBH, I don't like this interface at all.  Userspace doesn't pass data
> > > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > > configuring user signaling with eventfds.  I think we only need to
> > > > define an IRQ type that tells the user to re-evaluate the sparse mmap
> > > > information for a region.  The user would enumerate the device IRQs via
> > > > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > > > indicate which region(s) should be re-evaluated on signaling.  The user
> > > > would enable that signaling via SET_IRQS and simply re-evaluate the
> > > ok. I'll try to switch to this way. Thanks for this suggestion.
> > >   
> > > > sparse mmap capability for the associated regions when signaled.
> > > 
> > > Do you like the "disablable" flag of sparse mmap ?
> > > I think it's a lightweight way for user to switch mmap state of a whole 
> > > region,
> > > otherwise going through a complete flow of GET_REGION_INFO and re-setup
> > > region might be too heavy.  
> > 
> > No, I don't like the disable-able flag.  At what frequency do we expect
> > regions to change?  It seems like we'd only change when switching into
> > and out of the _SAVING state, which is rare.  It seems easy for
> > userspace, at least QEMU, to drop the entire mmap configuration and  
> ok. I'll try this way.
> 
> > re-read it.  Another concern here is how do we synchronize the event?
> > Are we assuming that this event would occur when a user switch to
> > _SAVING mode on the device?  That operation is synchronous, the device
> > must be in saving mode after the write to device state completes, but
> > it seems like this might be trying to add an asynchronous dependency.
> > Will the write to device_state only complete once the user handles the
> > eventfd?  How would the kernel know when the mmap re-evaluation is
> > complete.  It seems like there are gaps here that the vendor driver
> > could miss traps required for migration because the user hasn't
> > completed the mmap transition yet.  Thanks,
> > 
> > Alex  
> 
> yes, this asynchronous event notification will cause vendor driver miss
> traps. But it's supposed to be of very short period time. That's also a
> reason for us to wish the re-evaluation to be lightweight. E.g. if it's
> able to be finished before the first iterate, it's still safe.

Making the re-evaluation lightweight cannot solve the race, it only
masks 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-08 Thread Yan Zhao
On Fri, Dec 06, 2019 at 11:20:38PM +0800, Alex Williamson wrote:
> On Fri, 6 Dec 2019 01:04:07 -0500
> Yan Zhao  wrote:
> 
> > On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:
> > > On Wed,  4 Dec 2019 22:26:50 -0500
> > > Yan Zhao  wrote:
> > >   
> > > > Dynamic trap bar info region is a channel for QEMU and vendor driver to
> > > > communicate dynamic trap info. It is of type
> > > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > > 
> > > > This region has two fields: dt_fd and trap.
> > > > When QEMU detects a device regions of this type, it will create an
> > > > eventfd and write its eventfd id to dt_fd field.
> > > > When vendor drivre signals this eventfd, QEMU reads trap field of this
> > > > info region.
> > > > - If trap is true, QEMU would search the device's PCI BAR
> > > > regions and disable all the sparse mmaped subregions (if the sparse
> > > > mmaped subregion is disablable).
> > > > - If trap is false, QEMU would re-enable those subregions.
> > > > 
> > > > A typical usage is
> > > > 1. vendor driver first cuts its bar 0 into several sections, all in a
> > > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > > > 3. on migration starts, vendor driver signals dt_fd and set trap to true
> > > > to notify QEMU disabling the bar 0 sections of disablable flags on.
> > > > 4. QEMU disables those bar 0 section and hence let vendor driver be able
> > > > to trap access of bar 0 registers and make dirty page tracking possible.
> > > > 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> > > > QEMU reads trap field of this info region which is false and QEMU
> > > > re-passthrough the whole bar 0 region.
> > > > 
> > > > Vendor driver specifies whether it supports dynamic-trap-bar-info region
> > > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > > vfio_pci_mediate_ops->open().
> > > > 
> > > > If vfio-pci detects this cap, it will create a default
> > > > dynamic_trap_bar_info region on behalf of vendor driver with region 
> > > > len=0
> > > > and region->ops=null.
> > > > Vvendor driver should override this region's len, flags, rw, mmap in its
> > > > vfio_pci_mediate_ops.  
> > > 
> > > TBH, I don't like this interface at all.  Userspace doesn't pass data
> > > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > > configuring user signaling with eventfds.  I think we only need to
> > > define an IRQ type that tells the user to re-evaluate the sparse mmap
> > > information for a region.  The user would enumerate the device IRQs via
> > > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > > indicate which region(s) should be re-evaluated on signaling.  The user
> > > would enable that signaling via SET_IRQS and simply re-evaluate the  
> > ok. I'll try to switch to this way. Thanks for this suggestion.
> > 
> > > sparse mmap capability for the associated regions when signaled.  
> > 
> > Do you like the "disablable" flag of sparse mmap ?
> > I think it's a lightweight way for user to switch mmap state of a whole 
> > region,
> > otherwise going through a complete flow of GET_REGION_INFO and re-setup
> > region might be too heavy.
> 
> No, I don't like the disable-able flag.  At what frequency do we expect
> regions to change?  It seems like we'd only change when switching into
> and out of the _SAVING state, which is rare.  It seems easy for
> userspace, at least QEMU, to drop the entire mmap configuration and
ok. I'll try this way.

> re-read it.  Another concern here is how do we synchronize the event?
> Are we assuming that this event would occur when a user switch to
> _SAVING mode on the device?  That operation is synchronous, the device
> must be in saving mode after the write to device state completes, but
> it seems like this might be trying to add an asynchronous dependency.
> Will the write to device_state only complete once the user handles the
> eventfd?  How would the kernel know when the mmap re-evaluation is
> complete.  It seems like there are gaps here that the vendor driver
> could miss traps required for migration because the user hasn't
> completed the mmap transition yet.  Thanks,
> 
> Alex

yes, this asynchronous event notification will cause vendor driver miss
traps. But it's supposed to be of very short period time. That's also a
reason for us to wish the re-evaluation to be lightweight. E.g. if it's
able to be finished before the first iterate, it's still safe.

But I agree, the timing is not guaranteed, and so it's best for kernel
to wait for mmap re-evaluation to complete. 

migration_thread
|->qemu_savevm_state_setup
|   |->ram_save_setup
|   |   |->migration_bitmap_sync
|   |   |->kvm_log_sync
|   |   |->vfio_log_sync
|   |
|   |->vfio_save_setup
|   |->set_device_state(_SAVING)
|

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-06 Thread Alex Williamson
On Fri, 6 Dec 2019 01:04:07 -0500
Yan Zhao  wrote:

> On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:
> > On Wed,  4 Dec 2019 22:26:50 -0500
> > Yan Zhao  wrote:
> >   
> > > Dynamic trap bar info region is a channel for QEMU and vendor driver to
> > > communicate dynamic trap info. It is of type
> > > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > > 
> > > This region has two fields: dt_fd and trap.
> > > When QEMU detects a device regions of this type, it will create an
> > > eventfd and write its eventfd id to dt_fd field.
> > > When vendor drivre signals this eventfd, QEMU reads trap field of this
> > > info region.
> > > - If trap is true, QEMU would search the device's PCI BAR
> > > regions and disable all the sparse mmaped subregions (if the sparse
> > > mmaped subregion is disablable).
> > > - If trap is false, QEMU would re-enable those subregions.
> > > 
> > > A typical usage is
> > > 1. vendor driver first cuts its bar 0 into several sections, all in a
> > > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > > 3. on migration starts, vendor driver signals dt_fd and set trap to true
> > > to notify QEMU disabling the bar 0 sections of disablable flags on.
> > > 4. QEMU disables those bar 0 section and hence let vendor driver be able
> > > to trap access of bar 0 registers and make dirty page tracking possible.
> > > 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> > > QEMU reads trap field of this info region which is false and QEMU
> > > re-passthrough the whole bar 0 region.
> > > 
> > > Vendor driver specifies whether it supports dynamic-trap-bar-info region
> > > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > > vfio_pci_mediate_ops->open().
> > > 
> > > If vfio-pci detects this cap, it will create a default
> > > dynamic_trap_bar_info region on behalf of vendor driver with region len=0
> > > and region->ops=null.
> > > Vvendor driver should override this region's len, flags, rw, mmap in its
> > > vfio_pci_mediate_ops.  
> > 
> > TBH, I don't like this interface at all.  Userspace doesn't pass data
> > to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> > configuring user signaling with eventfds.  I think we only need to
> > define an IRQ type that tells the user to re-evaluate the sparse mmap
> > information for a region.  The user would enumerate the device IRQs via
> > GET_IRQ_INFO, find one of this type where the IRQ info would also
> > indicate which region(s) should be re-evaluated on signaling.  The user
> > would enable that signaling via SET_IRQS and simply re-evaluate the  
> ok. I'll try to switch to this way. Thanks for this suggestion.
> 
> > sparse mmap capability for the associated regions when signaled.  
> 
> Do you like the "disablable" flag of sparse mmap ?
> I think it's a lightweight way for user to switch mmap state of a whole 
> region,
> otherwise going through a complete flow of GET_REGION_INFO and re-setup
> region might be too heavy.

No, I don't like the disable-able flag.  At what frequency do we expect
regions to change?  It seems like we'd only change when switching into
and out of the _SAVING state, which is rare.  It seems easy for
userspace, at least QEMU, to drop the entire mmap configuration and
re-read it.  Another concern here is how do we synchronize the event?
Are we assuming that this event would occur when a user switch to
_SAVING mode on the device?  That operation is synchronous, the device
must be in saving mode after the write to device state completes, but
it seems like this might be trying to add an asynchronous dependency.
Will the write to device_state only complete once the user handles the
eventfd?  How would the kernel know when the mmap re-evaluation is
complete.  It seems like there are gaps here that the vendor driver
could miss traps required for migration because the user hasn't
completed the mmap transition yet.  Thanks,

Alex

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list



Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-05 Thread Yan Zhao
On Fri, Dec 06, 2019 at 07:55:30AM +0800, Alex Williamson wrote:
> On Wed,  4 Dec 2019 22:26:50 -0500
> Yan Zhao  wrote:
> 
> > Dynamic trap bar info region is a channel for QEMU and vendor driver to
> > communicate dynamic trap info. It is of type
> > VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> > VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> > 
> > This region has two fields: dt_fd and trap.
> > When QEMU detects a device regions of this type, it will create an
> > eventfd and write its eventfd id to dt_fd field.
> > When vendor drivre signals this eventfd, QEMU reads trap field of this
> > info region.
> > - If trap is true, QEMU would search the device's PCI BAR
> > regions and disable all the sparse mmaped subregions (if the sparse
> > mmaped subregion is disablable).
> > - If trap is false, QEMU would re-enable those subregions.
> > 
> > A typical usage is
> > 1. vendor driver first cuts its bar 0 into several sections, all in a
> > sparse mmap array. So initally, all its bar 0 are passthroughed.
> > 2. vendor driver specifys part of bar 0 sections to be disablable.
> > 3. on migration starts, vendor driver signals dt_fd and set trap to true
> > to notify QEMU disabling the bar 0 sections of disablable flags on.
> > 4. QEMU disables those bar 0 section and hence let vendor driver be able
> > to trap access of bar 0 registers and make dirty page tracking possible.
> > 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> > QEMU reads trap field of this info region which is false and QEMU
> > re-passthrough the whole bar 0 region.
> > 
> > Vendor driver specifies whether it supports dynamic-trap-bar-info region
> > through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> > vfio_pci_mediate_ops->open().
> > 
> > If vfio-pci detects this cap, it will create a default
> > dynamic_trap_bar_info region on behalf of vendor driver with region len=0
> > and region->ops=null.
> > Vvendor driver should override this region's len, flags, rw, mmap in its
> > vfio_pci_mediate_ops.
> 
> TBH, I don't like this interface at all.  Userspace doesn't pass data
> to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
> configuring user signaling with eventfds.  I think we only need to
> define an IRQ type that tells the user to re-evaluate the sparse mmap
> information for a region.  The user would enumerate the device IRQs via
> GET_IRQ_INFO, find one of this type where the IRQ info would also
> indicate which region(s) should be re-evaluated on signaling.  The user
> would enable that signaling via SET_IRQS and simply re-evaluate the
ok. I'll try to switch to this way. Thanks for this suggestion.

> sparse mmap capability for the associated regions when signaled.

Do you like the "disablable" flag of sparse mmap ?
I think it's a lightweight way for user to switch mmap state of a whole region,
otherwise going through a complete flow of GET_REGION_INFO and re-setup
region might be too heavy.

Thanks
Yan

> Thanks,
> 
> Alex
>




> > 
> > Cc: Kevin Tian 
> > 
> > Signed-off-by: Yan Zhao 
> > ---
> >  drivers/vfio/pci/vfio_pci.c | 16 
> >  include/linux/vfio.h|  3 ++-
> >  include/uapi/linux/vfio.h   | 11 +++
> >  3 files changed, 29 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 059660328be2..62b811ca43e4 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -127,6 +127,19 @@ void init_migration_region(struct vfio_pci_device 
> > *vdev)
> > NULL);
> >  }
> >  
> > +/**
> > + * register a region to hold info for dynamically trap bar regions
> > + */
> > +void init_dynamic_trap_bar_info_region(struct vfio_pci_device *vdev)
> > +{
> > +   vfio_pci_register_dev_region(vdev,
> > +   VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO,
> > +   VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO,
> > +   NULL, 0,
> > +   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
> > +   NULL);
> > +}
> > +
> >  static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >  {
> > struct resource *res;
> > @@ -538,6 +551,9 @@ static int vfio_pci_open(void *device_data)
> > if (caps & VFIO_PCI_DEVICE_CAP_MIGRATION)
> > init_migration_region(vdev);
> >  
> > +   if (caps & VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR)
> > +   init_dynamic_trap_bar_info_region(vdev);
> > +
> > pr_info("vfio pci found mediate_ops %s, 
> > caps=%llx, handle=%x for %x:%x\n",
> > vdev->mediate_ops->name, caps,
> > handle, vdev->pdev->vendor,
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index cddea8e9dcb2..cf8ecf687bee 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -197,7 +197,8 @@ 

Re: [libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-05 Thread Alex Williamson
On Wed,  4 Dec 2019 22:26:50 -0500
Yan Zhao  wrote:

> Dynamic trap bar info region is a channel for QEMU and vendor driver to
> communicate dynamic trap info. It is of type
> VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
> VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.
> 
> This region has two fields: dt_fd and trap.
> When QEMU detects a device regions of this type, it will create an
> eventfd and write its eventfd id to dt_fd field.
> When vendor drivre signals this eventfd, QEMU reads trap field of this
> info region.
> - If trap is true, QEMU would search the device's PCI BAR
> regions and disable all the sparse mmaped subregions (if the sparse
> mmaped subregion is disablable).
> - If trap is false, QEMU would re-enable those subregions.
> 
> A typical usage is
> 1. vendor driver first cuts its bar 0 into several sections, all in a
> sparse mmap array. So initally, all its bar 0 are passthroughed.
> 2. vendor driver specifys part of bar 0 sections to be disablable.
> 3. on migration starts, vendor driver signals dt_fd and set trap to true
> to notify QEMU disabling the bar 0 sections of disablable flags on.
> 4. QEMU disables those bar 0 section and hence let vendor driver be able
> to trap access of bar 0 registers and make dirty page tracking possible.
> 5. on migration failure, vendor driver signals dt_fd to QEMU again.
> QEMU reads trap field of this info region which is false and QEMU
> re-passthrough the whole bar 0 region.
> 
> Vendor driver specifies whether it supports dynamic-trap-bar-info region
> through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
> vfio_pci_mediate_ops->open().
> 
> If vfio-pci detects this cap, it will create a default
> dynamic_trap_bar_info region on behalf of vendor driver with region len=0
> and region->ops=null.
> Vvendor driver should override this region's len, flags, rw, mmap in its
> vfio_pci_mediate_ops.

TBH, I don't like this interface at all.  Userspace doesn't pass data
to the kernel via INFO ioctls.  We have a SET_IRQS ioctl for
configuring user signaling with eventfds.  I think we only need to
define an IRQ type that tells the user to re-evaluate the sparse mmap
information for a region.  The user would enumerate the device IRQs via
GET_IRQ_INFO, find one of this type where the IRQ info would also
indicate which region(s) should be re-evaluated on signaling.  The user
would enable that signaling via SET_IRQS and simply re-evaluate the
sparse mmap capability for the associated regions when signaled.
Thanks,

Alex

> 
> Cc: Kevin Tian 
> 
> Signed-off-by: Yan Zhao 
> ---
>  drivers/vfio/pci/vfio_pci.c | 16 
>  include/linux/vfio.h|  3 ++-
>  include/uapi/linux/vfio.h   | 11 +++
>  3 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 059660328be2..62b811ca43e4 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -127,6 +127,19 @@ void init_migration_region(struct vfio_pci_device *vdev)
>   NULL);
>  }
>  
> +/**
> + * register a region to hold info for dynamically trap bar regions
> + */
> +void init_dynamic_trap_bar_info_region(struct vfio_pci_device *vdev)
> +{
> + vfio_pci_register_dev_region(vdev,
> + VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO,
> + VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO,
> + NULL, 0,
> + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
> + NULL);
> +}
> +
>  static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>  {
>   struct resource *res;
> @@ -538,6 +551,9 @@ static int vfio_pci_open(void *device_data)
>   if (caps & VFIO_PCI_DEVICE_CAP_MIGRATION)
>   init_migration_region(vdev);
>  
> + if (caps & VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR)
> + init_dynamic_trap_bar_info_region(vdev);
> +
>   pr_info("vfio pci found mediate_ops %s, 
> caps=%llx, handle=%x for %x:%x\n",
>   vdev->mediate_ops->name, caps,
>   handle, vdev->pdev->vendor,
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index cddea8e9dcb2..cf8ecf687bee 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -197,7 +197,8 @@ extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
>  
>  struct vfio_pci_mediate_ops {
>   char*name;
> -#define VFIO_PCI_DEVICE_CAP_MIGRATION (0x01)
> +#define VFIO_PCI_DEVICE_CAP_MIGRATION(0x01)
> +#define VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR (0x02)
>   int (*open)(struct pci_dev *pdev, u64 *caps, u32 *handle);
>   void(*release)(int handle);
>   void(*get_region_info)(int handle,
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index caf8845a67a6..74a2d0b57741 100644
> --- 

[libvirt] [RFC PATCH 4/9] vfio-pci: register default dynamic-trap-bar-info region

2019-12-05 Thread Yan Zhao
Dynamic trap bar info region is a channel for QEMU and vendor driver to
communicate dynamic trap info. It is of type
VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO and subtype
VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO.

This region has two fields: dt_fd and trap.
When QEMU detects a device regions of this type, it will create an
eventfd and write its eventfd id to dt_fd field.
When vendor drivre signals this eventfd, QEMU reads trap field of this
info region.
- If trap is true, QEMU would search the device's PCI BAR
regions and disable all the sparse mmaped subregions (if the sparse
mmaped subregion is disablable).
- If trap is false, QEMU would re-enable those subregions.

A typical usage is
1. vendor driver first cuts its bar 0 into several sections, all in a
sparse mmap array. So initally, all its bar 0 are passthroughed.
2. vendor driver specifys part of bar 0 sections to be disablable.
3. on migration starts, vendor driver signals dt_fd and set trap to true
to notify QEMU disabling the bar 0 sections of disablable flags on.
4. QEMU disables those bar 0 section and hence let vendor driver be able
to trap access of bar 0 registers and make dirty page tracking possible.
5. on migration failure, vendor driver signals dt_fd to QEMU again.
QEMU reads trap field of this info region which is false and QEMU
re-passthrough the whole bar 0 region.

Vendor driver specifies whether it supports dynamic-trap-bar-info region
through cap VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR in
vfio_pci_mediate_ops->open().

If vfio-pci detects this cap, it will create a default
dynamic_trap_bar_info region on behalf of vendor driver with region len=0
and region->ops=null.
Vvendor driver should override this region's len, flags, rw, mmap in its
vfio_pci_mediate_ops.

Cc: Kevin Tian 

Signed-off-by: Yan Zhao 
---
 drivers/vfio/pci/vfio_pci.c | 16 
 include/linux/vfio.h|  3 ++-
 include/uapi/linux/vfio.h   | 11 +++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 059660328be2..62b811ca43e4 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -127,6 +127,19 @@ void init_migration_region(struct vfio_pci_device *vdev)
NULL);
 }
 
+/**
+ * register a region to hold info for dynamically trap bar regions
+ */
+void init_dynamic_trap_bar_info_region(struct vfio_pci_device *vdev)
+{
+   vfio_pci_register_dev_region(vdev,
+   VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO,
+   VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO,
+   NULL, 0,
+   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+   NULL);
+}
+
 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 {
struct resource *res;
@@ -538,6 +551,9 @@ static int vfio_pci_open(void *device_data)
if (caps & VFIO_PCI_DEVICE_CAP_MIGRATION)
init_migration_region(vdev);
 
+   if (caps & VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR)
+   init_dynamic_trap_bar_info_region(vdev);
+
pr_info("vfio pci found mediate_ops %s, 
caps=%llx, handle=%x for %x:%x\n",
vdev->mediate_ops->name, caps,
handle, vdev->pdev->vendor,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index cddea8e9dcb2..cf8ecf687bee 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -197,7 +197,8 @@ extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
 struct vfio_pci_mediate_ops {
char*name;
-#define VFIO_PCI_DEVICE_CAP_MIGRATION (0x01)
+#define VFIO_PCI_DEVICE_CAP_MIGRATION  (0x01)
+#define VFIO_PCI_DEVICE_CAP_DYNAMIC_TRAP_BAR   (0x02)
int (*open)(struct pci_dev *pdev, u64 *caps, u32 *handle);
void(*release)(int handle);
void(*get_region_info)(int handle,
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index caf8845a67a6..74a2d0b57741 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -258,6 +258,9 @@ struct vfio_region_info {
 struct vfio_region_sparse_mmap_area {
__u64   offset; /* Offset of mmap'able area within region */
__u64   size;   /* Size of mmap'able area */
+   __u32   disablable; /* whether this mmap'able are able to
+*  be dynamically disabled
+*/
 };
 
 struct vfio_region_info_cap_sparse_mmap {
@@ -454,6 +457,14 @@ struct vfio_device_migration_info {
 #define VFIO_DEVICE_DIRTY_PFNS_ALL (~0ULL)
 } __attribute__((packed));
 
+/* Region type and sub-type to hold info to dynamically trap bars */
+#define VFIO_REGION_TYPE_DYNAMIC_TRAP_BAR_INFO (4)
+#define VFIO_REGION_SUBTYPE_DYNAMIC_TRAP_BAR_INFO  (1)
+
+struct