On 11/15/2010 11:15 AM, Sheng Yang wrote:
This patch enable per-vector mask for assigned devices using MSI-X.

This patch provided two new APIs: one is for guest to specific device's MSI-X
table address in MMIO, the other is for userspace to get information about mask
bit.

All the mask bit operation are kept in kernel, in order to accelerate.
Userspace shouldn't access the device MMIO directly for the information,
instead it should uses provided API to do so.

Signed-off-by: Sheng Yang<sh...@linux.intel.com>
---
  arch/x86/kvm/x86.c       |    1 +
  include/linux/kvm.h      |   32 +++++
  include/linux/kvm_host.h |    5 +
  virt/kvm/assigned-dev.c  |  318 +++++++++++++++++++++++++++++++++++++++++++++-
  4 files changed, 355 insertions(+), 1 deletions(-)


Documentation?

+static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+                         void *val)
+{
+       struct kvm_assigned_dev_kernel *adev =
+                       container_of(this, struct kvm_assigned_dev_kernel,
+                                    msix_mmio_dev);
+       int idx, r = 0;
+       u32 entry[4];
+       struct kvm_kernel_irq_routing_entry e;
+
+       /* TODO: Get big-endian machine work */
+       mutex_lock(&adev->kvm->lock);
+       if (!msix_mmio_in_range(adev, addr, len)) {
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       if ((addr&  0x3) || len != 4)
+               goto out;
+
+       idx = msix_get_enabled_idx(adev, addr, len);
+       if (idx<  0) {
+               idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
+               if ((addr % PCI_MSIX_ENTRY_SIZE) ==
+                               PCI_MSIX_ENTRY_VECTOR_CTRL)
+                       *(unsigned long *)val =
+                               test_bit(idx, adev->msix_mask_bitmap) ?
+                               PCI_MSIX_ENTRY_CTRL_MASKBIT : 0;
+               else
+                       r = -EOPNOTSUPP;
+               goto out;
+       }
+
+       r = kvm_get_irq_routing_entry(adev->kvm,
+                       adev->guest_msix_entries[idx].vector,&e);
+       if (r || e.type != KVM_IRQ_ROUTING_MSI) {
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       entry[0] = e.msi.address_lo;
+       entry[1] = e.msi.address_hi;
+       entry[2] = e.msi.data;
+       entry[3] = test_bit(adev->guest_msix_entries[idx].entry,
+                       adev->msix_mask_bitmap);
+       memcpy(val,&entry[addr % PCI_MSIX_ENTRY_SIZE / sizeof *entry], len);
+
+out:
+       mutex_unlock(&adev->kvm->lock);
+       return r;
+}
+
+static int msix_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+                          const void *val)
+{
+       struct kvm_assigned_dev_kernel *adev =
+                       container_of(this, struct kvm_assigned_dev_kernel,
+                                    msix_mmio_dev);
+       int idx, r = 0;
+       unsigned long new_val = *(unsigned long *)val;

What if it's a 64-bit write on a 32-bit host?

Are we sure the trailing bytes of val are zero?

+
+       /* TODO: Get big-endian machine work */

BUILD_BUG_ON(something)

+       mutex_lock(&adev->kvm->lock);
+       if (!msix_mmio_in_range(adev, addr, len)) {
+               r = -EOPNOTSUPP;
+               goto out;
+       }

Why is this needed?  Didn't the iodev check already do this?

+       if ((addr&  0x3) || len != 4)
+               goto out;

What if len == 8?  I think mst said it was legal.

+
+       idx = msix_get_enabled_idx(adev, addr, len);
+       if (idx<  0) {
+               idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
+               if (((addr % PCI_MSIX_ENTRY_SIZE) ==
+                               PCI_MSIX_ENTRY_VECTOR_CTRL)) {
+                       if (new_val&  ~PCI_MSIX_ENTRY_CTRL_MASKBIT)
+                               goto out;
+                       if (new_val&  PCI_MSIX_ENTRY_CTRL_MASKBIT)
+                               set_bit(idx, adev->msix_mask_bitmap);
+                       else
+                               clear_bit(idx, adev->msix_mask_bitmap);
+                       /* It's possible that we need re-enable MSI-X, so go
+                        * back to userspace */
+               }
+               /* Userspace would handle other MMIO writing */
+               r = -EOPNOTSUPP;

That's not very good. We should do the entire thing in the kernel or in userspace. We can have a new EXIT_REASON to let userspace know an msix entry changed, and it should read it from the kernel.

+               goto out;
+       }
+       if (addr % PCI_MSIX_ENTRY_SIZE != PCI_MSIX_ENTRY_VECTOR_CTRL) {
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       if (new_val&  ~PCI_MSIX_ENTRY_CTRL_MASKBIT)
+               goto out;
+       update_msix_mask(adev, idx, !!(new_val&  PCI_MSIX_ENTRY_CTRL_MASKBIT));
+out:
+       mutex_unlock(&adev->kvm->lock);
+
+       return r;
+}
+
+static int kvm_vm_ioctl_update_msix_mmio(struct kvm *kvm,
+                               struct kvm_msix_mmio *msix_mmio)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *adev;
+
+       if (msix_mmio->type != KVM_MSIX_TYPE_ASSIGNED_DEV)
+               return -EINVAL;
+
+       if (!msix_mmio->flags)
+               return -EINVAL;

Need to check flags for undefined bits too.

+
+       mutex_lock(&kvm->lock);
+       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     msix_mmio->id);
+       if (!adev) {
+               r = -EINVAL;
+               goto out;
+       }
+       if (msix_mmio->base_addr == 0) {
+               r = -EINVAL;
+               goto out;
+       }
+       if (msix_mmio->max_entries_nr == 0 ||
+                       msix_mmio->max_entries_nr>  KVM_MAX_MSIX_PER_DEV) {
+               r = -EINVAL;
+               goto out;
+       }
+
+       if ((msix_mmio->flags&  KVM_MSIX_MMIO_FLAG_REGISTER)&&
+                       (msix_mmio->flags&  KVM_MSIX_MMIO_FLAG_UNREGISTER)) {
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (msix_mmio->flags&  KVM_MSIX_MMIO_FLAG_REGISTER) {
+               if (!(adev->flags&  KVM_ASSIGNED_ENABLED_MSIX_MMIO)) {
+                       mutex_lock(&kvm->slots_lock);
+                       kvm_iodevice_init(&adev->msix_mmio_dev,
+                                       &msix_mmio_ops);
+                       r = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+                                       &adev->msix_mmio_dev);
+                       if (!r)
+                               adev->flags |= KVM_ASSIGNED_ENABLED_MSIX_MMIO;
+                       mutex_unlock(&kvm->slots_lock);
+               }
+               if (!r) {
+                       adev->msix_mmio_base = msix_mmio->base_addr;
+                       adev->msix_max_entries_nr = msix_mmio->max_entries_nr;
+               }
+       } else if (msix_mmio->flags&  KVM_MSIX_MMIO_FLAG_UNREGISTER)
+               unregister_msix_mmio(kvm, adev);
+out:
+       mutex_unlock(&kvm->lock);
+
+       return r;
+}
  #endif

Question: how do we do this with vfio?  One idea is to have

ioctl(vmfd, KVM_IO_PIPE, { KVM_IO_MMIO /* or KVM_IO_PIO */, range, pipefd1, pipefd2 })

and have kvm convert mmio or pio in that range to commands and responses sent over that pipe. We could have qemu threads, other userspace processes, or kernel components like vfio implement the other side of the channel.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to