On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> MSI interrupt affinity setting on the guest ended always up on vcpu0,
> no matter what.
> IOW writes to /proc/irq/<IRQ>/smp_affinity are irgnored.
> This patch fixes the MSI IRQ routing and avoids the utter madness of
> tearing down and setting up the interrupt completely when this changes.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Signed-off-by: Richard Weinberger <[email protected]>
> ---
> hw/device-assignment.c | 73
> ++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 70 insertions(+), 3 deletions(-)
>
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 09726f9..78d57c8 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -913,6 +913,50 @@ void assigned_dev_update_irqs(void)
> }
> }
>
> +static void assigned_dev_update_msi_route(PCIDevice *pci_dev)
> +{
> + AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> + uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
> + PCI_MSI_FLAGS);
> + struct kvm_irq_routing_entry *old, new;
> + KVMMsiMessage msg;
> + int r;
Please follow qemu coding style for braces throughout.
> +
> + if (!(ctrl_byte & PCI_MSI_FLAGS_ENABLE))
> + return;
> +
> + msg.addr_lo = pci_get_long(pci_dev->config + pci_dev->msi_cap +
> + PCI_MSI_ADDRESS_LO);
> + msg.addr_hi = pci_get_long(pci_dev->config + pci_dev->msi_cap +
> + PCI_MSI_ADDRESS_HI);
Odd, since we only expose a 32bit MSI capability to the guest...
> + msg.data = pci_get_long(pci_dev->config + pci_dev->msi_cap +
> + PCI_MSI_DATA_32);
Should be pci_get_word()
> +
> + old = adev->entry;
> + new = *old;
> + new.u.msi.address_lo = msg.addr_lo;
> + new.u.msi.address_hi = msg.addr_hi;
> + new.u.msi.data = msg.data;
> +
> + if (memcmp(old, &new, sizeof(new)) == 0)
> + return;
> +
> + r = kvm_update_routing_entry(old, &new);
How does this work? old is now new, so kvm_update_routing_entry() is
never going to match to the existing entry if address_lo or data
actually change.
> + if (r < 0) {
> + fprintf(stderr, "%s: kvm_update_msi failed: %s\n", __func__,
> + strerror(-r));
> + exit(1);
> + }
> +
> + *old = new;
huh?
> + r = kvm_irqchip_commit_routes(kvm_state);
> + if (r) {
> + fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n",
> __func__,
> + strerror(-r));
> + exit(1);
> + }
> +}
> +
> static void assigned_dev_update_msi(PCIDevice *pci_dev)
> {
> struct kvm_assigned_irq assigned_irq_data;
> @@ -1116,6 +1160,14 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice
> *pci_dev,
> uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
> uint32_t real_val, emulate_mask, full_emulation_mask;
>
> + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> + uint32_t msi_start = pci_dev->msi_cap;
> + uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
> +
> + if (address >= msi_start && (address + len) < msi_end)
ranges_overlap() is meant for this. We only expose a 32bit MSI cap, so
msi_end is wrong.
> + return virt_val;
> + }
> +
> emulate_mask = 0;
> memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
> emulate_mask = le32_to_cpu(emulate_mask);
> @@ -1130,6 +1182,17 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice
> *pci_dev,
> }
> }
>
> +static void handle_cfg_write_msi(PCIDevice *pci_dev, AssignedDevice *adev)
> +{
> + if (!kvm_enabled() || !kvm_irqchip_in_kernel())
> + return;
Unnecessary, device assignment doesn't work otherwise.
> +
> + if (adev->entry && (adev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI))
Should just be able to test irq_requested_type.
> + assigned_dev_update_msi_route(pci_dev);
> + else
> + assigned_dev_update_msi(pci_dev);
> +}
> +
> static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t
> address,
> uint32_t val, int len)
> {
> @@ -1155,9 +1218,13 @@ static void assigned_dev_pci_write_config(PCIDevice
> *pci_dev, uint32_t address,
> }
> }
> if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> - if (range_covers_byte(address, len,
> - pci_dev->msi_cap + PCI_MSI_FLAGS)) {
> - assigned_dev_update_msi(pci_dev);
> + uint32_t msi_start = pci_dev->msi_cap;
> + uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
> +
> + if (address >= msi_start && (address + len) < msi_end) {
Use ranges_overlap() please, msi_end is wrong.
> + if (address == msi_start + PCI_MSI_DATA_32)
> + handle_cfg_write_msi(pci_dev, assigned_dev);
Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
+ PCI_MSI_DATA_32) to start with? But how does this handle the enable
bit?
> + return;
> }
> }
> if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
Thanks,
Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html