On Thu, Sep 30, 2021 at 10:52:18AM +0300, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushche...@epam.com>
> 
> Instead of handling a single range set, that contains all the memory
> regions of all the BARs and ROM, have them per BAR.
> 
> This is in preparation of making non-identity mappings in p2m for the
> MMIOs/ROM.
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushche...@epam.com>
> ---
>  xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
>  xen/include/xen/vpci.h    |   3 +-
>  2 files changed, 122 insertions(+), 53 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index ec4d215f36ff..9c603d26d302 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev, 
> uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    if ( v->vpci.mem )
> +    if ( v->vpci.num_mem_ranges )
>      {
>          struct map_data data = {
>              .d = v->domain,
>              .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>          };
> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> +        struct pci_dev *pdev = v->vpci.pdev;
> +        struct vpci_header *header = &pdev->vpci->header;
> +        unsigned int i;
>  
> -        if ( rc == -ERESTART )
> -            return true;
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +        {
> +            struct vpci_bar *bar = &header->bars[i];
> +            int rc;
>  
> -        spin_lock(&v->vpci.pdev->vpci->lock);
> -        /* Disable memory decoding unconditionally on failure. */
> -        modify_decoding(v->vpci.pdev,
> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> -                        !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci->lock);
> +            if ( !bar->mem )
> +                continue;
>  
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
> -        if ( rc )
> -            /*
> -             * FIXME: in case of failure remove the device from the domain.
> -             * Note that there might still be leftover mappings. While this 
> is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> -             */
> -            vpci_remove_device(v->vpci.pdev);
> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> +
> +            if ( rc == -ERESTART )
> +                return true;
> +
> +            spin_lock(&pdev->vpci->lock);
> +            /* Disable memory decoding unconditionally on failure. */
> +            modify_decoding(pdev,
> +                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : 
> v->vpci.cmd,
> +                            !rc && v->vpci.rom_only);
> +            spin_unlock(&pdev->vpci->lock);
> +
> +            rangeset_destroy(bar->mem);

Now that the rangesets are per-BAR we might have to consider
allocating them at initialization time and not destroying them when
empty. We could replace the NULL checks with rangeset_is_empty
instead. Not that you have to do this on this patch, but I think it's
worth mentioning.

> +            bar->mem = NULL;
> +            v->vpci.num_mem_ranges--;
> +            if ( rc )
> +                /*
> +                 * FIXME: in case of failure remove the device from the 
> domain.
> +                 * Note that there might still be leftover mappings. While 
> this is
> +                 * safe for Dom0, for DomUs the domain will likely need to be
> +                 * killed in order to avoid leaking stale p2m mappings on
> +                 * failure.
> +                 */
> +                vpci_remove_device(pdev);
> +        }
>      }
>  
>      return false;
>  }
>  
>  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            struct rangeset *mem, uint16_t cmd)
> +                            uint16_t cmd)
>  {
>      struct map_data data = { .d = d, .map = true };
> -    int rc;
> +    struct vpci_header *header = &pdev->vpci->header;
> +    int rc = 0;
> +    unsigned int i;
> +
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
>  
> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == 
> -ERESTART )
> -        process_pending_softirqs();
> -    rangeset_destroy(mem);
> +        if ( !bar->mem )
> +            continue;
> +
> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
> +                                              &data)) == -ERESTART )
> +            process_pending_softirqs();
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
>      if ( !rc )
>          modify_decoding(pdev, cmd, false);
>  
> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const 
> struct pci_dev *pdev,
>  }
>  
>  static void defer_map(struct domain *d, struct pci_dev *pdev,
> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
> +                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)

Like mentioned below, I don't think you need to pass the number of
BARs that need mapping changes. Iff that's strictly needed, it should
be an unsigned int.

>  {
>      struct vcpu *curr = current;
>  
> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev 
> *pdev,
>       * started for the same device if the domain is not well-behaved.
>       */
>      curr->vpci.pdev = pdev;
> -    curr->vpci.mem = mem;
>      curr->vpci.cmd = cmd;
>      curr->vpci.rom_only = rom_only;
> +    curr->vpci.num_mem_ranges = num_mem_ranges;
>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev 
> *pdev,
>  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool 
> rom_only)
>  {
>      struct vpci_header *header = &pdev->vpci->header;
> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>      struct pci_dev *tmp, *dev = NULL;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> -    unsigned int i;
> +    unsigned int i, j;
>      int rc;
> -
> -    if ( !mem )
> -        return -ENOMEM;
> +    uint8_t num_mem_ranges;
>  
>      /*
> -     * Create a rangeset that represents the current device BARs memory 
> region
> +     * Create a rangeset per BAR that represents the current device memory 
> region
>       * and compare it against all the currently active BAR memory regions. If
>       * an overlap is found, subtract it from the region to be 
> mapped/unmapped.
>       *
> -     * First fill the rangeset with all the BARs of this device or with the 
> ROM
> +     * First fill the rangesets with all the BARs of this device or with the 
> ROM
>       * BAR only, depending on whether the guest is toggling the memory decode
>       * bit of the command register, or the enable bit of the ROM BAR 
> register.
>       */
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
> -        const struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar *bar = &header->bars[i];
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
> +        bar->mem = NULL;

Why do you need to set mem to NULL here? I think we should instead
assert that bar->mem == NULL here.

> +
>          if ( !MAPPABLE_BAR(bar) ||
>               (rom_only ? bar->type != VPCI_BAR_ROM
>                         : (bar->type == VPCI_BAR_ROM && 
> !header->rom_enabled)) )
>              continue;
>  
> -        rc = rangeset_add_range(mem, start, end);
> +        bar->mem = rangeset_new(NULL, NULL, 0);
> +        if ( !bar->mem )
> +        {
> +            rc = -ENOMEM;
> +            goto fail;
> +        }
> +
> +        rc = rangeset_add_range(bar->mem, start, end);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            goto fail;
>          }
>      }
>  
> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>          unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>                                       vmsix_table_size(pdev->vpci, i) - 1);
>  
> -        rc = rangeset_remove_range(mem, start, end);
> -        if ( rc )
> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            printk(XENLOG_G_WARNING
> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
> -                   start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            const struct vpci_bar *bar = &header->bars[j];
> +
> +            if ( !bar->mem )
> +                continue;
> +
> +            rc = rangeset_remove_range(bar->mem, start, end);
> +            if ( rc )
> +            {
> +                printk(XENLOG_G_WARNING
> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
> +                       start, end, rc);
> +                goto fail;
> +            }
>          }
>      }
>  
> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>              unsigned long start = PFN_DOWN(bar->addr);
>              unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
> -            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) 
> ||
> +            if ( !bar->enabled ||
> +                 !rangeset_overlaps_range(bar->mem, start, end) ||
>                   /*
>                    * If only the ROM enable bit is toggled check against other
>                    * BARs in the same device for overlaps, but not against the
> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>                   (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>                  continue;
>  
> -            rc = rangeset_remove_range(mem, start, end);
> +            rc = rangeset_remove_range(bar->mem, start, end);
>              if ( rc )
>              {
>                  printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                         start, end, rc);
> -                rangeset_destroy(mem);
> -                return rc;
> +                goto fail;
>              }
>          }
>      }
> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, mem, cmd);
> +        return apply_map(pdev->domain, pdev, cmd);
>      }
>  
> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
> +    /* Find out how many memory ranges has left after MSI and overlaps. */
> +    num_mem_ranges = 0;
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];

There's no need to declare this local variable AFAICT, just use
header->bars[i].mem. In any case this is likely to go away if you
follow my recommendation below to just call defer_map unconditionally
like it's currently done.

> +
> +        if ( !rangeset_is_empty(bar->mem) )
> +            num_mem_ranges++;
> +    }
> +
> +    /*
> +     * There are cases when PCI device, root port for example, has neither
> +     * memory space nor IO. In this case PCI command register write is
> +     * missed resulting in the underlying PCI device not functional, so:
> +     *   - if there are no regions write the command register now
> +     *   - if there are regions then defer work and write later on
> +     */
> +    if ( !num_mem_ranges )
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);

I think this is wrong, as not calling defer_map will prevent the
rangesets (bar[i]->mem) from being destroyed, so we are effectively
leaking memory.

You need to take a path similar to the failure one in case there are
no mappings pending, or even better just call defer_map anyway and let
it do it's thing, it should be capable of handling empty rangesets
just fine. That's how it's currently done.

> +    else
> +        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>  
>      return 0;
> +
> +fail:

We usually ask labels to be indented with one space.

> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
> +
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
> +    return rc;
>  }
>  
>  static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index a0320b22cb36..352e02d0106d 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -80,6 +80,7 @@ struct vpci {
>              /* Guest view of the BAR. */
>              uint64_t guest_addr;
>              uint64_t size;
> +            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -154,9 +155,9 @@ struct vpci {
>  
>  struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> -    struct rangeset *mem;
>      struct pci_dev *pdev;
>      uint16_t cmd;
> +    uint8_t num_mem_ranges;

AFAICT This could be a simple bool:

bool map_pending : 1;

As there's no strict need to know how many BARs have pending mappings.

Thanks, Roger.

Reply via email to