Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Łukasz Gieryk
I'm sorry for the delayed response. We (I and the other Lukasz) somehow
had hoped that Knut, the original author of this patch, would have
responded.

Let me address your questions, up to my best knowledge.
  
> > -static pcibus_t pci_bar_address(PCIDevice *d,
> > -int reg, uint8_t type, pcibus_t size)
> > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > +uint8_t type, pcibus_t size)
> > +{
> > +pcibus_t new_addr;
> > +if (!pci_is_vf(d)) {
> > +int bar = pci_bar(d, reg);
> > +if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +new_addr = pci_get_quad(d->config + bar);
> > +} else {
> > +new_addr = pci_get_long(d->config + bar);
> > +}
> > +} else {
> > +PCIDevice *pf = d->exp.sriov_vf.pf;
> > +uint16_t sriov_cap = pf->exp.sriov_cap;
> > +int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > +uint16_t vf_offset = pci_get_word(pf->config + sriov_cap + 
> > PCI_SRIOV_VF_OFFSET);
> > +uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + 
> > PCI_SRIOV_VF_STRIDE);
> > +uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride;
> > +
> > +if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +new_addr = pci_get_quad(pf->config + bar);
> > +} else {
> > +new_addr = pci_get_long(pf->config + bar);
> > +}
> > +new_addr += vf_num * size;
> > +}
> > +if (reg != PCI_ROM_SLOT) {
> > +/* Preserve the rom enable bit */
> > +new_addr &= ~(size - 1);
> 
> This comment puzzles me. How does clearing low bits preserve
> any bits? Looks like this will clear low bits if any.
> 

I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are
cleared for BARs, but not for expansion ROM. I agree the placement of this
comment is slightly misleading. We will move it up and rephrase slightly.
 
> > +pcibus_t pci_bar_address(PCIDevice *d,
> > + int reg, uint8_t type, pcibus_t size)
> >  {
> >  pcibus_t new_addr, last_addr;
> > -int bar = pci_bar(d, reg);
> >  uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> >  Object *machine = qdev_get_machine();
> >  ObjectClass *oc = object_get_class(machine);
> > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> >  if (!(cmd & PCI_COMMAND_IO)) {
> >  return PCI_BAR_UNMAPPED;
> >  }
> > -new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > +new_addr = pci_config_get_bar_addr(d, reg, type, size);
> >  last_addr = new_addr + size - 1;
> >  /* Check if 32 bit BAR wraps around explicitly.
> >   * TODO: make priorities correct and remove this work around.
> > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> >  if (!(cmd & PCI_COMMAND_MEMORY)) {
> >  return PCI_BAR_UNMAPPED;
> >  }
> > -if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > -new_addr = pci_get_quad(d->config + bar);
> > -} else {
> > -new_addr = pci_get_long(d->config + bar);
> > -}
> > +new_addr = pci_config_get_bar_addr(d, reg, type, size);
> >  /* the ROM slot has a specific enable bit */
> >  if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> 
> And in fact here we check the low bit and handle it specially.

The code seems correct for me. The bit is preserved for ROM case.

> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index d7d73a31e4..182a225054 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler 
> > *hotplug_dev, DeviceState *dev,
> >  PCIDevice *pci_dev = PCI_DEVICE(dev);
> >  uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> >  
> > +if(pci_is_vf(pci_dev)) {
> > +/* We don't want to change any state in hotplug_dev for SR/IOV 
> > virtual functions */
> > +return;
> > +}
> > +
> 
> Coding style violation here.  And pls document the why not the what.
> E.g. IIRC the reason is that VFs don't have an express capability,
> right?

I think the reason is that virtual functions don’t exist physically, so
they cannot be individually disconnected. Only PF should respond to
hotplug events, implicitly disconnecting (thus: destroying) all child
VFs.

Anyway, we will update this comment to state *why* and add the missing
space.

V4 with the mentioned changes will happen really soon.




Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Knut Omang
On Wed, 2022-01-26 at 14:23 +0100, Łukasz Gieryk wrote:
> I'm sorry for the delayed response. We (I and the other Lukasz) somehow
> had hoped that Knut, the original author of this patch, would have
> responded.

Yes, sorry - this one flushed past me here for some reason,

> 
> Let me address your questions, up to my best knowledge.
>   
> > > -static pcibus_t pci_bar_address(PCIDevice *d,
> > > -    int reg, uint8_t type, pcibus_t size)
> > > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > > +    uint8_t type, pcibus_t size)
> > > +{
> > > +    pcibus_t new_addr;
> > > +    if (!pci_is_vf(d)) {
> > > +    int bar = pci_bar(d, reg);
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(d->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(d->config + bar);
> > > +    }
> > > +    } else {
> > > +    PCIDevice *pf = d->exp.sriov_vf.pf;
> > > +    uint16_t sriov_cap = pf->exp.sriov_cap;
> > > +    int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > > +    uint16_t vf_offset = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_OFFSET);
> > > +    uint16_t vf_stride = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_STRIDE);
> > > +    uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / 
> > > vf_stride;
> > > +
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(pf->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(pf->config + bar);
> > > +    }
> > > +    new_addr += vf_num * size;
> > > +    }
> > > +    if (reg != PCI_ROM_SLOT) {
> > > +    /* Preserve the rom enable bit */
> > > +    new_addr &= ~(size - 1);
> > 
> > This comment puzzles me. How does clearing low bits preserve
> > any bits? Looks like this will clear low bits if any.
> > 
> 
> I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are
> cleared for BARs, but not for expansion ROM. I agree the placement of this
> comment is slightly misleading. We will move it up and rephrase slightly.

I agree - it's maybe better to just put the comment above the if(...) 
other than that I believe it is correct.

Knut

>  
> > > +pcibus_t pci_bar_address(PCIDevice *d,
> > > + int reg, uint8_t type, pcibus_t size)
> > >   {
> > >   pcibus_t new_addr, last_addr;
> > > -    int bar = pci_bar(d, reg);
> > >   uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> > >   Object *machine = qdev_get_machine();
> > >   ObjectClass *oc = object_get_class(machine);
> > > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_IO)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   last_addr = new_addr + size - 1;
> > >   /* Check if 32 bit BAR wraps around explicitly.
> > >    * TODO: make priorities correct and remove this work around.
> > > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_MEMORY)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > -    new_addr = pci_get_quad(d->config + bar);
> > > -    } else {
> > > -    new_addr = pci_get_long(d->config + bar);
> > > -    }
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   /* the ROM slot has a specific enable bit */
> > >   if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> > 
> > And in fact here we check the low bit and handle it specially.
> 
> The code seems correct for me. The bit is preserved for ROM case.
> 
> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index d7d73a31e4..182a225054 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler 
> > > *hotplug_dev,
> > > DeviceState *dev,
> > >   PCIDevice *pci_dev = PCI_DEVICE(dev);
> > >   uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > >   
> > > +    if(pci_is_vf(pci_dev)) {
> > > +    /* We don't want to change any state in hotplug_dev for SR/IOV 
> > > virtual
> > > functions */
> > > +    return;
> > > +    }
> > > +
> > 
> > Coding style violation here.  And pls document the why not the what.
> > E.g. IIRC the reason is that VFs don't have an express capability,
> > right?
> 
> I think the reason is that virtual functions don’t exist physically, so
> they cannot be individually disconnected. Only PF should respond to
> hotplug events, implicitly disconnecting (thus: destroying) all child
> VFs.
> 
> Anyway, we will update this comment to state *why* and add the missing
> 

Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-06 Thread Michael S. Tsirkin
On Tue, Dec 21, 2021 at 03:32:32PM +0100, Lukasz Maniak wrote:
> From: Knut Omang 
> 
> This patch provides the building blocks for creating an SR/IOV
> PCIe Extended Capability header and register/unregister
> SR/IOV Virtual Functions.
> 
> Signed-off-by: Knut Omang 
> ---
>  hw/pci/meson.build  |   1 +
>  hw/pci/pci.c|  97 +---
>  hw/pci/pcie.c   |   5 +
>  hw/pci/pcie_sriov.c | 287 
>  hw/pci/trace-events |   5 +
>  include/hw/pci/pci.h|  12 +-
>  include/hw/pci/pcie.h   |   6 +
>  include/hw/pci/pcie_sriov.h |  67 +
>  include/qemu/typedefs.h |   2 +
>  9 files changed, 456 insertions(+), 26 deletions(-)
>  create mode 100644 hw/pci/pcie_sriov.c
>  create mode 100644 include/hw/pci/pcie_sriov.h
> 
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac817..bcc9c75919 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -5,6 +5,7 @@ pci_ss.add(files(
>'pci.c',
>'pci_bridge.c',
>'pci_host.c',
> +  'pcie_sriov.c',
>'shpc.c',
>'slotid_cap.c'
>  ))
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e5993c1ef5..1892a7e74c 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
>  {
>  uint8_t type;
>  
> +/* PCIe virtual functions do not have their own BARs */
> +assert(!pci_is_vf(d));
> +
>  if (reg != PCI_ROM_SLOT)
>  return PCI_BASE_ADDRESS_0 + reg * 4;
>  
> @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
>  }
>  }
>  
> -static void pci_do_device_reset(PCIDevice *dev)
> +static void pci_reset_regions(PCIDevice *dev)
>  {
>  int r;
> +if (pci_is_vf(dev)) {
> +return;
> +}
> +
> +for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> +PCIIORegion *region = >io_regions[r];
> +if (!region->size) {
> +continue;
> +}
> +
> +if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> +region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> +} else {
> +pci_set_long(dev->config + pci_bar(dev, r), region->type);
> +}
> +}
> +}
>  
> +static void pci_do_device_reset(PCIDevice *dev)
> +{
>  pci_device_deassert_intx(dev);
>  assert(dev->irq_state == 0);
>  
> @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
>pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
>pci_get_word(dev->w1cmask + 
> PCI_INTERRUPT_LINE));
>  dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> -for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> -PCIIORegion *region = >io_regions[r];
> -if (!region->size) {
> -continue;
> -}
> -
> -if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> -region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> -pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> -} else {
> -pci_set_long(dev->config + pci_bar(dev, r), region->type);
> -}
> -}
> +pci_reset_regions(dev);
>  pci_update_mappings(dev);
>  
>  msi_reset(dev);
> @@ -884,6 +895,15 @@ static void pci_init_multifunction(PCIBus *bus, 
> PCIDevice *dev, Error **errp)
>  dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>  }
>  
> +/* With SR/IOV and ARI, a device at function 0 need not be a 
> multifunction
> + * device, as it may just be a VF that ended up with function 0 in
> + * the legacy PCI interpretation. Avoid failing in such cases:
> + */
> +if (pci_is_vf(dev) &&
> +dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +return;
> +}
> +
>  /*
>   * multifunction bit is interpreted in two ways as follows.
>   *   - all functions must set the bit to 1.
> @@ -1083,6 +1103,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
> bus->devices[devfn]->name);
>  return NULL;
>  } else if (dev->hotplugged &&
> +   !pci_is_vf(pci_dev) &&
> pci_get_function_0(pci_dev)) {
>  error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
> " new func %s cannot be exposed to guest.",
> @@ -1191,6 +1212,7 @@ void pci_register_bar(PCIDevice *pci_dev, int 
> region_num,
>  pcibus_t size = memory_region_size(memory);
>  uint8_t hdr_type;
>  
> +assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar 
> */
>  assert(region_num >= 0);
>  assert(region_num < PCI_NUM_REGIONS);
>  assert(is_power_of_2(size));
> @@ -1294,11 +1316,43 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int 
> region_num)
>  return pci_dev->io_regions[region_num].addr;
>  }
>  
> -static pcibus_t pci_bar_address(PCIDevice *d,
> - 

[PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2021-12-21 Thread Lukasz Maniak
From: Knut Omang 

This patch provides the building blocks for creating an SR/IOV
PCIe Extended Capability header and register/unregister
SR/IOV Virtual Functions.

Signed-off-by: Knut Omang 
---
 hw/pci/meson.build  |   1 +
 hw/pci/pci.c|  97 +---
 hw/pci/pcie.c   |   5 +
 hw/pci/pcie_sriov.c | 287 
 hw/pci/trace-events |   5 +
 include/hw/pci/pci.h|  12 +-
 include/hw/pci/pcie.h   |   6 +
 include/hw/pci/pcie_sriov.h |  67 +
 include/qemu/typedefs.h |   2 +
 9 files changed, 456 insertions(+), 26 deletions(-)
 create mode 100644 hw/pci/pcie_sriov.c
 create mode 100644 include/hw/pci/pcie_sriov.h

diff --git a/hw/pci/meson.build b/hw/pci/meson.build
index 5c4bbac817..bcc9c75919 100644
--- a/hw/pci/meson.build
+++ b/hw/pci/meson.build
@@ -5,6 +5,7 @@ pci_ss.add(files(
   'pci.c',
   'pci_bridge.c',
   'pci_host.c',
+  'pcie_sriov.c',
   'shpc.c',
   'slotid_cap.c'
 ))
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e5993c1ef5..1892a7e74c 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
 {
 uint8_t type;
 
+/* PCIe virtual functions do not have their own BARs */
+assert(!pci_is_vf(d));
+
 if (reg != PCI_ROM_SLOT)
 return PCI_BASE_ADDRESS_0 + reg * 4;
 
@@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
 }
 }
 
-static void pci_do_device_reset(PCIDevice *dev)
+static void pci_reset_regions(PCIDevice *dev)
 {
 int r;
+if (pci_is_vf(dev)) {
+return;
+}
+
+for (r = 0; r < PCI_NUM_REGIONS; ++r) {
+PCIIORegion *region = >io_regions[r];
+if (!region->size) {
+continue;
+}
+
+if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
+region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+pci_set_quad(dev->config + pci_bar(dev, r), region->type);
+} else {
+pci_set_long(dev->config + pci_bar(dev, r), region->type);
+}
+}
+}
 
+static void pci_do_device_reset(PCIDevice *dev)
+{
 pci_device_deassert_intx(dev);
 assert(dev->irq_state == 0);
 
@@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
   pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
   pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
 dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
-for (r = 0; r < PCI_NUM_REGIONS; ++r) {
-PCIIORegion *region = >io_regions[r];
-if (!region->size) {
-continue;
-}
-
-if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
-region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
-pci_set_quad(dev->config + pci_bar(dev, r), region->type);
-} else {
-pci_set_long(dev->config + pci_bar(dev, r), region->type);
-}
-}
+pci_reset_regions(dev);
 pci_update_mappings(dev);
 
 msi_reset(dev);
@@ -884,6 +895,15 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice 
*dev, Error **errp)
 dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
 }
 
+/* With SR/IOV and ARI, a device at function 0 need not be a multifunction
+ * device, as it may just be a VF that ended up with function 0 in
+ * the legacy PCI interpretation. Avoid failing in such cases:
+ */
+if (pci_is_vf(dev) &&
+dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+return;
+}
+
 /*
  * multifunction bit is interpreted in two ways as follows.
  *   - all functions must set the bit to 1.
@@ -1083,6 +1103,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
bus->devices[devfn]->name);
 return NULL;
 } else if (dev->hotplugged &&
+   !pci_is_vf(pci_dev) &&
pci_get_function_0(pci_dev)) {
 error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
" new func %s cannot be exposed to guest.",
@@ -1191,6 +1212,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
 pcibus_t size = memory_region_size(memory);
 uint8_t hdr_type;
 
+assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
 assert(region_num >= 0);
 assert(region_num < PCI_NUM_REGIONS);
 assert(is_power_of_2(size));
@@ -1294,11 +1316,43 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int 
region_num)
 return pci_dev->io_regions[region_num].addr;
 }
 
-static pcibus_t pci_bar_address(PCIDevice *d,
-int reg, uint8_t type, pcibus_t size)
+static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
+uint8_t type, pcibus_t size)
+{
+pcibus_t new_addr;
+if (!pci_is_vf(d)) {
+int bar = pci_bar(d, reg);
+if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+new_addr =