Re: [PATCH] drm/xen: Add missing VM_DONTEXPAND flag in mmap callback

2022-06-19 Thread Oleksandr Andrushchenko
Hi, Oleksandr!

On 09.05.22 16:51, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko 
>
> With Xen PV Display driver in use the "expected" VM_DONTEXPAND flag
> is not set (neither explicitly nor implicitly), so the driver hits
> the code path in drm_gem_mmap_obj() which triggers the WARNING.
>
> Signed-off-by: Oleksandr Tyshchenko 
Reviewed-by: Oleksandr Andrushchenko 

> ---
> This patch eliminates a WARNING which occurs during running any user space
> application over drm (weston, modetest, etc) using PV Display frontend
> in Xen guest (it worth mentioning the frontend still works despite the 
> WARNING):
>
> root@salvator-x-h3-4x2g-xt-domu:~# modetest -M xendrm-du -s 31:1920x1080
> (XEN) common/grant_table.c:1882:d2v0 Expanding d2 grant table from 5 to 9 
> frames
> [   31.566759] [ cut here ]
> [   31.566811] WARNING: CPU: 0 PID: 235 at drivers/gpu/drm/drm_gem.c:1055 
> drm_gem_mmap_obj+0x16c/0x180
> [   31.566864] Modules linked in:
> [   31.566886] CPU: 0 PID: 235 Comm: modetest Not tainted 
> 5.18.0-rc4-yocto-standard-9-gabe87d78bbc9 #1
> [   31.566922] Hardware name: XENVM-4.17 (DT)
> [   31.566940] pstate: 6005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   31.566973] pc : drm_gem_mmap_obj+0x16c/0x180
> [   31.567001] lr : drm_gem_mmap_obj+0x78/0x180
> [   31.567026] sp : 89d03bb0
> [   31.567044] x29: 89d03bb0 x28: 0008 x27: 
> 0001c42d43c0
> [   31.567080] x26: 0001c42d4cc0 x25: 07e9 x24: 
> 0001c0136000
> [   31.567116] x23: 0001c031 x22: 0001c4002b80 x21: 
> 
> [   31.567150] x20: 0001c42d43c0 x19: 0001c0137600 x18: 
> 0001
> [   31.567186] x17:  x16:  x15: 
> 00035c81
> [   31.567220] x14:  x13:  x12: 
> 
> [   31.567258] x11: 0010 x10: 95d69000 x9 : 
> 0001c435ac30
> [   31.567294] x8 : 8001f65ce000 x7 : 0001 x6 : 
> 0001c24de000
> [   31.567329] x5 : 89d03a10 x4 : 0090 x3 : 
> 10046400
> [   31.567365] x2 : 07e9 x1 : 9dd8cb7c02b1bd00 x0 : 
> 10fb
> [   31.567401] Call trace:
> [   31.567415]  drm_gem_mmap_obj+0x16c/0x180
> [   31.567439]  drm_gem_mmap+0x128/0x228
> [   31.567460]  mmap_region+0x384/0x5a0
> [   31.567484]  do_mmap+0x354/0x4f0
> [   31.567505]  vm_mmap_pgoff+0xdc/0x108
> [   31.567529]  ksys_mmap_pgoff+0x1b8/0x208
> [   31.567550]  __arm64_sys_mmap+0x30/0x48
> [   31.567576]  invoke_syscall+0x44/0x108
> [   31.567599]  el0_svc_common.constprop.0+0xcc/0xf0
> [   31.567629]  do_el0_svc+0x24/0x88
> [   31.567649]  el0_svc+0x2c/0x88
> [   31.567686]  el0t_64_sync_handler+0xb0/0xb8
> [   31.567708]  el0t_64_sync+0x18c/0x190
> [   31.567731] ---[ end trace  ]---
> setting mode 1920x1080-60.00Hz@XR24 on connectors 31, crtc 34
> ---
>   drivers/gpu/drm/xen/xen_drm_front_gem.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c 
> b/drivers/gpu/drm/xen/xen_drm_front_gem.c
> index 5a5bf4e..e31554d 100644
> --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c
> +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c
> @@ -71,7 +71,7 @@ static int xen_drm_front_gem_object_mmap(struct 
> drm_gem_object *gem_obj,
>* the whole buffer.
>*/
>   vma->vm_flags &= ~VM_PFNMAP;
> - vma->vm_flags |= VM_MIXEDMAP;
> + vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND;
>   vma->vm_pgoff = 0;
>   
>   /*


[PATCH 3/4] vpci: use pcidevs locking to protect MMIO handlers

2022-02-16 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

vPCI MMIO handlers are accessing pdevs without protecting this access
with pcidevs_{lock|unlock}. This is not a problem as of now as these
are only used by Dom0. But, towards vPCI is used also for guests, we need
to properly protect pdev and pdev->vpci from being removed while still
in use.

For that add new locking helpers: pcidevs_read_{un}lock and means to
check if the lock is held in read mode.

Note, that pcidevs_read_{un}lock doesn't acquire _pcidevs_lock recursive
lock because its users are not expected to modify pdev's contents
other than pdev->vpci which is protected by pdev->vpci->lock (where
appropriate). These new helpers are also suitable for simple pdev
list traversals such as for_each_pdev, pci_get_pdev_by_domain and others.

This patch adds ASSERTs in the code to check that the rwlock is taken
and in appropriate mode. Some of such checks require changes to the
initialization of local variables which may be accessed before the
ASSERT checks the locking. For example see init_bars and mask_write.

Signed-off-by: Oleksandr Andrushchenko 
---
 xen/arch/x86/hvm/vmsi.c   | 24 +--
 xen/drivers/passthrough/pci.c | 20 +
 xen/drivers/vpci/header.c | 24 +--
 xen/drivers/vpci/msi.c| 21 +
 xen/drivers/vpci/msix.c   | 55 ++-
 xen/drivers/vpci/vpci.c   | 22 --
 xen/include/xen/pci.h |  5 
 xen/include/xen/vpci.h|  2 +-
 8 files changed, 151 insertions(+), 22 deletions(-)

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 13e2a190b439..5ef8f37ab0fc 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -889,10 +889,16 @@ void vpci_msix_arch_init_entry(struct vpci_msix_entry 
*entry)
 entry->arch.pirq = INVALID_PIRQ;
 }
 
-int vpci_msix_arch_print(const struct vpci_msix *msix)
+int vpci_msix_arch_print(const struct domain *d, const struct vpci_msix *msix)
 {
 unsigned int i;
 
+/*
+ * FIXME: this is not immediately correct, as the lock can be grabbed
+ * by a different CPU. But this is better then nothing.
+ */
+ASSERT(pcidevs_read_locked());
+
 for ( i = 0; i < msix->max_entries; i++ )
 {
 const struct vpci_msix_entry *entry = >entries[i];
@@ -909,11 +915,23 @@ int vpci_msix_arch_print(const struct vpci_msix *msix)
 if ( i && !(i % 64) )
 {
 struct pci_dev *pdev = msix->pdev;
+pci_sbdf_t sbdf = pdev->sbdf;
 
 spin_unlock(>pdev->vpci->lock);
+pcidevs_read_unlock();
+
+/* NB: we still hold rcu_read_lock(_read_lock); here. */
 process_pending_softirqs();
-/* NB: we assume that pdev cannot go away for an alive domain. */
-if ( !pdev->vpci || !spin_trylock(>vpci->lock) )
+
+if ( !pcidevs_read_trylock() )
+return -EBUSY;
+pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
+/*
+ * FIXME: we may find a re-allocated pdev's copy here.
+ * Even occupying the same address as before. Do our best.
+ */
+if ( !pdev || (pdev != msix->pdev) || !pdev->vpci ||
+ !spin_trylock(>vpci->lock) )
 return -EBUSY;
 if ( pdev->vpci->msix != msix )
 {
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 2a0d3d37a69f..74fe1c94cf71 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -70,6 +70,26 @@ bool_t pcidevs_locked(void)
 return !!spin_is_locked(&_pcidevs_lock) || pcidevs_write_locked();
 }
 
+void pcidevs_read_lock(void)
+{
+read_lock(&_pcidevs_rwlock);
+}
+
+int pcidevs_read_trylock(void)
+{
+return read_trylock(&_pcidevs_rwlock);
+}
+
+void pcidevs_read_unlock(void)
+{
+read_unlock(&_pcidevs_rwlock);
+}
+
+bool pcidevs_read_locked(void)
+{
+return !!rw_is_locked(&_pcidevs_rwlock);
+}
+
 void pcidevs_write_lock(void)
 {
 write_lock(&_pcidevs_rwlock);
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 40ff79c33f8f..75e972740106 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -142,16 +142,19 @@ bool vpci_process_pending(struct vcpu *v)
 if ( rc == -ERESTART )
 return true;
 
+pcidevs_read_lock();
 spin_lock(>vpci.pdev->vpci->lock);
 /* Disable memory decoding unconditionally on failure. */
 modify_decoding(v->vpci.pdev,
 rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
 !rc && v->vpci.rom_only);
 spin_unlock(>vpci.pdev->vpci->lock);
+pcidevs_read_unlock();
 
 rangeset_destroy(v->vpci.mem);
 v->vpci.mem = NUL

[PATCH 1/4] pci: add rwlock to pcidevs_lock machinery

2022-02-16 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

Currently pcidevs lock is a global recursive spinlock which is fine for
the existing use cases. It is used to both protect pdev instances
themselves from being removed while in use and to make sure the update
of the relevant pdev properties is synchronized.

Moving towards vPCI is used for guests this becomes problematic in terms
of lock contention. For example, during vpci_{read|write} the access to
pdev must be protected to prevent pdev disappearing under our feet.
This needs to be done with the help of pcidevs_{lock|unlock}.
On the other hand it is highly undesirable to lock all other pdev accesses
which only use pdevs in read mode, e.g. those which do not remove or
add pdevs.

For the above reasons introduce a read/write lock which will help
preventing locking contentions between pdev readers and writers:
- make pci_{add|remove}_device and setup_hwdom_pci_devices use the
  new write lock
- keep all the rest using the existing API (pcidevs_{lock|unlock},
  but extend the later to also acquire the rwlock in read mode.

This is in preparation for vPCI to be used for guests.

Signed-off-by: Oleksandr Andrushchenko 
---
 xen/drivers/passthrough/pci.c | 45 ++-
 xen/include/xen/pci.h |  4 
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index e8b09d77d880..2a0d3d37a69f 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -51,20 +51,38 @@ struct pci_seg {
 };
 
 static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(_pcidevs_rwlock);
 
 void pcidevs_lock(void)
 {
+read_lock(&_pcidevs_rwlock);
 spin_lock_recursive(&_pcidevs_lock);
 }
 
 void pcidevs_unlock(void)
 {
 spin_unlock_recursive(&_pcidevs_lock);
+read_unlock(&_pcidevs_rwlock);
 }
 
 bool_t pcidevs_locked(void)
 {
-return !!spin_is_locked(&_pcidevs_lock);
+return !!spin_is_locked(&_pcidevs_lock) || pcidevs_write_locked();
+}
+
+void pcidevs_write_lock(void)
+{
+write_lock(&_pcidevs_rwlock);
+}
+
+void pcidevs_write_unlock(void)
+{
+write_unlock(&_pcidevs_rwlock);
+}
+
+bool pcidevs_write_locked(void)
+{
+return !!rw_is_write_locked(&_pcidevs_rwlock);
 }
 
 static struct radix_tree_root pci_segments;
@@ -758,7 +776,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 
 ret = -ENOMEM;
 
-pcidevs_lock();
+pcidevs_write_lock();
 pseg = alloc_pseg(seg);
 if ( !pseg )
 goto out;
@@ -854,7 +872,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 pci_enable_acs(pdev);
 
 out:
-pcidevs_unlock();
+pcidevs_write_unlock();
 if ( !ret )
 {
 printk(XENLOG_DEBUG "PCI add %s %pp\n", pdev_type,  >sbdf);
@@ -885,7 +903,7 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
 if ( !pseg )
 return -ENODEV;
 
-pcidevs_lock();
+pcidevs_write_lock();
 list_for_each_entry ( pdev, >alldevs_list, alldevs_list )
 if ( pdev->bus == bus && pdev->devfn == devfn )
 {
@@ -899,7 +917,7 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
 break;
 }
 
-pcidevs_unlock();
+pcidevs_write_unlock();
 return ret;
 }
 
@@ -1176,6 +1194,11 @@ static void __hwdom_init setup_one_hwdom_device(const 
struct setup_hwdom *ctxt,
ctxt->d->domain_id, err);
 }
 
+/*
+ * It's safe to drop and re-acquire the write lock in this context without
+ * risking pdev disappearing because devices cannot be removed until the
+ * initial domain has been started.
+ */
 static int __hwdom_init _setup_hwdom_pci_devices(struct pci_seg *pseg, void 
*arg)
 {
 struct setup_hwdom *ctxt = arg;
@@ -1208,17 +1231,17 @@ static int __hwdom_init _setup_hwdom_pci_devices(struct 
pci_seg *pseg, void *arg
 
 if ( iommu_verbose )
 {
-pcidevs_unlock();
+pcidevs_write_unlock();
 process_pending_softirqs();
-pcidevs_lock();
+pcidevs_write_lock();
 }
 }
 
 if ( !iommu_verbose )
 {
-pcidevs_unlock();
+pcidevs_write_unlock();
 process_pending_softirqs();
-pcidevs_lock();
+pcidevs_write_lock();
 }
 }
 
@@ -1230,9 +1253,9 @@ void __hwdom_init setup_hwdom_pci_devices(
 {
 struct setup_hwdom ctxt = { .d = d, .handler = handler };
 
-pcidevs_lock();
+pcidevs_write_lock();
 pci_segments_iterate(_setup_hwdom_pci_devices, );
-pcidevs_unlock();
+pcidevs_write_unlock();
 }
 
 /* APEI not supported on ARM yet. */
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index b6d7e454f814..e814d9542bfc 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -152,6 +152,10 @@ void pcidevs_lock(void);
 void pcidevs_unlock(void);
 bool_t __must_check pcidevs_lo

[PATCH 4/4] vpci: resolve possible clash while removing BAR overlaps

2022-02-16 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

modify_bars checks if the mapping of the BAR memory has already been
done when mapping other device's BARs or, while unmapping, are still
in use by other devices.

With the existing locking scheme it is possible that there are other
devices trying to do the same in parallel with us, but on other CPUs
as we only hold a read lock without acquiring _pcidevs_lock recursive
lock.

To prevent that upgrade the read lock to normal pcidevs_lock during
BAR overlapping check.

Signed-off-by: Oleksandr Andrushchenko 
---
 xen/drivers/vpci/header.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 75e972740106..c80a8bb5e3e0 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -281,7 +281,11 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
 /*
  * Check for overlaps with other BARs. Note that only BARs that are
  * currently mapped (enabled) are checked for overlaps.
+ * We are holding pcidevs_read_lock here, but we need to access
+ * different devices at a time. So, upgrade our current read lock to normal
+ * pcidevs_lock.
  */
+pcidevs_lock();
 for_each_pdev ( pdev->domain, tmp )
 {
 if ( tmp == pdev )
@@ -321,10 +325,12 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
start, end, rc);
 rangeset_destroy(mem);
+pcidevs_unlock();
 return rc;
 }
 }
 }
+pcidevs_unlock();
 
 ASSERT(dev);
 
-- 
2.25.1




[PATCH 2/4] vpci: restrict unhandled read/write operations for guests

2022-02-16 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

A guest would be able to read and write those registers which are not
emulated and have no respective vPCI handlers, so it will be possible
for it to access the hardware directly.
In order to prevent a guest from reads and writes from/to the unhandled
registers make sure only hardware domain can access the hardware directly
and restrict guests from doing so.

Suggested-by: Roger Pau Monné 
Signed-off-by: Oleksandr Andrushchenko 

---
Since v6:
- do not use is_hwdom parameter for vpci_{read|write}_hw and use
  current->domain internally
- update commit message
New in v6
---
 xen/drivers/vpci/vpci.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index fb0947179b79..f564572a51cb 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -213,6 +213,10 @@ static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int 
reg,
 {
 uint32_t data;
 
+/* Guest domains are not allowed to read real hardware. */
+if ( !is_hardware_domain(current->domain) )
+return ~(uint32_t)0;
+
 switch ( size )
 {
 case 4:
@@ -253,9 +257,13 @@ static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int 
reg,
 return data;
 }
 
-static void vpci_write_hw(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
-  uint32_t data)
+static void vpci_write_hw(pci_sbdf_t sbdf, unsigned int reg,
+  unsigned int size, uint32_t data)
 {
+/* Guest domains are not allowed to write real hardware. */
+if ( !is_hardware_domain(current->domain) )
+return;
+
 switch ( size )
 {
 case 4:
-- 
2.25.1




[PATCH 0/4] Yet another pci/vpci locking re-work

2022-02-16 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

Hello, all!

This is a yet another attempt to re-work the existing pci/vpci locking
scheme towards vPCI is going to be used for guests.
For more details on the previous attempts and their flaws please see [1], [2].

This work is based on the idea that it is possible to extend the
existing locking scheme by additionally providing a global read/write lock:

This way most of the code continues to use pcidevs_{lock|unlock}, so
only minor changes are required which do not lead to functional changes
seen by the users: these become readers with respect to the new rwlock
and they acquire _pcidevs_lock as before.

As to the writers (those which can add/remove pci devices and their
respective pdev) we need to make them use the new rwlock in write mode.
For that we introduce pcidevs_write_{un}lock helpers.

Those users, which do not add/remove pdevs and are only interested in
pdev->vpci or simple pdev's list traversal, will use
pcidevs_read_{lock|unlock} which only acquires the rwlock in read mode
without acquiring _pcidevs_lock. This is to make sure there is no
unnecessary contention for the later. For the cases when it is required
to make sure that no other CPU can access critical sections under the
read lock (modify_bars for example) it is possible to upgrade it to a
normal pcidevs_lock due to both read lock and _pcidevs_lock allow
recursion.

The series was tested on:
 - x86 PVH Dom0 and doesn't break it.
 - x86 HVM with PCI passthrough to DomU and doesn't break it.

Thank you,
Oleksandr

[1] 
https://lore.kernel.org/xen-devel/20180717094830.54806-2-roger@citrix.com/
[2] https://lore.kernel.org/all/20220204063459.680961-4-andr2...@gmail.com/

Oleksandr Andrushchenko (4):
  pci: add rwlock to pcidevs_lock machinery
  vpci: restrict unhandled read/write operations for guests
  vpci: use pcidevs locking to protect MMIO handlers
  vpci: resolve possible clash while removing BAR overlaps

 xen/arch/x86/hvm/vmsi.c   | 24 +++--
 xen/drivers/passthrough/pci.c | 65 +--
 xen/drivers/vpci/header.c | 30 ++--
 xen/drivers/vpci/msi.c| 21 +++
 xen/drivers/vpci/msix.c   | 55 +
 xen/drivers/vpci/vpci.c   | 34 +++---
 xen/include/xen/pci.h |  9 +
 xen/include/xen/vpci.h|  2 +-
 8 files changed, 205 insertions(+), 35 deletions(-)

-- 
2.25.1




[PATCH v2] tools: remove xenstore entries on vchan server closure

2022-02-15 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

vchan server creates XenStore entries to advertise its event channel and
ring, but those are not removed after the server quits.
Add additional cleanup step, so those are removed, so clients do not try
to connect to a non-existing server.

Signed-off-by: Oleksandr Andrushchenko 

---
Since v1:
- add NULL check after strdup
---
 tools/include/libxenvchan.h |  5 +
 tools/libs/vchan/init.c | 25 +
 tools/libs/vchan/io.c   |  4 
 tools/libs/vchan/vchan.h| 31 +++
 4 files changed, 65 insertions(+)
 create mode 100644 tools/libs/vchan/vchan.h

diff --git a/tools/include/libxenvchan.h b/tools/include/libxenvchan.h
index d6010b145df2..30cc73cf97e3 100644
--- a/tools/include/libxenvchan.h
+++ b/tools/include/libxenvchan.h
@@ -86,6 +86,11 @@ struct libxenvchan {
int blocking:1;
/* communication rings */
struct libxenvchan_ring read, write;
+   /**
+* Base xenstore path for storing ring/event data used by the server
+* during cleanup.
+* */
+   char *xs_path;
 };
 
 /**
diff --git a/tools/libs/vchan/init.c b/tools/libs/vchan/init.c
index c8510e6ce98a..ae9a6b579753 100644
--- a/tools/libs/vchan/init.c
+++ b/tools/libs/vchan/init.c
@@ -46,6 +46,8 @@
 #include 
 #include 
 
+#include "vchan.h"
+
 #ifndef PAGE_SHIFT
 #define PAGE_SHIFT 12
 #endif
@@ -251,6 +253,12 @@ static int init_xs_srv(struct libxenvchan *ctrl, int 
domain, const char* xs_base
char ref[16];
char* domid_str = NULL;
xs_transaction_t xs_trans = XBT_NULL;
+
+   // store the base path so we can clean up on server closure
+   ctrl->xs_path = strdup(xs_base);
+   if (!ctrl->xs_path)
+   goto fail;
+
xs = xs_open(0);
if (!xs)
goto fail;
@@ -298,6 +306,23 @@ retry_transaction:
return ret;
 }
 
+void close_xs_srv(struct libxenvchan *ctrl)
+{
+   struct xs_handle *xs;
+
+   if (!ctrl->xs_path)
+   return;
+
+   xs = xs_open(0);
+   if (!xs)
+   goto fail;
+
+   xs_rm(xs, XBT_NULL, ctrl->xs_path);
+
+fail:
+   free(ctrl->xs_path);
+}
+
 static int min_order(size_t siz)
 {
int rv = PAGE_SHIFT;
diff --git a/tools/libs/vchan/io.c b/tools/libs/vchan/io.c
index da303fbc01ca..1f201ad554f2 100644
--- a/tools/libs/vchan/io.c
+++ b/tools/libs/vchan/io.c
@@ -40,6 +40,8 @@
 #include 
 #include 
 
+#include "vchan.h"
+
 #ifndef PAGE_SHIFT
 #define PAGE_SHIFT 12
 #endif
@@ -384,5 +386,7 @@ void libxenvchan_close(struct libxenvchan *ctrl)
if (ctrl->gnttab)
xengnttab_close(ctrl->gnttab);
}
+   if (ctrl->is_server)
+   close_xs_srv(ctrl);
free(ctrl);
 }
diff --git a/tools/libs/vchan/vchan.h b/tools/libs/vchan/vchan.h
new file mode 100644
index ..621016ef42e5
--- /dev/null
+++ b/tools/libs/vchan/vchan.h
@@ -0,0 +1,31 @@
+/**
+ * @file
+ * @section AUTHORS
+ *
+ * Copyright (C) 2021 EPAM Systems Inc.
+ *
+ * @section LICENSE
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; If not, see 
<http://www.gnu.org/licenses/>.
+ *
+ * @section DESCRIPTION
+ *
+ *  This file contains common libxenvchan declarations.
+ */
+#ifndef LIBVCHAN_H
+#define LIBVCHAN_H
+
+void close_xs_srv(struct libxenvchan *ctrl);
+
+#endif /* LIBVCHAN_H */
-- 
2.25.1




Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 18:18, Jan Beulich wrote:
> On 15.02.2022 16:46, Oleksandr Andrushchenko wrote:
>> Question: can anyone please explain why pcidevs is a recursive lock?
> Well, assuming you did look at the change making it so, can you be a
> little more specific with your question? Are you perhaps suggesting
> the original reason has disappeared, and no new one has appeared? I'm
> afraid I have to repeat what I did say before: If you want to remove
> the recursive nature of the lock, then it is all on you to prove that
> there's no code path where the lock is taken recursively. IOW even if
> no-one knew of a reason, you'd still need to provide this proof.
> Unless of course we'd all agree we're okay to take the risk; I don't
> see us doing so, though.
The question was exactly as asked: I don't understand why it is
recursive and for what reason. I am not suggesting we blindly
change it to a normal spinlock.

My impression was that the code is structured in a way
that the same functionality is coded such as functions,
which already hold the lock, can call others which are
about to acquire the same. So, that allowed not introducing
XXX and XXX_unlocked function pairs which can be done
for many reasons.

That's it

> Jan
>
>
Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 14:56, Jan Beulich wrote:
> On 15.02.2022 13:44, Oleksandr Andrushchenko wrote:
>> On 15.02.22 13:54, Oleksandr Andrushchenko wrote:
>>> On 15.02.22 13:50, Jan Beulich wrote:
>>>> On 15.02.2022 12:45, Oleksandr Andrushchenko wrote:
>>>>> I'm on your side, I just want to hear that we all agree pcidevs
>>>>> needs to be converted into rwlock according with the plan you
>>>>> suggested and at least now it seems to be an acceptable solution.
>>>> I'd like to express worries though about the conversion of this
>>>> recursive lock into an r/w one.
>>> Could you please elaborate more on this?
>> What if we just do the following:
>>
>> static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED;
>> static rwlock_t DEFINE_RWLOCK(_pcidevs_rwlock);
>>
>> void pcidevs_lock(void)
>> {
>>       read_lock(&_pcidevs_rwlock);
>>       spin_lock_recursive(&_pcidevs_lock);
>> }
>>
>> void pcidevs_unlock(void)
>> {
>>       spin_unlock_recursive(&_pcidevs_lock);
>>       read_unlock(&_pcidevs_rwlock);
>> }
>>
>> void pcidevs_read_lock(void)
>> {
>>       read_lock(&_pcidevs_rwlock);
>> }
>>
>> void pcidevs_read_unlock(void)
>> {
>>       read_unlock(&_pcidevs_rwlock);
>> }
>>
>> void pcidevs_write_lock(void)
>> {
>>       write_lock(&_pcidevs_rwlock);
>> }
>>
>> void pcidevs_write_unlock(void)
>> {
>>       write_unlock(&_pcidevs_rwlock);
>> }
> Hmm, this is an interesting idea. Except that I'm not sure in how
> far it'll be suitable: read_lock() won't lock out users of just
> lock(), so the solution looks tailored to your vPCI use case. Yet
> obviously (I think) read_lock() would want to become usable for
> e.g. simple list traversal as well, down the road.

1. Assumption: _pcidevs_rwlock is used to protect pdev
structure itself, so after calling pcidevs_lock(), pcidevs_read_lock()
and pcidevs_write_lock() we need to check if pdev != NULL
at all sites

2. _pcidevs_rwlock is not meant to protect the contents of pdev:
- for that _pcidevs_lock is used
- _pcidevs_lock doesn't protect pdev->vpci: for that
   pdev->vpci->lock is used.

3. Current code will continue using pcidevs_lock() as it is now.
With the exception of the writers: pci_{add|remove}_device.
These will use pcidevs_write_lock() instead.

4. vPCI code, such as vpci_{read|write} will use
pcidevs_{read|write}_lock (write mode for modify_bars)
and pdev->vpci->lock to protect and/or modify pdev->vpci.
This should be safe because under the rwlock we are
guaranteed that pdev exists and no other code, but vPCI can
remove pdev->vpci.

for_each_pdev and pci_get_pdev_by_domain, when used by vPCI,
we use pcidevs_read_lock expecting we only need to access
pdev->vpci. If this is not the case and we need to modify
contents of pdev we need to acquire
     spin_lock_recursive(&_pcidevs_lock);
with a new helper 5)

5. A new helper is needed to acquire spin_lock_recursive(&_pcidevs_lock);
This will be used by at least vPCI code if it needs modifying
something in pdev other than pdev->vpci. In that case
we "upgrade" pcidevs_read_lock() to pcidevs_lock()

Question: can anyone please explain why pcidevs is a recursive lock?

>
> Jan
>
Thank you and hope to hear your thought on the above,
Oleksandr

Re: [PATCH] tools: remove xenstore entries on vchan server closure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 16:41, Jason Andryuk wrote:
> On Fri, Dec 10, 2021 at 7:35 AM Oleksandr Andrushchenko
>  wrote:
>> From: Oleksandr Andrushchenko 
>>
>> vchan server creates XenStore entries to advertise its event channel and
>> ring, but those are not removed after the server quits.
>> Add additional cleanup step, so those are removed, so clients do not try
>> to connect to a non-existing server.
>>
>> Signed-off-by: Oleksandr Andrushchenko 
>> ---
>>   tools/include/libxenvchan.h |  5 +
>>   tools/libs/vchan/init.c | 23 +++
>>   tools/libs/vchan/io.c   |  4 
>>   tools/libs/vchan/vchan.h| 31 +++
>>   4 files changed, 63 insertions(+)
>>   create mode 100644 tools/libs/vchan/vchan.h
>>
>> diff --git a/tools/include/libxenvchan.h b/tools/include/libxenvchan.h
>> index d6010b145df2..30cc73cf97e3 100644
>> --- a/tools/include/libxenvchan.h
>> +++ b/tools/include/libxenvchan.h
>> @@ -86,6 +86,11 @@ struct libxenvchan {
>>  int blocking:1;
>>  /* communication rings */
>>  struct libxenvchan_ring read, write;
>> +   /**
>> +* Base xenstore path for storing ring/event data used by the server
>> +* during cleanup.
>> +* */
>> +   char *xs_path;
>>   };
>>
>>   /**
>> diff --git a/tools/libs/vchan/init.c b/tools/libs/vchan/init.c
>> index c8510e6ce98a..c6b8674ef541 100644
>> --- a/tools/libs/vchan/init.c
>> +++ b/tools/libs/vchan/init.c
>> @@ -46,6 +46,8 @@
>>   #include 
>>   #include 
>>
>> +#include "vchan.h"
>> +
>>   #ifndef PAGE_SHIFT
>>   #define PAGE_SHIFT 12
>>   #endif
>> @@ -251,6 +253,10 @@ static int init_xs_srv(struct libxenvchan *ctrl, int 
>> domain, const char* xs_base
>>  char ref[16];
>>  char* domid_str = NULL;
>>  xs_transaction_t xs_trans = XBT_NULL;
>> +
>> +   // store the base path so we can clean up on server closure
>> +   ctrl->xs_path = strdup(xs_base);
> You don't check for NULL here, but you do check for NULL in
> close_xs_srv().  I guess it's okay, since it does the right thing.
> But I think it would be more robust to check for NULL here.  Is there
> a specific reason you wrote it this way?  Otherwise it looks good.
It does need a NULL check, thanks
It is after writing code with all those allocations and garbage collector
in the tools stack when allocations "don't fail" ;)
But this is indeed not the case here and needs a proper check
I'll wait for other comments and send v2
>
> Regards,
> Jason
Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 14:49, Jan Beulich wrote:
> On 15.02.2022 12:54, Oleksandr Andrushchenko wrote:
>> On 15.02.22 13:50, Jan Beulich wrote:
>>> On 15.02.2022 12:45, Oleksandr Andrushchenko wrote:
>>>> I'm on your side, I just want to hear that we all agree pcidevs
>>>> needs to be converted into rwlock according with the plan you
>>>> suggested and at least now it seems to be an acceptable solution.
>>> I'd like to express worries though about the conversion of this
>>> recursive lock into an r/w one.
>> Could you please elaborate more on this?
> Not sure what to say beyond the obvious:
I thought you have something specific in your mind that worries
you and you can tell what it is. Thus the qustion
>   At the time of the conversion,
> there certainly was an issue to be solved. You'd need to solve this
> issue differently then. Plus you'd need to make sure that no further
> incarnations of the original issue had been there or have been added in
> the meantime.
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 13:54, Oleksandr Andrushchenko wrote:
>
> On 15.02.22 13:50, Jan Beulich wrote:
>> On 15.02.2022 12:45, Oleksandr Andrushchenko wrote:
>>> I'm on your side, I just want to hear that we all agree pcidevs
>>> needs to be converted into rwlock according with the plan you
>>> suggested and at least now it seems to be an acceptable solution.
>> I'd like to express worries though about the conversion of this
>> recursive lock into an r/w one.
> Could you please elaborate more on this?
What if we just do the following:

static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED;
static rwlock_t DEFINE_RWLOCK(_pcidevs_rwlock);

void pcidevs_lock(void)
{
     read_lock(&_pcidevs_rwlock);
     spin_lock_recursive(&_pcidevs_lock);
}

void pcidevs_unlock(void)
{
     spin_unlock_recursive(&_pcidevs_lock);
     read_unlock(&_pcidevs_rwlock);
}

void pcidevs_read_lock(void)
{
     read_lock(&_pcidevs_rwlock);
}

void pcidevs_read_unlock(void)
{
     read_unlock(&_pcidevs_rwlock);
}

void pcidevs_write_lock(void)
{
     write_lock(&_pcidevs_rwlock);
}

void pcidevs_write_unlock(void)
{
     write_unlock(&_pcidevs_rwlock);
}

1. This way most of the code continues to use pcidevs_{lock|unlock}.
2. We need to change writers, those which can add /remove pdev, to use
pcidevs_write_{un}lock
3. Those, which do not modify pdevs (vpci_{read|write}), will use
pcidevs_read_lock
4. We do not introduce d->vpci_rwlock and use pcidevs_{read|write}_lock
as vpci doesn't seem to need to acquire _pcidevs_lock + we use pdev->vpci->lock
as it is now

Is this something which may address your worries?

Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 13:50, Jan Beulich wrote:
> On 15.02.2022 12:45, Oleksandr Andrushchenko wrote:
>> I'm on your side, I just want to hear that we all agree pcidevs
>> needs to be converted into rwlock according with the plan you
>> suggested and at least now it seems to be an acceptable solution.
> I'd like to express worries though about the conversion of this
> recursive lock into an r/w one.
Could you please elaborate more on this?
I would love not to have 4th approach requested to be implemented ;)
> Jan
>
Thank you in advance,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 13:39, Roger Pau Monné wrote:
> On Tue, Feb 15, 2022 at 11:12:23AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 15.02.22 12:48, Roger Pau Monné wrote:
>>> On Tue, Feb 15, 2022 at 10:11:35AM +0200, Oleksandr Andrushchenko wrote:
>>>> From: Oleksandr Andrushchenko 
>>>>
>>>> Introduce a per-domain read/write lock to check whether vpci is present,
>>>> so we are sure there are no accesses to the contents of the vpci struct
>>>> if not. This lock can be used (and in a few cases is used right away)
>>>> so that vpci removal can be performed while holding the lock in write
>>>> mode. Previously such removal could race with vpci_read for example.
>>>>
>>>> 1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
>>>> from being removed.
>>>>
>>>> 2. Writing the command register and ROM BAR register may trigger
>>>> modify_bars to run, which in turn may access multiple pdevs while
>>>> checking for the existing BAR's overlap. The overlapping check, if done
>>>> under the read lock, requires vpci->lock to be acquired on both devices
>>>> being compared, which may produce a deadlock. It is not possible to
>>>> upgrade read lock to write lock in such a case. So, in order to prevent
>>>> the deadlock, check which registers are going to be written and acquire
>>>> the lock in the appropriate mode from the beginning.
>>>>
>>>> All other code, which doesn't lead to pdev->vpci destruction and does not
>>>> access multiple pdevs at the same time, can still use a combination of the
>>>> read lock and pdev->vpci->lock.
>>>>
>>>> 3. Optimize if ROM BAR write lock required detection by caching offset
>>>> of the ROM BAR register in vpci->header->rom_reg which depends on
>>>> header's type.
>>>>
>>>> 4. Reduce locked region in vpci_remove_device as it is now possible
>>>> to set pdev->vpci to NULL early right after the write lock is acquired.
>>>>
>>>> 5. Reduce locked region in vpci_add_handlers as it is possible to
>>>> initialize many more fields of the struct vpci before assigning it to
>>>> pdev->vpci.
>>>>
>>>> 6. vpci_{add|remove}_register are required to be called with the write lock
>>>> held, but it is not feasible to add an assert there as it requires
>>>> struct domain to be passed for that. So, add a comment about this 
>>>> requirement
>>>> to these and other functions with the equivalent constraints.
>>>>
>>>> 7. Drop const qualifier where the new rwlock is used and this is 
>>>> appropriate.
>>>>
>>>> 8. Do not call process_pending_softirqs with any locks held. For that 
>>>> unlock
>>>> prior the call and re-acquire the locks after. After re-acquiring the
>>>> lock there is no need to check if pdev->vpci exists:
>>>>- in apply_map because of the context it is called (no race condition
>>>>  possible)
>>>>- for MSI/MSI-X debug code because it is called at the end of
>>>>  pdev->vpci access and no further access to pdev->vpci is made
>>>>
>>>> 9. Check for !pdev->vpci in vpci_{read|write} after acquiring the lock
>>>> and if so, allow reading or writing the hardware register directly. This is
>>>> acceptable as we only deal with Dom0 as of now. Once DomU support is
>>>> added the write will need to be ignored and read return all 0's for the
>>>> guests, while Dom0 can still access the registers directly.
>>>>
>>>> 10. Introduce pcidevs_trylock, so there is a possibility to try locking
>>>> the pcidev's lock.
>>>>
>>>> 11. Use pcidev's lock around for_each_pdev and pci_get_pdev_by_domain
>>>> while accessing pdevs in vpci code.
>>> So if you use the pcidevs_lock then it's impossible for the pdev or
>>> pdev->vpci to be removed or recreated, as the pcidevs lock protects
>>> any device operations (add, remove, assign, deassign).
>>>
>>> It's however not OK to use the pcidevs lock in vpci_{read,write}
>>> as-is, as the introduced contention is IMO not acceptable.
>>>
>>> The only viable option I see here is to:
>>>
>>>1. Make the pcidevs lock a rwlock: switch current callers to take the
>>>   lock in write mode, detect and fixup any issues that 

Re: [PATCH] tools: remove xenstore entries on vchan server closure

2022-02-15 Thread Oleksandr Andrushchenko
Anthony, could you please take a look?

Thank you in advance,
Oleksandr

On 10.12.21 14:35, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko 
>
> vchan server creates XenStore entries to advertise its event channel and
> ring, but those are not removed after the server quits.
> Add additional cleanup step, so those are removed, so clients do not try
> to connect to a non-existing server.
>
> Signed-off-by: Oleksandr Andrushchenko 
> ---
>   tools/include/libxenvchan.h |  5 +
>   tools/libs/vchan/init.c | 23 +++
>   tools/libs/vchan/io.c   |  4 
>   tools/libs/vchan/vchan.h| 31 +++
>   4 files changed, 63 insertions(+)
>   create mode 100644 tools/libs/vchan/vchan.h
>
> diff --git a/tools/include/libxenvchan.h b/tools/include/libxenvchan.h
> index d6010b145df2..30cc73cf97e3 100644
> --- a/tools/include/libxenvchan.h
> +++ b/tools/include/libxenvchan.h
> @@ -86,6 +86,11 @@ struct libxenvchan {
>   int blocking:1;
>   /* communication rings */
>   struct libxenvchan_ring read, write;
> + /**
> +  * Base xenstore path for storing ring/event data used by the server
> +  * during cleanup.
> +  * */
> + char *xs_path;
>   };
>   
>   /**
> diff --git a/tools/libs/vchan/init.c b/tools/libs/vchan/init.c
> index c8510e6ce98a..c6b8674ef541 100644
> --- a/tools/libs/vchan/init.c
> +++ b/tools/libs/vchan/init.c
> @@ -46,6 +46,8 @@
>   #include 
>   #include 
>   
> +#include "vchan.h"
> +
>   #ifndef PAGE_SHIFT
>   #define PAGE_SHIFT 12
>   #endif
> @@ -251,6 +253,10 @@ static int init_xs_srv(struct libxenvchan *ctrl, int 
> domain, const char* xs_base
>   char ref[16];
>   char* domid_str = NULL;
>   xs_transaction_t xs_trans = XBT_NULL;
> +
> + // store the base path so we can clean up on server closure
> + ctrl->xs_path = strdup(xs_base);
> +
>   xs = xs_open(0);
>   if (!xs)
>   goto fail;
> @@ -298,6 +304,23 @@ retry_transaction:
>   return ret;
>   }
>   
> +void close_xs_srv(struct libxenvchan *ctrl)
> +{
> + struct xs_handle *xs;
> +
> + if (!ctrl->xs_path)
> + return;
> +
> + xs = xs_open(0);
> + if (!xs)
> + goto fail;
> +
> + xs_rm(xs, XBT_NULL, ctrl->xs_path);
> +
> +fail:
> + free(ctrl->xs_path);
> +}
> +
>   static int min_order(size_t siz)
>   {
>   int rv = PAGE_SHIFT;
> diff --git a/tools/libs/vchan/io.c b/tools/libs/vchan/io.c
> index da303fbc01ca..1f201ad554f2 100644
> --- a/tools/libs/vchan/io.c
> +++ b/tools/libs/vchan/io.c
> @@ -40,6 +40,8 @@
>   #include 
>   #include 
>   
> +#include "vchan.h"
> +
>   #ifndef PAGE_SHIFT
>   #define PAGE_SHIFT 12
>   #endif
> @@ -384,5 +386,7 @@ void libxenvchan_close(struct libxenvchan *ctrl)
>   if (ctrl->gnttab)
>   xengnttab_close(ctrl->gnttab);
>   }
> + if (ctrl->is_server)
> + close_xs_srv(ctrl);
>   free(ctrl);
>   }
> diff --git a/tools/libs/vchan/vchan.h b/tools/libs/vchan/vchan.h
> new file mode 100644
> index ..621016ef42e5
> --- /dev/null
> +++ b/tools/libs/vchan/vchan.h
> @@ -0,0 +1,31 @@
> +/**
> + * @file
> + * @section AUTHORS
> + *
> + * Copyright (C) 2021 EPAM Systems Inc.
> + *
> + * @section LICENSE
> + *
> + *  This library is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU Lesser General Public
> + *  License as published by the Free Software Foundation; either
> + *  version 2.1 of the License, or (at your option) any later version.
> + *
> + *  This library is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + *  Lesser General Public License for more details.
> + *
> + *  You should have received a copy of the GNU Lesser General Public
> + *  License along with this library; If not, see 
> <http://www.gnu.org/licenses/>.
> + *
> + * @section DESCRIPTION
> + *
> + *  This file contains common libxenvchan declarations.
> + */
> +#ifndef LIBVCHAN_H
> +#define LIBVCHAN_H
> +
> +void close_xs_srv(struct libxenvchan *ctrl);
> +
> +#endif /* LIBVCHAN_H */


Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 12:48, Roger Pau Monné wrote:
> On Tue, Feb 15, 2022 at 10:11:35AM +0200, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko 
>>
>> Introduce a per-domain read/write lock to check whether vpci is present,
>> so we are sure there are no accesses to the contents of the vpci struct
>> if not. This lock can be used (and in a few cases is used right away)
>> so that vpci removal can be performed while holding the lock in write
>> mode. Previously such removal could race with vpci_read for example.
>>
>> 1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
>> from being removed.
>>
>> 2. Writing the command register and ROM BAR register may trigger
>> modify_bars to run, which in turn may access multiple pdevs while
>> checking for the existing BAR's overlap. The overlapping check, if done
>> under the read lock, requires vpci->lock to be acquired on both devices
>> being compared, which may produce a deadlock. It is not possible to
>> upgrade read lock to write lock in such a case. So, in order to prevent
>> the deadlock, check which registers are going to be written and acquire
>> the lock in the appropriate mode from the beginning.
>>
>> All other code, which doesn't lead to pdev->vpci destruction and does not
>> access multiple pdevs at the same time, can still use a combination of the
>> read lock and pdev->vpci->lock.
>>
>> 3. Optimize if ROM BAR write lock required detection by caching offset
>> of the ROM BAR register in vpci->header->rom_reg which depends on
>> header's type.
>>
>> 4. Reduce locked region in vpci_remove_device as it is now possible
>> to set pdev->vpci to NULL early right after the write lock is acquired.
>>
>> 5. Reduce locked region in vpci_add_handlers as it is possible to
>> initialize many more fields of the struct vpci before assigning it to
>> pdev->vpci.
>>
>> 6. vpci_{add|remove}_register are required to be called with the write lock
>> held, but it is not feasible to add an assert there as it requires
>> struct domain to be passed for that. So, add a comment about this requirement
>> to these and other functions with the equivalent constraints.
>>
>> 7. Drop const qualifier where the new rwlock is used and this is appropriate.
>>
>> 8. Do not call process_pending_softirqs with any locks held. For that unlock
>> prior the call and re-acquire the locks after. After re-acquiring the
>> lock there is no need to check if pdev->vpci exists:
>>   - in apply_map because of the context it is called (no race condition
>> possible)
>>   - for MSI/MSI-X debug code because it is called at the end of
>> pdev->vpci access and no further access to pdev->vpci is made
>>
>> 9. Check for !pdev->vpci in vpci_{read|write} after acquiring the lock
>> and if so, allow reading or writing the hardware register directly. This is
>> acceptable as we only deal with Dom0 as of now. Once DomU support is
>> added the write will need to be ignored and read return all 0's for the
>> guests, while Dom0 can still access the registers directly.
>>
>> 10. Introduce pcidevs_trylock, so there is a possibility to try locking
>> the pcidev's lock.
>>
>> 11. Use pcidev's lock around for_each_pdev and pci_get_pdev_by_domain
>> while accessing pdevs in vpci code.
> So if you use the pcidevs_lock then it's impossible for the pdev or
> pdev->vpci to be removed or recreated, as the pcidevs lock protects
> any device operations (add, remove, assign, deassign).
>
> It's however not OK to use the pcidevs lock in vpci_{read,write}
> as-is, as the introduced contention is IMO not acceptable.
>
> The only viable option I see here is to:
>
>   1. Make the pcidevs lock a rwlock: switch current callers to take the
>  lock in write mode, detect and fixup any issues that could arise
>  from the lock not being recursive anymore.
>   2. Take the lock in read mode around vpci_{read,write} sections that
>  rely on pdev (including the handlers).
>
> These items should be at least two separate patches. Let's not mix the
> conversion of pcidevs locks with the addition of vPCI support.
>
> I think with that we could get away without requiring a per-domain
> rwlock? Just doing lock ordering in modify_bars regarding
> tmp->vpci->lock vs pdev->vpci->lock. Neither pdev or vpci can go away
> while holding the pcidevs lock.
>
> Sorting the situation in modify_bars should also be done as a separate
> patch on top of 1. and 2.
So, to make it crystal clear: we can do with the locking as in this
patch and instead we need to convert pcidevs lock into rwlock.
Meaning that I need to drop this patch.

Then, 3 patches to follow:
1. pcidevs as rwlock
2. vpci_{read|write} and the rest using new pcidevs rwlock
3. lock ordering in modify_bars

Is it what we want?

Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 12:48, Roger Pau Monné wrote:
> On Tue, Feb 15, 2022 at 10:11:35AM +0200, Oleksandr Andrushchenko wrote:
> @@ -911,7 +914,11 @@ int vpci_msix_arch_print(const struct vpci_msix *msix)
>>   struct pci_dev *pdev = msix->pdev;
>>   
>>   spin_unlock(>pdev->vpci->lock);
>> +pcidevs_unlock();
>> +read_unlock(>domain->vpci_rwlock);
>>   process_pending_softirqs();
>> +read_lock(>domain->vpci_rwlock);
>> +pcidevs_lock();
> This is again an ABBA situation: vpci_add_handlers will get called
> with pci_devs locked, and it will try to acquire the per-domain vpci
> lock (so pcidevs -> vpci_rwlock) while here and in other places in the
> patch to you have inverse locking order (vpci_rwlock -> pcidevs).
Indeed, I need to always lock in this order: pcidevs -> vpci_rwlock
to prevent ABBA, good catch
>
>>   /* NB: we assume that pdev cannot go away for an alive domain. 
>> */
>>   if ( !pdev->vpci || !spin_trylock(>vpci->lock) )
>>   return -EBUSY;
>> @@ -323,10 +334,18 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, 
>> unsigned int size)
>>   }
>>   
>>   /* Find the PCI dev matching the address. */
>> +pcidevs_lock();
>>   pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>> +pcidevs_unlock();
>>   if ( !pdev )
>>   return vpci_read_hw(sbdf, reg, size);
> There's a window here (between dropping the pcidevs lock and acquiring
> the vpci_rwlock where either the pdev or pdev->vpci could be removed
> or recreated.
Yes, I know that. But this is the best I came up with...
>
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 10:11, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko 
>
> @@ -171,8 +173,24 @@ static int __init apply_map(struct domain *d, const 
> struct pci_dev *pdev,
>   struct map_data data = { .d = d, .map = true };
>   int rc;
>   
> +ASSERT(rw_is_write_locked(>vpci_rwlock));
> +
>   while ( (rc = rangeset_consume_ranges(mem, map_range, )) == 
> -ERESTART )
> +{
> +/*
> + * FIXME: apply_map is called from dom0 specific init code when
> + * system_state < SYS_STATE_active, so there is no race condition
> + * possible between this code and vpci_process_pending. So, neither
> + * vpci_process_pending may try to acquire the lock in read mode and
> + * also destroy pdev->vpci in its error path nor pdev may be disposed
> + * yet. This means that it is not required to check if the relevant
> + * pdev->vpci still exists after re-acquiring the lock.
> + */

> I'm not sure why you need to mention vpci_process_pending here:
> apply_map and defer_map are mutually exclusive, so given the current
> code it's impossible to get in a situation where apply_map is called
> while there's pending work on the vCPU (ie: v->vpci.mem != NULL).
>
> Also there's no need for a FIXME tag: the current approach doesn't
> require any fixes unless we start using apply_map in a different
> context.
>
> Hence I think the comment should be along the lines of:
>
> /*
>  * It's safe to drop and reacquire the lock in this context without
>  * risking pdev disappearing because devices cannot be removed until the
>  * initial domain has been started.
>  */
This sounds good, will use this
> 
> Thanks, Roger.



Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko


On 15.02.22 10:30, Roger Pau Monné wrote:
> On Mon, Feb 14, 2022 at 02:00:26PM +0000, Oleksandr Andrushchenko wrote:
>> /*
>> * FIXME: apply_map is called from dom0 specific init code when
>> * system_state < SYS_STATE_active, so there is no race condition
>> * possible between this code and vpci_process_pending. So, neither
>> * vpci_process_pending may try to acquire the lock in read mode and
>> * also destroy pdev->vpci in its error path nor pdev may be disposed yet.
>> * This means that it is not required to check if the relevant pdev
>> * still exists after re-acquiring the lock.
> I'm not sure why you need to mention vpci_process_pending here:
> apply_map and defer_map are mutually exclusive, so given the current
> code it's impossible to get in a situation where apply_map is called
> while there's pending work on the vCPU (ie: v->vpci.mem != NULL).
>
> Also there's no need for a FIXME tag: the current approach doesn't
> require any fixes unless we start using apply_map in a different
> context.
>
> Hence I think the comment should be along the lines of:
>
> /*
>   * It's safe to drop and reacquire the lock in this context without
>   * risking pdev disappearing because devices cannot be removed until the
>   * initial domain has been started.
>   */
Urgh, I've just sent v2. I'll move this there and answer
>
> Thanks, Roger.
>
Thank you,
Oleksandr

[PATCH v2] vpci: introduce per-domain lock to protect vpci structure

2022-02-15 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

Introduce a per-domain read/write lock to check whether vpci is present,
so we are sure there are no accesses to the contents of the vpci struct
if not. This lock can be used (and in a few cases is used right away)
so that vpci removal can be performed while holding the lock in write
mode. Previously such removal could race with vpci_read for example.

1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
from being removed.

2. Writing the command register and ROM BAR register may trigger
modify_bars to run, which in turn may access multiple pdevs while
checking for the existing BAR's overlap. The overlapping check, if done
under the read lock, requires vpci->lock to be acquired on both devices
being compared, which may produce a deadlock. It is not possible to
upgrade read lock to write lock in such a case. So, in order to prevent
the deadlock, check which registers are going to be written and acquire
the lock in the appropriate mode from the beginning.

All other code, which doesn't lead to pdev->vpci destruction and does not
access multiple pdevs at the same time, can still use a combination of the
read lock and pdev->vpci->lock.

3. Optimize if ROM BAR write lock required detection by caching offset
of the ROM BAR register in vpci->header->rom_reg which depends on
header's type.

4. Reduce locked region in vpci_remove_device as it is now possible
to set pdev->vpci to NULL early right after the write lock is acquired.

5. Reduce locked region in vpci_add_handlers as it is possible to
initialize many more fields of the struct vpci before assigning it to
pdev->vpci.

6. vpci_{add|remove}_register are required to be called with the write lock
held, but it is not feasible to add an assert there as it requires
struct domain to be passed for that. So, add a comment about this requirement
to these and other functions with the equivalent constraints.

7. Drop const qualifier where the new rwlock is used and this is appropriate.

8. Do not call process_pending_softirqs with any locks held. For that unlock
prior the call and re-acquire the locks after. After re-acquiring the
lock there is no need to check if pdev->vpci exists:
 - in apply_map because of the context it is called (no race condition
   possible)
 - for MSI/MSI-X debug code because it is called at the end of
   pdev->vpci access and no further access to pdev->vpci is made

9. Check for !pdev->vpci in vpci_{read|write} after acquiring the lock
and if so, allow reading or writing the hardware register directly. This is
acceptable as we only deal with Dom0 as of now. Once DomU support is
added the write will need to be ignored and read return all 0's for the
guests, while Dom0 can still access the registers directly.

10. Introduce pcidevs_trylock, so there is a possibility to try locking
the pcidev's lock.

11. Use pcidev's lock around for_each_pdev and pci_get_pdev_by_domain
while accessing pdevs in vpci code.

12. This is based on the discussion at [1].

[1] https://lore.kernel.org/all/20220204063459.680961-4-andr2...@gmail.com/

Suggested-by: Roger Pau Monné 
Suggested-by: Jan Beulich 
Signed-off-by: Oleksandr Andrushchenko 

---
This was checked on x86: with and without PVH Dom0.

Since v1:
- s/ASSERT(!!/ASSERT(
- move vpci_header_write_lock to vpci.c and rename to
  vpci_header_need_write_lock
- use a simple static overlap function instead of vpci_offset_cmp
- signal no ROM BAR with rom_reg == 0
- msix_accept: new line before return
- do not run process_pending_softirqs with locks held
- in-code comments update
- move rom_reg before rom_enabled in struct vpci. Roger, it is not
  possible to move it after 'type' as in this case it becomes per BAR
  and we need it per vpci
- add !pdev->vpci checks to vpci_{read|write}
- move ASSERT(pdev->vpci) in add_handlers under the write lock
- introduce pcidevs_trylock
- protect for_each_pdev with pcidevs lock
---
 xen/arch/x86/hvm/vmsi.c   |   7 +++
 xen/common/domain.c   |   3 +
 xen/drivers/passthrough/pci.c |   5 ++
 xen/drivers/vpci/header.c |  56 +++
 xen/drivers/vpci/msi.c|  25 -
 xen/drivers/vpci/msix.c   |  41 --
 xen/drivers/vpci/vpci.c   | 100 ++
 xen/include/xen/pci.h |   1 +
 xen/include/xen/sched.h   |   3 +
 xen/include/xen/vpci.h|   6 ++
 10 files changed, 215 insertions(+), 32 deletions(-)

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 13e2a190b439..2a13c6581345 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -893,6 +893,9 @@ int vpci_msix_arch_print(const struct vpci_msix *msix)
 {
 unsigned int i;
 
+ASSERT(rw_is_locked(>pdev->domain->vpci_rwlock));
+ASSERT(pcidevs_locked());
+
 for ( i = 0; i < msix->max_entries; i++ )
 {
 const struct vpci_msix_entry *entry = >entries[i];
@@ -911,7 +

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 16:31, Jan Beulich wrote:
> On 14.02.2022 15:26, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 16:19, Jan Beulich wrote:
>>> On 09.02.2022 14:36, Oleksandr Andrushchenko wrote:
>>>> @@ -410,14 +428,37 @@ static void vpci_write_helper(const struct pci_dev 
>>>> *pdev,
>>>> r->private);
>>>>}
>>>>
>>>> +static bool vpci_header_write_lock(const struct pci_dev *pdev,
>>>> +   unsigned int start, unsigned int size)
>>>> +{
>>>> +/*
>>>> + * Writing the command register and ROM BAR register may trigger
>>>> + * modify_bars to run which in turn may access multiple pdevs while
>>>> + * checking for the existing BAR's overlap. The overlapping check, if 
>>>> done
>>>> + * under the read lock, requires vpci->lock to be acquired on both 
>>>> devices
>>>> + * being compared, which may produce a deadlock. It is not possible to
>>>> + * upgrade read lock to write lock in such a case. So, in order to 
>>>> prevent
>>>> + * the deadlock, check which registers are going to be written and 
>>>> acquire
>>>> + * the lock in the appropriate mode from the beginning.
>>>> + */
>>>> +if ( !vpci_offset_cmp(start, size, PCI_COMMAND, 2) )
>>>> +return true;
>>>> +
>>>> +if ( !vpci_offset_cmp(start, size, pdev->vpci->header.rom_reg, 4) )
>>>> +return true;
>>>> +
>>>> +return false;
>>>> +}
>>> A function of this name gives (especially at the call site(s)) the
>>> impression of acquiring a lock. Considering that of the prefixes
>>> neither "vpci" nor "header" are really relevant here, may I suggest
>>> to use need_write_lock()?
>>>
>>> May I further suggest that you either split the comment or combine
>>> the two if()-s (perhaps even straight into single return statement)?
>>> Personally I'd prefer the single return statement approach here ...
>> That was already questioned by Roger and now it looks like:
>>
>> static bool overlap(unsigned int r1_offset, unsigned int r1_size,
>>       unsigned int r2_offset, unsigned int r2_size)
>> {
>>       /* Return true if there is an overlap. */
>>       return r1_offset < r2_offset + r2_size && r2_offset < r1_offset + 
>> r1_size;
>> }
>>
>> bool vpci_header_write_lock(const struct pci_dev *pdev,
>>       unsigned int start, unsigned int size)
>> {
>>       /*
>>    * Writing the command register and ROM BAR register may trigger
>>    * modify_bars to run which in turn may access multiple pdevs while
>>    * checking for the existing BAR's overlap. The overlapping check, if 
>> done
>>    * under the read lock, requires vpci->lock to be acquired on both 
>> devices
>>    * being compared, which may produce a deadlock. It is not possible to
>>    * upgrade read lock to write lock in such a case. So, in order to 
>> prevent
>>    * the deadlock, check which registers are going to be written and 
>> acquire
>>    * the lock in the appropriate mode from the beginning.
>>    */
>>       if ( overlap(start, size, PCI_COMMAND, 2) ||
>>    (pdev->vpci->header.rom_reg &&
>>     overlap(start, size, pdev->vpci->header.rom_reg, 4)) )
>>       return true;
>>
>>       return false;
>> }
>>
>> vpci_header_write_lock moved to header.c and is not static anymore.
>> So, sitting in header.c, the name seems to be appropriate now
> The prefix of the name - yes. But as said, a function of this name looks
> as if it would acquire a lock. Imo you want to insert "need" or some
> such.
Agree. Then vpci_header_need_write_lock.
I will also update the comment because it makes an impression that
the function acquires the lock
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 16:19, Jan Beulich wrote:
> On 09.02.2022 14:36, Oleksandr Andrushchenko wrote:
>> @@ -410,14 +428,37 @@ static void vpci_write_helper(const struct pci_dev 
>> *pdev,
>>r->private);
>>   }
>>   
>> +static bool vpci_header_write_lock(const struct pci_dev *pdev,
>> +   unsigned int start, unsigned int size)
>> +{
>> +/*
>> + * Writing the command register and ROM BAR register may trigger
>> + * modify_bars to run which in turn may access multiple pdevs while
>> + * checking for the existing BAR's overlap. The overlapping check, if 
>> done
>> + * under the read lock, requires vpci->lock to be acquired on both 
>> devices
>> + * being compared, which may produce a deadlock. It is not possible to
>> + * upgrade read lock to write lock in such a case. So, in order to 
>> prevent
>> + * the deadlock, check which registers are going to be written and 
>> acquire
>> + * the lock in the appropriate mode from the beginning.
>> + */
>> +if ( !vpci_offset_cmp(start, size, PCI_COMMAND, 2) )
>> +return true;
>> +
>> +if ( !vpci_offset_cmp(start, size, pdev->vpci->header.rom_reg, 4) )
>> +return true;
>> +
>> +return false;
>> +}
> A function of this name gives (especially at the call site(s)) the
> impression of acquiring a lock. Considering that of the prefixes
> neither "vpci" nor "header" are really relevant here, may I suggest
> to use need_write_lock()?
>
> May I further suggest that you either split the comment or combine
> the two if()-s (perhaps even straight into single return statement)?
> Personally I'd prefer the single return statement approach here ...
That was already questioned by Roger and now it looks like:

static bool overlap(unsigned int r1_offset, unsigned int r1_size,
     unsigned int r2_offset, unsigned int r2_size)
{
     /* Return true if there is an overlap. */
     return r1_offset < r2_offset + r2_size && r2_offset < r1_offset + r1_size;
}

bool vpci_header_write_lock(const struct pci_dev *pdev,
     unsigned int start, unsigned int size)
{
     /*
  * Writing the command register and ROM BAR register may trigger
  * modify_bars to run which in turn may access multiple pdevs while
  * checking for the existing BAR's overlap. The overlapping check, if done
  * under the read lock, requires vpci->lock to be acquired on both devices
  * being compared, which may produce a deadlock. It is not possible to
  * upgrade read lock to write lock in such a case. So, in order to prevent
  * the deadlock, check which registers are going to be written and acquire
  * the lock in the appropriate mode from the beginning.
  */
     if ( overlap(start, size, PCI_COMMAND, 2) ||
  (pdev->vpci->header.rom_reg &&
   overlap(start, size, pdev->vpci->header.rom_reg, 4)) )
     return true;

     return false;
}

vpci_header_write_lock moved to header.c and is not static anymore.
So, sitting in header.c, the name seems to be appropriate now
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 15:48, Jan Beulich wrote:
> On 14.02.2022 14:27, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 15:22, Jan Beulich wrote:
>>> On 14.02.2022 14:13, Oleksandr Andrushchenko wrote:
>>>> On 14.02.22 14:57, Jan Beulich wrote:
>>>>> On 14.02.2022 12:37, Oleksandr Andrushchenko wrote:
>>>>>> On 14.02.22 13:25, Roger Pau Monné wrote:
>>>>>>> On Mon, Feb 14, 2022 at 11:15:27AM +, Oleksandr Andrushchenko wrote:
>>>>>>>> On 14.02.22 13:11, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Feb 14, 2022 at 10:53:43AM +, Oleksandr Andrushchenko 
>>>>>>>>> wrote:
>>>>>>>>>> On 14.02.22 12:34, Roger Pau Monné wrote:
>>>>>>>>>>> On Mon, Feb 14, 2022 at 09:36:39AM +, Oleksandr Andrushchenko 
>>>>>>>>>>> wrote:
>>>>>>>>>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>  for ( i = 0; i < msix->max_entries; i++ )
>>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>>  const struct vpci_msix_entry *entry = 
>>>>>>>>>>>>>>>> >entries[i];
>>>>>>>>>>>>>>> Since this function is now called with the per-domain rwlock 
>>>>>>>>>>>>>>> read
>>>>>>>>>>>>>>> locked it's likely not appropriate to call 
>>>>>>>>>>>>>>> process_pending_softirqs
>>>>>>>>>>>>>>> while holding such lock (check below).
>>>>>>>>>>>>>> You are right, as it is possible that:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Even more, vpci_process_pending may also
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> in its error path. So, any invocation of process_pending_softirqs
>>>>>>>>>>>>>> must not hold d->vpci_rwlock at least.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> And also we need to check that pdev->vpci was not removed
>>>>>>>>>>>>>> in between or *re-created*
>>>>>>>>>>>>>>> We will likely need to re-iterate over the list of pdevs 
>>>>>>>>>>>>>>> assigned to
>>>>>>>>>>>>>>> the domain and assert that the pdev is still assigned to the 
>>>>>>>>>>>>>>> same
>>>>>>>>>>>>>>> domain.
>>>>>>>>>>>>>> So, do you mean a pattern like the below should be used at all
>>>>>>>>>>>>>> places where we need to call process_pending_softirqs?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> read_unlock
>>>>>>>>>>>>>> process_pending_softirqs
>>>>>>>>>>>>>> read_lock
>>>>>>>>>>>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>>>>>>>>>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>>>>>>>>>>>> 
>>>>>>>>>>>>> Something along those lines. You likely need to continue iterate 
>>>>>>>>>>>>> using
>>>>>>>>>>>>> for_each_pdev.
>>>>>>>>>>>> How do we tell if pdev->vpci is the same? Jan has already brought
>>>>>>>>>>>> this question before [1] and I was about to use some ID for that 
>>>&

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 15:22, Jan Beulich wrote:
> On 14.02.2022 14:13, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 14:57, Jan Beulich wrote:
>>> On 14.02.2022 12:37, Oleksandr Andrushchenko wrote:
>>>> On 14.02.22 13:25, Roger Pau Monné wrote:
>>>>> On Mon, Feb 14, 2022 at 11:15:27AM +, Oleksandr Andrushchenko wrote:
>>>>>> On 14.02.22 13:11, Roger Pau Monné wrote:
>>>>>>> On Mon, Feb 14, 2022 at 10:53:43AM +, Oleksandr Andrushchenko wrote:
>>>>>>>> On 14.02.22 12:34, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Feb 14, 2022 at 09:36:39AM +, Oleksandr Andrushchenko 
>>>>>>>>> wrote:
>>>>>>>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>>>>>>>> +
>>>>>>>>>>>>>> for ( i = 0; i < msix->max_entries; i++ )
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>> const struct vpci_msix_entry *entry = 
>>>>>>>>>>>>>> >entries[i];
>>>>>>>>>>>>> Since this function is now called with the per-domain rwlock read
>>>>>>>>>>>>> locked it's likely not appropriate to call 
>>>>>>>>>>>>> process_pending_softirqs
>>>>>>>>>>>>> while holding such lock (check below).
>>>>>>>>>>>> You are right, as it is possible that:
>>>>>>>>>>>>
>>>>>>>>>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>>>>>>>>>
>>>>>>>>>>>> Even more, vpci_process_pending may also
>>>>>>>>>>>>
>>>>>>>>>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>>>>>>>>>
>>>>>>>>>>>> in its error path. So, any invocation of process_pending_softirqs
>>>>>>>>>>>> must not hold d->vpci_rwlock at least.
>>>>>>>>>>>>
>>>>>>>>>>>> And also we need to check that pdev->vpci was not removed
>>>>>>>>>>>> in between or *re-created*
>>>>>>>>>>>>> We will likely need to re-iterate over the list of pdevs assigned 
>>>>>>>>>>>>> to
>>>>>>>>>>>>> the domain and assert that the pdev is still assigned to the same
>>>>>>>>>>>>> domain.
>>>>>>>>>>>> So, do you mean a pattern like the below should be used at all
>>>>>>>>>>>> places where we need to call process_pending_softirqs?
>>>>>>>>>>>>
>>>>>>>>>>>> read_unlock
>>>>>>>>>>>> process_pending_softirqs
>>>>>>>>>>>> read_lock
>>>>>>>>>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>>>>>>>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>>>>>>>>>> 
>>>>>>>>>>> Something along those lines. You likely need to continue iterate 
>>>>>>>>>>> using
>>>>>>>>>>> for_each_pdev.
>>>>>>>>>> How do we tell if pdev->vpci is the same? Jan has already brought
>>>>>>>>>> this question before [1] and I was about to use some ID for that 
>>>>>>>>>> purpose:
>>>>>>>>>> pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for 
>>>>>>>>>> checks
>>>>>>>>> Given this is a debug message I would be OK with just doing the
>>>>>>>>> minimal checks to prevent Xen from crashing (ie: pdev->vpci exists)
>>>>>>>>> and that the resume MSI entry is not past the current limit. Otherwise
>>>>>>>>> just print a message and move on to the next device.
>>>>>>>> Agree, I see no big issue (probably) if we are not able to print
>>>>>

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 14:57, Jan Beulich wrote:
> On 14.02.2022 12:37, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 13:25, Roger Pau Monné wrote:
>>> On Mon, Feb 14, 2022 at 11:15:27AM +, Oleksandr Andrushchenko wrote:
>>>> On 14.02.22 13:11, Roger Pau Monné wrote:
>>>>> On Mon, Feb 14, 2022 at 10:53:43AM +, Oleksandr Andrushchenko wrote:
>>>>>> On 14.02.22 12:34, Roger Pau Monné wrote:
>>>>>>> On Mon, Feb 14, 2022 at 09:36:39AM +, Oleksandr Andrushchenko wrote:
>>>>>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>>>>>> +
>>>>>>>>>>>>for ( i = 0; i < msix->max_entries; i++ )
>>>>>>>>>>>>{
>>>>>>>>>>>>const struct vpci_msix_entry *entry = 
>>>>>>>>>>>> >entries[i];
>>>>>>>>>>> Since this function is now called with the per-domain rwlock read
>>>>>>>>>>> locked it's likely not appropriate to call process_pending_softirqs
>>>>>>>>>>> while holding such lock (check below).
>>>>>>>>>> You are right, as it is possible that:
>>>>>>>>>>
>>>>>>>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>>>>>>>
>>>>>>>>>> Even more, vpci_process_pending may also
>>>>>>>>>>
>>>>>>>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>>>>>>>
>>>>>>>>>> in its error path. So, any invocation of process_pending_softirqs
>>>>>>>>>> must not hold d->vpci_rwlock at least.
>>>>>>>>>>
>>>>>>>>>> And also we need to check that pdev->vpci was not removed
>>>>>>>>>> in between or *re-created*
>>>>>>>>>>> We will likely need to re-iterate over the list of pdevs assigned to
>>>>>>>>>>> the domain and assert that the pdev is still assigned to the same
>>>>>>>>>>> domain.
>>>>>>>>>> So, do you mean a pattern like the below should be used at all
>>>>>>>>>> places where we need to call process_pending_softirqs?
>>>>>>>>>>
>>>>>>>>>> read_unlock
>>>>>>>>>> process_pending_softirqs
>>>>>>>>>> read_lock
>>>>>>>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>>>>>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>>>>>>>> 
>>>>>>>>> Something along those lines. You likely need to continue iterate using
>>>>>>>>> for_each_pdev.
>>>>>>>> How do we tell if pdev->vpci is the same? Jan has already brought
>>>>>>>> this question before [1] and I was about to use some ID for that 
>>>>>>>> purpose:
>>>>>>>> pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for 
>>>>>>>> checks
>>>>>>> Given this is a debug message I would be OK with just doing the
>>>>>>> minimal checks to prevent Xen from crashing (ie: pdev->vpci exists)
>>>>>>> and that the resume MSI entry is not past the current limit. Otherwise
>>>>>>> just print a message and move on to the next device.
>>>>>> Agree, I see no big issue (probably) if we are not able to print
>>>>>>
>>>>>> How about this one:
>>>>>>
>>>>>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>>>>>> index 809a6b4773e1..50373f04da82 100644
>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>> @@ -171,10 +171,31 @@ static int __init apply_map(struct domain *d, 
>>>>>> const struct pci_dev *pdev,
>>>>>>  struct rangeset *mem, uint16_t cmd)
>>>>>>  {
>>>>>>  struct map_data data = { .d = d, .map = true };
>>>>>> +    pci_s

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 13:25, Roger Pau Monné wrote:
> On Mon, Feb 14, 2022 at 11:15:27AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 13:11, Roger Pau Monné wrote:
>>> On Mon, Feb 14, 2022 at 10:53:43AM +, Oleksandr Andrushchenko wrote:
>>>> On 14.02.22 12:34, Roger Pau Monné wrote:
>>>>> On Mon, Feb 14, 2022 at 09:36:39AM +, Oleksandr Andrushchenko wrote:
>>>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>>>> +
>>>>>>>>>>   for ( i = 0; i < msix->max_entries; i++ )
>>>>>>>>>>   {
>>>>>>>>>>   const struct vpci_msix_entry *entry = 
>>>>>>>>>> >entries[i];
>>>>>>>>> Since this function is now called with the per-domain rwlock read
>>>>>>>>> locked it's likely not appropriate to call process_pending_softirqs
>>>>>>>>> while holding such lock (check below).
>>>>>>>> You are right, as it is possible that:
>>>>>>>>
>>>>>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>>>>>
>>>>>>>> Even more, vpci_process_pending may also
>>>>>>>>
>>>>>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>>>>>
>>>>>>>> in its error path. So, any invocation of process_pending_softirqs
>>>>>>>> must not hold d->vpci_rwlock at least.
>>>>>>>>
>>>>>>>> And also we need to check that pdev->vpci was not removed
>>>>>>>> in between or *re-created*
>>>>>>>>> We will likely need to re-iterate over the list of pdevs assigned to
>>>>>>>>> the domain and assert that the pdev is still assigned to the same
>>>>>>>>> domain.
>>>>>>>> So, do you mean a pattern like the below should be used at all
>>>>>>>> places where we need to call process_pending_softirqs?
>>>>>>>>
>>>>>>>> read_unlock
>>>>>>>> process_pending_softirqs
>>>>>>>> read_lock
>>>>>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>>>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>>>>>> 
>>>>>>> Something along those lines. You likely need to continue iterate using
>>>>>>> for_each_pdev.
>>>>>> How do we tell if pdev->vpci is the same? Jan has already brought
>>>>>> this question before [1] and I was about to use some ID for that purpose:
>>>>>> pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for checks
>>>>> Given this is a debug message I would be OK with just doing the
>>>>> minimal checks to prevent Xen from crashing (ie: pdev->vpci exists)
>>>>> and that the resume MSI entry is not past the current limit. Otherwise
>>>>> just print a message and move on to the next device.
>>>> Agree, I see no big issue (probably) if we are not able to print
>>>>
>>>> How about this one:
>>>>
>>>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>>>> index 809a6b4773e1..50373f04da82 100644
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -171,10 +171,31 @@ static int __init apply_map(struct domain *d, const 
>>>> struct pci_dev *pdev,
>>>> struct rangeset *mem, uint16_t cmd)
>>>> {
>>>> struct map_data data = { .d = d, .map = true };
>>>> +    pci_sbdf_t sbdf = pdev->sbdf;
>>>> int rc;
>>>>
>>>> + ASSERT(rw_is_write_locked(>domain->vpci_rwlock));
>>>> +
>>>> while ( (rc = rangeset_consume_ranges(mem, map_range, )) == 
>>>> -ERESTART )
>>>> +    {
>>>> +
>>>> +    /*
>>>> + * process_pending_softirqs may trigger vpci_process_pending which
>>>> + * may need to acquire pdev->domain->vpci_rwlock in read mode.
>>>> + */
>>>> +    write_unlock(>domain->vpci_rwlock);
>>>> p

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 13:11, Roger Pau Monné wrote:
> On Mon, Feb 14, 2022 at 10:53:43AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 14.02.22 12:34, Roger Pau Monné wrote:
>>> On Mon, Feb 14, 2022 at 09:36:39AM +, Oleksandr Andrushchenko wrote:
>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>> +
>>>>>>>>  for ( i = 0; i < msix->max_entries; i++ )
>>>>>>>>  {
>>>>>>>>  const struct vpci_msix_entry *entry = >entries[i];
>>>>>>> Since this function is now called with the per-domain rwlock read
>>>>>>> locked it's likely not appropriate to call process_pending_softirqs
>>>>>>> while holding such lock (check below).
>>>>>> You are right, as it is possible that:
>>>>>>
>>>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>>>
>>>>>> Even more, vpci_process_pending may also
>>>>>>
>>>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>>>
>>>>>> in its error path. So, any invocation of process_pending_softirqs
>>>>>> must not hold d->vpci_rwlock at least.
>>>>>>
>>>>>> And also we need to check that pdev->vpci was not removed
>>>>>> in between or *re-created*
>>>>>>> We will likely need to re-iterate over the list of pdevs assigned to
>>>>>>> the domain and assert that the pdev is still assigned to the same
>>>>>>> domain.
>>>>>> So, do you mean a pattern like the below should be used at all
>>>>>> places where we need to call process_pending_softirqs?
>>>>>>
>>>>>> read_unlock
>>>>>> process_pending_softirqs
>>>>>> read_lock
>>>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>>>> 
>>>>> Something along those lines. You likely need to continue iterate using
>>>>> for_each_pdev.
>>>> How do we tell if pdev->vpci is the same? Jan has already brought
>>>> this question before [1] and I was about to use some ID for that purpose:
>>>> pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for checks
>>> Given this is a debug message I would be OK with just doing the
>>> minimal checks to prevent Xen from crashing (ie: pdev->vpci exists)
>>> and that the resume MSI entry is not past the current limit. Otherwise
>>> just print a message and move on to the next device.
>> Agree, I see no big issue (probably) if we are not able to print
>>
>> How about this one:
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index 809a6b4773e1..50373f04da82 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -171,10 +171,31 @@ static int __init apply_map(struct domain *d, const 
>> struct pci_dev *pdev,
>>    struct rangeset *mem, uint16_t cmd)
>>    {
>>    struct map_data data = { .d = d, .map = true };
>> +    pci_sbdf_t sbdf = pdev->sbdf;
>>    int rc;
>>
>> + ASSERT(rw_is_write_locked(>domain->vpci_rwlock));
>> +
>>    while ( (rc = rangeset_consume_ranges(mem, map_range, )) == 
>> -ERESTART )
>> +    {
>> +
>> +    /*
>> + * process_pending_softirqs may trigger vpci_process_pending which
>> + * may need to acquire pdev->domain->vpci_rwlock in read mode.
>> + */
>> +    write_unlock(>domain->vpci_rwlock);
>>    process_pending_softirqs();
>> +    write_lock(>domain->vpci_rwlock);
>> +
>> +    /* Check if pdev still exists and vPCI was not removed or 
>> re-created. */
>> +    if (pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn) != 
>> pdev)
>> +    if ( vpci is NOT the same )
>> +    {
>> +    rc = 0;
>> +    break;
>> +    }
>> +    }
>> +
>>    rangeset_destroy(mem);
>>    if ( !rc )
>>    modify_decoding(pdev, cmd, false);
>>
>> This one also wants process_pending_softirqs to run so it *might*
>> want pdev and vpci checks. But at the same time apply_map runs
>> at ( system_state < SYS_STATE_active ), so defer_map won't be
>> running yet, thus no vpci_process_pending is possible yet (in terms
>> it has something to do yet). So, I think we just need:
>>
>>       write_unlock(>domain->vpci_rwlock);
>>       process_pending_softirqs();
>>       write_lock(>domain->vpci_rwlock);
>>
>> and this should be enough
> Given the context apply_map is called from (dom0 specific init code),
> there's no need to check for the pdev to still exits, or whether vpci
> has been recreated, as it's not possible. Just add a comment to
> explicitly note that the context of the function is special, and thus
> there's no possibility of either the device or vpci going away.
Does it really need write_unlock/write_lock given the context?...
I think it doesn't as there is no chance defer_map is called, thus
process_pending_softirqs -> vpci_process_pending -> read_lock
is not possible
I'll just add a comment about that
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 12:34, Roger Pau Monné wrote:
> On Mon, Feb 14, 2022 at 09:36:39AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>> +
>>>>>> for ( i = 0; i < msix->max_entries; i++ )
>>>>>> {
>>>>>> const struct vpci_msix_entry *entry = >entries[i];
>>>>> Since this function is now called with the per-domain rwlock read
>>>>> locked it's likely not appropriate to call process_pending_softirqs
>>>>> while holding such lock (check below).
>>>> You are right, as it is possible that:
>>>>
>>>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>>>
>>>> Even more, vpci_process_pending may also
>>>>
>>>> read_unlock -> vpci_remove_device -> write_lock
>>>>
>>>> in its error path. So, any invocation of process_pending_softirqs
>>>> must not hold d->vpci_rwlock at least.
>>>>
>>>> And also we need to check that pdev->vpci was not removed
>>>> in between or *re-created*
>>>>> We will likely need to re-iterate over the list of pdevs assigned to
>>>>> the domain and assert that the pdev is still assigned to the same
>>>>> domain.
>>>> So, do you mean a pattern like the below should be used at all
>>>> places where we need to call process_pending_softirqs?
>>>>
>>>> read_unlock
>>>> process_pending_softirqs
>>>> read_lock
>>>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>>>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>>>> 
>>> Something along those lines. You likely need to continue iterate using
>>> for_each_pdev.
>> How do we tell if pdev->vpci is the same? Jan has already brought
>> this question before [1] and I was about to use some ID for that purpose:
>> pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for checks
> Given this is a debug message I would be OK with just doing the
> minimal checks to prevent Xen from crashing (ie: pdev->vpci exists)
> and that the resume MSI entry is not past the current limit. Otherwise
> just print a message and move on to the next device.
Agree, I see no big issue (probably) if we are not able to print

How about this one:

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 809a6b4773e1..50373f04da82 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -171,10 +171,31 @@ static int __init apply_map(struct domain *d, const 
struct pci_dev *pdev,
  struct rangeset *mem, uint16_t cmd)
  {
  struct map_data data = { .d = d, .map = true };
+    pci_sbdf_t sbdf = pdev->sbdf;
  int rc;

+ ASSERT(rw_is_write_locked(>domain->vpci_rwlock));
+
  while ( (rc = rangeset_consume_ranges(mem, map_range, )) == 
-ERESTART )
+    {
+
+    /*
+ * process_pending_softirqs may trigger vpci_process_pending which
+ * may need to acquire pdev->domain->vpci_rwlock in read mode.
+ */
+    write_unlock(>domain->vpci_rwlock);
  process_pending_softirqs();
+    write_lock(>domain->vpci_rwlock);
+
+    /* Check if pdev still exists and vPCI was not removed or re-created. 
*/
+    if (pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn) != pdev)
+    if ( vpci is NOT the same )
+    {
+    rc = 0;
+    break;
+    }
+    }
+
  rangeset_destroy(mem);
  if ( !rc )
  modify_decoding(pdev, cmd, false);

This one also wants process_pending_softirqs to run so it *might*
want pdev and vpci checks. But at the same time apply_map runs
at ( system_state < SYS_STATE_active ), so defer_map won't be
running yet, thus no vpci_process_pending is possible yet (in terms
it has something to do yet). So, I think we just need:

     write_unlock(>domain->vpci_rwlock);
     process_pending_softirqs();
     write_lock(>domain->vpci_rwlock);

and this should be enough
>
> The recreating of pdev->vpci only occurs as a result of some admin
> operations, and doing it while also trying to print the current MSI
> status is not a reliable approach. So dumping an incomplete or
> incoherent state as a result of ongoing admin operations would be
> fine.
Ok
>
> Thanks, Roger.
>
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 11.02.22 13:40, Roger Pau Monné wrote:
> +
for ( i = 0; i < msix->max_entries; i++ )
{
const struct vpci_msix_entry *entry = >entries[i];
>>> Since this function is now called with the per-domain rwlock read
>>> locked it's likely not appropriate to call process_pending_softirqs
>>> while holding such lock (check below).
>> You are right, as it is possible that:
>>
>> process_pending_softirqs -> vpci_process_pending -> read_lock
>>
>> Even more, vpci_process_pending may also
>>
>> read_unlock -> vpci_remove_device -> write_lock
>>
>> in its error path. So, any invocation of process_pending_softirqs
>> must not hold d->vpci_rwlock at least.
>>
>> And also we need to check that pdev->vpci was not removed
>> in between or *re-created*
>>> We will likely need to re-iterate over the list of pdevs assigned to
>>> the domain and assert that the pdev is still assigned to the same
>>> domain.
>> So, do you mean a pattern like the below should be used at all
>> places where we need to call process_pending_softirqs?
>>
>> read_unlock
>> process_pending_softirqs
>> read_lock
>> pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn);
>> if ( pdev && pdev->vpci && is_the_same_vpci(pdev->vpci) )
>> 
> Something along those lines. You likely need to continue iterate using
> for_each_pdev.
How do we tell if pdev->vpci is the same? Jan has already brought
this question before [1] and I was about to use some ID for that purpose:
pdev->vpci->id = d->vpci_id++ and then we use pdev->vpci->id  for checks

Thank you,
Oleksandr

[1] https://www.mail-archive.com/xen-devel@lists.xenproject.org/msg113790.html

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-14 Thread Oleksandr Andrushchenko


On 14.02.22 10:47, Roger Pau Monné wrote:
> On Mon, Feb 14, 2022 at 06:33:07AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 11.02.22 17:44, Roger Pau Monné wrote:
>>> On Fri, Feb 11, 2022 at 12:13:38PM +, Oleksandr Andrushchenko wrote:
>>>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>>>> On Fri, Feb 11, 2022 at 07:27:39AM +, Oleksandr Andrushchenko wrote:
>>>>>> Hi, Roger!
>>>>>>
>>>>>> On 10.02.22 18:16, Roger Pau Monné wrote:
>>>>>>> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>> From: Oleksandr Andrushchenko 
>>>>>>>>
>>>>>>>> Introduce a per-domain read/write lock to check whether vpci is 
>>>>>>>> present,
>>>>>>>> so we are sure there are no accesses to the contents of the vpci struct
>>>>>>>> if not. This lock can be used (and in a few cases is used right away)
>>>>>>>> so that vpci removal can be performed while holding the lock in write
>>>>>>>> mode. Previously such removal could race with vpci_read for example.
>>>>>>> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
>>>>>>> pci_remove_device, and likely when vPCI gets also used in
>>>>>>> {de}assign_device I think.
>>>>>> Yes, this is indeed an issue, but I was not trying to solve it in
>>>>>> context of vPCI locking yet. I think we should discuss how do
>>>>>> we approach pdev locking, so I can create a patch for that.
>>>>>> that being said, I would like not to solve pdev in  this patch yet
>>>>>>
>>>>>> ...I do understand we do want to avoid that, but at the moment
>>>>>> a single reliable way for making sure pdev is alive seems to
>>>>>> be pcidevs_lock
>>>>> I think we will need to make pcidevs_lock a rwlock and take it in read
>>>>> mode for pci_get_pdev_by_domain.
>>>>>
>>>>> We didn't have this scenario before where PCI emulation is done in the
>>>>> hypervisor, and hence the locking around those data structures has not
>>>>> been designed for those use-cases.
>>>> Yes, I do understand that.
>>>> I hope pcidevs lock move to rwlock can be done as a separate
>>>> patch. While this is not done, do you think we can proceed with
>>>> vPCI series and pcidevs locking re-work being done after?
>>> Ideally we would like to sort out the locking once and for all. I
>>> would like to be sure that what we introduce now doesn't turn out to
>>> interact badly when we decide to look at the pcidevs locking issue.
>> Ok, so I'll start converting pcidevs into rwlock then
> Sorry, maybe I didn't express myself correctly, since the current
> series doesn't lead to a functional implementation of vPCI for domUs I
> would be fine with postponing the locking work, as long as the
> currently introduced code doesn't make it any worse or extend the
> locking scheme into new paths, but maybe that's not very helpful.
Indeed, I misunderstood you probably. Great, so we can continue
working on the vPCI series which when accepted will unblock
MSI/MSI-X work which depends on vPCI. Then, in parallel with MSIs,
we can start re-working pcidevs locking.
>
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-13 Thread Oleksandr Andrushchenko


On 11.02.22 17:44, Roger Pau Monné wrote:
> On Fri, Feb 11, 2022 at 12:13:38PM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 11.02.22 13:40, Roger Pau Monné wrote:
>>> On Fri, Feb 11, 2022 at 07:27:39AM +, Oleksandr Andrushchenko wrote:
>>>> Hi, Roger!
>>>>
>>>> On 10.02.22 18:16, Roger Pau Monné wrote:
>>>>> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>> From: Oleksandr Andrushchenko 
>>>>>>
>>>>>> Introduce a per-domain read/write lock to check whether vpci is present,
>>>>>> so we are sure there are no accesses to the contents of the vpci struct
>>>>>> if not. This lock can be used (and in a few cases is used right away)
>>>>>> so that vpci removal can be performed while holding the lock in write
>>>>>> mode. Previously such removal could race with vpci_read for example.
>>>>> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
>>>>> pci_remove_device, and likely when vPCI gets also used in
>>>>> {de}assign_device I think.
>>>> Yes, this is indeed an issue, but I was not trying to solve it in
>>>> context of vPCI locking yet. I think we should discuss how do
>>>> we approach pdev locking, so I can create a patch for that.
>>>> that being said, I would like not to solve pdev in  this patch yet
>>>>
>>>> ...I do understand we do want to avoid that, but at the moment
>>>> a single reliable way for making sure pdev is alive seems to
>>>> be pcidevs_lock
>>> I think we will need to make pcidevs_lock a rwlock and take it in read
>>> mode for pci_get_pdev_by_domain.
>>>
>>> We didn't have this scenario before where PCI emulation is done in the
>>> hypervisor, and hence the locking around those data structures has not
>>> been designed for those use-cases.
>> Yes, I do understand that.
>> I hope pcidevs lock move to rwlock can be done as a separate
>> patch. While this is not done, do you think we can proceed with
>> vPCI series and pcidevs locking re-work being done after?
> Ideally we would like to sort out the locking once and for all. I
> would like to be sure that what we introduce now doesn't turn out to
> interact badly when we decide to look at the pcidevs locking issue.
Ok, so I'll start converting pcidevs into rwlock then
>
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-11 Thread Oleksandr Andrushchenko


On 11.02.22 13:51, Roger Pau Monné wrote:
> On Fri, Feb 11, 2022 at 08:46:59AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 10.02.22 18:16, Roger Pau Monné wrote:
>>> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>>>> From: Oleksandr Andrushchenko 
>>>>
>>>> Introduce a per-domain read/write lock to check whether vpci is present,
>>>> so we are sure there are no accesses to the contents of the vpci struct
>>>> if not. This lock can be used (and in a few cases is used right away)
>>>> so that vpci removal can be performed while holding the lock in write
>>>> mode. Previously such removal could race with vpci_read for example.
>>> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
>>> pci_remove_device, and likely when vPCI gets also used in
>>> {de}assign_device I think.
>>>
>> How about the below? It seems to guarantee that we can access pdev
>> without issues and without requiring pcidevs_lock to be used?
> Hm, I'm unsure this is correct.
Yes, we need pcidevs as rwlock in order to solve this reliably...
>   It's in general a bad idea to use a
> per-domain lock approach to protect the consistency of elements moving
> between domains.
>
> In order for this to be safe you will likely need to hold both the
> source and the destination per-domain locks, and then you could also
> get into ABBA lock issues unless you always take the lock in the same
> order.
>
> I think it's safer to use a global lock in this case (pcidevs_lock).
>
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index e8b09d77d880..fd464a58b3b3 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -937,8 +937,14 @@ static int deassign_device(struct domain *d, uint16_t 
>> seg, uint8_t bus,
>>    }
>>
>>    devfn = pdev->devfn;
>> +#ifdef CONFIG_HAS_VPCI
>> +    write_lock(>vpci_rwlock);
>> +#endif
>>    ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
>>     pci_to_dev(pdev));
>> +#ifdef CONFIG_HAS_VPCI
>> +    write_unlock(>vpci_rwlock);
>> +#endif
>>    if ( ret )
>>    goto out;
>>
>> @@ -1474,6 +1480,9 @@ static int assign_device(struct domain *d, u16 seg, u8 
>> bus, u8 devfn, u32 flag)
>>    const struct domain_iommu *hd = dom_iommu(d);
>>    struct pci_dev *pdev;
>>    int rc = 0;
>> +#ifdef CONFIG_HAS_VPCI
>> +    struct domain *old_d;
>> +#endif
>>
>>    if ( !is_iommu_enabled(d) )
>>    return 0;
>> @@ -1487,15 +1496,34 @@ static int assign_device(struct domain *d, u16 seg, 
>> u8 bus, u8 devfn, u32 flag)
>>    ASSERT(pdev && (pdev->domain == hardware_domain ||
>>    pdev->domain == dom_io));
>>
>> +#ifdef CONFIG_HAS_VPCI
>> +    /* pdev->domain is either hwdom or dom_io. We do not want the later. */
>> +    old_d = pdev->domain == hardware_domain ? pdev->domain : NULL;
>> +    if ( old_d )
>> +    write_lock(_d->vpci_rwlock);
>> +#endif
>> +
>>    rc = pdev_msix_assign(d, pdev);
> I don't think you need the vpci lock for this operation.
>
>>    if ( rc )
>> +    {
>> +#ifdef CONFIG_HAS_VPCI
>> +    if ( old_d )
>> +    write_unlock(_d->vpci_rwlock);
>> +#endif
>>    goto done;
>> +    }
>>
>>    pdev->fault.count = 0;
>>
>>    if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
>>      pci_to_dev(pdev), flag)) )
>> +    {
>> +#ifdef CONFIG_HAS_VPCI
>> +    if ( old_d )
>> +    write_unlock(_d->vpci_rwlock);
>> +#endif
> Like I've mentioned above, I'm unsure this is correct. You are holding
> the lock of the previous domain, but at some point the device will be
> assigned to a new domain, so that change won't be protected by the
> lock of the new domain.
>
> Thanks, Roger.


Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-11 Thread Oleksandr Andrushchenko


On 11.02.22 13:40, Roger Pau Monné wrote:
> On Fri, Feb 11, 2022 at 07:27:39AM +0000, Oleksandr Andrushchenko wrote:
>> Hi, Roger!
>>
>> On 10.02.22 18:16, Roger Pau Monné wrote:
>>> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>>>> From: Oleksandr Andrushchenko 
>>>>
>>>> Introduce a per-domain read/write lock to check whether vpci is present,
>>>> so we are sure there are no accesses to the contents of the vpci struct
>>>> if not. This lock can be used (and in a few cases is used right away)
>>>> so that vpci removal can be performed while holding the lock in write
>>>> mode. Previously such removal could race with vpci_read for example.
>>> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
>>> pci_remove_device, and likely when vPCI gets also used in
>>> {de}assign_device I think.
>> Yes, this is indeed an issue, but I was not trying to solve it in
>> context of vPCI locking yet. I think we should discuss how do
>> we approach pdev locking, so I can create a patch for that.
>> that being said, I would like not to solve pdev in  this patch yet
>>
>> ...I do understand we do want to avoid that, but at the moment
>> a single reliable way for making sure pdev is alive seems to
>> be pcidevs_lock
> I think we will need to make pcidevs_lock a rwlock and take it in read
> mode for pci_get_pdev_by_domain.
>
> We didn't have this scenario before where PCI emulation is done in the
> hypervisor, and hence the locking around those data structures has not
> been designed for those use-cases.
Yes, I do understand that.
I hope pcidevs lock move to rwlock can be done as a separate
patch. While this is not done, do you think we can proceed with
vPCI series and pcidevs locking re-work being done after?

>
>>>> 1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
>>>> from being removed.
>>>>
>>>> 2. Writing the command register and ROM BAR register may trigger
>>>> modify_bars to run, which in turn may access multiple pdevs while
>>>> checking for the existing BAR's overlap. The overlapping check, if done
>>>> under the read lock, requires vpci->lock to be acquired on both devices
>>>> being compared, which may produce a deadlock. It is not possible to
>>>> upgrade read lock to write lock in such a case. So, in order to prevent
>>>> the deadlock, check which registers are going to be written and acquire
>>>> the lock in the appropriate mode from the beginning.
>>>>
>>>> All other code, which doesn't lead to pdev->vpci destruction and does not
>>>> access multiple pdevs at the same time, can still use a combination of the
>>>> read lock and pdev->vpci->lock.
>>>>
>>>> 3. Optimize if ROM BAR write lock required detection by caching offset
>>>> of the ROM BAR register in vpci->header->rom_reg which depends on
>>>> header's type.
>>>>
>>>> 4. Reduce locked region in vpci_remove_device as it is now possible
>>>> to set pdev->vpci to NULL early right after the write lock is acquired.
>>>>
>>>> 5. Reduce locked region in vpci_add_handlers as it is possible to
>>>> initialize many more fields of the struct vpci before assigning it to
>>>> pdev->vpci.
>>>>
>>>> 6. vpci_{add|remove}_register are required to be called with the write lock
>>>> held, but it is not feasible to add an assert there as it requires
>>>> struct domain to be passed for that. So, add a comment about this 
>>>> requirement
>>>> to these and other functions with the equivalent constraints.
>>>>
>>>> 7. Drop const qualifier where the new rwlock is used and this is 
>>>> appropriate.
>>>>
>>>> 8. This is based on the discussion at [1].
>>>>
>>>> [1] 
>>>> https://urldefense.com/v3/__https://lore.kernel.org/all/20220204063459.680961-4-andr2...@gmail.com/__;!!GF_29dbcQIUBPA!gObSySzN7s6zSKrcpSEi6vw18fRPls157cuRoqq4KDd7Ic_Nvh_cFlyVXPRpEWBkI38pgsvvfg$
>>>>  [lore[.]kernel[.]org]
>>>>
>>>> Suggested-by: Roger Pau Monné 
>>>> Suggested-by: Jan Beulich 
>>>> Signed-off-by: Oleksandr Andrushchenko 
>>>>
>>>> ---
>>>> This was checked on x86: with and without PVH Dom0.
>>>> ---
>>>>xen/arch/x86/hvm/vmsi.c   |   2 +
>>>>xen/common/domain.c   |   3 +

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-11 Thread Oleksandr Andrushchenko


On 10.02.22 18:16, Roger Pau Monné wrote:
> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko 
>>
>> Introduce a per-domain read/write lock to check whether vpci is present,
>> so we are sure there are no accesses to the contents of the vpci struct
>> if not. This lock can be used (and in a few cases is used right away)
>> so that vpci removal can be performed while holding the lock in write
>> mode. Previously such removal could race with vpci_read for example.
> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
> pci_remove_device, and likely when vPCI gets also used in
> {de}assign_device I think.
>
How about the below? It seems to guarantee that we can access pdev
without issues and without requiring pcidevs_lock to be used?

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index e8b09d77d880..fd464a58b3b3 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -937,8 +937,14 @@ static int deassign_device(struct domain *d, uint16_t seg, 
uint8_t bus,
  }

  devfn = pdev->devfn;
+#ifdef CONFIG_HAS_VPCI
+    write_lock(>vpci_rwlock);
+#endif
  ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
   pci_to_dev(pdev));
+#ifdef CONFIG_HAS_VPCI
+    write_unlock(>vpci_rwlock);
+#endif
  if ( ret )
  goto out;

@@ -1474,6 +1480,9 @@ static int assign_device(struct domain *d, u16 seg, u8 
bus, u8 devfn, u32 flag)
  const struct domain_iommu *hd = dom_iommu(d);
  struct pci_dev *pdev;
  int rc = 0;
+#ifdef CONFIG_HAS_VPCI
+    struct domain *old_d;
+#endif

  if ( !is_iommu_enabled(d) )
  return 0;
@@ -1487,15 +1496,34 @@ static int assign_device(struct domain *d, u16 seg, u8 
bus, u8 devfn, u32 flag)
  ASSERT(pdev && (pdev->domain == hardware_domain ||
  pdev->domain == dom_io));

+#ifdef CONFIG_HAS_VPCI
+    /* pdev->domain is either hwdom or dom_io. We do not want the later. */
+    old_d = pdev->domain == hardware_domain ? pdev->domain : NULL;
+    if ( old_d )
+    write_lock(_d->vpci_rwlock);
+#endif
+
  rc = pdev_msix_assign(d, pdev);
  if ( rc )
+    {
+#ifdef CONFIG_HAS_VPCI
+    if ( old_d )
+    write_unlock(_d->vpci_rwlock);
+#endif
  goto done;
+    }

  pdev->fault.count = 0;

  if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
    pci_to_dev(pdev), flag)) )
+    {
+#ifdef CONFIG_HAS_VPCI
+    if ( old_d )
+    write_unlock(_d->vpci_rwlock);
+#endif
  goto done;
+    }

  for ( ; pdev->phantom_stride; rc = 0 )
  {

I think we don't care about pci_remove_device because:

int pci_remove_device(u16 seg, u8 bus, u8 devfn)
{
[snip]
     pcidevs_lock();
     list_for_each_entry ( pdev, >alldevs_list, alldevs_list )
     if ( pdev->bus == bus && pdev->devfn == devfn )
     {
     vpci_remove_device(pdev);

as vpci_remove_device will take care there are no readers and
will safely remove vpci.

Thank you,
Oleksandr

Re: [PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-10 Thread Oleksandr Andrushchenko
Hi, Roger!

On 10.02.22 18:16, Roger Pau Monné wrote:
> On Wed, Feb 09, 2022 at 03:36:27PM +0200, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko 
>>
>> Introduce a per-domain read/write lock to check whether vpci is present,
>> so we are sure there are no accesses to the contents of the vpci struct
>> if not. This lock can be used (and in a few cases is used right away)
>> so that vpci removal can be performed while holding the lock in write
>> mode. Previously such removal could race with vpci_read for example.
> Sadly there's still a race in the usage of pci_get_pdev_by_domain wrt
> pci_remove_device, and likely when vPCI gets also used in
> {de}assign_device I think.
Yes, this is indeed an issue, but I was not trying to solve it in
context of vPCI locking yet. I think we should discuss how do
we approach pdev locking, so I can create a patch for that.
that being said, I would like not to solve pdev in  this patch yet

...I do understand we do want to avoid that, but at the moment
a single reliable way for making sure pdev is alive seems to
be pcidevs_lock
>
>> 1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
>> from being removed.
>>
>> 2. Writing the command register and ROM BAR register may trigger
>> modify_bars to run, which in turn may access multiple pdevs while
>> checking for the existing BAR's overlap. The overlapping check, if done
>> under the read lock, requires vpci->lock to be acquired on both devices
>> being compared, which may produce a deadlock. It is not possible to
>> upgrade read lock to write lock in such a case. So, in order to prevent
>> the deadlock, check which registers are going to be written and acquire
>> the lock in the appropriate mode from the beginning.
>>
>> All other code, which doesn't lead to pdev->vpci destruction and does not
>> access multiple pdevs at the same time, can still use a combination of the
>> read lock and pdev->vpci->lock.
>>
>> 3. Optimize if ROM BAR write lock required detection by caching offset
>> of the ROM BAR register in vpci->header->rom_reg which depends on
>> header's type.
>>
>> 4. Reduce locked region in vpci_remove_device as it is now possible
>> to set pdev->vpci to NULL early right after the write lock is acquired.
>>
>> 5. Reduce locked region in vpci_add_handlers as it is possible to
>> initialize many more fields of the struct vpci before assigning it to
>> pdev->vpci.
>>
>> 6. vpci_{add|remove}_register are required to be called with the write lock
>> held, but it is not feasible to add an assert there as it requires
>> struct domain to be passed for that. So, add a comment about this requirement
>> to these and other functions with the equivalent constraints.
>>
>> 7. Drop const qualifier where the new rwlock is used and this is appropriate.
>>
>> 8. This is based on the discussion at [1].
>>
>> [1] 
>> https://urldefense.com/v3/__https://lore.kernel.org/all/20220204063459.680961-4-andr2...@gmail.com/__;!!GF_29dbcQIUBPA!gObSySzN7s6zSKrcpSEi6vw18fRPls157cuRoqq4KDd7Ic_Nvh_cFlyVXPRpEWBkI38pgsvvfg$
>>  [lore[.]kernel[.]org]
>>
>> Suggested-by: Roger Pau Monné 
>> Suggested-by: Jan Beulich 
>> Signed-off-by: Oleksandr Andrushchenko 
>>
>> ---
>> This was checked on x86: with and without PVH Dom0.
>> ---
>>   xen/arch/x86/hvm/vmsi.c   |   2 +
>>   xen/common/domain.c   |   3 +
>>   xen/drivers/vpci/header.c |   8 +++
>>   xen/drivers/vpci/msi.c|   8 ++-
>>   xen/drivers/vpci/msix.c   |  40 +++--
>>   xen/drivers/vpci/vpci.c   | 114 --
>>   xen/include/xen/sched.h   |   3 +
>>   xen/include/xen/vpci.h|   2 +
>>   8 files changed, 146 insertions(+), 34 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
>> index 13e2a190b439..351cb968a423 100644
>> --- a/xen/arch/x86/hvm/vmsi.c
>> +++ b/xen/arch/x86/hvm/vmsi.c
>> @@ -893,6 +893,8 @@ int vpci_msix_arch_print(const struct vpci_msix *msix)
>>   {
>>   unsigned int i;
>>   
>> +ASSERT(!!rw_is_locked(>pdev->domain->vpci_rwlock));
>^ no need for the double negation.
Ok, will update all asserts which use !!
>
> Also this asserts that the lock is taken, but could be by a different
> pCPU.  I guess it's better than nothing.
Fair enough. Do you still want the asserts or should I remove them?
>
>> +
>>   for ( i = 0; i < msix->max_entries; i++ )
>>   {
>>   const struct vpci_msix_entry *entry = >entries[i];

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-10 Thread Oleksandr Andrushchenko


On 10.02.22 15:36, Jan Beulich wrote:
> On 10.02.2022 13:54, Oleksandr Andrushchenko wrote:
>> On 07.02.22 16:31, Jan Beulich wrote:
>>> On 07.02.2022 15:17, Oleksandr Andrushchenko wrote:
>>> But: What's still missing here then is the separation of guest and host
>>> views. When we set INTx behind the guest's back, it shouldn't observe the
>>> bit set. Or is this meant to be another (big) TODO?
>> Why not? This seems to be when a guest tries to both enable MSI/MSI-X
>> and INTx which is a wrong combination. Let's pretend to be a really
>> smart PCI device which partially rejects such PCI_COMMAND write,
>> so guest still sees the register consistent wrt INTx bit. Namely it remains
>> set.
> I'm afraid this wouldn't be "smart", but "buggy". I'm not aware of
> the spec leaving room for such behavior. And our emulation should
> give the guest a spec-compliant view of the device.
This means we need to emulate PCI_COMMAND for guests in terms
we need to maintain their state just like we do for BARs (header->guest_reg)
So, we will need header->guest_cmd to hold the state
>
> Jan
>
>
Thank you,
Oleksandr

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-10 Thread Oleksandr Andrushchenko


On 07.02.22 16:31, Jan Beulich wrote:
> On 07.02.2022 15:17, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 14:54, Jan Beulich wrote:
>>> On 07.02.2022 13:51, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 14:38, Jan Beulich wrote:
>>>>> On 07.02.2022 12:27, Oleksandr Andrushchenko wrote:
>>>>>> On 07.02.22 09:29, Jan Beulich wrote:
>>>>>>> On 04.02.2022 15:37, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 16:30, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>> Reset the command register when assigning a PCI device to a guest:
>>>>>>>>>> according to the PCI spec the PCI_COMMAND register is typically all 
>>>>>>>>>> 0's
>>>>>>>>>> after reset.
>>>>>>>>> It's not entirely clear to me whether setting the hardware register to
>>>>>>>>> zero is okay. What wants to be zero is the value the guest observes
>>>>>>>>> initially.
>>>>>>>> "the PCI spec says the PCI_COMMAND register is typically all 0's after 
>>>>>>>> reset."
>>>>>>>> Why wouldn't it be ok? What is the exact concern here?
>>>>>>> The concern is - as voiced is similar ways before, perhaps in other
>>>>>>> contexts - that you need to consider bit-by-bit whether overwriting
>>>>>>> with 0 what is currently there is okay. Xen and/or Dom0 may have put
>>>>>>> values there which they expect to remain unaltered. I guess
>>>>>>> PCI_COMMAND_SERR is a good example: While the guest's view of this
>>>>>>> will want to be zero initially, the host having set it to 1 may not
>>>>>>> easily be overwritten with 0, or else you'd effectively imply giving
>>>>>>> the guest control of the bit.
>>>>>> We have already discussed in great detail PCI_COMMAND emulation [1].
>>>>>> At the end you wrote [1]:
>>>>>> "Well, in order for the whole thing to be security supported it needs to
>>>>>> be explained for every bit why it is safe to allow the guest to drive it.
>>>>>> Until you mean vPCI to reach that state, leaving TODO notes in the code
>>>>>> for anything not investigated may indeed be good enough.
>>>>>>
>>>>>> Jan"
>>>>>>
>>>>>> So, this is why I left a TODO in the PCI_COMMAND emulation for now and 
>>>>>> only
>>>>>> care about INTx which is honored with the code in this patch.
>>>>> Right. The issue I see is that the description does not have any
>>>>> mention of this, but instead talks about simply writing zero.
>>>> How do you want that mentioned? Extended commit message or
>>>> just a link to the thread [1]?
>>> What I'd like you to describe is what the change does without
>>> fundamentally implying it'll end up being zero which gets written
>>> to the register. Stating as a conclusion that for the time being
>>> this means writing zero is certainly fine (and likely helpful if
>>> made explicit).
>> Xen and/or Dom0 may have put values in PCI_COMMAND which they expect
>> to remain unaltered. PCI_COMMAND_SERR bit is a good example: while the
>> guest's view of this will want to be zero initially, the host having set
>> it to 1 may not easily be overwritten with 0, or else we'd effectively
>> imply giving the guest control of the bit. Thus, PCI_COMMAND register needs
>> proper emulation in order to honor host's settings.
>>
>> There are examples of emulators [1], [2] which already deal with PCI_COMMAND
>> register emulation and it seems that at most they care about the only INTX
>> bit (besides IO/memory enable and bus muster which are write through).
>> It could be because in order to properly emulate the PCI_COMMAND register
>> we need to know about the whole PCI topology, e.g. if any setting in device's
>> command register is aligned with the upstream port etc.
>> This makes me think that because of this complexity others just ignore that.
>> Neither I think this can be easily done in Xen case.
>>
>> According to "PCI LOCAL BUS SPECIFICATION, REV. 3.0", section "6.2.2
>> Device Control" says that the reset state of the command register is
>> typically 0, so reset the command register when assigning a PCI device
>> to a guest t all 0's and for now only make sure INTx bit is set according
>> to if MSI/MSI-X enabled.
> "... is typically 0, so when assigning a PCI device reset the guest view of
>   the command register to all 0's. For now our emulation only makes sure INTx
>   is set according to host requirements, i.e. depending on MSI/MSI-X enabled
>   state."
I'll put this description into PCI_COMMAND emulation patch
>
> Maybe? (Obviously a fresh device given to a guest will have MSI/MSI-X
> disabled, so I'm not sure that aspect really needs mentioning.)
>
> But: What's still missing here then is the separation of guest and host
> views. When we set INTx behind the guest's back, it shouldn't observe the
> bit set. Or is this meant to be another (big) TODO?
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-10 Thread Oleksandr Andrushchenko


On 07.02.22 16:31, Jan Beulich wrote:
> On 07.02.2022 15:17, Oleksandr Andrushchenko wrote:
> But: What's still missing here then is the separation of guest and host
> views. When we set INTx behind the guest's back, it shouldn't observe the
> bit set. Or is this meant to be another (big) TODO?
Why not? This seems to be when a guest tries to both enable MSI/MSI-X
and INTx which is a wrong combination. Let's pretend to be a really
smart PCI device which partially rejects such PCI_COMMAND write,
so guest still sees the register consistent wrt INTx bit. Namely it remains
set.
>
> Jan
>
>


Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-10 Thread Oleksandr Andrushchenko


On 10.02.22 11:22, Jan Beulich wrote:
> On 10.02.2022 09:21, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 13:25, Oleksandr Andrushchenko wrote:
>>> On 08.02.22 13:00, Jan Beulich wrote:
>>>> On 08.02.2022 11:52, Oleksandr Andrushchenko wrote:
>>>>> This smells like we first need to fix the existing code, so
>>>>> pdev->domain is not assigned by specific IOMMU implementations,
>>>>> but instead controlled by the code which relies on that, assign_device.
>>>> Feel free to come up with proposals how to cleanly do so. Moving the
>>>> assignment to pdev->domain may even be possible now, but if you go
>>>> back you may find that the code was quite different earlier on.
>>> I do understand that as the code evolves new use cases bring
>>> new issues.
>>>>> I can have something like:
>>>>>
>>>>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>>>>> index 88836aab6baf..cc7790709a50 100644
>>>>> --- a/xen/drivers/passthrough/pci.c
>>>>> +++ b/xen/drivers/passthrough/pci.c
>>>>> @@ -1475,6 +1475,7 @@ static int device_assigned(u16 seg, u8 bus, u8 
>>>>> devfn)
>>>>>  static int assign_device(struct domain *d, u16 seg, u8 bus, u8 
>>>>> devfn, u32 flag)
>>>>>  {
>>>>>  const struct domain_iommu *hd = dom_iommu(d);
>>>>> +    struct domain *old_owner;
>>>>>  struct pci_dev *pdev;
>>>>>  int rc = 0;
>>>>>
>>>>> @@ -1490,6 +1491,9 @@ static int assign_device(struct domain *d, u16 seg, 
>>>>> u8 bus, u8 devfn, u32 flag)
>>>>>  ASSERT(pdev && (pdev->domain == hardware_domain ||
>>>>>  pdev->domain == dom_io));
>>>>>
>>>>> +    /* We need to restore the old owner in case of an error. */
>>>>> +    old_owner = pdev->domain;
>>>>> +
>>>>>  vpci_deassign_device(pdev->domain, pdev);
>>>>>
>>>>>  rc = pdev_msix_assign(d, pdev);
>>>>> @@ -1515,8 +1519,12 @@ static int assign_device(struct domain *d, u16 
>>>>> seg, u8 bus, u8 devfn, u32 flag)
>>>>>
>>>>>   done:
>>>>>  if ( rc )
>>>>> +    {
>>>>>  printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>>>>     d, _SBDF3(seg, bus, devfn), rc);
>>>>> +    /* We failed to assign, so restore the previous owner. */
>>>>> +    pdev->domain = old_owner;
>>>>> +    }
>>>>>  /* The device is assigned to dom_io so mark it as quarantined */
>>>>>  else if ( d == dom_io )
>>>>>  pdev->quarantine = true;
>>>>>
>>>>> But I do not think this belongs to this patch
>>>> Indeed. Plus I'm sure you understand that it's not that simple. Assigning
>>>> to pdev->domain is only the last step of assignment. Restoring the original
>>>> owner would entail putting in place the original IOMMU table entries as
>>>> well, which in turn can fail. Hence why you'll find a number of uses of
>>>> domain_crash() in places where rolling back is far from easy.
>>> So, why don't we just rely on the toolstack to do the roll back then?
>>> This way we won't add new domain_crash() calls.
>>> I do understand though that we may live Xen in a wrong state though.
>>> So, do you think it is possible if we just call deassign_device from
>>> assign_device on the error path? This is just like I do in 
>>> vpci_assign_device:
>>> I call vpci_deassign_device if the former fails.
>> With the following addition:
>>
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index c4ae22aeefcd..d6c00449193c 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -1511,6 +1511,12 @@ static int assign_device(struct domain *d, u16 seg, 
>> u8 bus, u8 devfn, u32 flag)
>>    }
>>
>>    rc = vpci_assign_device(pdev);
>> +    if ( rc )
>> +    /*
>> + * Ignore the return code as we want to preserve the one from the
>> + * failed assign operation.
>> + */
>> +    deassign_device(d, seg, bus, devfn);
This needs devfn to be p

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-10 Thread Oleksandr Andrushchenko


On 08.02.22 13:25, Oleksandr Andrushchenko wrote:
>
> On 08.02.22 13:00, Jan Beulich wrote:
>> On 08.02.2022 11:52, Oleksandr Andrushchenko wrote:
>>> This smells like we first need to fix the existing code, so
>>> pdev->domain is not assigned by specific IOMMU implementations,
>>> but instead controlled by the code which relies on that, assign_device.
>> Feel free to come up with proposals how to cleanly do so. Moving the
>> assignment to pdev->domain may even be possible now, but if you go
>> back you may find that the code was quite different earlier on.
> I do understand that as the code evolves new use cases bring
> new issues.
>>> I can have something like:
>>>
>>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>>> index 88836aab6baf..cc7790709a50 100644
>>> --- a/xen/drivers/passthrough/pci.c
>>> +++ b/xen/drivers/passthrough/pci.c
>>> @@ -1475,6 +1475,7 @@ static int device_assigned(u16 seg, u8 bus, u8 devfn)
>>> static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, 
>>> u32 flag)
>>> {
>>> const struct domain_iommu *hd = dom_iommu(d);
>>> +    struct domain *old_owner;
>>> struct pci_dev *pdev;
>>> int rc = 0;
>>>
>>> @@ -1490,6 +1491,9 @@ static int assign_device(struct domain *d, u16 seg, 
>>> u8 bus, u8 devfn, u32 flag)
>>> ASSERT(pdev && (pdev->domain == hardware_domain ||
>>> pdev->domain == dom_io));
>>>
>>> +    /* We need to restore the old owner in case of an error. */
>>> +    old_owner = pdev->domain;
>>> +
>>> vpci_deassign_device(pdev->domain, pdev);
>>>
>>> rc = pdev_msix_assign(d, pdev);
>>> @@ -1515,8 +1519,12 @@ static int assign_device(struct domain *d, u16 seg, 
>>> u8 bus, u8 devfn, u32 flag)
>>>
>>>  done:
>>> if ( rc )
>>> +    {
>>> printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>>    d, _SBDF3(seg, bus, devfn), rc);
>>> +    /* We failed to assign, so restore the previous owner. */
>>> +    pdev->domain = old_owner;
>>> +    }
>>> /* The device is assigned to dom_io so mark it as quarantined */
>>> else if ( d == dom_io )
>>> pdev->quarantine = true;
>>>
>>> But I do not think this belongs to this patch
>> Indeed. Plus I'm sure you understand that it's not that simple. Assigning
>> to pdev->domain is only the last step of assignment. Restoring the original
>> owner would entail putting in place the original IOMMU table entries as
>> well, which in turn can fail. Hence why you'll find a number of uses of
>> domain_crash() in places where rolling back is far from easy.
> So, why don't we just rely on the toolstack to do the roll back then?
> This way we won't add new domain_crash() calls.
> I do understand though that we may live Xen in a wrong state though.
> So, do you think it is possible if we just call deassign_device from
> assign_device on the error path? This is just like I do in vpci_assign_device:
> I call vpci_deassign_device if the former fails.
With the following addition:

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index c4ae22aeefcd..d6c00449193c 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1511,6 +1511,12 @@ static int assign_device(struct domain *d, u16 seg, u8 
bus, u8 devfn, u32 flag)
  }

  rc = vpci_assign_device(pdev);
+    if ( rc )
+    /*
+ * Ignore the return code as we want to preserve the one from the
+ * failed assign operation.
+ */
+    deassign_device(d, seg, bus, devfn);

   done:
  if ( rc )

I see the following logs (PV Dom0):

(XEN) assign_device seg 0 bus 3 devfn 0
(XEN) [VT-D]d[IO]:PCIe: unmap :03:00.0
(XEN) [VT-D]d4:PCIe: map :03:00.0
(XEN) assign_device vpci_assign rc -22 from d[IO] to d4
(XEN) deassign_device current d4 to d[IO]
(XEN) [VT-D]d4:PCIe: unmap :03:00.0
(XEN) [VT-D]d[IO]:PCIe: map :03:00.0
(XEN) deassign_device ret 0
(XEN) d4: assign (:03:00.0) failed (-22)
libxl: error: libxl_pci.c:1498:pci_add_dm_done: Domain 4:xc_assign_device 
failed: Invalid argument
libxl: error: libxl_pci.c:1781:device_pci_add_done: Domain 
4:libxl__device_pci_add failed for PCI device 0:3:0.0 (rc -3)
libxl: error: libxl_create.c:1895:domcreate_attach_devices: Domain 4:unable to 
add pci devices
libxl: error: libxl_domain.c:1183:libxl__destroy_domid: Domain 4:Non-existant 
domain
libxl: error: libxl_domain.c:1137:domain_destroy_callback: Domain 4:Unable to 
destroy guest
libxl: error: libxl_domain.c:1064:domain_destroy_cb: Domain 4:Destruction of 
domain failed

So, it seems to properly solve the issue with pdev->domain left
set to the domain we couldn't create.

@Jan, will this address your concern?

Thank you,
Oleksandr

Re: [RFC v2 5/8] xen/arm: introduce SCMI-SMC mediator driver

2022-02-09 Thread Oleksandr Andrushchenko
Hi, Oleksii!

On 08.02.22 20:00, Oleksii Moisieiev wrote:
> This is the implementation of SCI interface, called SCMI-SMC driver,
> which works as the mediator between XEN Domains and Firmware (SCP, ATF etc).
> This allows devices from the Domains to work with clocks, resets and
> power-domains without access to CPG.
>
> Originally, cpg should be passed to the domain so it can work with
> power-domains/clocks/resets etc. Considering that cpg can't be split between
> the Domains, we get the limitation that the devices, which are using
> power-domains/clocks/resets etc, couldn't be split between the domains.
> The solution is to move the power-domain/clock/resets etc to the
> Firmware (such as SCP firmware or ATF) and provide interface for the
> Domains. XEN should have an entity, caled SCI-Mediator, which is
> responsible for messages redirection between Domains and Firmware and
> for permission handling.
>
> The following features are implemented:
> - request SCMI channels from ATF and pass channels to Domains;
> - set device permissions for Domains based on the Domain partial
> device-tree. Devices with permissions are able to work with clocks,
> resets and power-domains via SCMI;
> - redirect scmi messages from Domains to ATF.
>
> Signed-off-by: Oleksii Moisieiev 
> ---
>   xen/arch/arm/Kconfig|   2 +
>   xen/arch/arm/sci/Kconfig|  10 +
>   xen/arch/arm/sci/scmi_smc.c | 959 
>   3 files changed, 971 insertions(+)
>   create mode 100644 xen/arch/arm/sci/Kconfig
>   create mode 100644 xen/arch/arm/sci/scmi_smc.c
>
> diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
> index ab07833582..3b0dfc57b6 100644
> --- a/xen/arch/arm/Kconfig
> +++ b/xen/arch/arm/Kconfig
> @@ -123,6 +123,8 @@ config ARM_SCI
> support. It allows guests to control system resourcess via one of
> ARM_SCI mediators implemented in XEN.
>   
> + source "arch/arm/sci/Kconfig"
> +
>   endmenu
>   
>   menu "ARM errata workaround via the alternative framework"
> diff --git a/xen/arch/arm/sci/Kconfig b/xen/arch/arm/sci/Kconfig
> new file mode 100644
> index 00..10b634d2ed
> --- /dev/null
> +++ b/xen/arch/arm/sci/Kconfig
> @@ -0,0 +1,10 @@
> +config SCMI_SMC
> + bool "Enable SCMI-SMC mediator driver"
> + default n
> + depends on ARM_SCI && HOST_DTB_EXPORT
> + ---help---
> +
> + Enables mediator in XEN to pass SCMI requests from Domains to ATF.
> + This feature allows drivers from Domains to work with System
> + Controllers (such as power,resets,clock etc.). SCP is used as transport
> + for communication.
> diff --git a/xen/arch/arm/sci/scmi_smc.c b/xen/arch/arm/sci/scmi_smc.c
> new file mode 100644
> index 00..103529dfab
> --- /dev/null
> +++ b/xen/arch/arm/sci/scmi_smc.c
> @@ -0,0 +1,959 @@
> +/*
> + * xen/arch/arm/sci/scmi_smc.c
> + *
> + * SCMI mediator driver, using SCP as transport.
> + *
> + * Oleksii Moisieiev 
> + * Copyright (C) 2021, EPAM Systems.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define SCMI_BASE_PROTOCOL  0x10
> +#define SCMI_BASE_PROTOCOL_ATTIBUTES0x1
> +#define SCMI_BASE_SET_DEVICE_PERMISSIONS0x9
> +#define SCMI_BASE_RESET_AGENT_CONFIGURATION 0xB
> +#define SCMI_BASE_DISCOVER_AGENT0x7
Can the above be sorted?
> +
> +/* SCMI return codes. See section 4.1.4 of SCMI spec (DEN0056C) */
> +#define SCMI_SUCCESS  0
> +#define SCMI_NOT_SUPPORTED  (-1)
> +#define SCMI_INVALID_PARAMETERS (-2)
> +#define SCMI_DENIED (-3)
> +#define SCMI_NOT_FOUND  (-4)
> +#define SCMI_OUT_OF_RANGE   (-5)
> +#define SCMI_BUSY   (-6)
> +#define SCMI_COMMS_ERROR(-7)
> +#define SCMI_GENERIC_ERROR  (-8)
> +#define SCMI_HARDWARE_ERROR (-9)
> +#define SCMI_PROTOCOL_ERROR (-10)
> +
> +#define DT_MATCH_SCMI_SMC DT_MATCH_COMPATIBLE("arm,scmi-smc")
> +
> +#define SCMI_SMC_ID"arm,smc-id"
> +#define SCMI_SHARED_MEMORY "arm,scmi-shmem"
> +#define SCMI_SHMEM "shmem"
> +#define SCMI_SHMEM_MAPPED_SIZE PAGE_SIZE
> +
> +#define HYP_CHANNEL  0x0
Alignment
> +
> +#define HDR_ID GENMASK(7,0)
> +#define HDR_TYPE   

Re: [RFC v2 2/8] libs: libxenhypfs - handle blob properties

2022-02-09 Thread Oleksandr Andrushchenko


On 09.02.22 16:01, Jan Beulich wrote:
> On 09.02.2022 14:47, Oleksandr Andrushchenko wrote:
>> Hi, Oleksii!
>>
>> On 08.02.22 20:00, Oleksii Moisieiev wrote:
>>> libxenhypfs will return blob properties as is. This output can be used
>>> to retrieve information from the hypfs. Caller is responsible for
>>> parsing property value.
>>>
>>> Signed-off-by: Oleksii Moisieiev 
>>> ---
>>>tools/libs/hypfs/core.c | 2 --
>>>1 file changed, 2 deletions(-)
>>>
>>> diff --git a/tools/libs/hypfs/core.c b/tools/libs/hypfs/core.c
>>> index 52b30db8d7..d09bba7d8c 100644
>>> --- a/tools/libs/hypfs/core.c
>>> +++ b/tools/libs/hypfs/core.c
>>> @@ -307,8 +307,6 @@ char *xenhypfs_read(xenhypfs_handle *fshdl, const char 
>>> *path)
>>>errno = EISDIR;
>>>break;
>>>case xenhypfs_type_blob:
>>> -errno = EDOM;
>>> -break;
>> This will need a /* fallthrough */ I guess
> Why? There's no statement left before the next case label.
You are right, no need
Sorry
>
> Jan
>
>>>case xenhypfs_type_string:
>>>ret_buf = buf;
>>>buf = NULL;
>


Re: [RFC v2 2/8] libs: libxenhypfs - handle blob properties

2022-02-09 Thread Oleksandr Andrushchenko
Hi, Oleksii!

On 08.02.22 20:00, Oleksii Moisieiev wrote:
> libxenhypfs will return blob properties as is. This output can be used
> to retrieve information from the hypfs. Caller is responsible for
> parsing property value.
>
> Signed-off-by: Oleksii Moisieiev 
> ---
>   tools/libs/hypfs/core.c | 2 --
>   1 file changed, 2 deletions(-)
>
> diff --git a/tools/libs/hypfs/core.c b/tools/libs/hypfs/core.c
> index 52b30db8d7..d09bba7d8c 100644
> --- a/tools/libs/hypfs/core.c
> +++ b/tools/libs/hypfs/core.c
> @@ -307,8 +307,6 @@ char *xenhypfs_read(xenhypfs_handle *fshdl, const char 
> *path)
>   errno = EISDIR;
>   break;
>   case xenhypfs_type_blob:
> -errno = EDOM;
> -break;
This will need a /* fallthrough */ I guess
>   case xenhypfs_type_string:
>   ret_buf = buf;
>   buf = NULL;
Thank you,
Oleksandr

[PATCH] vpci: introduce per-domain lock to protect vpci structure

2022-02-09 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

Introduce a per-domain read/write lock to check whether vpci is present,
so we are sure there are no accesses to the contents of the vpci struct
if not. This lock can be used (and in a few cases is used right away)
so that vpci removal can be performed while holding the lock in write
mode. Previously such removal could race with vpci_read for example.

1. Per-domain's vpci_rwlock is used to protect pdev->vpci structure
from being removed.

2. Writing the command register and ROM BAR register may trigger
modify_bars to run, which in turn may access multiple pdevs while
checking for the existing BAR's overlap. The overlapping check, if done
under the read lock, requires vpci->lock to be acquired on both devices
being compared, which may produce a deadlock. It is not possible to
upgrade read lock to write lock in such a case. So, in order to prevent
the deadlock, check which registers are going to be written and acquire
the lock in the appropriate mode from the beginning.

All other code, which doesn't lead to pdev->vpci destruction and does not
access multiple pdevs at the same time, can still use a combination of the
read lock and pdev->vpci->lock.

3. Optimize if ROM BAR write lock required detection by caching offset
of the ROM BAR register in vpci->header->rom_reg which depends on
header's type.

4. Reduce locked region in vpci_remove_device as it is now possible
to set pdev->vpci to NULL early right after the write lock is acquired.

5. Reduce locked region in vpci_add_handlers as it is possible to
initialize many more fields of the struct vpci before assigning it to
pdev->vpci.

6. vpci_{add|remove}_register are required to be called with the write lock
held, but it is not feasible to add an assert there as it requires
struct domain to be passed for that. So, add a comment about this requirement
to these and other functions with the equivalent constraints.

7. Drop const qualifier where the new rwlock is used and this is appropriate.

8. This is based on the discussion at [1].

[1] https://lore.kernel.org/all/20220204063459.680961-4-andr2...@gmail.com/

Suggested-by: Roger Pau Monné 
Suggested-by: Jan Beulich 
Signed-off-by: Oleksandr Andrushchenko 

---
This was checked on x86: with and without PVH Dom0.
---
 xen/arch/x86/hvm/vmsi.c   |   2 +
 xen/common/domain.c   |   3 +
 xen/drivers/vpci/header.c |   8 +++
 xen/drivers/vpci/msi.c|   8 ++-
 xen/drivers/vpci/msix.c   |  40 +++--
 xen/drivers/vpci/vpci.c   | 114 --
 xen/include/xen/sched.h   |   3 +
 xen/include/xen/vpci.h|   2 +
 8 files changed, 146 insertions(+), 34 deletions(-)

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 13e2a190b439..351cb968a423 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -893,6 +893,8 @@ int vpci_msix_arch_print(const struct vpci_msix *msix)
 {
 unsigned int i;
 
+ASSERT(!!rw_is_locked(>pdev->domain->vpci_rwlock));
+
 for ( i = 0; i < msix->max_entries; i++ )
 {
 const struct vpci_msix_entry *entry = >entries[i];
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 2048ebad86ff..10558c22285d 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -616,6 +616,9 @@ struct domain *domain_create(domid_t domid,
 
 #ifdef CONFIG_HAS_PCI
 INIT_LIST_HEAD(>pdev_list);
+#ifdef CONFIG_HAS_VPCI
+rwlock_init(>vpci_rwlock);
+#endif
 #endif
 
 /* All error paths can depend on the above setup. */
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 40ff79c33f8f..9e2aeb2055c9 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -142,12 +142,14 @@ bool vpci_process_pending(struct vcpu *v)
 if ( rc == -ERESTART )
 return true;
 
+read_lock(>domain->vpci_rwlock);
 spin_lock(>vpci.pdev->vpci->lock);
 /* Disable memory decoding unconditionally on failure. */
 modify_decoding(v->vpci.pdev,
 rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
 !rc && v->vpci.rom_only);
 spin_unlock(>vpci.pdev->vpci->lock);
+read_unlock(>domain->vpci_rwlock);
 
 rangeset_destroy(v->vpci.mem);
 v->vpci.mem = NULL;
@@ -203,6 +205,7 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
 raise_softirq(SCHEDULE_SOFTIRQ);
 }
 
+/* This must hold domain's vpci_rwlock in write mode. */
 static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 {
 struct vpci_header *header = >vpci->header;
@@ -454,6 +457,8 @@ static int init_bars(struct pci_dev *pdev)
 struct vpci_bar *bars = header->bars;
 int rc;
 
+ASSERT(!!rw_is_write_locked(>domain->vpci_rwlock));
+
 switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
 {

Re: [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 16:09, Roger Pau Monné wrote:
> On Tue, Feb 08, 2022 at 11:29:07AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 13:11, Roger Pau Monné wrote:
>>> On Tue, Feb 08, 2022 at 09:58:40AM +, Oleksandr Andrushchenko wrote:
>>>> On 08.02.22 11:52, Jan Beulich wrote:
>>>>> On 08.02.2022 10:38, Oleksandr Andrushchenko wrote:
>>>>>> On 08.02.22 11:33, Jan Beulich wrote:
>>>>>>> On 08.02.2022 09:13, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 16:25, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>>>>> @@ -454,6 +454,22 @@ static void cmd_write(const struct pci_dev 
>>>>>>>>>> *pdev, unsigned int reg,
>>>>>>>>>>   pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>>>>>>>   }
>>>>>>>>>>   
>>>>>>>>>> +static void guest_cmd_write(const struct pci_dev *pdev, unsigned 
>>>>>>>>>> int reg,
>>>>>>>>>> +uint32_t cmd, void *data)
>>>>>>>>>> +{
>>>>>>>>>> +/* TODO: Add proper emulation for all bits of the command 
>>>>>>>>>> register. */
>>>>>>>>>> +
>>>>>>>>>> +#ifdef CONFIG_HAS_PCI_MSI
>>>>>>>>>> +if ( pdev->vpci->msi->enabled || pdev->vpci->msix->enabled )
>>>>>>>>>> +{
>>>>>>>>>> +/* Guest wants to enable INTx. It can't be enabled if 
>>>>>>>>>> MSI/MSI-X enabled. */
>>>>>>>>>> +cmd |= PCI_COMMAND_INTX_DISABLE;
>>>>>>>>>> +}
>>>>>>>>>> +#endif
>>>>>>>>>> +
>>>>>>>>>> +cmd_write(pdev, reg, cmd, data);
>>>>>>>>>> +}
>>>>>>>>> It's not really clear to me whether the TODO warrants this being a
>>>>>>>>> separate function. Personally I'd find it preferable if the logic
>>>>>>>>> was folded into cmd_write().
>>>>>>>> Not sure cmd_write needs to have guest's logic. And what's the
>>>>>>>> profit? Later on, when we decide how PCI_COMMAND can be emulated
>>>>>>>> this code will live in guest_cmd_write anyways
>>>>>>> Why "will"? There's nothing conceptually wrong with putting all the
>>>>>>> emulation logic into cmd_write(), inside an if(!hwdom) conditional.
>>>>>>> If and when we gain CET-IBT support on the x86 side (and I'm told
>>>>>>> there's an Arm equivalent of this), then to make this as useful as
>>>>>>> possible it is going to be desirable to limit the number of functions
>>>>>>> called through function pointers. You may have seen Andrew's huge
>>>>>>> "x86: Support for CET Indirect Branch Tracking" series. We want to
>>>>>>> keep down the number of such annotations; the vast part of the series
>>>>>>> is about adding of such.
>>>>>> Well, while I see nothing bad with that, from the code organization
>>>>>> it would look a bit strange: we don't differentiate hwdom in vpci
>>>>>> handlers, but instead provide one for hwdom and one for guests.
>>>>>> While I understand your concern I still think that at the moment
>>>>>> it will be more in line with the existing code if we provide a dedicated
>>>>>> handler.
>>>>> The existing code only deals with Dom0, and hence doesn't have any
>>>>> pairs of handlers.
>>>> This is fair
>>>>> FTAOD what I said above applies equally to other
>>>>> separate guest read/write handlers you may be introducing. The
>>>>> exception being when e.g. a hardware access handler is put in place
>>>>> for Dom0 (for obvious reasons, I think).
>>>> @Roger, what's your preference here?
>>> The newly introduced handler ends up calling the existing one,
>> But before doing so it implements guest specific logic which will be
>> extended as we add more bits of emulation
>>>so in
>>> this case it might make sense to expand cmd_write to also cater for
>>> the domU case?
>> So, from the above I thought is was ok to have a dedicated handler
> Given the current proposal where you are only dealing with INTx I don't
> think it makes much sense to have a separate handler because you end
> up calling cmd_write anyway, so what's added there could very well be
> added at the top of cmd_write.
Good
>
>>> I think we need to be sensible here in that we don't want to end up
>>> with handlers like:
>>>
>>> register_read(...)
>>> {
>>>  if ( is_hardware_domain() )
>>>  
>>>  else
>>>  ...
>>> }
>>>
>>> If there's shared code it's IMO better to not create as guest specific
>>> handler.
>>>
>>> It's also more risky to use the same handlers for dom0 and domU, as a
>>> change intended to dom0 only might end up leaking in the domU path and
>>> that could easily become a security issue.
>> So, just for your justification: BARs. Is this something we also want
>> to be kept separate or we want if (is_hwdom)?
>> I guess the former.
> I think BAR access handling is sufficiently different between dom0 and
> domU that we want separate handlers.
Makes sense
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 15:38, Roger Pau Monné wrote:
> On Tue, Feb 08, 2022 at 11:13:41AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 12:50, Roger Pau Monné wrote:
>>> On Tue, Feb 08, 2022 at 07:35:34AM +, Oleksandr Andrushchenko wrote:
>>>> 5. You name it
>>>> ==
>>>>
>>>>From all the above I would recommend we go with option 2 which seems to 
>>>> reliably
>>>> solve ABBA and does not bring cons of the other approaches.
>>> 6. per-domain rwlock + per-device vpci lock
>>>
>>> Introduce vpci_header_write_lock(start, {end, size}) helper: return
>>> whether a range requires the per-domain lock in write mode. This will
>>> only return true if the range overlaps with the BAR ROM or the command
>>> register.
>>>
>>> In vpci_{read,write}:
>>>
>>> if ( vpci_header_write_lock(...) )
>>>   /* Gain exclusive access to all of the domain pdevs vpci. */
>>>   write_lock(d->vpci);
>>> else
>>> {
>>>   read_lock(d->vpci);
>>>   spin_lock(vpci->lock);
>>> }
>>> ...
>>>
>>> The vpci assign/deassign functions would need to be modified to write
>>> lock the per-domain rwlock. The MSI-X table MMIO handler will also
>>> need to read lock the per domain vpci lock.
>> Ok, so it seems you are in favor of this implementation and I have
>> no objection as well. The only limitation we should be aware of is
>> that once a path has acquired the read lock it is not possible to do
>> any write path operations in there.
>> vpci_process_pending will acquire write lock though as it can
>> lead to vpci_remove_device on its error path.
>>
>> So, I am going to implement pdev->vpci->lock + d->vpci_lock
> I think it's the less uncertain option.
>
> As said, if you want to investigate whether you can successfully move
> the checking into vpci_process_pending that would also be fine with
> me, but I cannot assert it's going to be successful. OTOH I think the
> per-domain rwlock + per-device spinlock seems quite likely to solve
> our issues.
Ok, then I'll go with per-domain rwlock + per-device spinlock
and write lock in vpci_write for cmd + ROM. Of course other
places such as vpci_remove_device and vpci_process_pending
will use write lock
>
> Thanks, Roger.
>
Thank you,
Oleksandr

Re: [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 13:11, Roger Pau Monné wrote:
> On Tue, Feb 08, 2022 at 09:58:40AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 11:52, Jan Beulich wrote:
>>> On 08.02.2022 10:38, Oleksandr Andrushchenko wrote:
>>>> On 08.02.22 11:33, Jan Beulich wrote:
>>>>> On 08.02.2022 09:13, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 16:25, Jan Beulich wrote:
>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>>> @@ -454,6 +454,22 @@ static void cmd_write(const struct pci_dev *pdev, 
>>>>>>>> unsigned int reg,
>>>>>>>>  pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +static void guest_cmd_write(const struct pci_dev *pdev, unsigned int 
>>>>>>>> reg,
>>>>>>>> +uint32_t cmd, void *data)
>>>>>>>> +{
>>>>>>>> +/* TODO: Add proper emulation for all bits of the command 
>>>>>>>> register. */
>>>>>>>> +
>>>>>>>> +#ifdef CONFIG_HAS_PCI_MSI
>>>>>>>> +if ( pdev->vpci->msi->enabled || pdev->vpci->msix->enabled )
>>>>>>>> +{
>>>>>>>> +/* Guest wants to enable INTx. It can't be enabled if 
>>>>>>>> MSI/MSI-X enabled. */
>>>>>>>> +cmd |= PCI_COMMAND_INTX_DISABLE;
>>>>>>>> +}
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>> +cmd_write(pdev, reg, cmd, data);
>>>>>>>> +}
>>>>>>> It's not really clear to me whether the TODO warrants this being a
>>>>>>> separate function. Personally I'd find it preferable if the logic
>>>>>>> was folded into cmd_write().
>>>>>> Not sure cmd_write needs to have guest's logic. And what's the
>>>>>> profit? Later on, when we decide how PCI_COMMAND can be emulated
>>>>>> this code will live in guest_cmd_write anyways
>>>>> Why "will"? There's nothing conceptually wrong with putting all the
>>>>> emulation logic into cmd_write(), inside an if(!hwdom) conditional.
>>>>> If and when we gain CET-IBT support on the x86 side (and I'm told
>>>>> there's an Arm equivalent of this), then to make this as useful as
>>>>> possible it is going to be desirable to limit the number of functions
>>>>> called through function pointers. You may have seen Andrew's huge
>>>>> "x86: Support for CET Indirect Branch Tracking" series. We want to
>>>>> keep down the number of such annotations; the vast part of the series
>>>>> is about adding of such.
>>>> Well, while I see nothing bad with that, from the code organization
>>>> it would look a bit strange: we don't differentiate hwdom in vpci
>>>> handlers, but instead provide one for hwdom and one for guests.
>>>> While I understand your concern I still think that at the moment
>>>> it will be more in line with the existing code if we provide a dedicated
>>>> handler.
>>> The existing code only deals with Dom0, and hence doesn't have any
>>> pairs of handlers.
>> This is fair
>>>FTAOD what I said above applies equally to other
>>> separate guest read/write handlers you may be introducing. The
>>> exception being when e.g. a hardware access handler is put in place
>>> for Dom0 (for obvious reasons, I think).
>> @Roger, what's your preference here?
> The newly introduced handler ends up calling the existing one,
But before doing so it implements guest specific logic which will be
extended as we add more bits of emulation
>   so in
> this case it might make sense to expand cmd_write to also cater for
> the domU case?
So, from the above I thought is was ok to have a dedicated handler
>
> I think we need to be sensible here in that we don't want to end up
> with handlers like:
>
> register_read(...)
> {
> if ( is_hardware_domain() )
> 
> else
> ...
> }
>
> If there's shared code it's IMO better to not create as guest specific
> handler.
>
> It's also more risky to use the same handlers for dom0 and domU, as a
> change intended to dom0 only might end up leaking in the domU path and
> that could easily become a security issue.
So, just for your justification: BARs. Is this something we also want
to be kept separate or we want if (is_hwdom)?
I guess the former.
>
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 13:00, Jan Beulich wrote:
> On 08.02.2022 11:52, Oleksandr Andrushchenko wrote:
>> This smells like we first need to fix the existing code, so
>> pdev->domain is not assigned by specific IOMMU implementations,
>> but instead controlled by the code which relies on that, assign_device.
> Feel free to come up with proposals how to cleanly do so. Moving the
> assignment to pdev->domain may even be possible now, but if you go
> back you may find that the code was quite different earlier on.
I do understand that as the code evolves new use cases bring
new issues.
>
>> I can have something like:
>>
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index 88836aab6baf..cc7790709a50 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -1475,6 +1475,7 @@ static int device_assigned(u16 seg, u8 bus, u8 devfn)
>>    static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 
>> flag)
>>    {
>>    const struct domain_iommu *hd = dom_iommu(d);
>> +    struct domain *old_owner;
>>    struct pci_dev *pdev;
>>    int rc = 0;
>>
>> @@ -1490,6 +1491,9 @@ static int assign_device(struct domain *d, u16 seg, u8 
>> bus, u8 devfn, u32 flag)
>>    ASSERT(pdev && (pdev->domain == hardware_domain ||
>>    pdev->domain == dom_io));
>>
>> +    /* We need to restore the old owner in case of an error. */
>> +    old_owner = pdev->domain;
>> +
>>    vpci_deassign_device(pdev->domain, pdev);
>>
>>    rc = pdev_msix_assign(d, pdev);
>> @@ -1515,8 +1519,12 @@ static int assign_device(struct domain *d, u16 seg, 
>> u8 bus, u8 devfn, u32 flag)
>>
>>     done:
>>    if ( rc )
>> +    {
>>    printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>       d, _SBDF3(seg, bus, devfn), rc);
>> +    /* We failed to assign, so restore the previous owner. */
>> +    pdev->domain = old_owner;
>> +    }
>>    /* The device is assigned to dom_io so mark it as quarantined */
>>    else if ( d == dom_io )
>>    pdev->quarantine = true;
>>
>> But I do not think this belongs to this patch
> Indeed. Plus I'm sure you understand that it's not that simple. Assigning
> to pdev->domain is only the last step of assignment. Restoring the original
> owner would entail putting in place the original IOMMU table entries as
> well, which in turn can fail. Hence why you'll find a number of uses of
> domain_crash() in places where rolling back is far from easy.
So, why don't we just rely on the toolstack to do the roll back then?
This way we won't add new domain_crash() calls.
I do understand though that we may live Xen in a wrong state though.
So, do you think it is possible if we just call deassign_device from
assign_device on the error path? This is just like I do in vpci_assign_device:
I call vpci_deassign_device if the former fails.
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 12:50, Roger Pau Monné wrote:
> On Tue, Feb 08, 2022 at 07:35:34AM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 18:44, Oleksandr Andrushchenko wrote:
>>> On 07.02.22 18:37, Jan Beulich wrote:
>>>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote:
>>>>> On 07.02.22 18:15, Jan Beulich wrote:
>>>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>>>>>>> On 07.02.22 17:26, Jan Beulich wrote:
>>>>>>>> 1b. Make vpci_write use write lock for writes to command register and 
>>>>>>>> BARs
>>>>>>>> only; keep using the read lock for all other writes.
>>>>>>> I am not quite sure how to do that. Do you mean something like:
>>>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>>>      uint32_t data)
>>>>>>> [snip]
>>>>>>>      list_for_each_entry ( r, >vpci->handlers, node )
>>>>>>> {
>>>>>>> [snip]
>>>>>>>      if ( r->needs_write_lock)
>>>>>>>          write_lock(d->vpci_lock)
>>>>>>>      else
>>>>>>>          read_lock(d->vpci_lock)
>>>>>>> 
>>>>>>>
>>>>>>> And provide rw as an argument to:
>>>>>>>
>>>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>>>>>>    vpci_write_t *write_handler, unsigned int 
>>>>>>> offset,
>>>>>>>    unsigned int size, void *data, --->>> bool 
>>>>>>> write_path <<<-)
>>>>>>>
>>>>>>> Is this what you mean?
>>>>>> This sounds overly complicated. You can derive locally in vpci_write(),
>>>>>> from just its "reg" and "size" parameters, whether the lock needs taking
>>>>>> in write mode.
>>>>> Yes, I started writing a reply with that. So, the summary (ROM
>>>>> position depends on header type):
>>>>> if ( (reg == PCI_COMMAND) || (reg == ROM) )
>>>>> {
>>>>>     read PCI_COMMAND and see if memory or IO decoding are enabled.
>>>>>     if ( enabled )
>>>>>         write_lock(d->vpci_lock)
>>>>>     else
>>>>>         read_lock(d->vpci_lock)
>>>>> }
>>>> Hmm, yes, you can actually get away without using "size", since both
>>>> command register and ROM BAR are 32-bit aligned registers, and 64-bit
>>>> accesses get split in vpci_ecam_write().
>>> But, OS may want reading a single byte of ROM BAR, so I think
>>> I'll need to check if reg+size fall into PCI_COMAND and ROM BAR
>>> ranges
>>>> For the command register the memory- / IO-decoding-enabled check may
>>>> end up a little more complicated, as the value to be written also
>>>> matters. Maybe read the command register only for the ROM BAR write,
>>>> using the write lock uniformly for all command register writes?
>>> Sounds good for the start.
>>> Another concern is that if we go with a read_lock and then in the
>>> underlying code we disable memory decoding and try doing
>>> something and calling cmd_write handler for any reason then
>>>
>>> I mean that the check in the vpci_write is somewhat we can tolerate,
>>> but then it is must be considered that no code in the read path
>>> is allowed to perform write path functions. Which brings a pretty
>>> valid use-case: say in read mode we detect an unrecoverable error
>>> and need to remove the device:
>>> vpci_process_pending -> ERROR -> vpci_remove_device or similar.
>>>
>>> What do we do then? It is all going to be fragile...
>> I have tried to summarize the options we have wrt locking
>> and would love to hear from @Roger and @Jan.
>>
>> In every variant there is a task of dealing with the overlap
>> detection in modify_bars, so this is the only place as of now
>> which needs special treatment.
>>
>> Existing limitations: there is no way to upgrade a read lock to a write
>> lock, so paths which may require write lock protection need to use
>> write lock from the very beginning. Workarounds can be applied.
>>
&

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 12:29, Jan Beulich wrote:
> On 08.02.2022 11:22, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 12:09, Jan Beulich wrote:
>>> On 08.02.2022 10:55, Oleksandr Andrushchenko wrote:
>>>> On 08.02.22 11:44, Jan Beulich wrote:
>>>>> On 08.02.2022 10:27, Oleksandr Andrushchenko wrote:
>>>>>> On 08.02.22 11:13, Jan Beulich wrote:
>>>>>>> On 08.02.2022 09:32, Oleksandr Andrushchenko wrote:
>>>>>>>> On 07.02.22 18:28, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>> @@ -1507,6 +1511,8 @@ static int assign_device(struct domain *d, u16 
>>>>>>>>>> seg, u8 bus, u8 devfn, u32 flag)
>>>>>>>>>>   pci_to_dev(pdev), flag);
>>>>>>>>>>   }
>>>>>>>>>>   
>>>>>>>>>> +rc = vpci_assign_device(d, pdev);
>>>>>>>>>> +
>>>>>>>>>>done:
>>>>>>>>>>   if ( rc )
>>>>>>>>>>   printk(XENLOG_G_WARNING "%pd: assign (%pp) failed 
>>>>>>>>>> (%d)\n",
>>>>>>>>> There's no attempt to undo anything in the case of getting back an
>>>>>>>>> error. ISTR this being deemed okay on the basis that the tool stack
>>>>>>>>> would then take whatever action, but whatever it is that is supposed
>>>>>>>>> to deal with errors here wants spelling out in the description.
>>>>>>>> Why? I don't change the previously expected decision and implementation
>>>>>>>> of the assign_device function: I use error paths as they were used 
>>>>>>>> before
>>>>>>>> for the existing code. So, I see no clear reason to stress that the 
>>>>>>>> existing
>>>>>>>> and new code relies on the toolstack
>>>>>>> Saying half a sentence on this is helping review.
>>>>>> Ok
>>>>>>>>> What's important is that no caller up the call tree may be left with
>>>>>>>>> the impression that the device is still owned by the original
>>>>>>>>> domain. With how you have it, the device is going to be owned by the
>>>>>>>>> new domain, but not really usable.
>>>>>>>> This is not true: vpci_assign_device will call vpci_deassign_device
>>>>>>>> internally if it fails. So, the device won't be assigned in this case
>>>>>>> No. The device is assigned to whatever pdev->domain holds. Calling
>>>>>>> vpci_deassign_device() there merely makes sure that the device will
>>>>>>> have _no_ vPCI data and hooks in place, rather than something
>>>>>>> partial.
>>>>>> So, this patch is only dealing with vpci assign/de-assign
>>>>>> And it rolls back what it did in case of a failure
>>>>>> It also returns rc in assign_device to signal it has failed
>>>>>> What else is expected from this patch??
>>>>> Until now if assign_device() returns an error, this tells the caller
>>>>> that the device did not change ownership;
>>>> Not sure this is the case:
>>>>    if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
>>>>  pci_to_dev(pdev), flag)) )
>>>> iommu_call can leave the new ownership even now without
>>>> vpci_assign_device.
>>> Did you check the actual hook functions for when exactly the ownership
>>> change happens. For both VT-d and AMD it is the last thing they do,
>>> when no error can occur anymore.
>> This functionality does not exist for Arm yet, so this is up to the
>> future series to add that.
>>
>> WRT to the existing code:
>>
>> static int amd_iommu_assign_device(struct domain *d, u8 devfn,
>>      struct pci_dev *pdev,
>>      u32 flag)
>> {
>>       if ( !rc )
>>       rc = reassign_device(pdev->domain, d, devfn, pdev); <<<<< this 
>> will set pdev->domain
>>
>>       if ( rc && !is_hardware_domain(d) )
>>       {
>>       int ret = amd_iommu_reserve_domain

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 12:11, Roger Pau Monné wrote:
> On Mon, Feb 07, 2022 at 05:37:49PM +0100, Jan Beulich wrote:
>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote:
>>>
>>> On 07.02.22 18:15, Jan Beulich wrote:
>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>>>>> On 07.02.22 17:26, Jan Beulich wrote:
>>>>>> 1b. Make vpci_write use write lock for writes to command register and 
>>>>>> BARs
>>>>>> only; keep using the read lock for all other writes.
>>>>> I am not quite sure how to do that. Do you mean something like:
>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>    uint32_t data)
>>>>> [snip]
>>>>>    list_for_each_entry ( r, >vpci->handlers, node )
>>>>> {
>>>>> [snip]
>>>>>    if ( r->needs_write_lock)
>>>>>        write_lock(d->vpci_lock)
>>>>>    else
>>>>>        read_lock(d->vpci_lock)
>>>>> 
>>>>>
>>>>> And provide rw as an argument to:
>>>>>
>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>>>>  vpci_write_t *write_handler, unsigned int offset,
>>>>>  unsigned int size, void *data, --->>> bool 
>>>>> write_path <<<-)
>>>>>
>>>>> Is this what you mean?
>>>> This sounds overly complicated. You can derive locally in vpci_write(),
>>>> from just its "reg" and "size" parameters, whether the lock needs taking
>>>> in write mode.
>>> Yes, I started writing a reply with that. So, the summary (ROM
>>> position depends on header type):
>>> if ( (reg == PCI_COMMAND) || (reg == ROM) )
>>> {
>>>       read PCI_COMMAND and see if memory or IO decoding are enabled.
>>>       if ( enabled )
>>>           write_lock(d->vpci_lock)
>>>       else
>>>           read_lock(d->vpci_lock)
>>> }
>> Hmm, yes, you can actually get away without using "size", since both
>> command register and ROM BAR are 32-bit aligned registers, and 64-bit
>> accesses get split in vpci_ecam_write().
>>
>> For the command register the memory- / IO-decoding-enabled check may
>> end up a little more complicated, as the value to be written also
>> matters. Maybe read the command register only for the ROM BAR write,
>> using the write lock uniformly for all command register writes?
>>
>>> Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock)
>>> at all then?
>> I haven't looked at this in any detail, sorry. It sounds possible,
>> yes.
> AFAICT you should avoid taking the per-device vpci lock when you take
> the per-domain lock in write mode. Otherwise you still need the
> per-device vpci lock in order to keep consistency between concurrent
> accesses to the device registers.
I have sent an e-mail this morning describing possible locking schemes.
Could we please move there and continue if you don't mind?
>
> Thanks, Roger.
Thank you in advance,
Oleksandr

Re: [PATCH v6 06/13] vpci/header: implement guest BAR register handlers

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 12:15, Jan Beulich wrote:
> On 08.02.2022 10:57, Oleksandr Andrushchenko wrote:
>> On 08.02.22 11:48, Jan Beulich wrote:
>>> On 08.02.2022 10:31, Oleksandr Andrushchenko wrote:
>>>> On 08.02.22 11:25, Roger Pau Monné wrote:
>>>>> On Fri, Feb 04, 2022 at 08:34:52AM +0200, Oleksandr Andrushchenko wrote:
>>>>>> @@ -516,6 +594,11 @@ static int init_bars(struct pci_dev *pdev)
>>>>>> if ( (val & PCI_BASE_ADDRESS_SPACE) == 
>>>>>> PCI_BASE_ADDRESS_SPACE_IO )
>>>>>> {
>>>>>> bars[i].type = VPCI_BAR_IO;
>>>>>> +
>>>>>> +rc = bar_ignore_access(pdev, reg, [i]);
>>>>> This is wrong: you only want to ignore access to IO BARs for Arm, for
>>>>> x86 we should keep the previous behavior. Even more if you go with
>>>>> Jan's suggestions to make bar_ignore_access also applicable to dom0.
>>>> How do we want this?
>>>> #ifdef CONFIG_ARM?
>>> Afaic better via a new, dedicated CONFIG_HAVE_* setting, which x86 selects
>>> but Arm doesn't. Unless we have one already, of course ...
>> Could you please be more specific on the name you see appropriate?
> I'm pretty sure Linux has something similar, so I'd like to ask that
> you go look there.
Not sure, but I can have a look
>   I'm sorry to say this a little bluntly, but I'm
> really in need of doing something beyond answering your mails
Well, if answers were to be a bit more specific and not so general
some time, this could definitely be helpful and save a lot of time trying
to guess what other party has in their mind.
>   (and
> in part re-stating the same thing again and again).
I have no comments on this.
>
>> And do you realize that this is going to be a single user of such a
>> setting?
> Yes, but I'm not sure this is going to remain just a single use.
> Furthermore every CONFIG_ is problematic as soon as a new port
> is being worked on. If we wanted to go with a CONFIG_ here, imo
> it ought to be CONFIG_X86, not CONFIG_ARM, as I/O ports are really an
> x86-specific thing (which has propagated into other architectures in
> more or less strange ways, but never as truly I/O ports).
I am fine using CONFIG_X86
@Roger, are you ok with that?
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 12:09, Jan Beulich wrote:
> On 08.02.2022 10:55, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 11:44, Jan Beulich wrote:
>>> On 08.02.2022 10:27, Oleksandr Andrushchenko wrote:
>>>> On 08.02.22 11:13, Jan Beulich wrote:
>>>>> On 08.02.2022 09:32, Oleksandr Andrushchenko wrote:
>>>>>> On 07.02.22 18:28, Jan Beulich wrote:
>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>> @@ -1507,6 +1511,8 @@ static int assign_device(struct domain *d, u16 
>>>>>>>> seg, u8 bus, u8 devfn, u32 flag)
>>>>>>>>  pci_to_dev(pdev), flag);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +rc = vpci_assign_device(d, pdev);
>>>>>>>> +
>>>>>>>>   done:
>>>>>>>>  if ( rc )
>>>>>>>>  printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>>>>>> There's no attempt to undo anything in the case of getting back an
>>>>>>> error. ISTR this being deemed okay on the basis that the tool stack
>>>>>>> would then take whatever action, but whatever it is that is supposed
>>>>>>> to deal with errors here wants spelling out in the description.
>>>>>> Why? I don't change the previously expected decision and implementation
>>>>>> of the assign_device function: I use error paths as they were used before
>>>>>> for the existing code. So, I see no clear reason to stress that the 
>>>>>> existing
>>>>>> and new code relies on the toolstack
>>>>> Saying half a sentence on this is helping review.
>>>> Ok
>>>>>>> What's important is that no caller up the call tree may be left with
>>>>>>> the impression that the device is still owned by the original
>>>>>>> domain. With how you have it, the device is going to be owned by the
>>>>>>> new domain, but not really usable.
>>>>>> This is not true: vpci_assign_device will call vpci_deassign_device
>>>>>> internally if it fails. So, the device won't be assigned in this case
>>>>> No. The device is assigned to whatever pdev->domain holds. Calling
>>>>> vpci_deassign_device() there merely makes sure that the device will
>>>>> have _no_ vPCI data and hooks in place, rather than something
>>>>> partial.
>>>> So, this patch is only dealing with vpci assign/de-assign
>>>> And it rolls back what it did in case of a failure
>>>> It also returns rc in assign_device to signal it has failed
>>>> What else is expected from this patch??
>>> Until now if assign_device() returns an error, this tells the caller
>>> that the device did not change ownership;
>> Not sure this is the case:
>>       if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
>>     pci_to_dev(pdev), flag)) )
>> iommu_call can leave the new ownership even now without
>> vpci_assign_device.
> Did you check the actual hook functions for when exactly the ownership
> change happens. For both VT-d and AMD it is the last thing they do,
> when no error can occur anymore.
This functionality does not exist for Arm yet, so this is up to the
future series to add that.

WRT to the existing code:

static int amd_iommu_assign_device(struct domain *d, u8 devfn,
    struct pci_dev *pdev,
    u32 flag)
{
     if ( !rc )
     rc = reassign_device(pdev->domain, d, devfn, pdev); <<<<< this will 
set pdev->domain

     if ( rc && !is_hardware_domain(d) )
     {
     int ret = amd_iommu_reserve_domain_unity_unmap(
   d, ivrs_mappings[req_id].unity_map);

     if ( ret )
     {
     printk(XENLOG_ERR "AMD-Vi: "
    "unity-unmap for %pd/%04x:%02x:%02x.%u failed (%d)\n",
    d, pdev->seg, pdev->bus,
    PCI_SLOT(devfn), PCI_FUNC(devfn), ret);
     domain_crash(d);
     }
So

This is IMO wrong in the first place to let IOMMU code assign pdev->domain.
This is something that needs to be done by the PCI code itself and
not relying on each IOMMU callback implementation
>
>   My understanding is that the roll-back is
>> expected to be performed by the toolstack and vpci_assign_device
>> doesn't prevent that by returning rc. Even more, before we discussed
>> that it would be good for vpci_assign_device to try recovering from
>> a possible error early which is done by calling vpci_deassign_device
>> internally.
> Yes, but that's only part of it. It at least needs considering what
> effects have resulted from operations prior to vpci_assign_device().
Taking into account the code snippet above: what is your expectation
from this patch with this respect?

>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:52, Jan Beulich wrote:
> On 08.02.2022 10:38, Oleksandr Andrushchenko wrote:
>>
>> On 08.02.22 11:33, Jan Beulich wrote:
>>> On 08.02.2022 09:13, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 16:25, Jan Beulich wrote:
>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>> @@ -454,6 +454,22 @@ static void cmd_write(const struct pci_dev *pdev, 
>>>>>> unsigned int reg,
>>>>>> pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>>> }
>>>>>> 
>>>>>> +static void guest_cmd_write(const struct pci_dev *pdev, unsigned int 
>>>>>> reg,
>>>>>> +uint32_t cmd, void *data)
>>>>>> +{
>>>>>> +/* TODO: Add proper emulation for all bits of the command register. 
>>>>>> */
>>>>>> +
>>>>>> +#ifdef CONFIG_HAS_PCI_MSI
>>>>>> +if ( pdev->vpci->msi->enabled || pdev->vpci->msix->enabled )
>>>>>> +{
>>>>>> +/* Guest wants to enable INTx. It can't be enabled if MSI/MSI-X 
>>>>>> enabled. */
>>>>>> +cmd |= PCI_COMMAND_INTX_DISABLE;
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>> +cmd_write(pdev, reg, cmd, data);
>>>>>> +}
>>>>> It's not really clear to me whether the TODO warrants this being a
>>>>> separate function. Personally I'd find it preferable if the logic
>>>>> was folded into cmd_write().
>>>> Not sure cmd_write needs to have guest's logic. And what's the
>>>> profit? Later on, when we decide how PCI_COMMAND can be emulated
>>>> this code will live in guest_cmd_write anyways
>>> Why "will"? There's nothing conceptually wrong with putting all the
>>> emulation logic into cmd_write(), inside an if(!hwdom) conditional.
>>> If and when we gain CET-IBT support on the x86 side (and I'm told
>>> there's an Arm equivalent of this), then to make this as useful as
>>> possible it is going to be desirable to limit the number of functions
>>> called through function pointers. You may have seen Andrew's huge
>>> "x86: Support for CET Indirect Branch Tracking" series. We want to
>>> keep down the number of such annotations; the vast part of the series
>>> is about adding of such.
>> Well, while I see nothing bad with that, from the code organization
>> it would look a bit strange: we don't differentiate hwdom in vpci
>> handlers, but instead provide one for hwdom and one for guests.
>> While I understand your concern I still think that at the moment
>> it will be more in line with the existing code if we provide a dedicated
>> handler.
> The existing code only deals with Dom0, and hence doesn't have any
> pairs of handlers.
This is fair
>   FTAOD what I said above applies equally to other
> separate guest read/write handlers you may be introducing. The
> exception being when e.g. a hardware access handler is put in place
> for Dom0 (for obvious reasons, I think).
@Roger, what's your preference here?
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 06/13] vpci/header: implement guest BAR register handlers

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:48, Jan Beulich wrote:
> On 08.02.2022 10:31, Oleksandr Andrushchenko wrote:
>> On 08.02.22 11:25, Roger Pau Monné wrote:
>>> On Fri, Feb 04, 2022 at 08:34:52AM +0200, Oleksandr Andrushchenko wrote:
>>>> @@ -516,6 +594,11 @@ static int init_bars(struct pci_dev *pdev)
>>>>if ( (val & PCI_BASE_ADDRESS_SPACE) == 
>>>> PCI_BASE_ADDRESS_SPACE_IO )
>>>>{
>>>>bars[i].type = VPCI_BAR_IO;
>>>> +
>>>> +rc = bar_ignore_access(pdev, reg, [i]);
>>> This is wrong: you only want to ignore access to IO BARs for Arm, for
>>> x86 we should keep the previous behavior. Even more if you go with
>>> Jan's suggestions to make bar_ignore_access also applicable to dom0.
>> How do we want this?
>> #ifdef CONFIG_ARM?
> Afaic better via a new, dedicated CONFIG_HAVE_* setting, which x86 selects
> but Arm doesn't. Unless we have one already, of course ...
Could you please be more specific on the name you see appropriate?
And do you realize that this is going to be a single user of such a
setting?
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:44, Jan Beulich wrote:
> On 08.02.2022 10:27, Oleksandr Andrushchenko wrote:
>> On 08.02.22 11:13, Jan Beulich wrote:
>>> On 08.02.2022 09:32, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 18:28, Jan Beulich wrote:
>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>> @@ -1507,6 +1511,8 @@ static int assign_device(struct domain *d, u16 
>>>>>> seg, u8 bus, u8 devfn, u32 flag)
>>>>>> pci_to_dev(pdev), flag);
>>>>>> }
>>>>>> 
>>>>>> +rc = vpci_assign_device(d, pdev);
>>>>>> +
>>>>>>  done:
>>>>>> if ( rc )
>>>>>> printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>>>> There's no attempt to undo anything in the case of getting back an
>>>>> error. ISTR this being deemed okay on the basis that the tool stack
>>>>> would then take whatever action, but whatever it is that is supposed
>>>>> to deal with errors here wants spelling out in the description.
>>>> Why? I don't change the previously expected decision and implementation
>>>> of the assign_device function: I use error paths as they were used before
>>>> for the existing code. So, I see no clear reason to stress that the 
>>>> existing
>>>> and new code relies on the toolstack
>>> Saying half a sentence on this is helping review.
>> Ok
>>>>> What's important is that no caller up the call tree may be left with
>>>>> the impression that the device is still owned by the original
>>>>> domain. With how you have it, the device is going to be owned by the
>>>>> new domain, but not really usable.
>>>> This is not true: vpci_assign_device will call vpci_deassign_device
>>>> internally if it fails. So, the device won't be assigned in this case
>>> No. The device is assigned to whatever pdev->domain holds. Calling
>>> vpci_deassign_device() there merely makes sure that the device will
>>> have _no_ vPCI data and hooks in place, rather than something
>>> partial.
>> So, this patch is only dealing with vpci assign/de-assign
>> And it rolls back what it did in case of a failure
>> It also returns rc in assign_device to signal it has failed
>> What else is expected from this patch??
> Until now if assign_device() returns an error, this tells the caller
> that the device did not change ownership;
Not sure this is the case:
     if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
   pci_to_dev(pdev), flag)) )
iommu_call can leave the new ownership even now without
vpci_assign_device. My understanding is that the roll-back is
expected to be performed by the toolstack and vpci_assign_device
doesn't prevent that by returning rc. Even more, before we discussed
that it would be good for vpci_assign_device to try recovering from
a possible error early which is done by calling vpci_deassign_device
internally.

So, if you want the things to be clearly handled without relying on the
toolstack then it is not vpci_assign_device introduced issue, but the
existing one, which needs (if there is a good reason) to be fixed
separately.
I think that new code doesn't make things worse. At least

>   in the worst case it either
> only moved to the quarantine domain, or the new owner may have been
> crashed. In no case is the device owned by an alive DomU. You're
> changing this property, and hence you need to make clear/sure that
> this isn't colliding with assumptions made elsewhere.
>
> Jan
>
>


Re: [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:33, Jan Beulich wrote:
> On 08.02.2022 09:13, Oleksandr Andrushchenko wrote:
>> On 04.02.22 16:25, Jan Beulich wrote:
>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -454,6 +454,22 @@ static void cmd_write(const struct pci_dev *pdev, 
>>>> unsigned int reg,
>>>>pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>}
>>>>
>>>> +static void guest_cmd_write(const struct pci_dev *pdev, unsigned int reg,
>>>> +uint32_t cmd, void *data)
>>>> +{
>>>> +/* TODO: Add proper emulation for all bits of the command register. */
>>>> +
>>>> +#ifdef CONFIG_HAS_PCI_MSI
>>>> +if ( pdev->vpci->msi->enabled || pdev->vpci->msix->enabled )
>>>> +{
>>>> +/* Guest wants to enable INTx. It can't be enabled if MSI/MSI-X 
>>>> enabled. */
>>>> +cmd |= PCI_COMMAND_INTX_DISABLE;
>>>> +}
>>>> +#endif
>>>> +
>>>> +cmd_write(pdev, reg, cmd, data);
>>>> +}
>>> It's not really clear to me whether the TODO warrants this being a
>>> separate function. Personally I'd find it preferable if the logic
>>> was folded into cmd_write().
>> Not sure cmd_write needs to have guest's logic. And what's the
>> profit? Later on, when we decide how PCI_COMMAND can be emulated
>> this code will live in guest_cmd_write anyways
> Why "will"? There's nothing conceptually wrong with putting all the
> emulation logic into cmd_write(), inside an if(!hwdom) conditional.
> If and when we gain CET-IBT support on the x86 side (and I'm told
> there's an Arm equivalent of this), then to make this as useful as
> possible it is going to be desirable to limit the number of functions
> called through function pointers. You may have seen Andrew's huge
> "x86: Support for CET Indirect Branch Tracking" series. We want to
> keep down the number of such annotations; the vast part of the series
> is about adding of such.
Well, while I see nothing bad with that, from the code organization
it would look a bit strange: we don't differentiate hwdom in vpci
handlers, but instead provide one for hwdom and one for guests.
While I understand your concern I still think that at the moment
it will be more in line with the existing code if we provide a dedicated
handler.

Once we are all set with the handlers we may want performing a refactoring
with limiting the number of register handlers.

@Roger, what's your view on this?
> Jan
>
>
Thank you,
Oleksandr

Re: [PATCH v6 06/13] vpci/header: implement guest BAR register handlers

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:25, Roger Pau Monné wrote:
> On Fri, Feb 04, 2022 at 08:34:52AM +0200, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko 
>>
>> Add relevant vpci register handlers when assigning PCI device to a domain
>> and remove those when de-assigning. This allows having different
>> handlers for different domains, e.g. hwdom and other guests.
>>
>> Emulate guest BAR register values: this allows creating a guest view
>> of the registers and emulates size and properties probe as it is done
>> during PCI device enumeration by the guest.
>>
>> All empty, IO and ROM BARs for guests are emulated by returning 0 on
>> reads and ignoring writes: this BARs are special with this respect as
>> their lower bits have special meaning, so returning default ~0 on read
>> may confuse guest OS.
>>
>> Memory decoding is initially disabled when used by guests in order to
>> prevent the BAR being placed on top of a RAM region.
>>
>> Signed-off-by: Oleksandr Andrushchenko 
>> ---
>> Since v5:
>> - make sure that the guest set address has the same page offset
>>as the physical address on the host
>> - remove guest_rom_{read|write} as those just implement the default
>>behaviour of the registers not being handled
>> - adjusted comment for struct vpci.addr field
>> - add guest handlers for BARs which are not handled and will otherwise
>>return ~0 on read and ignore writes. The BARs are special with this
>>respect as their lower bits have special meaning, so returning ~0
>>doesn't seem to be right
>> Since v4:
>> - updated commit message
>> - s/guest_addr/guest_reg
>> Since v3:
>> - squashed two patches: dynamic add/remove handlers and guest BAR
>>handler implementation
>> - fix guest BAR read of the high part of a 64bit BAR (Roger)
>> - add error handling to vpci_assign_device
>> - s/dom%pd/%pd
>> - blank line before return
>> Since v2:
>> - remove unneeded ifdefs for CONFIG_HAS_VPCI_GUEST_SUPPORT as more code
>>has been eliminated from being built on x86
>> Since v1:
>>   - constify struct pci_dev where possible
>>   - do not open code is_system_domain()
>>   - simplify some code3. simplify
>>   - use gdprintk + error code instead of gprintk
>>   - gate vpci_bar_{add|remove}_handlers with CONFIG_HAS_VPCI_GUEST_SUPPORT,
>> so these do not get compiled for x86
>>   - removed unneeded is_system_domain check
>>   - re-work guest read/write to be much simpler and do more work on write
>> than read which is expected to be called more frequently
>>   - removed one too obvious comment
>> ---
>>   xen/drivers/vpci/header.c | 131 +-
>>   xen/include/xen/vpci.h|   3 +
>>   2 files changed, 118 insertions(+), 16 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index bd23c0274d48..2620a95ff35b 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -406,6 +406,81 @@ static void bar_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   pci_conf_write32(pdev->sbdf, reg, val);
>>   }
>>   
>> +static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>> +uint32_t val, void *data)
>> +{
>> +struct vpci_bar *bar = data;
>> +bool hi = false;
>> +uint64_t guest_reg = bar->guest_reg;
>> +
>> +if ( bar->type == VPCI_BAR_MEM64_HI )
>> +{
>> +ASSERT(reg > PCI_BASE_ADDRESS_0);
>> +bar--;
>> +hi = true;
>> +}
>> +else
>> +{
>> +val &= PCI_BASE_ADDRESS_MEM_MASK;
>> +val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>> +   : PCI_BASE_ADDRESS_MEM_TYPE_64;
>> +val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>> +}
>> +
>> +guest_reg &= ~(0xull << (hi ? 32 : 0));
>> +guest_reg |= (uint64_t)val << (hi ? 32 : 0);
>> +
>> +guest_reg &= ~(bar->size - 1) | ~PCI_BASE_ADDRESS_MEM_MASK;
>> +
>> +/*
>> + * Make sure that the guest set address has the same page offset
>> + * as the physical address on the host or otherwise things won't work as
>> + * expected.
>> + */
>> +if ( (guest_reg & (~PAGE_MASK & PCI_BASE_ADDRESS_MEM_MASK)) !=
>> + (bar->addr & ~PAGE_MASK) )
> This is only r

Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:13, Jan Beulich wrote:
> On 08.02.2022 09:32, Oleksandr Andrushchenko wrote:
>> On 07.02.22 18:28, Jan Beulich wrote:
>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>> @@ -1507,6 +1511,8 @@ static int assign_device(struct domain *d, u16 seg, 
>>>> u8 bus, u8 devfn, u32 flag)
>>>>pci_to_dev(pdev), flag);
>>>>}
>>>>
>>>> +rc = vpci_assign_device(d, pdev);
>>>> +
>>>> done:
>>>>if ( rc )
>>>>printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
>>> There's no attempt to undo anything in the case of getting back an
>>> error. ISTR this being deemed okay on the basis that the tool stack
>>> would then take whatever action, but whatever it is that is supposed
>>> to deal with errors here wants spelling out in the description.
>> Why? I don't change the previously expected decision and implementation
>> of the assign_device function: I use error paths as they were used before
>> for the existing code. So, I see no clear reason to stress that the existing
>> and new code relies on the toolstack
> Saying half a sentence on this is helping review.
Ok
>
>>> What's important is that no caller up the call tree may be left with
>>> the impression that the device is still owned by the original
>>> domain. With how you have it, the device is going to be owned by the
>>> new domain, but not really usable.
>> This is not true: vpci_assign_device will call vpci_deassign_device
>> internally if it fails. So, the device won't be assigned in this case
> No. The device is assigned to whatever pdev->domain holds. Calling
> vpci_deassign_device() there merely makes sure that the device will
> have _no_ vPCI data and hooks in place, rather than something
> partial.
So, this patch is only dealing with vpci assign/de-assign
And it rolls back what it did in case of a failure
It also returns rc in assign_device to signal it has failed
What else is expected from this patch??
>
>>>> --- a/xen/drivers/vpci/vpci.c
>>>> +++ b/xen/drivers/vpci/vpci.c
>>>> @@ -99,6 +99,33 @@ int vpci_add_handlers(struct pci_dev *pdev)
>>>>
>>>>return rc;
>>>>}
>>>> +
>>>> +#ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>>>> +/* Notify vPCI that device is assigned to guest. */
>>>> +int vpci_assign_device(struct domain *d, struct pci_dev *pdev)
>>>> +{
>>>> +int rc;
>>>> +
>>>> +if ( !has_vpci(d) )
>>>> +return 0;
>>>> +
>>>> +rc = vpci_add_handlers(pdev);
>>>> +if ( rc )
>>>> +vpci_deassign_device(d, pdev);
>>>> +
>>>> +return rc;
>>>> +}
>>>> +
>>>> +/* Notify vPCI that device is de-assigned from guest. */
>>>> +void vpci_deassign_device(struct domain *d, struct pci_dev *pdev)
>>>> +{
>>>> +if ( !has_vpci(d) )
>>>> +return;
>>>> +
>>>> +vpci_remove_device(pdev);
>>>> +}
>>>> +#endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>> While for the latter function you look to need two parameters, do you
>>> really need them also in the former one?
>> Do you mean instead of passing d we could just use pdev->domain?
>> int vpci_assign_device(struct pci_dev *pdev)
>> +{
>> +    int rc;
>> +
>> +    if ( !has_vpci(pdev->domain) )
>> +    return 0;
> Yes.
>
>> Yes, we probably can, but the rest of functions called from assign_device
>> are accepting both d and pdev, so not sure why would we want these
>> two be any different. Any good reason not to change others as well then?
> Yes: Prior to the call of the ->assign_device() hook, d != pdev->domain.
> It is the _purpose_ of this function to change ownership of the device.
This can be done and makes sense.
@Roger which way do you want this?
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 04/13] vpci: restrict unhandled read/write operations for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:05, Roger Pau Monné wrote:
> On Tue, Feb 08, 2022 at 08:00:28AM +0000, Oleksandr Andrushchenko wrote:
>> On 04.02.22 16:24, Oleksandr Andrushchenko wrote:
>>> On 04.02.22 16:11, Jan Beulich wrote:
>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>> A guest can read and write those registers which are not emulated and
>>>>> have no respective vPCI handlers, so it can access the HW directly.
>>>> I don't think this describes the present situation. Or did I miss where
>>>> devices can actually be exposed to guests already, despite much of the
>>>> support logic still missing?
>>> No, they are not exposed yet and you know that.
>>> I will update the commit message
>> BTW, all this work is about adding vpci for guests and of course this
>> is not going to be enabled right away.
>> I would like to hear the common acceptable way of documenting such
>> things: either we just say something like "A guest can read and write"
>> elsewhere or we need to invent something neutral not directly mentioning
>> what the change does. With the later it all seems a bit confusing IMO
>> as we do know what we are doing and for what reason: enable vpci for guests
>>>>> In order to prevent a guest from reads and writes from/to the unhandled
>>>>> registers make sure only hardware domain can access HW directly and 
>>>>> restrict
>>>>> guests from doing so.
>>>> Tangential question: Going over the titles of the remaining patches I
>>>> notice patch 6 is going to deal with BAR accesses. But (going just
>>>> from the titles) I can't spot anywhere that vendor and device IDs
>>>> would be exposed to guests. Yet that's the first thing guests will need
>>>> in order to actually recognize devices. As said before, allowing guests
>>>> access to such r/o fields is quite likely going to be fine.
>>> Agree, I was thinking about adding such a patch to allow IDs,
>>> but finally decided not to add more to this series.
>>> Again, the whole thing is not working yet and for the development
>>> this patch can/needs to be reverted. So, either we implement IDs
>>> or not this doesn't change anything with this respect
>> Roger, do you want an additional patch with IDs in v7?
> I would expect a lot more work to be required, you need IDs and the
> Header type as a minimum I would say. And then in order to have
> something functional you will also need to handle the capabilities
> pointer.
>
> I'm fine for this to be added in a followup series. I think it's clear
> the status after this series is not going to be functional.
Ok, so let's first have something and then we can extend guest's support
This can go in parallel with other work on Arm which still waits
for this series to be accepted
>
>>>>> --- a/xen/drivers/vpci/vpci.c
>>>>> +++ b/xen/drivers/vpci/vpci.c
>>>>> @@ -215,11 +215,15 @@ int vpci_remove_register(struct vpci *vpci, 
>>>>> unsigned int offset,
>>>>> }
>>>>> 
>>>>> /* Wrappers for performing reads/writes to the underlying hardware. */
>>>>> -static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int reg,
>>>>> +static uint32_t vpci_read_hw(bool is_hwdom, pci_sbdf_t sbdf, unsigned 
>>>>> int reg,
>>>>>  unsigned int size)
>>>> Was the passing around of a boolean the consensus which was reached?
>>> Was this patch committed yet?
>>>> Personally I'd fine it more natural if the two functions checked
>>>> current->domain themselves.
>>> This is also possible, but I would like to hear Roger's view on this as well
>>> I am fine either way
>> Roger, what's your maintainer's preference here? Additional argument
>> to vpci_read_hw of make it use current->domain internally?
> My recommendation would be to use current->domain. Handlers will
> always be executed in guest context, so there's no need to pass a
> parameter around.
ok, I'll use current->domain
>
> Thanks, Roger.
>
Thank you,
Oleksandr

Re: [PATCH v6 04/13] vpci: restrict unhandled read/write operations for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 11:04, Jan Beulich wrote:
> On 08.02.2022 09:00, Oleksandr Andrushchenko wrote:
>> On 04.02.22 16:24, Oleksandr Andrushchenko wrote:
>>> On 04.02.22 16:11, Jan Beulich wrote:
>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>> A guest can read and write those registers which are not emulated and
>>>>> have no respective vPCI handlers, so it can access the HW directly.
>>>> I don't think this describes the present situation. Or did I miss where
>>>> devices can actually be exposed to guests already, despite much of the
>>>> support logic still missing?
>>> No, they are not exposed yet and you know that.
>>> I will update the commit message
>> BTW, all this work is about adding vpci for guests and of course this
>> is not going to be enabled right away.
>> I would like to hear the common acceptable way of documenting such
>> things: either we just say something like "A guest can read and write"
>> elsewhere or we need to invent something neutral not directly mentioning
>> what the change does. With the later it all seems a bit confusing IMO
>> as we do know what we are doing and for what reason: enable vpci for guests
> What's the problem with describing things as they are? Code is hwdom-
> only right now, and you're trying to enable DomU support. Hence it's
> all about "would be able to", not "can".
Sounds good, will use that wording then
>
> Jan
>
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 10:57, Jan Beulich wrote:
> On 08.02.2022 08:35, Oleksandr Andrushchenko wrote:
>> 1.1. Semi read lock upgrade in modify bars
>> --
>> In this case both vpci_read and vpci_write take a read lock and when it comes
>> to modify_bars:
>>
>> 1. read_unlock(d->vpci_lock)
>> 2. write_lock(d->vpci_lock)
>> 3. Check that pdev->vpci is still available and is the same object:
>> if (pdev->vpci && (pdev->vpci == old_vpci) )
>> {
>>       /* vpci structure is valid and can be used. */
>> }
>> else
>> {
>>       /* vpci has gone, return an error. */
>> }
>>
>> Pros:
>> - no per-device vpci lock is needed?
>> - solves overlap code ABBA in modify_bars
>> - readers and writers are NOT serialized
>> - NO need to carefully select read paths, so they are guaranteed not to lead
>>     to lock upgrade use-cases
>>
>> Cons:
>> - ???
> The "pdev->vpci == old_vpci" is fragile: The struct may have got re-
> allocated, and it just so happened that the two pointers are identical.
>
> Same then for the subsequent variant 2.
Yes, it is possible. We can add an ID number to pdev->vpci,
so each new allocated vpci structure has a unique ID which can be used
to compare vpci structures. It can be something like pdev->vpci->id = 
d->vpci_id++;
with id being uint32_t for example
>
> Jan
>
>


Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-08 Thread Oleksandr Andrushchenko


On 08.02.22 10:53, Jan Beulich wrote:
> On 07.02.2022 17:44, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 18:37, Jan Beulich wrote:
>>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 18:15, Jan Beulich wrote:
>>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>>>>>> On 07.02.22 17:26, Jan Beulich wrote:
>>>>>>> 1b. Make vpci_write use write lock for writes to command register and 
>>>>>>> BARs
>>>>>>> only; keep using the read lock for all other writes.
>>>>>> I am not quite sure how to do that. Do you mean something like:
>>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>>     uint32_t data)
>>>>>> [snip]
>>>>>>     list_for_each_entry ( r, >vpci->handlers, node )
>>>>>> {
>>>>>> [snip]
>>>>>>     if ( r->needs_write_lock)
>>>>>>         write_lock(d->vpci_lock)
>>>>>>     else
>>>>>>         read_lock(d->vpci_lock)
>>>>>> 
>>>>>>
>>>>>> And provide rw as an argument to:
>>>>>>
>>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>>>>>   vpci_write_t *write_handler, unsigned int 
>>>>>> offset,
>>>>>>   unsigned int size, void *data, --->>> bool 
>>>>>> write_path <<<-)
>>>>>>
>>>>>> Is this what you mean?
>>>>> This sounds overly complicated. You can derive locally in vpci_write(),
>>>>> from just its "reg" and "size" parameters, whether the lock needs taking
>>>>> in write mode.
>>>> Yes, I started writing a reply with that. So, the summary (ROM
>>>> position depends on header type):
>>>> if ( (reg == PCI_COMMAND) || (reg == ROM) )
>>>> {
>>>>    read PCI_COMMAND and see if memory or IO decoding are enabled.
>>>>    if ( enabled )
>>>>        write_lock(d->vpci_lock)
>>>>    else
>>>>        read_lock(d->vpci_lock)
>>>> }
>>> Hmm, yes, you can actually get away without using "size", since both
>>> command register and ROM BAR are 32-bit aligned registers, and 64-bit
>>> accesses get split in vpci_ecam_write().
>> But, OS may want reading a single byte of ROM BAR, so I think
>> I'll need to check if reg+size fall into PCI_COMAND and ROM BAR
>> ranges
>>> For the command register the memory- / IO-decoding-enabled check may
>>> end up a little more complicated, as the value to be written also
>>> matters. Maybe read the command register only for the ROM BAR write,
>>> using the write lock uniformly for all command register writes?
>> Sounds good for the start.
>> Another concern is that if we go with a read_lock and then in the
>> underlying code we disable memory decoding and try doing
>> something and calling cmd_write handler for any reason then
>>
>> I mean that the check in the vpci_write is somewhat we can tolerate,
>> but then it is must be considered that no code in the read path
>> is allowed to perform write path functions. Which brings a pretty
>> valid use-case: say in read mode we detect an unrecoverable error
>> and need to remove the device:
>> vpci_process_pending -> ERROR -> vpci_remove_device or similar.
>>
>> What do we do then? It is all going to be fragile...
> Real hardware won't cause a device to disappear upon a problem with
> a read access. There shouldn't be any need to remove a passed-through
> device either; such problems (if any) need handling differently imo.
Yes, at the moment there is a single place in the code which
removes the device (besides normal use-cases such as
pci_add_device on fail path and PHYSDEVOP_manage_pci_remove):

bool vpci_process_pending(struct vcpu *v)
{
[snip]
     if ( rc )
     /*
  * FIXME: in case of failure remove the device from the domain.
  * Note that there might still be leftover mappings. While this is
  * safe for Dom0, for DomUs the domain will likely need to be
  * killed in order to avoid leaking stale p2m mappings on
  * failure.
  */
     vpci_remove_device(v->vpci.pdev);

>
> Jan
>
>


Re: [PATCH v6 05/13] vpci: add hooks for PCI device assign/de-assign

2022-02-08 Thread Oleksandr Andrushchenko


On 07.02.22 18:28, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> @@ -1507,6 +1511,8 @@ static int assign_device(struct domain *d, u16 seg, u8 
>> bus, u8 devfn, u32 flag)
>>   pci_to_dev(pdev), flag);
>>   }
>>   
>> +rc = vpci_assign_device(d, pdev);
>> +
>>done:
>>   if ( rc )
>>   printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
> There's no attempt to undo anything in the case of getting back an
> error. ISTR this being deemed okay on the basis that the tool stack
> would then take whatever action, but whatever it is that is supposed
> to deal with errors here wants spelling out in the description.
Why? I don't change the previously expected decision and implementation
of the assign_device function: I use error paths as they were used before
for the existing code. So, I see no clear reason to stress that the existing
and new code relies on the toolstack
> What's important is that no caller up the call tree may be left with
> the impression that the device is still owned by the original
> domain. With how you have it, the device is going to be owned by the
> new domain, but not really usable.
This is not true: vpci_assign_device will call vpci_deassign_device
internally if it fails. So, the device won't be assigned in this case
>
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -99,6 +99,33 @@ int vpci_add_handlers(struct pci_dev *pdev)
>>   
>>   return rc;
>>   }
>> +
>> +#ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>> +/* Notify vPCI that device is assigned to guest. */
>> +int vpci_assign_device(struct domain *d, struct pci_dev *pdev)
>> +{
>> +int rc;
>> +
>> +if ( !has_vpci(d) )
>> +return 0;
>> +
>> +rc = vpci_add_handlers(pdev);
>> +if ( rc )
>> +vpci_deassign_device(d, pdev);
>> +
>> +return rc;
>> +}
>> +
>> +/* Notify vPCI that device is de-assigned from guest. */
>> +void vpci_deassign_device(struct domain *d, struct pci_dev *pdev)
>> +{
>> +if ( !has_vpci(d) )
>> +return;
>> +
>> +vpci_remove_device(pdev);
>> +}
>> +#endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
> While for the latter function you look to need two parameters, do you
> really need them also in the former one?
Do you mean instead of passing d we could just use pdev->domain?
int vpci_assign_device(struct pci_dev *pdev)
+{
+    int rc;
+
+    if ( !has_vpci(pdev->domain) )
+    return 0;
Yes, we probably can, but the rest of functions called from assign_device
are accepting both d and pdev, so not sure why would we want these
two be any different. Any good reason not to change others as well then?
> Symmetry considerations make me wonder though whether the de-assign
> hook shouldn't be called earlier, when pdev->domain still has the
> original owner. At which point the 2nd parameter could disappear there
> as well.
static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
{
[snip]
     vpci_deassign_device(pdev->domain, pdev);
[snip]
     rc = vpci_assign_device(d, pdev);

It looks ok to me
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests

2022-02-08 Thread Oleksandr Andrushchenko


On 04.02.22 16:25, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -454,6 +454,22 @@ static void cmd_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   pci_conf_write16(pdev->sbdf, reg, cmd);
>>   }
>>   
>> +static void guest_cmd_write(const struct pci_dev *pdev, unsigned int reg,
>> +uint32_t cmd, void *data)
>> +{
>> +/* TODO: Add proper emulation for all bits of the command register. */
>> +
>> +#ifdef CONFIG_HAS_PCI_MSI
>> +if ( pdev->vpci->msi->enabled || pdev->vpci->msix->enabled )
>> +{
>> +/* Guest wants to enable INTx. It can't be enabled if MSI/MSI-X 
>> enabled. */
>> +cmd |= PCI_COMMAND_INTX_DISABLE;
>> +}
>> +#endif
>> +
>> +cmd_write(pdev, reg, cmd, data);
>> +}
> It's not really clear to me whether the TODO warrants this being a
> separate function. Personally I'd find it preferable if the logic
> was folded into cmd_write().
Not sure cmd_write needs to have guest's logic. And what's the
profit? Later on, when we decide how PCI_COMMAND can be emulated
this code will live in guest_cmd_write anyways
>
> With this and ...
>
>> --- a/xen/drivers/vpci/msi.c
>> +++ b/xen/drivers/vpci/msi.c
>> @@ -70,6 +70,10 @@ static void control_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   
>>   if ( vpci_msi_arch_enable(msi, pdev, vectors) )
>>   return;
>> +
>> +/* Make sure guest doesn't enable INTx while enabling MSI. */
>> +if ( !is_hardware_domain(pdev->domain) )
>> +pci_intx(pdev, false);
>>   }
>>   else
>>   vpci_msi_arch_disable(msi, pdev);
>> --- a/xen/drivers/vpci/msix.c
>> +++ b/xen/drivers/vpci/msix.c
>> @@ -92,6 +92,10 @@ static void control_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   for ( i = 0; i < msix->max_entries; i++ )
>>   if ( !msix->entries[i].masked && msix->entries[i].updated )
>>   update_entry(>entries[i], pdev, i);
>> +
>> +/* Make sure guest doesn't enable INTx while enabling MSI-X. */
>> +if ( !is_hardware_domain(pdev->domain) )
>> +pci_intx(pdev, false);
>>   }
>>   else if ( !new_enabled && msix->enabled )
>>   {
> ... this done (as requested) behind the back of the guest, what's the
> idea wrt the guest reading the command register? That continues to be
> wired to vpci_hw_read16() (and hence accesses the underlying hardware
> value irrespective of what patch 4 did).
Yes, good point. We need to add guest_cmd_read counterpart,
so we can also implement the same logic as in guest_cmd_write
wrt to INTx bit.
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 06/13] vpci/header: implement guest BAR register handlers

2022-02-08 Thread Oleksandr Andrushchenko


On 07.02.22 19:06, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> +static uint32_t guest_bar_ignore_read(const struct pci_dev *pdev,
>> +  unsigned int reg, void *data)
>> +{
>> +return 0;
>> +}
>> +
>> +static int bar_ignore_access(const struct pci_dev *pdev, unsigned int reg,
>> + struct vpci_bar *bar)
>> +{
>> +if ( is_hardware_domain(pdev->domain) )
>> +return 0;
>> +
>> +return vpci_add_register(pdev->vpci, guest_bar_ignore_read, NULL,
>> + reg, 4, bar);
>> +}
> For these two functions: I'm not sure "ignore" is an appropriate
> term here. unused_bar_read() and unused_bar() maybe? Or,
> considering we already have VPCI_BAR_EMPTY, s/unused/empty/ ? I'm
> also not sure we really need the is_hardware_domain() check here:
> Returning 0 for Dom0 is going to be fine as well; there's no need
> to fetch the value from actual hardware. The one exception might
> be for devices with buggy BAR behavior ...
Well, I think this should be ok, so then
- s/guest_bar_ignore_read/empty_bar_read
- s/bar_ignore_access/empty_bar
- no is_hardware_domain check
>
>> @@ -516,6 +594,11 @@ static int init_bars(struct pci_dev *pdev)
>>   if ( (val & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
>>   {
>>   bars[i].type = VPCI_BAR_IO;
>> +
>> +rc = bar_ignore_access(pdev, reg, [i]);
>> +if ( rc )
>> +return rc;
> Elsewhere the command register is restored on error paths.
Ok, I will restore
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 04/13] vpci: restrict unhandled read/write operations for guests

2022-02-08 Thread Oleksandr Andrushchenko

On 04.02.22 16:24, Oleksandr Andrushchenko wrote:
>
> On 04.02.22 16:11, Jan Beulich wrote:
>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>> A guest can read and write those registers which are not emulated and
>>> have no respective vPCI handlers, so it can access the HW directly.
>> I don't think this describes the present situation. Or did I miss where
>> devices can actually be exposed to guests already, despite much of the
>> support logic still missing?
> No, they are not exposed yet and you know that.
> I will update the commit message
BTW, all this work is about adding vpci for guests and of course this
is not going to be enabled right away.
I would like to hear the common acceptable way of documenting such
things: either we just say something like "A guest can read and write"
elsewhere or we need to invent something neutral not directly mentioning
what the change does. With the later it all seems a bit confusing IMO
as we do know what we are doing and for what reason: enable vpci for guests
>>> In order to prevent a guest from reads and writes from/to the unhandled
>>> registers make sure only hardware domain can access HW directly and restrict
>>> guests from doing so.
>> Tangential question: Going over the titles of the remaining patches I
>> notice patch 6 is going to deal with BAR accesses. But (going just
>> from the titles) I can't spot anywhere that vendor and device IDs
>> would be exposed to guests. Yet that's the first thing guests will need
>> in order to actually recognize devices. As said before, allowing guests
>> access to such r/o fields is quite likely going to be fine.
> Agree, I was thinking about adding such a patch to allow IDs,
> but finally decided not to add more to this series.
> Again, the whole thing is not working yet and for the development
> this patch can/needs to be reverted. So, either we implement IDs
> or not this doesn't change anything with this respect
Roger, do you want an additional patch with IDs in v7?
>>> --- a/xen/drivers/vpci/vpci.c
>>> +++ b/xen/drivers/vpci/vpci.c
>>> @@ -215,11 +215,15 @@ int vpci_remove_register(struct vpci *vpci, unsigned 
>>> int offset,
>>>}
>>>
>>>/* Wrappers for performing reads/writes to the underlying hardware. */
>>> -static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int reg,
>>> +static uint32_t vpci_read_hw(bool is_hwdom, pci_sbdf_t sbdf, unsigned int 
>>> reg,
>>> unsigned int size)
>> Was the passing around of a boolean the consensus which was reached?
> Was this patch committed yet?
>> Personally I'd fine it more natural if the two functions checked
>> current->domain themselves.
> This is also possible, but I would like to hear Roger's view on this as well
> I am fine either way
Roger, what's your maintainer's preference here? Additional argument
to vpci_read_hw of make it use current->domain internally?

Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 18:44, Oleksandr Andrushchenko wrote:
>
> On 07.02.22 18:37, Jan Beulich wrote:
>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote:
>>> On 07.02.22 18:15, Jan Beulich wrote:
>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>>>>> On 07.02.22 17:26, Jan Beulich wrote:
>>>>>> 1b. Make vpci_write use write lock for writes to command register and 
>>>>>> BARs
>>>>>> only; keep using the read lock for all other writes.
>>>>> I am not quite sure how to do that. Do you mean something like:
>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>     uint32_t data)
>>>>> [snip]
>>>>>     list_for_each_entry ( r, >vpci->handlers, node )
>>>>> {
>>>>> [snip]
>>>>>     if ( r->needs_write_lock)
>>>>>         write_lock(d->vpci_lock)
>>>>>     else
>>>>>         read_lock(d->vpci_lock)
>>>>> 
>>>>>
>>>>> And provide rw as an argument to:
>>>>>
>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>>>>   vpci_write_t *write_handler, unsigned int 
>>>>> offset,
>>>>>   unsigned int size, void *data, --->>> bool 
>>>>> write_path <<<-)
>>>>>
>>>>> Is this what you mean?
>>>> This sounds overly complicated. You can derive locally in vpci_write(),
>>>> from just its "reg" and "size" parameters, whether the lock needs taking
>>>> in write mode.
>>> Yes, I started writing a reply with that. So, the summary (ROM
>>> position depends on header type):
>>> if ( (reg == PCI_COMMAND) || (reg == ROM) )
>>> {
>>>    read PCI_COMMAND and see if memory or IO decoding are enabled.
>>>    if ( enabled )
>>>        write_lock(d->vpci_lock)
>>>    else
>>>        read_lock(d->vpci_lock)
>>> }
>> Hmm, yes, you can actually get away without using "size", since both
>> command register and ROM BAR are 32-bit aligned registers, and 64-bit
>> accesses get split in vpci_ecam_write().
> But, OS may want reading a single byte of ROM BAR, so I think
> I'll need to check if reg+size fall into PCI_COMAND and ROM BAR
> ranges
>> For the command register the memory- / IO-decoding-enabled check may
>> end up a little more complicated, as the value to be written also
>> matters. Maybe read the command register only for the ROM BAR write,
>> using the write lock uniformly for all command register writes?
> Sounds good for the start.
> Another concern is that if we go with a read_lock and then in the
> underlying code we disable memory decoding and try doing
> something and calling cmd_write handler for any reason then
>
> I mean that the check in the vpci_write is somewhat we can tolerate,
> but then it is must be considered that no code in the read path
> is allowed to perform write path functions. Which brings a pretty
> valid use-case: say in read mode we detect an unrecoverable error
> and need to remove the device:
> vpci_process_pending -> ERROR -> vpci_remove_device or similar.
>
> What do we do then? It is all going to be fragile...
I have tried to summarize the options we have wrt locking
and would love to hear from @Roger and @Jan.

In every variant there is a task of dealing with the overlap
detection in modify_bars, so this is the only place as of now
which needs special treatment.

Existing limitations: there is no way to upgrade a read lock to a write
lock, so paths which may require write lock protection need to use
write lock from the very beginning. Workarounds can be applied.

1. Per-domain rw lock, aka d->vpci_lock
==
Note: with per-domain rw lock it is possible to do without introducing
per-device locks, so pdev->vpci->lock can be removed and no pdev->vpci_lock
should be required.

This is only going to work in case if vpci_write always takes the write lock
and vpci_read takes a read lock and no path in vpci_read is allowed to
perform write path operations.
vpci_process_pending uses write lock as it have vpci_remove_device in its
error path.

Pros:
- no per-device vpci lock is needed?
- solves overlap code ABBA in modify_bars

Cons:
- all writes are serialized
- need to carefully select read paths, so they are guaranteed not to lead
   to lock upgrade use-cases

1.1. Semi

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 18:37, Jan Beulich wrote:
> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 18:15, Jan Beulich wrote:
>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 17:26, Jan Beulich wrote:
>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs
>>>>> only; keep using the read lock for all other writes.
>>>> I am not quite sure how to do that. Do you mean something like:
>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>    uint32_t data)
>>>> [snip]
>>>>    list_for_each_entry ( r, >vpci->handlers, node )
>>>> {
>>>> [snip]
>>>>    if ( r->needs_write_lock)
>>>>        write_lock(d->vpci_lock)
>>>>    else
>>>>        read_lock(d->vpci_lock)
>>>> 
>>>>
>>>> And provide rw as an argument to:
>>>>
>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>>>  vpci_write_t *write_handler, unsigned int offset,
>>>>  unsigned int size, void *data, --->>> bool 
>>>> write_path <<<-)
>>>>
>>>> Is this what you mean?
>>> This sounds overly complicated. You can derive locally in vpci_write(),
>>> from just its "reg" and "size" parameters, whether the lock needs taking
>>> in write mode.
>> Yes, I started writing a reply with that. So, the summary (ROM
>> position depends on header type):
>> if ( (reg == PCI_COMMAND) || (reg == ROM) )
>> {
>>       read PCI_COMMAND and see if memory or IO decoding are enabled.
>>       if ( enabled )
>>           write_lock(d->vpci_lock)
>>       else
>>           read_lock(d->vpci_lock)
>> }
> Hmm, yes, you can actually get away without using "size", since both
> command register and ROM BAR are 32-bit aligned registers, and 64-bit
> accesses get split in vpci_ecam_write().
But, OS may want reading a single byte of ROM BAR, so I think
I'll need to check if reg+size fall into PCI_COMAND and ROM BAR
ranges
>
> For the command register the memory- / IO-decoding-enabled check may
> end up a little more complicated, as the value to be written also
> matters. Maybe read the command register only for the ROM BAR write,
> using the write lock uniformly for all command register writes?
Sounds good for the start.
Another concern is that if we go with a read_lock and then in the
underlying code we disable memory decoding and try doing
something and calling cmd_write handler for any reason then

I mean that the check in the vpci_write is somewhat we can tolerate,
but then it is must be considered that no code in the read path
is allowed to perform write path functions. Which brings a pretty
valid use-case: say in read mode we detect an unrecoverable error
and need to remove the device:
vpci_process_pending -> ERROR -> vpci_remove_device or similar.

What do we do then? It is all going to be fragile...
>
>> Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock)
>> at all then?
> I haven't looked at this in any detail, sorry. It sounds possible,
> yes.
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 18:15, Jan Beulich wrote:
> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote:
>> On 07.02.22 17:26, Jan Beulich wrote:
>>> 1b. Make vpci_write use write lock for writes to command register and BARs
>>> only; keep using the read lock for all other writes.
>> I am not quite sure how to do that. Do you mean something like:
>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>       uint32_t data)
>> [snip]
>>       list_for_each_entry ( r, >vpci->handlers, node )
>> {
>> [snip]
>>       if ( r->needs_write_lock)
>>           write_lock(d->vpci_lock)
>>       else
>>           read_lock(d->vpci_lock)
>> 
>>
>> And provide rw as an argument to:
>>
>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>     vpci_write_t *write_handler, unsigned int offset,
>>     unsigned int size, void *data, --->>> bool 
>> write_path <<<-)
>>
>> Is this what you mean?
> This sounds overly complicated. You can derive locally in vpci_write(),
> from just its "reg" and "size" parameters, whether the lock needs taking
> in write mode.
Yes, I started writing a reply with that. So, the summary (ROM
position depends on header type):
if ( (reg == PCI_COMMAND) || (reg == ROM) )
{
     read PCI_COMMAND and see if memory or IO decoding are enabled.
     if ( enabled )
         write_lock(d->vpci_lock)
     else
         read_lock(d->vpci_lock)
}

Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock)
at all then?
> Jan
>
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 17:26, Jan Beulich wrote:
> On 07.02.2022 16:11, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 16:35, Oleksandr Andrushchenko wrote:
>>> On 07.02.22 16:27, Roger Pau Monné wrote:
>>>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote:
>>>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote:
>>>>>> On 07.02.22 14:46, Roger Pau Monné wrote:
>>>>>>> I think the per-domain rwlock seems like a good option. I would do
>>>>>>> that as a pre-patch.
>>>>>> It is. But it seems it won't solve the thing we started this adventure 
>>>>>> for:
>>>>>>
>>>>>> With per-domain read lock and still ABBA in modify_bars (hope the below
>>>>>> is correctly seen with a monospace font):
>>>>>>
>>>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock ->
>>>>>>   rom_write -> modify_bars: tmp (pdev2) ->lock
>>>>>> cpu1:    vpci_write-> d->RLock pdev2->lock -> cmd_write -> 
>>>>>> modify_bars: tmp (pdev1) ->lock
>>>>>>
>>>>>> There is no API to upgrade read lock to write lock in modify_bars which 
>>>>>> could help,
>>>>>> so in both cases vpci_write should take write lock.
>>>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs
>>>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively
>>>>> vpci_write() would need to take the write lock if the range written
>>>>> overlaps the BARs or the command register.
>>>> I'm confused. If we use a per-domain rwlock approach there would be no
>>>> need to lock tmp again in modify_bars, because we should hold the
>>>> rwlock in write mode, so there's no ABBA?
>>> this is only possible with what you wrote below:
>>>> We will have however to drop the per domain read and vpci locks and
>>>> pick the per-domain lock in write mode.
>>> I think this is going to be unreliable. We need a reliable way to
>>> upgrade read lock to write lock.
>>> Then, we can drop pdev->vpci_lock at all, because we are always
>>> protected with d->rwlock and those who want to free pdev->vpci
>>> will use write lock.
>>>
>>> So, per-domain rwlock with write upgrade implemented minus pdev->vpci
>>> should do the trick
>> Linux doesn't implement write upgrade and it seems for a reason [1]:
>> "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ 
>> time
>> need to do any changes (even if you don’t do it every time), you have to get
>> the write-lock at the very beginning."
>>
>> So, I am not sure we can have the same for Xen...
>>
>> At the moment I see at least two possible ways to solve the issue:
>> 1. Make vpci_write use write lock, thus make all write accesses synchronized
>> for the given domain, read are fully parallel
> 1b. Make vpci_write use write lock for writes to command register and BARs
> only; keep using the read lock for all other writes.
I am not quite sure how to do that. Do you mean something like:
void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
     uint32_t data)
[snip]
     list_for_each_entry ( r, >vpci->handlers, node )
{
[snip]
     if ( r->needs_write_lock)
         write_lock(d->vpci_lock)
     else
         read_lock(d->vpci_lock)


And provide rw as an argument to:

int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
   vpci_write_t *write_handler, unsigned int offset,
   unsigned int size, void *data, --->>> bool write_path 
<<<-)

Is this what you mean?

With the above, if we have d->vpci_lock, I think we can drop
pdev->vpci_lock at all

Thank you,
Oleksandr

P.S. I don't think you mean we just drop the read lock and acquire write lock
as it leads to the mentioned before unreliability.


Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 17:28, Jan Beulich wrote:
> On 07.02.2022 16:14, Oleksandr Andrushchenko wrote:
>> On 07.02.22 17:05, Jan Beulich wrote:
>>> On 07.02.2022 15:46, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 16:31, Jan Beulich wrote:
>>>>> But: What's still missing here then is the separation of guest and host
>>>>> views. When we set INTx behind the guest's back, it shouldn't observe the
>>>>> bit set. Or is this meant to be another (big) TODO?
>>>> But, patch [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for 
>>>> guests
>>>> already takes care of it, I mean that it will set/reset INTx for the guest
>>>> according to MSI/MSI-X. So, if we squash these two patches the whole
>>>> picture will be seen at once.
>>> Does it? I did get the impression that the guest would be able to observe
>>> the bit set even after writing zero to it (while a reason exists that Xen
>>> wants the bit set).
>> Yes, you are correct: guest might not see what it wanted to set.
>> I meant that Xen won't allow resetting INTx if it is not possible
>> due to MSI/MSI-X
>>
>> Anyways, I think squashing will be a good idea to have the relevant
>> functionality in a single change set. Will this work for you?
> It might work, but I'd prefer things which can sensibly be separate to
> remain separate.
Ok, two patches
> Jan
>


Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 17:05, Jan Beulich wrote:
> On 07.02.2022 15:46, Oleksandr Andrushchenko wrote:
>> On 07.02.22 16:31, Jan Beulich wrote:
>>> But: What's still missing here then is the separation of guest and host
>>> views. When we set INTx behind the guest's back, it shouldn't observe the
>>> bit set. Or is this meant to be another (big) TODO?
>> But, patch [PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for 
>> guests
>> already takes care of it, I mean that it will set/reset INTx for the guest
>> according to MSI/MSI-X. So, if we squash these two patches the whole
>> picture will be seen at once.
> Does it? I did get the impression that the guest would be able to observe
> the bit set even after writing zero to it (while a reason exists that Xen
> wants the bit set).
Yes, you are correct: guest might not see what it wanted to set.
I meant that Xen won't allow resetting INTx if it is not possible
due to MSI/MSI-X

Anyways, I think squashing will be a good idea to have the relevant
functionality in a single change set. Will this work for you?
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 16:35, Oleksandr Andrushchenko wrote:
>
> On 07.02.22 16:27, Roger Pau Monné wrote:
>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote:
>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 14:46, Roger Pau Monné wrote:
>>>>> I think the per-domain rwlock seems like a good option. I would do
>>>>> that as a pre-patch.
>>>> It is. But it seems it won't solve the thing we started this adventure for:
>>>>
>>>> With per-domain read lock and still ABBA in modify_bars (hope the below
>>>> is correctly seen with a monospace font):
>>>>
>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock ->  
>>>>     rom_write -> modify_bars: tmp (pdev2) ->lock
>>>> cpu1:    vpci_write-> d->RLock pdev2->lock -> cmd_write -> 
>>>> modify_bars: tmp (pdev1) ->lock
>>>>
>>>> There is no API to upgrade read lock to write lock in modify_bars which 
>>>> could help,
>>>> so in both cases vpci_write should take write lock.
>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs
>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively
>>> vpci_write() would need to take the write lock if the range written
>>> overlaps the BARs or the command register.
>> I'm confused. If we use a per-domain rwlock approach there would be no
>> need to lock tmp again in modify_bars, because we should hold the
>> rwlock in write mode, so there's no ABBA?
> this is only possible with what you wrote below:
>> We will have however to drop the per domain read and vpci locks and
>> pick the per-domain lock in write mode.
> I think this is going to be unreliable. We need a reliable way to
> upgrade read lock to write lock.
> Then, we can drop pdev->vpci_lock at all, because we are always
> protected with d->rwlock and those who want to free pdev->vpci
> will use write lock.
>
> So, per-domain rwlock with write upgrade implemented minus pdev->vpci
> should do the trick
Linux doesn't implement write upgrade and it seems for a reason [1]:
"Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time
need to do any changes (even if you don’t do it every time), you have to get
the write-lock at the very beginning."

So, I am not sure we can have the same for Xen...

At the moment I see at least two possible ways to solve the issue:
1. Make vpci_write use write lock, thus make all write accesses synchronized
for the given domain, read are fully parallel

2. Re-implement pdev/tmp overlapping detection with something which won't
require pdev->vpci_lock/tmp->vpci_lock

3. Drop read and acquire write lock in modify_bars... but this is not reliable
and will hide a free(pdev->vpci) bug

@Roger, @Jan: Any other suggestions?

Thank you,
Oleksandr

[1] 
https://www.kernel.org/doc/html/latest/locking/spinlocks.html#lesson-2-reader-writer-spinlocks

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 16:31, Jan Beulich wrote:
> On 07.02.2022 15:17, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 14:54, Jan Beulich wrote:
>>> On 07.02.2022 13:51, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 14:38, Jan Beulich wrote:
>>>>> On 07.02.2022 12:27, Oleksandr Andrushchenko wrote:
>>>>>> On 07.02.22 09:29, Jan Beulich wrote:
>>>>>>> On 04.02.2022 15:37, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 16:30, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>> Reset the command register when assigning a PCI device to a guest:
>>>>>>>>>> according to the PCI spec the PCI_COMMAND register is typically all 
>>>>>>>>>> 0's
>>>>>>>>>> after reset.
>>>>>>>>> It's not entirely clear to me whether setting the hardware register to
>>>>>>>>> zero is okay. What wants to be zero is the value the guest observes
>>>>>>>>> initially.
>>>>>>>> "the PCI spec says the PCI_COMMAND register is typically all 0's after 
>>>>>>>> reset."
>>>>>>>> Why wouldn't it be ok? What is the exact concern here?
>>>>>>> The concern is - as voiced is similar ways before, perhaps in other
>>>>>>> contexts - that you need to consider bit-by-bit whether overwriting
>>>>>>> with 0 what is currently there is okay. Xen and/or Dom0 may have put
>>>>>>> values there which they expect to remain unaltered. I guess
>>>>>>> PCI_COMMAND_SERR is a good example: While the guest's view of this
>>>>>>> will want to be zero initially, the host having set it to 1 may not
>>>>>>> easily be overwritten with 0, or else you'd effectively imply giving
>>>>>>> the guest control of the bit.
>>>>>> We have already discussed in great detail PCI_COMMAND emulation [1].
>>>>>> At the end you wrote [1]:
>>>>>> "Well, in order for the whole thing to be security supported it needs to
>>>>>> be explained for every bit why it is safe to allow the guest to drive it.
>>>>>> Until you mean vPCI to reach that state, leaving TODO notes in the code
>>>>>> for anything not investigated may indeed be good enough.
>>>>>>
>>>>>> Jan"
>>>>>>
>>>>>> So, this is why I left a TODO in the PCI_COMMAND emulation for now and 
>>>>>> only
>>>>>> care about INTx which is honored with the code in this patch.
>>>>> Right. The issue I see is that the description does not have any
>>>>> mention of this, but instead talks about simply writing zero.
>>>> How do you want that mentioned? Extended commit message or
>>>> just a link to the thread [1]?
>>> What I'd like you to describe is what the change does without
>>> fundamentally implying it'll end up being zero which gets written
>>> to the register. Stating as a conclusion that for the time being
>>> this means writing zero is certainly fine (and likely helpful if
>>> made explicit).
>> Xen and/or Dom0 may have put values in PCI_COMMAND which they expect
>> to remain unaltered. PCI_COMMAND_SERR bit is a good example: while the
>> guest's view of this will want to be zero initially, the host having set
>> it to 1 may not easily be overwritten with 0, or else we'd effectively
>> imply giving the guest control of the bit. Thus, PCI_COMMAND register needs
>> proper emulation in order to honor host's settings.
>>
>> There are examples of emulators [1], [2] which already deal with PCI_COMMAND
>> register emulation and it seems that at most they care about the only INTX
>> bit (besides IO/memory enable and bus muster which are write through).
>> It could be because in order to properly emulate the PCI_COMMAND register
>> we need to know about the whole PCI topology, e.g. if any setting in device's
>> command register is aligned with the upstream port etc.
>> This makes me think that because of this complexity others just ignore that.
>> Neither I think this can be easily done in Xen case.
>>
>> According to "PCI LOCAL BUS SPECIFICATION, REV. 3.0", section "6.2.2
>> Device Control" says that the reset state of the command register is
>> typically 0, so reset the command

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 16:27, Roger Pau Monné wrote:
> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote:
>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote:
>>> On 07.02.22 14:46, Roger Pau Monné wrote:
>>>> I think the per-domain rwlock seems like a good option. I would do
>>>> that as a pre-patch.
>>> It is. But it seems it won't solve the thing we started this adventure for:
>>>
>>> With per-domain read lock and still ABBA in modify_bars (hope the below
>>> is correctly seen with a monospace font):
>>>
>>> cpu0: vpci_write-> d->RLock -> pdev1->lock ->   
>>>    rom_write -> modify_bars: tmp (pdev2) ->lock
>>> cpu1:    vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: 
>>> tmp (pdev1) ->lock
>>>
>>> There is no API to upgrade read lock to write lock in modify_bars which 
>>> could help,
>>> so in both cases vpci_write should take write lock.
>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs
>> to acquire the write lock, but its (perhaps indirect) caller. Effectively
>> vpci_write() would need to take the write lock if the range written
>> overlaps the BARs or the command register.
> I'm confused. If we use a per-domain rwlock approach there would be no
> need to lock tmp again in modify_bars, because we should hold the
> rwlock in write mode, so there's no ABBA?
this is only possible with what you wrote below:
>
> We will have however to drop the per domain read and vpci locks and
> pick the per-domain lock in write mode.
I think this is going to be unreliable. We need a reliable way to
upgrade read lock to write lock.
Then, we can drop pdev->vpci_lock at all, because we are always
protected with d->rwlock and those who want to free pdev->vpci
will use write lock.

So, per-domain rwlock with write upgrade implemented minus pdev->vpci
should do the trick
> Thanks, Roger.
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 16:11, Jan Beulich wrote:
> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote:
>> On 07.02.22 14:46, Roger Pau Monné wrote:
>>> I think the per-domain rwlock seems like a good option. I would do
>>> that as a pre-patch.
>> It is. But it seems it won't solve the thing we started this adventure for:
>>
>> With per-domain read lock and still ABBA in modify_bars (hope the below
>> is correctly seen with a monospace font):
>>
>> cpu0: vpci_write-> d->RLock -> pdev1->lock ->
>>   rom_write -> modify_bars: tmp (pdev2) ->lock
>> cpu1:    vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: 
>> tmp (pdev1) ->lock
>>
>> There is no API to upgrade read lock to write lock in modify_bars which 
>> could help,
>> so in both cases vpci_write should take write lock.
> Hmm, yes, I think you're right: It's not modify_bars() itself which needs
> to acquire the write lock, but its (perhaps indirect) caller. Effectively
> vpci_write() would need to take the write lock if the range written
> overlaps the BARs or the command register.
Exactly, vpci_write needs a write lock, but it is not desirable.
And again, there is a single offending piece of code which wants that...
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 16:19, Roger Pau Monné wrote:
> On Mon, Feb 07, 2022 at 01:53:34PM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 14:46, Roger Pau Monné wrote:
>>> On Mon, Feb 07, 2022 at 11:08:39AM +, Oleksandr Andrushchenko wrote:
>>>> ==
>>>>
>>>> Bottom line:
>>>> ==
>>>>
>>>> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in
>>>> parallel with pci_remove_device which can remove pdev after 
>>>> vpci_{read|write}
>>>> acquired the pdev pointer. This may lead to a fail due to pdev dereference.
>>>>
>>>> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock.
>>> We would like to take the pcidevs_lock only while fetching the device
>>> (ie: pci_get_pdev_by_domain), afterwards it should be fine to lock the
>>> device using a vpci specific lock so calls to vpci_{read,write} can be
>>> partially concurrent across multiple domains.
>> This means this can't be done a pre-req patch, but as a part of the
>> patch which changes locking.
>>> In fact I think Jan had already pointed out that the pci lock would
>>> need taking while searching for the device in vpci_{read,write}.
>> I was referring to the time after we found pdev and it is currently
>> possible to free pdev while using it after the search
>>> It seems to me that if you implement option 3 below taking the
>>> per-domain rwlock in read mode in vpci_{read|write} will already
>>> protect you from the device being removed if the same per-domain lock
>>> is taken in write mode in vpci_remove_device.
>> Yes, it should. Again this can't be done as a pre-req patch because
>> this relies on pdev->vpci_lock
> Hm, no, I don't think so. You could introduce this per-domain rwlock
> in a prepatch, and then move the vpci lock outside of the vpci struct.
> I see no problem with that.
>
>>>> 2. The only offending place which is in the way of pci_dev->vpci_lock is
>>>> modify_bars. If it can be re-worked to track already mapped and unmapped
>>>> regions then we can avoid having a possible deadlock and can use
>>>> pci_dev->vpci_lock (rangesets won't help here as we also need refcounting 
>>>> be
>>>> implemented).
>>> I think a refcounting based solution will be very complex to
>>> implement. I'm however happy to be proven wrong.
>> I can't estimate, but I have a feeling that all these plays around locking
>> is just because of this single piece of code. No other place suffer from
>> pdev->vpci_lock and no d->lock
>>>> If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible,
>>>> but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock 
>>>> and
>>>> tmp->vpci_lock when pdev == tmp, this is minor).
>>> Taking the pcidevs lock (a global lock) is out of the picture IMO, as
>>> it's going to serialize all calls of vpci_{read|write}, and would
>>> create too much contention on the pcidevs lock.
>> I understand that. But if we would like to fix the existing code I see
>> no other alternative.
>>>> 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this 
>>>> solves
>>>> modify_bars's two pdevs access. But this doesn't solve possible pdev
>>>> de-reference in vpci_{read|write} vs pci_remove_device.
>>> pci_remove device will call vpci_remove_device, so as long as
>>> vpci_remove_device taken the per-domain lock in write (exclusive) mode
>>> it should be fine.
>> I think I need to see if there are any other places which similarly
>> require the write lock
>>>> @Roger, @Jan, I would like to hear what do you think about the above 
>>>> analysis
>>>> and how can we proceed with locking re-work?
>>> I think the per-domain rwlock seems like a good option. I would do
>>> that as a pre-patch.
>> It is. But it seems it won't solve the thing we started this adventure for:
>>
>> With per-domain read lock and still ABBA in modify_bars (hope the below
>> is correctly seen with a monospace font):
>>
>> cpu0: vpci_write-> d->RLock -> pdev1->lock ->
>>   rom_write -> modify_bars: tmp (pdev2) ->lock
>> cpu1:    vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: 
>> tmp (pdev1) ->lock
>>
>> There is no API to

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 14:54, Jan Beulich wrote:
> On 07.02.2022 13:51, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 14:38, Jan Beulich wrote:
>>> On 07.02.2022 12:27, Oleksandr Andrushchenko wrote:
>>>> On 07.02.22 09:29, Jan Beulich wrote:
>>>>> On 04.02.2022 15:37, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 16:30, Jan Beulich wrote:
>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>> Reset the command register when assigning a PCI device to a guest:
>>>>>>>> according to the PCI spec the PCI_COMMAND register is typically all 0's
>>>>>>>> after reset.
>>>>>>> It's not entirely clear to me whether setting the hardware register to
>>>>>>> zero is okay. What wants to be zero is the value the guest observes
>>>>>>> initially.
>>>>>> "the PCI spec says the PCI_COMMAND register is typically all 0's after 
>>>>>> reset."
>>>>>> Why wouldn't it be ok? What is the exact concern here?
>>>>> The concern is - as voiced is similar ways before, perhaps in other
>>>>> contexts - that you need to consider bit-by-bit whether overwriting
>>>>> with 0 what is currently there is okay. Xen and/or Dom0 may have put
>>>>> values there which they expect to remain unaltered. I guess
>>>>> PCI_COMMAND_SERR is a good example: While the guest's view of this
>>>>> will want to be zero initially, the host having set it to 1 may not
>>>>> easily be overwritten with 0, or else you'd effectively imply giving
>>>>> the guest control of the bit.
>>>> We have already discussed in great detail PCI_COMMAND emulation [1].
>>>> At the end you wrote [1]:
>>>> "Well, in order for the whole thing to be security supported it needs to
>>>> be explained for every bit why it is safe to allow the guest to drive it.
>>>> Until you mean vPCI to reach that state, leaving TODO notes in the code
>>>> for anything not investigated may indeed be good enough.
>>>>
>>>> Jan"
>>>>
>>>> So, this is why I left a TODO in the PCI_COMMAND emulation for now and only
>>>> care about INTx which is honored with the code in this patch.
>>> Right. The issue I see is that the description does not have any
>>> mention of this, but instead talks about simply writing zero.
>> How do you want that mentioned? Extended commit message or
>> just a link to the thread [1]?
> What I'd like you to describe is what the change does without
> fundamentally implying it'll end up being zero which gets written
> to the register. Stating as a conclusion that for the time being
> this means writing zero is certainly fine (and likely helpful if
> made explicit).
Xen and/or Dom0 may have put values in PCI_COMMAND which they expect
to remain unaltered. PCI_COMMAND_SERR bit is a good example: while the
guest's view of this will want to be zero initially, the host having set
it to 1 may not easily be overwritten with 0, or else we'd effectively
imply giving the guest control of the bit. Thus, PCI_COMMAND register needs
proper emulation in order to honor host's settings.

There are examples of emulators [1], [2] which already deal with PCI_COMMAND
register emulation and it seems that at most they care about the only INTX
bit (besides IO/memory enable and bus muster which are write through).
It could be because in order to properly emulate the PCI_COMMAND register
we need to know about the whole PCI topology, e.g. if any setting in device's
command register is aligned with the upstream port etc.
This makes me think that because of this complexity others just ignore that.
Neither I think this can be easily done in Xen case.

According to "PCI LOCAL BUS SPECIFICATION, REV. 3.0", section "6.2.2
Device Control" says that the reset state of the command register is
typically 0, so reset the command register when assigning a PCI device
to a guest t all 0's and for now only make sure INTx bit is set according
to if MSI/MSI-X enabled.

[1] https://github.com/qemu/qemu/blob/master/hw/xen/xen_pt_config_init.c#L310
[2] 
https://github.com/projectacrn/acrn-hypervisor/blob/master/hypervisor/hw/pci.c#L336

Will the above description be enough?

It also seems to be a good move to squash the following patches:
[PATCH v6 09/13] vpci/header: emulate PCI_COMMAND register for guests
[PATCH v6 10/13] vpci/header: reset the command register when adding devices

as they implement a single piece of functionality now.
>> With the above done, do you think that writing 0's is an acceptable
>> approach as of now?
> Well, yes, provided we have a sufficiently similar understanding
> of what "acceptable" here means.
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 14:46, Roger Pau Monné wrote:
> On Mon, Feb 07, 2022 at 11:08:39AM +0000, Oleksandr Andrushchenko wrote:
>> Hello,
>>
>> On 04.02.22 16:57, Roger Pau Monné wrote:
>>> On Fri, Feb 04, 2022 at 02:43:07PM +, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 15:06, Roger Pau Monné wrote:
>>>>> On Fri, Feb 04, 2022 at 12:53:20PM +, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 14:47, Jan Beulich wrote:
>>>>>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 13:37, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote:
>>>>>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>>>>>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>>>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct 
>>>>>>>>>>>>>>>> pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>>>>> continue;
>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>> 
>>>>>>>>>>>>>>>> +spin_lock(>vpci_lock);
>>>>>>>>>>>>>>>> +if ( !tmp->vpci )
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>>>> +continue;
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>> for ( i = 0; i < 
>>>>>>>>>>>>>>>> ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> const struct vpci_bar *bar = 
>>>>>>>>>>>>>>>> >vpci->header.bars[i];
>>>>>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct 
>>>>>>>>>>>>>>>> pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>>>>> rc = rangeset_remove_range(mem, start, 
>>>>>>>>>>>>>>>> end);
>>>>>>>>>>>>>>>> if ( rc )
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to 
>>>>>>>>>>>>>>>> remove [%lx, %lx]: %d\n",
>>>>>>>>>>>>>>>>start, end, rc);
>>>>>>>>>>>>>>>> rangeset_destroy(mem);
>>>>>>>>>>>>>>>> return rc;
>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>> At the first glance this simply looks like another unjustified 
>>>>>>>>>>>>>>> (in the
>>>>>>>>>>>>>>> description) change, as you're not converting anything here but 
>>>>>>>>>>>>>>> you
>>>>>>>>>>>>>>> actually add locking (and I realize this was there before, s

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 14:34, Jan Beulich wrote:
> On 07.02.2022 12:08, Oleksandr Andrushchenko wrote:
>> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in
>> parallel with pci_remove_device which can remove pdev after vpci_{read|write}
>> acquired the pdev pointer. This may lead to a fail due to pdev dereference.
>>
>> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock.
> I think this is not the only place where there is a theoretical race
> against pci_remove_device().
Not at all, that was just to demonstrate one of the possible sources of races.
>   I would recommend to separate the
> overall situation with pcidevs_lock from the issue here.
Do you agree that there is already an issue with that? In the currently 
existing code?
>   I don't view
> it as an option to acquire pcidevs_lock in vpci_{read,write}().
Yes, that would hurt too much, I agree. But this needs to be solved
>   If
> anything, we need proper refcounting of PCI devices (at which point
> likely a number of lock uses can go away).
It seems so. Then not only pdev's need refcounting, but pdev->vpci as well

What's your view on how can we achieve both goals?
pdev and pdev->vpci and locking/refcounting
This is really crucial for all the code for PCI passthrough on Arm because
without this ground work done we can't accept all the patches which rely
on this: vPCI changes, MSI/MSI-X etc.
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 14:38, Jan Beulich wrote:
> On 07.02.2022 12:27, Oleksandr Andrushchenko wrote:
>>
>> On 07.02.22 09:29, Jan Beulich wrote:
>>> On 04.02.2022 15:37, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 16:30, Jan Beulich wrote:
>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>> Reset the command register when assigning a PCI device to a guest:
>>>>>> according to the PCI spec the PCI_COMMAND register is typically all 0's
>>>>>> after reset.
>>>>> It's not entirely clear to me whether setting the hardware register to
>>>>> zero is okay. What wants to be zero is the value the guest observes
>>>>> initially.
>>>> "the PCI spec says the PCI_COMMAND register is typically all 0's after 
>>>> reset."
>>>> Why wouldn't it be ok? What is the exact concern here?
>>> The concern is - as voiced is similar ways before, perhaps in other
>>> contexts - that you need to consider bit-by-bit whether overwriting
>>> with 0 what is currently there is okay. Xen and/or Dom0 may have put
>>> values there which they expect to remain unaltered. I guess
>>> PCI_COMMAND_SERR is a good example: While the guest's view of this
>>> will want to be zero initially, the host having set it to 1 may not
>>> easily be overwritten with 0, or else you'd effectively imply giving
>>> the guest control of the bit.
>> We have already discussed in great detail PCI_COMMAND emulation [1].
>> At the end you wrote [1]:
>> "Well, in order for the whole thing to be security supported it needs to
>> be explained for every bit why it is safe to allow the guest to drive it.
>> Until you mean vPCI to reach that state, leaving TODO notes in the code
>> for anything not investigated may indeed be good enough.
>>
>> Jan"
>>
>> So, this is why I left a TODO in the PCI_COMMAND emulation for now and only
>> care about INTx which is honored with the code in this patch.
> Right. The issue I see is that the description does not have any
> mention of this, but instead talks about simply writing zero.
How do you want that mentioned? Extended commit message or
just a link to the thread [1]?
With the above done, do you think that writing 0's is an acceptable
approach as of now?
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-07 Thread Oleksandr Andrushchenko


On 07.02.22 09:29, Jan Beulich wrote:
> On 04.02.2022 15:37, Oleksandr Andrushchenko wrote:
>> On 04.02.22 16:30, Jan Beulich wrote:
>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>> Reset the command register when assigning a PCI device to a guest:
>>>> according to the PCI spec the PCI_COMMAND register is typically all 0's
>>>> after reset.
>>> It's not entirely clear to me whether setting the hardware register to
>>> zero is okay. What wants to be zero is the value the guest observes
>>> initially.
>> "the PCI spec says the PCI_COMMAND register is typically all 0's after 
>> reset."
>> Why wouldn't it be ok? What is the exact concern here?
> The concern is - as voiced is similar ways before, perhaps in other
> contexts - that you need to consider bit-by-bit whether overwriting
> with 0 what is currently there is okay. Xen and/or Dom0 may have put
> values there which they expect to remain unaltered. I guess
> PCI_COMMAND_SERR is a good example: While the guest's view of this
> will want to be zero initially, the host having set it to 1 may not
> easily be overwritten with 0, or else you'd effectively imply giving
> the guest control of the bit.
We have already discussed in great detail PCI_COMMAND emulation [1].
At the end you wrote [1]:
"Well, in order for the whole thing to be security supported it needs to
be explained for every bit why it is safe to allow the guest to drive it.
Until you mean vPCI to reach that state, leaving TODO notes in the code
for anything not investigated may indeed be good enough.

Jan"

So, this is why I left a TODO in the PCI_COMMAND emulation for now and only
care about INTx which is honored with the code in this patch.
>
> Jan
>

Thank you,
Oleksandr

[1] 
https://patchwork.kernel.org/project/xen-devel/patch/20210903100831.177748-9-andr2...@gmail.com/
[2] https://lists.xenproject.org/archives/html/xen-devel/2021-09/msg00737.html

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-07 Thread Oleksandr Andrushchenko
Hello,

On 04.02.22 16:57, Roger Pau Monné wrote:
> On Fri, Feb 04, 2022 at 02:43:07PM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 04.02.22 15:06, Roger Pau Monné wrote:
>>> On Fri, Feb 04, 2022 at 12:53:20PM +, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 14:47, Jan Beulich wrote:
>>>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 13:37, Jan Beulich wrote:
>>>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote:
>>>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev 
>>>>>>>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>>>continue;
>>>>>>>>>>>>>>}
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +spin_lock(>vpci_lock);
>>>>>>>>>>>>>> +if ( !tmp->vpci )
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>> +continue;
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>for ( i = 0; i < 
>>>>>>>>>>>>>> ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>>>>>>>>>>>{
>>>>>>>>>>>>>>const struct vpci_bar *bar = 
>>>>>>>>>>>>>> >vpci->header.bars[i];
>>>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct 
>>>>>>>>>>>>>> pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>>>rc = rangeset_remove_range(mem, start, end);
>>>>>>>>>>>>>>if ( rc )
>>>>>>>>>>>>>>{
>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>>printk(XENLOG_G_WARNING "Failed to remove 
>>>>>>>>>>>>>> [%lx, %lx]: %d\n",
>>>>>>>>>>>>>>   start, end, rc);
>>>>>>>>>>>>>>rangeset_destroy(mem);
>>>>>>>>>>>>>>return rc;
>>>>>>>>>>>>>>}
>>>>>>>>>>>>>>}
>>>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>>>}
>>>>>>>>>>>>> At the first glance this simply looks like another unjustified 
>>>>>>>>>>>>> (in the
>>>>>>>>>>>>> description) change, as you're not converting anything here but 
>>>>>>>>>>>>> you
>>>>>>>>>>>>> actually add locking (and I realize this was there before, so I'm 
>>>>>>>>>>>>> sorry
>>>>>>>>>>>>> for not pointing this out earlier).
>>>>>>>>>>>> Well, I thought that the description already has "...the lock can 
>>>>>>>>>>>> be
>>>>>>>>>>>> used (and in a few cases is used right away) to check whether vpci
>>>>>>>>>>>> is present" and this is enough for such uses as here.
>>>>>>>>>>>>>

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 15:06, Roger Pau Monné wrote:
> On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote:
>>
>> On 04.02.22 14:47, Jan Beulich wrote:
>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 13:37, Jan Beulich wrote:
>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote:
>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev 
>>>>>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>   continue;
>>>>>>>>>>>>   }
>>>>>>>>>>>>   
>>>>>>>>>>>> +spin_lock(>vpci_lock);
>>>>>>>>>>>> +if ( !tmp->vpci )
>>>>>>>>>>>> +{
>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>> +continue;
>>>>>>>>>>>> +}
>>>>>>>>>>>>   for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); 
>>>>>>>>>>>> i++ )
>>>>>>>>>>>>   {
>>>>>>>>>>>>   const struct vpci_bar *bar = 
>>>>>>>>>>>> >vpci->header.bars[i];
>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev 
>>>>>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>>>   rc = rangeset_remove_range(mem, start, end);
>>>>>>>>>>>>   if ( rc )
>>>>>>>>>>>>   {
>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>   printk(XENLOG_G_WARNING "Failed to remove 
>>>>>>>>>>>> [%lx, %lx]: %d\n",
>>>>>>>>>>>>  start, end, rc);
>>>>>>>>>>>>   rangeset_destroy(mem);
>>>>>>>>>>>>   return rc;
>>>>>>>>>>>>   }
>>>>>>>>>>>>   }
>>>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>>>   }
>>>>>>>>>>> At the first glance this simply looks like another unjustified (in 
>>>>>>>>>>> the
>>>>>>>>>>> description) change, as you're not converting anything here but you
>>>>>>>>>>> actually add locking (and I realize this was there before, so I'm 
>>>>>>>>>>> sorry
>>>>>>>>>>> for not pointing this out earlier).
>>>>>>>>>> Well, I thought that the description already has "...the lock can be
>>>>>>>>>> used (and in a few cases is used right away) to check whether vpci
>>>>>>>>>> is present" and this is enough for such uses as here.
>>>>>>>>>>>   But then I wonder whether you
>>>>>>>>>>> actually tested this, since I can't help getting the impression that
>>>>>>>>>>> you're introducing a live-lock: The function is called from 
>>>>>>>>>>> cmd_write()
>>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet 
>>>>>>>>>>> that
>>>>>>>>>>> function already holds the lock, and the lock is not (currently)
>>>>>>>>>>> recursive. (For the 3rd caller of the fun

Re: [PATCH v6 10/13] vpci/header: reset the command register when adding devices

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 16:30, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> Reset the command register when assigning a PCI device to a guest:
>> according to the PCI spec the PCI_COMMAND register is typically all 0's
>> after reset.
> It's not entirely clear to me whether setting the hardware register to
> zero is okay. What wants to be zero is the value the guest observes
> initially.
"the PCI spec says the PCI_COMMAND register is typically all 0's after reset."
Why wouldn't it be ok? What is the exact concern here?
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -454,8 +454,7 @@ static void cmd_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   pci_conf_write16(pdev->sbdf, reg, cmd);
>>   }
>>   
>> -static void guest_cmd_write(const struct pci_dev *pdev, unsigned int reg,
>> -uint32_t cmd, void *data)
>> +static uint32_t emulate_cmd_reg(const struct pci_dev *pdev, uint32_t cmd)
> The command register is a 16-bit one, so parameter and return type should
> either be plain unsigned int (preferred, see ./CODING_STYLE) or uint16_t
> imo.
God catch, thank you
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 04/13] vpci: restrict unhandled read/write operations for guests

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 16:11, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> A guest can read and write those registers which are not emulated and
>> have no respective vPCI handlers, so it can access the HW directly.
> I don't think this describes the present situation. Or did I miss where
> devices can actually be exposed to guests already, despite much of the
> support logic still missing?
No, they are not exposed yet and you know that.
I will update the commit message
>
>> In order to prevent a guest from reads and writes from/to the unhandled
>> registers make sure only hardware domain can access HW directly and restrict
>> guests from doing so.
> Tangential question: Going over the titles of the remaining patches I
> notice patch 6 is going to deal with BAR accesses. But (going just
> from the titles) I can't spot anywhere that vendor and device IDs
> would be exposed to guests. Yet that's the first thing guests will need
> in order to actually recognize devices. As said before, allowing guests
> access to such r/o fields is quite likely going to be fine.
Agree, I was thinking about adding such a patch to allow IDs,
but finally decided not to add more to this series.
Again, the whole thing is not working yet and for the development
this patch can/needs to be reverted. So, either we implement IDs
or not this doesn't change anything with this respect
>
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -215,11 +215,15 @@ int vpci_remove_register(struct vpci *vpci, unsigned 
>> int offset,
>>   }
>>   
>>   /* Wrappers for performing reads/writes to the underlying hardware. */
>> -static uint32_t vpci_read_hw(pci_sbdf_t sbdf, unsigned int reg,
>> +static uint32_t vpci_read_hw(bool is_hwdom, pci_sbdf_t sbdf, unsigned int 
>> reg,
>>unsigned int size)
> Was the passing around of a boolean the consensus which was reached?
Was this patch committed yet?
> Personally I'd fine it more natural if the two functions checked
> current->domain themselves.
This is also possible, but I would like to hear Roger's view on this as well
I am fine either way
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 14:47, Jan Beulich wrote:
> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote:
>>
>> On 04.02.22 13:37, Jan Beulich wrote:
>>> On 04.02.2022 12:13, Roger Pau Monné wrote:
>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev 
>>>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>  continue;
>>>>>>>>>>  }
>>>>>>>>>>  
>>>>>>>>>> +spin_lock(>vpci_lock);
>>>>>>>>>> +if ( !tmp->vpci )
>>>>>>>>>> +{
>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>> +continue;
>>>>>>>>>> +}
>>>>>>>>>>  for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); 
>>>>>>>>>> i++ )
>>>>>>>>>>  {
>>>>>>>>>>  const struct vpci_bar *bar = 
>>>>>>>>>> >vpci->header.bars[i];
>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev 
>>>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>>>>  rc = rangeset_remove_range(mem, start, end);
>>>>>>>>>>  if ( rc )
>>>>>>>>>>  {
>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>  printk(XENLOG_G_WARNING "Failed to remove [%lx, 
>>>>>>>>>> %lx]: %d\n",
>>>>>>>>>> start, end, rc);
>>>>>>>>>>  rangeset_destroy(mem);
>>>>>>>>>>  return rc;
>>>>>>>>>>  }
>>>>>>>>>>  }
>>>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>>>>  }
>>>>>>>>> At the first glance this simply looks like another unjustified (in the
>>>>>>>>> description) change, as you're not converting anything here but you
>>>>>>>>> actually add locking (and I realize this was there before, so I'm 
>>>>>>>>> sorry
>>>>>>>>> for not pointing this out earlier).
>>>>>>>> Well, I thought that the description already has "...the lock can be
>>>>>>>> used (and in a few cases is used right away) to check whether vpci
>>>>>>>> is present" and this is enough for such uses as here.
>>>>>>>>>  But then I wonder whether you
>>>>>>>>> actually tested this, since I can't help getting the impression that
>>>>>>>>> you're introducing a live-lock: The function is called from 
>>>>>>>>> cmd_write()
>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet 
>>>>>>>>> that
>>>>>>>>> function already holds the lock, and the lock is not (currently)
>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh
>>>>>>>>> the locking looks to be entirely unnecessary.)
>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire
>>>>>>>> the lock. But if tmp == pdev and rom_only == true
>>>>>>>> then we'll deadlock.
>>>>>>>>
>>>>>>>> It seems we need to have the locking conditional, e.g. only lock
>>>>>>>> if tmp != pdev
>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential
>>>>

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 13:37, Jan Beulich wrote:
> On 04.02.2022 12:13, Roger Pau Monné wrote:
>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev 
>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>> continue;
>>>>>>>> }
>>>>>>>> 
>>>>>>>> +spin_lock(>vpci_lock);
>>>>>>>> +if ( !tmp->vpci )
>>>>>>>> +{
>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>> +continue;
>>>>>>>> +}
>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>>>>> {
>>>>>>>> const struct vpci_bar *bar = 
>>>>>>>> >vpci->header.bars[i];
>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev 
>>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>>> rc = rangeset_remove_range(mem, start, end);
>>>>>>>> if ( rc )
>>>>>>>> {
>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, 
>>>>>>>> %lx]: %d\n",
>>>>>>>>start, end, rc);
>>>>>>>> rangeset_destroy(mem);
>>>>>>>> return rc;
>>>>>>>> }
>>>>>>>> }
>>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>>> }
>>>>>>> At the first glance this simply looks like another unjustified (in the
>>>>>>> description) change, as you're not converting anything here but you
>>>>>>> actually add locking (and I realize this was there before, so I'm sorry
>>>>>>> for not pointing this out earlier).
>>>>>> Well, I thought that the description already has "...the lock can be
>>>>>> used (and in a few cases is used right away) to check whether vpci
>>>>>> is present" and this is enough for such uses as here.
>>>>>>> But then I wonder whether you
>>>>>>> actually tested this, since I can't help getting the impression that
>>>>>>> you're introducing a live-lock: The function is called from cmd_write()
>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that
>>>>>>> function already holds the lock, and the lock is not (currently)
>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh
>>>>>>> the locking looks to be entirely unnecessary.)
>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire
>>>>>> the lock. But if tmp == pdev and rom_only == true
>>>>>> then we'll deadlock.
>>>>>>
>>>>>> It seems we need to have the locking conditional, e.g. only lock
>>>>>> if tmp != pdev
>>>>> Which will address the live-lock, but introduce ABBA deadlock potential
>>>>> between the two locks.
>>>> I am not sure I can suggest a better solution here
>>>> @Roger, @Jan, could you please help here?
>>> Well, first of all I'd like to mention that while it may have been okay to
>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing
>>> with DomU-s' lists of PCI devices. The requirement really applies to the
>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that
>>> there it probably wants to be a try-lock.
>>>
>>> Next I'd like to point out that here we have the still pending issue of
>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch
>&

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 13:13, Roger Pau Monné wrote:
> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote:
>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote:
>>> On 04.02.22 11:15, Jan Beulich wrote:
>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>>>>> On 04.02.22 09:52, Jan Beulich wrote:
>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, 
>>>>>>> uint16_t cmd, bool rom_only)
>>>>>>> continue;
>>>>>>> }
>>>>>>> 
>>>>>>> +spin_lock(>vpci_lock);
>>>>>>> +if ( !tmp->vpci )
>>>>>>> +{
>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>> +continue;
>>>>>>> +}
>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>>>> {
>>>>>>> const struct vpci_bar *bar = >vpci->header.bars[i];
>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev 
>>>>>>> *pdev, uint16_t cmd, bool rom_only)
>>>>>>> rc = rangeset_remove_range(mem, start, end);
>>>>>>> if ( rc )
>>>>>>> {
>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, 
>>>>>>> %lx]: %d\n",
>>>>>>>start, end, rc);
>>>>>>> rangeset_destroy(mem);
>>>>>>> return rc;
>>>>>>> }
>>>>>>> }
>>>>>>> +spin_unlock(>vpci_lock);
>>>>>>> }
>>>>>> At the first glance this simply looks like another unjustified (in the
>>>>>> description) change, as you're not converting anything here but you
>>>>>> actually add locking (and I realize this was there before, so I'm sorry
>>>>>> for not pointing this out earlier).
>>>>> Well, I thought that the description already has "...the lock can be
>>>>> used (and in a few cases is used right away) to check whether vpci
>>>>> is present" and this is enough for such uses as here.
>>>>>> But then I wonder whether you
>>>>>> actually tested this, since I can't help getting the impression that
>>>>>> you're introducing a live-lock: The function is called from cmd_write()
>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that
>>>>>> function already holds the lock, and the lock is not (currently)
>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh
>>>>>> the locking looks to be entirely unnecessary.)
>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire
>>>>> the lock. But if tmp == pdev and rom_only == true
>>>>> then we'll deadlock.
>>>>>
>>>>> It seems we need to have the locking conditional, e.g. only lock
>>>>> if tmp != pdev
>>>> Which will address the live-lock, but introduce ABBA deadlock potential
>>>> between the two locks.
>>> I am not sure I can suggest a better solution here
>>> @Roger, @Jan, could you please help here?
>> Well, first of all I'd like to mention that while it may have been okay to
>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing
>> with DomU-s' lists of PCI devices. The requirement really applies to the
>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that
>> there it probably wants to be a try-lock.
>>
>> Next I'd like to point out that here we have the still pending issue of
>> how to deal with hidden devices, which Dom0 can access. See my RFC patch
>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution
>> here, I think it wants to at least account for the extra need there.
> Yes, sorry, I should take care of that.
>
>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding
>> the deadlock, as it's imo not an opt

Re: [PATCH v6 01/13] xen/pci: arm: add stub for is_memory_hole

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 13:00, Julien Grall wrote:
>
>
> On 04/02/2022 10:35, Oleksandr Andrushchenko wrote:
>>
>>
>> On 04.02.22 11:57, Julien Grall wrote:
>>> Hi,
>>>
>>> On 04/02/2022 09:47, Oleksandr Andrushchenko wrote:
>>>>>> Could you please help me with the exact message you would like to see?
>>>>>
>>>>> Here a summary of the discussion (+ some my follow-up thoughts):
>>>>>
>>>>> is_memory_hole() was recently introduced on x86 (see commit 75cc460a1b8c 
>>>>> "xen/pci: detect when BARs are not suitably positioned") to check whether 
>>>>> the BAR are positioned outside of a valid memory range. This was 
>>>>> introduced to work-around quirky firmware.
>>>>>
>>>>> In theory, this could also happen on Arm. In practice, this may not 
>>>>> happen but it sounds better to sanity check that the BAR contains "valid" 
>>>>> I/O range.
>>>>>
>>>>> On x86, this is implemented by checking the region is not described is in 
>>>>> the e820. IIUC, on Arm, the BARs have to be positioned in pre-defined 
>>>>> ranges. So I think it would be possible to implement is_memory_hole() by 
>>>>> going through the list of hostbridges and check the ranges.
>>>>>
>>>>> But first, I'd like to confirm my understanding with Rahul, and others.
>>>>>
>>>>> If we were going to go this route, I would also rename the function to be 
>>>>> better match what it is doing (i.e. it checks the BAR is correctly 
>>>>> placed). As a potentially optimization/hardening for Arm, we could pass 
>>>>> the hostbridge so we don't have to walk all of them.
>>>> It seems this needs to live in the commit message then? So, it is easy to 
>>>> find
>>>> as everything after "---" is going to be dropped on commit
>>> I expect the function to be fully implemented before this is will be merged.
>>>
>>> So if it is fully implemented, then a fair chunk of what I wrote would not 
>>> be necessary to carry in the commit message.
>> Well, we started from that we want *something* with TODO and now
>> you request it to be fully implemented before it is merged.
>
> I don't think I ever suggested this patch would be merged as-is. Sorry if 
> this may have crossed like this.
Np
>
> Instead, my intent by asking you to send a TODO patch is to start a 
> discussion how this function could be implemented for Arm.
>
> You sent a TODO but you didn't provide any summary on what is the issue, what 
> we want to achieve... Hence my request to add a bit more details so the other 
> reviewers can provide their opinion more easily.
Ok, so we can discuss it here, but I won't have this patch in v7
>
> Cheers,
>
Thank you,
Oleksandr

Re: [PATCH v6 01/13] xen/pci: arm: add stub for is_memory_hole

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 11:57, Julien Grall wrote:
> Hi,
>
> On 04/02/2022 09:47, Oleksandr Andrushchenko wrote:
>>>> Could you please help me with the exact message you would like to see?
>>>
>>> Here a summary of the discussion (+ some my follow-up thoughts):
>>>
>>> is_memory_hole() was recently introduced on x86 (see commit 75cc460a1b8c 
>>> "xen/pci: detect when BARs are not suitably positioned") to check whether 
>>> the BAR are positioned outside of a valid memory range. This was introduced 
>>> to work-around quirky firmware.
>>>
>>> In theory, this could also happen on Arm. In practice, this may not happen 
>>> but it sounds better to sanity check that the BAR contains "valid" I/O 
>>> range.
>>>
>>> On x86, this is implemented by checking the region is not described is in 
>>> the e820. IIUC, on Arm, the BARs have to be positioned in pre-defined 
>>> ranges. So I think it would be possible to implement is_memory_hole() by 
>>> going through the list of hostbridges and check the ranges.
>>>
>>> But first, I'd like to confirm my understanding with Rahul, and others.
>>>
>>> If we were going to go this route, I would also rename the function to be 
>>> better match what it is doing (i.e. it checks the BAR is correctly placed). 
>>> As a potentially optimization/hardening for Arm, we could pass the 
>>> hostbridge so we don't have to walk all of them.
>> It seems this needs to live in the commit message then? So, it is easy to 
>> find
>> as everything after "---" is going to be dropped on commit
> I expect the function to be fully implemented before this is will be merged.
>
> So if it is fully implemented, then a fair chunk of what I wrote would not be 
> necessary to carry in the commit message.
Well, we started from that we want *something* with TODO and now
you request it to be fully implemented before it is merged.
What do I miss here?
>
> Cheers,
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko
Hi, Jan!

On 04.02.22 11:15, Jan Beulich wrote:
> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote:
>> On 04.02.22 09:52, Jan Beulich wrote:
>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, 
>>>> uint16_t cmd, bool rom_only)
>>>>continue;
>>>>}
>>>>
>>>> +spin_lock(>vpci_lock);
>>>> +if ( !tmp->vpci )
>>>> +{
>>>> +spin_unlock(>vpci_lock);
>>>> +continue;
>>>> +}
>>>>for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>{
>>>>const struct vpci_bar *bar = >vpci->header.bars[i];
>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, 
>>>> uint16_t cmd, bool rom_only)
>>>>rc = rangeset_remove_range(mem, start, end);
>>>>if ( rc )
>>>>{
>>>> +spin_unlock(>vpci_lock);
>>>>printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: 
>>>> %d\n",
>>>>   start, end, rc);
>>>>rangeset_destroy(mem);
>>>>return rc;
>>>>}
>>>>}
>>>> +spin_unlock(>vpci_lock);
>>>>}
>>> At the first glance this simply looks like another unjustified (in the
>>> description) change, as you're not converting anything here but you
>>> actually add locking (and I realize this was there before, so I'm sorry
>>> for not pointing this out earlier).
>> Well, I thought that the description already has "...the lock can be
>> used (and in a few cases is used right away) to check whether vpci
>> is present" and this is enough for such uses as here.
>>>But then I wonder whether you
>>> actually tested this, since I can't help getting the impression that
>>> you're introducing a live-lock: The function is called from cmd_write()
>>> and rom_write(), which in turn are called out of vpci_write(). Yet that
>>> function already holds the lock, and the lock is not (currently)
>>> recursive. (For the 3rd caller of the function - init_bars() - otoh
>>> the locking looks to be entirely unnecessary.)
>> Well, you are correct: if tmp != pdev then it is correct to acquire
>> the lock. But if tmp == pdev and rom_only == true
>> then we'll deadlock.
>>
>> It seems we need to have the locking conditional, e.g. only lock
>> if tmp != pdev
> Which will address the live-lock, but introduce ABBA deadlock potential
> between the two locks.
I am not sure I can suggest a better solution here
@Roger, @Jan, could you please help here?
>
>>>> @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long 
>>>> addr, unsigned int len,
>>>>break;
>>>>}
>>>>
>>>> +msix_put(msix);
>>>>return X86EMUL_OKAY;
>>>>}
>>>>
>>>> -spin_lock(>pdev->vpci->lock);
>>>>entry = get_entry(msix, addr);
>>>>offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
>>> You're increasing the locked region quite a bit here. If this is really
>>> needed, it wants explaining. And if this is deemed acceptable as a
>>> "side effect", it wants justifying or at least stating imo. Same for
>>> msix_write() then, obviously.
>> Yes, I do increase the locking region here, but the msix variable needs
>> to be protected all the time, so it seems to be obvious that it remains
>> under the lock
> What does the msix variable have to do with the vPCI lock? If you see
> a need to grow the locked region here, then surely this is independent
> of your conversion of the lock, and hence wants to be a prereq fix
> (which may in fact want/need backporting).
First of all, the implementation of msix_get is wrong and needs to be:

/*
  * Note: if vpci_msix found, then this function returns with
  * pdev->vpci_lock held. Use msix_put to unlock.
  */
static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr)
{
     struct vpci_msix *msix;

     list_for_each_entry ( msix, >arch.hvm.msix_tables, next )
     {
     const struct vpci_bar *bars;
     unsigned int i;

     spin_lock(>pd

Re: [PATCH v6 01/13] xen/pci: arm: add stub for is_memory_hole

2022-02-04 Thread Oleksandr Andrushchenko


On 04.02.22 11:41, Julien Grall wrote:
> On 04/02/2022 09:01, Oleksandr Andrushchenko wrote:
>> On 04.02.22 10:51, Julien Grall wrote:
>>> Hi,
>>>
>>> On 04/02/2022 06:34, Oleksandr Andrushchenko wrote:
>>>> From: Oleksandr Andrushchenko 
>>>>
>>>> Add a stub for is_memory_hole which is required for PCI passthrough
>>>> on Arm.
>>>>
>>>> Signed-off-by: Oleksandr Andrushchenko 
>>>>
>>>> ---
>>>> Cc: Julien Grall 
>>>> Cc: Stefano Stabellini 
>>>> ---
>>>> New in v6
>>>> ---
>>>>    xen/arch/arm/mm.c | 6 ++
>>>>    1 file changed, 6 insertions(+)
>>>>
>>>> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
>>>> index b1eae767c27c..c32e34a182a2 100644
>>>> --- a/xen/arch/arm/mm.c
>>>> +++ b/xen/arch/arm/mm.c
>>>> @@ -1640,6 +1640,12 @@ unsigned long get_upper_mfn_bound(void)
>>>>    return max_page - 1;
>>>>    }
>>>>    +bool is_memory_hole(mfn_t start, mfn_t end)
>>>> +{
>>>> +    /* TODO: this needs to be properly implemented. */
>>>
>>> I was hoping to see a summary of the discussion from IRC somewhere in the 
>>> patch (maybe after ---). This would help to bring up to speed the others 
>>> that were not on IRC.
>> I am not quite sure what needs to be put here as the summary
>
> At least some details on why this is a TODO. Is it because you are unsure of 
> the implementation? Is it because you wanted to send early?...
>
> IOW, what are you expecting from the reviewers?
Well, I just need to allow PCI passthrough to be built on Arm at the moment.
Clearly, without this stub I can't do so. This is the only intention now.
Of course, while PCI passthrough on Arm is still not really enabled those
who want trying it will need reverting the offending patch otherwise.
I am fine both ways
>
>> Could you please help me with the exact message you would like to see?
>
> Here a summary of the discussion (+ some my follow-up thoughts):
>
> is_memory_hole() was recently introduced on x86 (see commit 75cc460a1b8c 
> "xen/pci: detect when BARs are not suitably positioned") to check whether the 
> BAR are positioned outside of a valid memory range. This was introduced to 
> work-around quirky firmware.
>
> In theory, this could also happen on Arm. In practice, this may not happen 
> but it sounds better to sanity check that the BAR contains "valid" I/O range.
>
> On x86, this is implemented by checking the region is not described is in the 
> e820. IIUC, on Arm, the BARs have to be positioned in pre-defined ranges. So 
> I think it would be possible to implement is_memory_hole() by going through 
> the list of hostbridges and check the ranges.
>
> But first, I'd like to confirm my understanding with Rahul, and others.
>
> If we were going to go this route, I would also rename the function to be 
> better match what it is doing (i.e. it checks the BAR is correctly placed). 
> As a potentially optimization/hardening for Arm, we could pass the hostbridge 
> so we don't have to walk all of them.
It seems this needs to live in the commit message then? So, it is easy to find
as everything after "---" is going to be dropped on commit
>
> Cheers,
>
Thank you,
Oleksandr

Re: [PATCH v6 01/13] xen/pci: arm: add stub for is_memory_hole

2022-02-04 Thread Oleksandr Andrushchenko
Hi, Julien!

On 04.02.22 10:51, Julien Grall wrote:
> Hi,
>
> On 04/02/2022 06:34, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko 
>>
>> Add a stub for is_memory_hole which is required for PCI passthrough
>> on Arm.
>>
>> Signed-off-by: Oleksandr Andrushchenko 
>>
>> ---
>> Cc: Julien Grall 
>> Cc: Stefano Stabellini 
>> ---
>> New in v6
>> ---
>>   xen/arch/arm/mm.c | 6 ++
>>   1 file changed, 6 insertions(+)
>>
>> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
>> index b1eae767c27c..c32e34a182a2 100644
>> --- a/xen/arch/arm/mm.c
>> +++ b/xen/arch/arm/mm.c
>> @@ -1640,6 +1640,12 @@ unsigned long get_upper_mfn_bound(void)
>>   return max_page - 1;
>>   }
>>   +bool is_memory_hole(mfn_t start, mfn_t end)
>> +{
>> +    /* TODO: this needs to be properly implemented. */
>
> I was hoping to see a summary of the discussion from IRC somewhere in the 
> patch (maybe after ---). This would help to bring up to speed the others that 
> were not on IRC.
I am not quite sure what needs to be put here as the summary
Could you please help me with the exact message you would like to see?
>
>> +    return true;
>> +}
>> +
>>   /*
>>    * Local variables:
>>    * mode: C
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko
Hi, Jan!

On 04.02.22 09:52, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>   continue;
>>   }
>>   
>> +spin_lock(>vpci_lock);
>> +if ( !tmp->vpci )
>> +{
>> +spin_unlock(>vpci_lock);
>> +continue;
>> +}
>>   for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>   {
>>   const struct vpci_bar *bar = >vpci->header.bars[i];
>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>   rc = rangeset_remove_range(mem, start, end);
>>   if ( rc )
>>   {
>> +spin_unlock(>vpci_lock);
>>   printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: 
>> %d\n",
>>  start, end, rc);
>>   rangeset_destroy(mem);
>>   return rc;
>>   }
>>   }
>> +spin_unlock(>vpci_lock);
>>   }
> At the first glance this simply looks like another unjustified (in the
> description) change, as you're not converting anything here but you
> actually add locking (and I realize this was there before, so I'm sorry
> for not pointing this out earlier).
Well, I thought that the description already has "...the lock can be
used (and in a few cases is used right away) to check whether vpci
is present" and this is enough for such uses as here.
>   But then I wonder whether you
> actually tested this, since I can't help getting the impression that
> you're introducing a live-lock: The function is called from cmd_write()
> and rom_write(), which in turn are called out of vpci_write(). Yet that
> function already holds the lock, and the lock is not (currently)
> recursive. (For the 3rd caller of the function - init_bars() - otoh
> the locking looks to be entirely unnecessary.)
Well, you are correct: if tmp != pdev then it is correct to acquire
the lock. But if tmp == pdev and rom_only == true
then we'll deadlock.

It seems we need to have the locking conditional, e.g. only lock
if tmp != pdev
>
> Then again this was present already even in Roger's original patch, so
> I guess I must be missing something ...
>
>> --- a/xen/drivers/vpci/msix.c
>> +++ b/xen/drivers/vpci/msix.c
>> @@ -138,7 +138,7 @@ static void control_write(const struct pci_dev *pdev, 
>> unsigned int reg,
>>   pci_conf_write16(pdev->sbdf, reg, val);
>>   }
>>   
>> -static struct vpci_msix *msix_find(const struct domain *d, unsigned long 
>> addr)
>> +static struct vpci_msix *msix_get(const struct domain *d, unsigned long 
>> addr)
>>   {
>>   struct vpci_msix *msix;
>>   
>> @@ -150,15 +150,29 @@ static struct vpci_msix *msix_find(const struct domain 
>> *d, unsigned long addr)
>>   for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
>>   if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
>>VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
>> +{
>> +spin_lock(>pdev->vpci_lock);
>>   return msix;
>> +}
> I think deliberately returning with a lock held requires a respective
> comment ahead of the function.
Ok, will add a comment
>
>>   }
>>   
>>   return NULL;
>>   }
>>   
>> +static void msix_put(struct vpci_msix *msix)
>> +{
>> +if ( !msix )
>> +return;
>> +
>> +spin_unlock(>pdev->vpci_lock);
>> +}
> Maybe shorter
>
>  if ( msix )
>  spin_unlock(>pdev->vpci_lock);
Looks good
>
> ? Yet there's only one case where you may pass NULL in here, so
> maybe it's better anyway to move the conditional ...
>
>>   static int msix_accept(struct vcpu *v, unsigned long addr)
>>   {
>> -return !!msix_find(v->domain, addr);
>> +struct vpci_msix *msix = msix_get(v->domain, addr);
>> +
>> +msix_put(msix);
>> +return !!msix;
>>   }
> ... here?
Yes, I can have that check here, but what if there is yet
another caller of the same? I am not sure whether it is better
to have the check in msix_get or at the caller site.
At the moment (with a single place with NULL possible) I can
move the check. @Roger?
>
>> @@ -186,7 +200,7 @@ static int msix_read(struct vcpu *v, unsigned long addr,

Re: [PATCH v6 12/13] xen/arm: translate virtual PCI bus topology for guests

2022-02-04 Thread Oleksandr Andrushchenko
Hi, Jan!

On 04.02.22 09:56, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -168,6 +168,35 @@ static void vpci_remove_virtual_device(struct domain *d,
>>   pdev->vpci->guest_sbdf.sbdf = ~0;
>>   }
>>   
>> +/*
>> + * Find the physical device which is mapped to the virtual device
>> + * and translate virtual SBDF to the physical one.
>> + */
>> +bool vpci_translate_virtual_device(const struct domain *d, pci_sbdf_t *sbdf)
>> +{
>> +struct pci_dev *pdev;
>> +
>> +ASSERT(!is_hardware_domain(d));
> In addition to this, don't you also need to assert that pcidevs_lock is
> held (or if it isn't, you'd need to acquire it) for ...
>
>> +for_each_pdev( d, pdev )
> ... this to be race-free?
Yes, you are right and this needs pcidevs_lock();
Will add
>
> Jan
>
Thank you,
Oleksandr

Re: [PATCH v6 03/13] vpci: move lock outside of struct vpci

2022-02-04 Thread Oleksandr Andrushchenko
Hi, Jan!

On 04.02.22 09:52, Jan Beulich wrote:
> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote:
>
> At the first glance this simply looks like another unjustified (in the
> description) change, as you're not converting anything here but you
> actually add locking (and I realize this was there before, so I'm sorry
> for not pointing this out earlier). But then I wonder whether you
> actually tested this
This is already stated in the cover letter that I have tested two x86
configurations and tested that on Arm...
Would you like to see the relevant logs?

Thank you,
Oleksandr

[PATCH v6 13/13] xen/arm: account IO handlers for emulated PCI MSI-X

2022-02-03 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

At the moment, we always allocate an extra 16 slots for IO handlers
(see MAX_IO_HANDLER). So while adding IO trap handlers for the emulated
MSI-X registers we need to explicitly tell that we have additional IO
handlers, so those are accounted.

Signed-off-by: Oleksandr Andrushchenko 

---
Cc: Julien Grall 
Cc: Stefano Stabellini 
---
This actually moved here from the part 2 of the prep work for PCI
passthrough on Arm as it seems to be the proper place for it.

Since v5:
- optimize with IS_ENABLED(CONFIG_HAS_PCI_MSI) since VPCI_MAX_VIRT_DEV is
  defined unconditionally
New in v5
---
 xen/arch/arm/vpci.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/xen/arch/arm/vpci.c b/xen/arch/arm/vpci.c
index 84b2b068a0fe..c5902cb9d34d 100644
--- a/xen/arch/arm/vpci.c
+++ b/xen/arch/arm/vpci.c
@@ -131,6 +131,8 @@ static int vpci_get_num_handlers_cb(struct domain *d,
 
 unsigned int domain_vpci_get_num_mmio_handlers(struct domain *d)
 {
+unsigned int count;
+
 if ( !has_vpci(d) )
 return 0;
 
@@ -151,7 +153,17 @@ unsigned int domain_vpci_get_num_mmio_handlers(struct 
domain *d)
  * For guests each host bridge requires one region to cover the
  * configuration space. At the moment, we only expose a single host bridge.
  */
-return 1;
+count = 1;
+
+/*
+ * There's a single MSI-X MMIO handler that deals with both PBA
+ * and MSI-X tables per each PCI device being passed through.
+ * Maximum number of emulated virtual devices is VPCI_MAX_VIRT_DEV.
+ */
+if ( IS_ENABLED(CONFIG_HAS_PCI_MSI) )
+count += VPCI_MAX_VIRT_DEV;
+
+return count;
 }
 
 /*
-- 
2.25.1




[PATCH v6 12/13] xen/arm: translate virtual PCI bus topology for guests

2022-02-03 Thread Oleksandr Andrushchenko
From: Oleksandr Andrushchenko 

There are three  originators for the PCI configuration space access:
1. The domain that owns physical host bridge: MMIO handlers are
there so we can update vPCI register handlers with the values
written by the hardware domain, e.g. physical view of the registers
vs guest's view on the configuration space.
2. Guest access to the passed through PCI devices: we need to properly
map virtual bus topology to the physical one, e.g. pass the configuration
space access to the corresponding physical devices.
3. Emulated host PCI bridge access. It doesn't exist in the physical
topology, e.g. it can't be mapped to some physical host bridge.
So, all access to the host bridge itself needs to be trapped and
emulated.

Signed-off-by: Oleksandr Andrushchenko 
---
Since v5:
- add vpci_translate_virtual_device for #ifndef CONFIG_HAS_VPCI_GUEST_SUPPORT
  case to simplify ifdefery
- add ASSERT(!is_hardware_domain(d)); to vpci_translate_virtual_device
- reset output register on failed virtual SBDF translation
Since v4:
- indentation fixes
- constify struct domain
- updated commit message
- updates to the new locking scheme (pdev->vpci_lock)
Since v3:
- revisit locking
- move code to vpci.c
Since v2:
 - pass struct domain instead of struct vcpu
 - constify arguments where possible
 - gate relevant code with CONFIG_HAS_VPCI_GUEST_SUPPORT
New in v2
---
 xen/arch/arm/vpci.c | 17 +
 xen/drivers/vpci/vpci.c | 29 +
 xen/include/xen/vpci.h  |  7 +++
 3 files changed, 53 insertions(+)

diff --git a/xen/arch/arm/vpci.c b/xen/arch/arm/vpci.c
index a9fc5817f94e..84b2b068a0fe 100644
--- a/xen/arch/arm/vpci.c
+++ b/xen/arch/arm/vpci.c
@@ -41,6 +41,16 @@ static int vpci_mmio_read(struct vcpu *v, mmio_info_t *info,
 /* data is needed to prevent a pointer cast on 32bit */
 unsigned long data;
 
+/*
+ * For the passed through devices we need to map their virtual SBDF
+ * to the physical PCI device being passed through.
+ */
+if ( !bridge && !vpci_translate_virtual_device(v->domain, ) )
+{
+*r = ~0ul;
+return 1;
+}
+
 if ( vpci_ecam_read(sbdf, ECAM_REG_OFFSET(info->gpa),
 1U << info->dabt.size, ) )
 {
@@ -59,6 +69,13 @@ static int vpci_mmio_write(struct vcpu *v, mmio_info_t *info,
 struct pci_host_bridge *bridge = p;
 pci_sbdf_t sbdf = vpci_sbdf_from_gpa(bridge, info->gpa);
 
+/*
+ * For the passed through devices we need to map their virtual SBDF
+ * to the physical PCI device being passed through.
+ */
+if ( !bridge && !vpci_translate_virtual_device(v->domain, ) )
+return 1;
+
 return vpci_ecam_write(sbdf, ECAM_REG_OFFSET(info->gpa),
1U << info->dabt.size, r);
 }
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 7d422d11f83d..070db7391391 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -168,6 +168,35 @@ static void vpci_remove_virtual_device(struct domain *d,
 pdev->vpci->guest_sbdf.sbdf = ~0;
 }
 
+/*
+ * Find the physical device which is mapped to the virtual device
+ * and translate virtual SBDF to the physical one.
+ */
+bool vpci_translate_virtual_device(const struct domain *d, pci_sbdf_t *sbdf)
+{
+struct pci_dev *pdev;
+
+ASSERT(!is_hardware_domain(d));
+
+for_each_pdev( d, pdev )
+{
+bool found;
+
+spin_lock(>vpci_lock);
+found = pdev->vpci && (pdev->vpci->guest_sbdf.sbdf == sbdf->sbdf);
+spin_unlock(>vpci_lock);
+
+if ( found )
+{
+/* Replace guest SBDF with the physical one. */
+*sbdf = pdev->sbdf;
+return true;
+}
+}
+
+return false;
+}
+
 /* Notify vPCI that device is assigned to guest. */
 int vpci_assign_device(struct domain *d, struct pci_dev *pdev)
 {
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 1f04d34a2369..f6eb9f2051af 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -271,6 +271,7 @@ static inline bool __must_check vpci_process_pending(struct 
vcpu *v)
 /* Notify vPCI that device is assigned/de-assigned to/from guest. */
 int vpci_assign_device(struct domain *d, struct pci_dev *pdev);
 void vpci_deassign_device(struct domain *d, struct pci_dev *pdev);
+bool vpci_translate_virtual_device(const struct domain *d, pci_sbdf_t *sbdf);
 #else
 static inline int vpci_assign_device(struct domain *d, struct pci_dev *pdev)
 {
@@ -280,6 +281,12 @@ static inline int vpci_assign_device(struct domain *d, 
struct pci_dev *pdev)
 static inline void vpci_deassign_device(struct domain *d, struct pci_dev *pdev)
 {
 };
+
+static inline bool vpci_translate_virtual_device(const struct domain *d,
+ pci_sbdf_t *sbdf)
+{
+return false;
+}
 #endif
 
 #endif
-- 
2.25.1




  1   2   3   4   5   6   7   8   9   10   >