Re: [PATCH 2/2 v2] powerpc/powernv: Enable and setup PCI P2P

2020-08-11 Thread Frederic Barrat




Le 03/08/2020 à 09:35, Oliver O'Halloran a écrit :

On Thu, Apr 30, 2020 at 11:15 PM Max Gurtovoy  wrote:

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57d3a6a..9ecc576 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3706,18 +3706,208 @@ static void pnv_pci_ioda_dma_bus_setup(struct pci_bus 
*bus)
 }
  }

+#ifdef CONFIG_PCI_P2PDMA
+static DEFINE_MUTEX(p2p_mutex);
+
+static bool pnv_pci_controller_owns_addr(struct pci_controller *hose,
+phys_addr_t addr, size_t size)
+{
+   int i;
+
+   /*
+* It seems safe to assume the full range is under the same PHB, so we
+* can ignore the size.
+*/
+   for (i = 0; i < ARRAY_SIZE(hose->mem_resources); i++) {
+   struct resource *res = >mem_resources[i];
+
+   if (res->flags && addr >= res->start && addr < res->end)
+   return true;
+   }
+   return false;
+}
+
+/*
+ * find the phb owning a mmio address if not owned locally
+ */
+static struct pnv_phb *pnv_pci_find_owning_phb(struct pci_dev *pdev,
+  phys_addr_t addr, size_t size)
+{
+   struct pci_controller *hose;
+
+   /* fast path */
+   if (pnv_pci_controller_owns_addr(pdev->bus->sysdata, addr, size))
+   return NULL;


Do we actually need this fast path? It's going to be slow either way.
Also if a device is doing p2p to another device under the same PHB
then it should not be happening via the root complex. Is this a case
you've tested?



The "fast path" comment is misleading and we should rephrase. The point 
is to catch if we're mapping a resource under the same PHB, in which 
case we don't modify the PHB configuration. So we need to catch it 
early, but it's not a fast path.
If the 2 devices are under the same PHB, the code above shouldn't do 
anything. So I guess behavior depends on the underlying bridge? We'll 
need another platform than witherspoon to test it. Probably worth checking.




+   list_for_each_entry(hose, _list, list_node) {
+   struct pnv_phb *phb = hose->private_data;
+
+   if (phb->type != PNV_PHB_NPU_NVLINK &&
+   phb->type != PNV_PHB_NPU_OCAPI) {
+   if (pnv_pci_controller_owns_addr(hose, addr, size))
+   return phb;
+   }
+   }
+   return NULL;
+}
+
+static u64 pnv_pci_dma_dir_to_opal_p2p(enum dma_data_direction dir)
+{
+   if (dir == DMA_TO_DEVICE)
+   return OPAL_PCI_P2P_STORE;
+   else if (dir == DMA_FROM_DEVICE)
+   return OPAL_PCI_P2P_LOAD;
+   else if (dir == DMA_BIDIRECTIONAL)
+   return OPAL_PCI_P2P_LOAD | OPAL_PCI_P2P_STORE;
+   else
+   return 0;
+}
+
+static int pnv_pci_ioda_enable_p2p(struct pci_dev *initiator,
+  struct pnv_phb *phb_target,
+  enum dma_data_direction dir)
+{
+   struct pci_controller *hose;
+   struct pnv_phb *phb_init;
+   struct pnv_ioda_pe *pe_init;
+   u64 desc;
+   int rc;
+
+   if (!opal_check_token(OPAL_PCI_SET_P2P))
+   return -ENXIO;
+



+   hose = pci_bus_to_host(initiator->bus);
+   phb_init = hose->private_data;


You can use the pci_bus_to_pnvhb() helper


+
+   pe_init = pnv_ioda_get_pe(initiator);
+   if (!pe_init)
+   return -ENODEV;
+
+   if (!pe_init->tce_bypass_enabled)
+   return -EINVAL;
+
+   /*
+* Configuring the initiator's PHB requires to adjust its TVE#1
+* setting. Since the same device can be an initiator several times for
+* different target devices, we need to keep a reference count to know
+* when we can restore the default bypass setting on its TVE#1 when
+* disabling. Opal is not tracking PE states, so we add a reference
+* count on the PE in linux.
+*
+* For the target, the configuration is per PHB, so we keep a
+* target reference count on the PHB.
+*/


This irks me a bit because configuring the DMA address limits for the
TVE is the kernel's job. What we really should be doing is using
opal_pci_map_pe_dma_window_real() to set the bypass-mode address limit
for the TVE to something large enough to hit the MMIO ranges rather
than having set_p2p do it as a side effect. Unfortunately, for some
reason skiboot doesn't implement support for enabling 56bit addressing
using opal_pci_map_pe_dma_window_real() and we do need to support
older kernel's which used this stuff so I guess we're stuck with it
for now. It'd be nice if we could fix this in the longer term
though...



OK. We'd need more than a 56-bit opal_pci_map_pe_dma_window_real() 
though, there's also a queue setting change on the target PHB.


  Fred



+  

RE: [PATCH 2/2 v2] powerpc/powernv: Enable and setup PCI P2P

2020-08-10 Thread Aneela Devarasetty
+ David from IBM.

-Original Message-
From: Oliver O'Halloran  
Sent: Monday, August 3, 2020 2:35 AM
To: Max Gurtovoy 
Cc: Christoph Hellwig ; linux-pci ; 
linuxppc-dev ; Israel Rukshin 
; Idan Werpoler ; Vladimir Koushnir 
; Shlomi Nimrodi ; Frederic 
Barrat ; Carol Soto ; Aneela 
Devarasetty 
Subject: Re: [PATCH 2/2 v2] powerpc/powernv: Enable and setup PCI P2P

On Thu, Apr 30, 2020 at 11:15 PM Max Gurtovoy  wrote:
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 57d3a6a..9ecc576 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -3706,18 +3706,208 @@ static void pnv_pci_ioda_dma_bus_setup(struct 
> pci_bus *bus)
> }
>  }
>
> +#ifdef CONFIG_PCI_P2PDMA
> +static DEFINE_MUTEX(p2p_mutex);
> +
> +static bool pnv_pci_controller_owns_addr(struct pci_controller *hose,
> +phys_addr_t addr, size_t 
> +size) {
> +   int i;
> +
> +   /*
> +* It seems safe to assume the full range is under the same PHB, so we
> +* can ignore the size.
> +*/
> +   for (i = 0; i < ARRAY_SIZE(hose->mem_resources); i++) {
> +   struct resource *res = >mem_resources[i];
> +
> +   if (res->flags && addr >= res->start && addr < res->end)
> +   return true;
> +   }
> +   return false;
> +}
> +
> +/*
> + * find the phb owning a mmio address if not owned locally  */ static 
> +struct pnv_phb *pnv_pci_find_owning_phb(struct pci_dev *pdev,
> +  phys_addr_t addr, 
> +size_t size) {
> +   struct pci_controller *hose;
> +
> +   /* fast path */
> +   if (pnv_pci_controller_owns_addr(pdev->bus->sysdata, addr, size))
> +   return NULL;

Do we actually need this fast path? It's going to be slow either way.
Also if a device is doing p2p to another device under the same PHB then it 
should not be happening via the root complex. Is this a case you've tested?

> +   list_for_each_entry(hose, _list, list_node) {
> +   struct pnv_phb *phb = hose->private_data;
> +
> +   if (phb->type != PNV_PHB_NPU_NVLINK &&
> +   phb->type != PNV_PHB_NPU_OCAPI) {
> +   if (pnv_pci_controller_owns_addr(hose, addr, size))
> +   return phb;
> +   }
> +   }
> +   return NULL;
> +}
> +
> +static u64 pnv_pci_dma_dir_to_opal_p2p(enum dma_data_direction dir) {
> +   if (dir == DMA_TO_DEVICE)
> +   return OPAL_PCI_P2P_STORE;
> +   else if (dir == DMA_FROM_DEVICE)
> +   return OPAL_PCI_P2P_LOAD;
> +   else if (dir == DMA_BIDIRECTIONAL)
> +   return OPAL_PCI_P2P_LOAD | OPAL_PCI_P2P_STORE;
> +   else
> +   return 0;
> +}
> +
> +static int pnv_pci_ioda_enable_p2p(struct pci_dev *initiator,
> +  struct pnv_phb *phb_target,
> +  enum dma_data_direction dir) {
> +   struct pci_controller *hose;
> +   struct pnv_phb *phb_init;
> +   struct pnv_ioda_pe *pe_init;
> +   u64 desc;
> +   int rc;
> +
> +   if (!opal_check_token(OPAL_PCI_SET_P2P))
> +   return -ENXIO;
> +

> +   hose = pci_bus_to_host(initiator->bus);
> +   phb_init = hose->private_data;

You can use the pci_bus_to_pnvhb() helper

> +
> +   pe_init = pnv_ioda_get_pe(initiator);
> +   if (!pe_init)
> +   return -ENODEV;
> +
> +   if (!pe_init->tce_bypass_enabled)
> +   return -EINVAL;
> +
> +   /*
> +* Configuring the initiator's PHB requires to adjust its TVE#1
> +* setting. Since the same device can be an initiator several times 
> for
> +* different target devices, we need to keep a reference count to know
> +* when we can restore the default bypass setting on its TVE#1 when
> +* disabling. Opal is not tracking PE states, so we add a reference
> +* count on the PE in linux.
> +*
> +* For the target, the configuration is per PHB, so we keep a
> +* target reference count on the PHB.
> +*/

This irks me a bit because configuring the DMA address limits for the TVE is 
the kernel's job. What we really should be doing is using
opal_pci_map_pe_dma_window_real() to set the bypass-mode address limit for the 
TVE to something large enough to hit the MMIO ranges rather than having set_p2p 
do it as a side effect. Unfortunately, for some reason skiboot d

Re: [PATCH 2/2 v2] powerpc/powernv: Enable and setup PCI P2P

2020-08-03 Thread Oliver O'Halloran
On Thu, Apr 30, 2020 at 11:15 PM Max Gurtovoy  wrote:
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 57d3a6a..9ecc576 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -3706,18 +3706,208 @@ static void pnv_pci_ioda_dma_bus_setup(struct 
> pci_bus *bus)
> }
>  }
>
> +#ifdef CONFIG_PCI_P2PDMA
> +static DEFINE_MUTEX(p2p_mutex);
> +
> +static bool pnv_pci_controller_owns_addr(struct pci_controller *hose,
> +phys_addr_t addr, size_t size)
> +{
> +   int i;
> +
> +   /*
> +* It seems safe to assume the full range is under the same PHB, so we
> +* can ignore the size.
> +*/
> +   for (i = 0; i < ARRAY_SIZE(hose->mem_resources); i++) {
> +   struct resource *res = >mem_resources[i];
> +
> +   if (res->flags && addr >= res->start && addr < res->end)
> +   return true;
> +   }
> +   return false;
> +}
> +
> +/*
> + * find the phb owning a mmio address if not owned locally
> + */
> +static struct pnv_phb *pnv_pci_find_owning_phb(struct pci_dev *pdev,
> +  phys_addr_t addr, size_t size)
> +{
> +   struct pci_controller *hose;
> +
> +   /* fast path */
> +   if (pnv_pci_controller_owns_addr(pdev->bus->sysdata, addr, size))
> +   return NULL;

Do we actually need this fast path? It's going to be slow either way.
Also if a device is doing p2p to another device under the same PHB
then it should not be happening via the root complex. Is this a case
you've tested?

> +   list_for_each_entry(hose, _list, list_node) {
> +   struct pnv_phb *phb = hose->private_data;
> +
> +   if (phb->type != PNV_PHB_NPU_NVLINK &&
> +   phb->type != PNV_PHB_NPU_OCAPI) {
> +   if (pnv_pci_controller_owns_addr(hose, addr, size))
> +   return phb;
> +   }
> +   }
> +   return NULL;
> +}
> +
> +static u64 pnv_pci_dma_dir_to_opal_p2p(enum dma_data_direction dir)
> +{
> +   if (dir == DMA_TO_DEVICE)
> +   return OPAL_PCI_P2P_STORE;
> +   else if (dir == DMA_FROM_DEVICE)
> +   return OPAL_PCI_P2P_LOAD;
> +   else if (dir == DMA_BIDIRECTIONAL)
> +   return OPAL_PCI_P2P_LOAD | OPAL_PCI_P2P_STORE;
> +   else
> +   return 0;
> +}
> +
> +static int pnv_pci_ioda_enable_p2p(struct pci_dev *initiator,
> +  struct pnv_phb *phb_target,
> +  enum dma_data_direction dir)
> +{
> +   struct pci_controller *hose;
> +   struct pnv_phb *phb_init;
> +   struct pnv_ioda_pe *pe_init;
> +   u64 desc;
> +   int rc;
> +
> +   if (!opal_check_token(OPAL_PCI_SET_P2P))
> +   return -ENXIO;
> +

> +   hose = pci_bus_to_host(initiator->bus);
> +   phb_init = hose->private_data;

You can use the pci_bus_to_pnvhb() helper

> +
> +   pe_init = pnv_ioda_get_pe(initiator);
> +   if (!pe_init)
> +   return -ENODEV;
> +
> +   if (!pe_init->tce_bypass_enabled)
> +   return -EINVAL;
> +
> +   /*
> +* Configuring the initiator's PHB requires to adjust its TVE#1
> +* setting. Since the same device can be an initiator several times 
> for
> +* different target devices, we need to keep a reference count to know
> +* when we can restore the default bypass setting on its TVE#1 when
> +* disabling. Opal is not tracking PE states, so we add a reference
> +* count on the PE in linux.
> +*
> +* For the target, the configuration is per PHB, so we keep a
> +* target reference count on the PHB.
> +*/

This irks me a bit because configuring the DMA address limits for the
TVE is the kernel's job. What we really should be doing is using
opal_pci_map_pe_dma_window_real() to set the bypass-mode address limit
for the TVE to something large enough to hit the MMIO ranges rather
than having set_p2p do it as a side effect. Unfortunately, for some
reason skiboot doesn't implement support for enabling 56bit addressing
using opal_pci_map_pe_dma_window_real() and we do need to support
older kernel's which used this stuff so I guess we're stuck with it
for now. It'd be nice if we could fix this in the longer term
though...

> +   mutex_lock(_mutex);
> +
> +   desc = OPAL_PCI_P2P_ENABLE | pnv_pci_dma_dir_to_opal_p2p(dir);
> +   /* always go to opal to validate the configuration */
> +   rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, desc,
> + pe_init->pe_number);
> +   if (rc != OPAL_SUCCESS) {
> +   rc = -EIO;
> +   goto out;
> +   }
> +
> +   pe_init->p2p_initiator_count++;
> +