Re: [PATCH 05/11] mmc: s3cmci: handle highmem pages

2019-01-29 Thread Ulf Hansson
On Mon, 14 Jan 2019 at 10:58, Christoph Hellwig  wrote:
>
> Instead of setting up a kernel pointer to track the current PIO address,
> track the offset in the current page, and do an atomic kmap for the page
> while doing the actual PIO operations.
>
> Signed-off-by: Christoph Hellwig 

Nitpick: This one have some trailing whitespace errors, which git
complains about when I apply it.

No need to re-spin due to this, I can fix it  up.

Kind regards
Uffe

> ---
>  drivers/mmc/host/s3cmci.c | 107 +++---
>  drivers/mmc/host/s3cmci.h |   3 +-
>  2 files changed, 55 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c
> index 10f5219b3b40..1be84426c817 100644
> --- a/drivers/mmc/host/s3cmci.c
> +++ b/drivers/mmc/host/s3cmci.c
> @@ -15,6 +15,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -317,26 +318,17 @@ static void s3cmci_check_sdio_irq(struct s3cmci_host 
> *host)
> }
>  }
>
> -static inline int get_data_buffer(struct s3cmci_host *host,
> - u32 *bytes, u32 **pointer)
> +static inline int get_data_buffer(struct s3cmci_host *host)
>  {
> -   struct scatterlist *sg;
> -
> -   if (host->pio_active == XFER_NONE)
> -   return -EINVAL;
> -
> -   if ((!host->mrq) || (!host->mrq->data))
> -   return -EINVAL;
> -
> if (host->pio_sgptr >= host->mrq->data->sg_len) {
> dbg(host, dbg_debug, "no more buffers (%i/%i)\n",
>   host->pio_sgptr, host->mrq->data->sg_len);
> return -EBUSY;
> }
> -   sg = >mrq->data->sg[host->pio_sgptr];
> +   host->cur_sg = >mrq->data->sg[host->pio_sgptr];
>
> -   *bytes = sg->length;
> -   *pointer = sg_virt(sg);
> +   host->pio_bytes = host->cur_sg->length;
> +   host->pio_offset = host->cur_sg->offset;
>
> host->pio_sgptr++;
>
> @@ -422,11 +414,16 @@ static void s3cmci_disable_irq(struct s3cmci_host 
> *host, bool transfer)
>
>  static void do_pio_read(struct s3cmci_host *host)
>  {
> -   int res;
> u32 fifo;
> u32 *ptr;
> u32 fifo_words;
> void __iomem *from_ptr;
> +   void *buf;
> +
> +   if (host->pio_active == XFER_NONE)
> +   goto done;
> +   if (!host->mrq || !host->mrq->data)
> +   goto done;
>
> /* write real prescaler to host, it might be set slow to fix */
> writel(host->prescaler, host->base + S3C2410_SDIPRE);
> @@ -435,20 +432,12 @@ static void do_pio_read(struct s3cmci_host *host)
>
> while ((fifo = fifo_count(host))) {
> if (!host->pio_bytes) {
> -   res = get_data_buffer(host, >pio_bytes,
> - >pio_ptr);
> -   if (res) {
> -   host->pio_active = XFER_NONE;
> -   host->complete_what = COMPLETION_FINALIZE;
> -
> -   dbg(host, dbg_pio, "pio_read(): "
> -   "complete (no more data).\n");
> -   return;
> -   }
> +   if (get_data_buffer(host) < 0)
> +   goto done;
>
> dbg(host, dbg_pio,
> -   "pio_read(): new target: [%i]@[%p]\n",
> -   host->pio_bytes, host->pio_ptr);
> +   "pio_read(): new target: [%i]@[%zu]\n",
> +   host->pio_bytes, host->pio_offset);
> }
>
> dbg(host, dbg_pio,
> @@ -470,63 +459,65 @@ static void do_pio_read(struct s3cmci_host *host)
> host->pio_count += fifo;
>
> fifo_words = fifo >> 2;
> -   ptr = host->pio_ptr;
> -   while (fifo_words--)
> +
> +   buf = (kmap_atomic(sg_page(host->cur_sg)) + host->pio_offset);
> +   ptr = buf;
> +   while (fifo_words--) {
> *ptr++ = readl(from_ptr);
> -   host->pio_ptr = ptr;
> +   host->pio_offset += 4;
> +   }
>
> if (fifo & 3) {
> u32 n = fifo & 3;
> u32 data = readl(from_ptr);
> -   u8 *p = (u8 *)host->pio_ptr;
> +   u8 *p = (u8 *)ptr;
>
> while (n--) {
> *p++ = data;
> data >>= 8;
> +   host->pio_offset++;
> }
> }
> +   kunmap_atomic(buf);
> }
>
> if (!host->pio_bytes) {
> -   res = get_data_buffer(host, >pio_bytes, >pio_ptr);
> -   if (res) {
> -   dbg(host, dbg_pio,
> -  

Re: [PATCH 07/11] mmc: mvsdio: handle highmem pages

2019-01-29 Thread Ulf Hansson
On Mon, 14 Jan 2019 at 10:58, Christoph Hellwig  wrote:
>
> Instead of setting up a kernel pointer to track the current PIO address,
> track the offset in the current page, and do an atomic kmap for the page
> while doing the actual PIO operations.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/mmc/host/mvsdio.c | 48 +++
>  1 file changed, 29 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
> index e22bbff89c8d..545e370d6dae 100644
> --- a/drivers/mmc/host/mvsdio.c
> +++ b/drivers/mmc/host/mvsdio.c
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -42,7 +43,8 @@ struct mvsd_host {
> unsigned int intr_en;
> unsigned int ctrl;
> unsigned int pio_size;
> -   void *pio_ptr;
> +   struct scatterlist *pio_sg;
> +   unsigned int pio_offset; /* offset in words into the segment */
> unsigned int sg_frags;
> unsigned int ns_per_clk;
> unsigned int clock;
> @@ -96,9 +98,9 @@ static int mvsd_setup_data(struct mvsd_host *host, struct 
> mmc_data *data)
> if (tmout_index > MVSD_HOST_CTRL_TMOUT_MAX)
> tmout_index = MVSD_HOST_CTRL_TMOUT_MAX;
>
> -   dev_dbg(host->dev, "data %s at 0x%08x: blocks=%d blksz=%d tmout=%u 
> (%d)\n",
> +   dev_dbg(host->dev, "data %s at 0x%08llx: blocks=%d blksz=%d tmout=%u 
> (%d)\n",
> (data->flags & MMC_DATA_READ) ? "read" : "write",
> -   (u32)sg_virt(data->sg), data->blocks, data->blksz,
> +   (u64)sg_phys(data->sg), data->blocks, data->blksz,
> tmout, tmout_index);
>
> host->ctrl &= ~MVSD_HOST_CTRL_TMOUT_MASK;
> @@ -118,10 +120,11 @@ static int mvsd_setup_data(struct mvsd_host *host, 
> struct mmc_data *data)
>  * boundary.
>  */
> host->pio_size = data->blocks * data->blksz;
> -   host->pio_ptr = sg_virt(data->sg);
> +   host->pio_sg = data->sg;
> +   host->pio_offset = data->sg->offset / 2;
> if (!nodma)
> -   dev_dbg(host->dev, "fallback to PIO for data at 0x%p 
> size %d\n",
> -   host->pio_ptr, host->pio_size);
> +   dev_dbg(host->dev, "fallback to PIO for data at 0x%x 
> size %d\n",
> +   host->pio_offset, host->pio_size);
> return 1;
> } else {
> dma_addr_t phys_addr;
> @@ -291,8 +294,9 @@ static u32 mvsd_finish_data(struct mvsd_host *host, 
> struct mmc_data *data,
>  {
> void __iomem *iobase = host->base;
>
> -   if (host->pio_ptr) {
> -   host->pio_ptr = NULL;
> +   if (host->pio_sg) {
> +   host->pio_sg = NULL;
> +   host->pio_offset = 0;
> host->pio_size = 0;
> } else {
> dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags,
> @@ -376,11 +380,12 @@ static irqreturn_t mvsd_irq(int irq, void *dev)
> if (host->pio_size &&
> (intr_status & host->intr_en &
>  (MVSD_NOR_RX_READY | MVSD_NOR_RX_FIFO_8W))) {
> -   u16 *p = host->pio_ptr;
> +   u16 *p = kmap_atomic(sg_page(host->pio_sg));

Seems like a corresponding kunmap_atomic() is missing somewhere.

> +   unsigned int o = host->pio_offset;
> int s = host->pio_size;
> while (s >= 32 && (intr_status & MVSD_NOR_RX_FIFO_8W)) {
> -   readsw(iobase + MVSD_FIFO, p, 16);
> -   p += 16;
> +   readsw(iobase + MVSD_FIFO, p + o, 16);
> +   o += 16;
> s -= 32;
> intr_status = mvsd_read(MVSD_NOR_INTR_STATUS);
> }
> @@ -391,8 +396,10 @@ static irqreturn_t mvsd_irq(int irq, void *dev)
>  */
> if (s <= 32) {
> while (s >= 4 && (intr_status & MVSD_NOR_RX_READY)) {
> -   put_unaligned(mvsd_read(MVSD_FIFO), p++);
> -   put_unaligned(mvsd_read(MVSD_FIFO), p++);
> +   put_unaligned(mvsd_read(MVSD_FIFO), p + o);
> +   o++;
> +   put_unaligned(mvsd_read(MVSD_FIFO), p + o);
> +   o++;
> s -= 4;
> intr_status = mvsd_read(MVSD_NOR_INTR_STATUS);
> }
> @@ -400,7 +407,7 @@ static irqreturn_t mvsd_irq(int irq, void *dev)
> u16 val[2] = {0, 0};
> val[0] = mvsd_read(MVSD_FIFO);
> val[1] = mvsd_read(MVSD_FIFO);
> -   memcpy(p, ((void *)) + 4 - s, s);
> +  

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 01:43:02PM -0700, Logan Gunthorpe wrote:
> It's hard to reason about an interface when you can't see what all the
> layers want to do with it. Most maintainers (I'd hope) would certainly
> never merge code that has no callers, and for much the same reason, I'd
> rather not review patches that don't have real use case examples.

Yes, we should never review, nevermind merge code without users.
We had one example recently where this was not followed, which was HMM
and that turned out to be a desaster.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:36:08PM -0500, Michael S. Tsirkin wrote:
> This has been discussed ad nauseum. virtio is all about compatibility.
> Losing a couple of lines of code isn't worth breaking working setups.
> People that want "just use DMA API no tricks" now have the option.
> Setting a flag in a feature bit map is literally a single line
> of code in the hypervisor. So stop pushing for breaking working
> legacy setups and just fix it in the right place.

I agree with the legacy aspect.  What I am missing is an extremely
strong wording that says you SHOULD always set this flag for new
hosts, including an explanation why.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/2] iommu/vt-d: Remove change_pte notifier

2019-01-29 Thread Peter Xu
The change_pte() interface is tailored for PFN updates, while the
other notifier invalidate_range() should be enough for Intel IOMMU
cache flushing.  Actually we've done similar thing for AMD IOMMU
already in 8301da53fbc1 ("iommu/amd: Remove change_pte mmu_notifier
call-back", 2014-07-30) but the Intel IOMMU driver still have it.

Signed-off-by: Peter Xu 
---
 drivers/iommu/intel-svm.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a2a2aa4439aa..e9fd3ca057ac 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -180,14 +180,6 @@ static void intel_flush_svm_range(struct intel_svm *svm, 
unsigned long address,
rcu_read_unlock();
 }
 
-static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
-unsigned long address, pte_t pte)
-{
-   struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
-
-   intel_flush_svm_range(svm, address, 1, 1, 0);
-}
-
 /* Pages have been freed at this point */
 static void intel_invalidate_range(struct mmu_notifier *mn,
   struct mm_struct *mm,
@@ -227,7 +219,6 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
 
 static const struct mmu_notifier_ops intel_mmuops = {
.release = intel_mm_release,
-   .change_pte = intel_change_pte,
.invalidate_range = intel_invalidate_range,
 };
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/2] iommu/amd: Remove clear_flush_young notifier

2019-01-29 Thread Peter Xu
AMD IOMMU driver is using the clear_flush_young() to do cache flushing
but that's actually already covered by invalidate_range().  Remove the
extra notifier and the chunks.

Signed-off-by: Peter Xu 
---
 drivers/iommu/amd_iommu_v2.c | 24 
 1 file changed, 24 deletions(-)

diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 23dae9348ace..5d7ef750e4a0 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -370,29 +370,6 @@ static struct pasid_state *mn_to_state(struct mmu_notifier 
*mn)
return container_of(mn, struct pasid_state, mn);
 }
 
-static void __mn_flush_page(struct mmu_notifier *mn,
-   unsigned long address)
-{
-   struct pasid_state *pasid_state;
-   struct device_state *dev_state;
-
-   pasid_state = mn_to_state(mn);
-   dev_state   = pasid_state->device_state;
-
-   amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address);
-}
-
-static int mn_clear_flush_young(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long start,
-   unsigned long end)
-{
-   for (; start < end; start += PAGE_SIZE)
-   __mn_flush_page(mn, start);
-
-   return 0;
-}
-
 static void mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -430,7 +407,6 @@ static void mn_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 
 static const struct mmu_notifier_ops iommu_mn = {
.release= mn_release,
-   .clear_flush_young  = mn_clear_flush_young,
.invalidate_range   = mn_invalidate_range,
 };
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/2] Some MMU notifier cleanups for Intel/AMD IOMMU

2019-01-29 Thread Peter Xu
Recently when I'm reading the mmu notifiers I noticed that both
Intel/AMD IOMMU drivers seem to have redundancies in using the MMU
notifiers.  It can also be seen as a follow up of commit 8301da53fbc1
("iommu/amd: Remove change_pte mmu_notifier call-back", 2014-07-30).

I don't have hardwares to test them, but they compile well.

Please have a look, thanks.

Peter Xu (2):
  iommu/vt-d: Remove change_pte notifier
  iommu/amd: Remove clear_flush_young notifier

 drivers/iommu/amd_iommu_v2.c | 24 
 drivers/iommu/intel-svm.c|  9 -
 2 files changed, 33 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/3] iommu/arm-smmu: Add support to use Last level cache

2019-01-29 Thread Vivek Gautam
On Tue, Jan 29, 2019 at 8:34 PM Ard Biesheuvel
 wrote:
>
> (+ Bjorn)
>
> On Mon, 28 Jan 2019 at 12:27, Vivek Gautam  
> wrote:
> >
> > Hi Ard,
> >
> > On Thu, Jan 24, 2019 at 1:25 PM Ard Biesheuvel
> >  wrote:
> > >
> > > On Thu, 24 Jan 2019 at 07:58, Vivek Gautam  
> > > wrote:
> > > >
> > > > On Mon, Jan 21, 2019 at 7:55 PM Ard Biesheuvel
> > > >  wrote:
> > > > >
> > > > > On Mon, 21 Jan 2019 at 14:56, Robin Murphy  
> > > > > wrote:
> > > > > >
> > > > > > On 21/01/2019 13:36, Ard Biesheuvel wrote:
> > > > > > > On Mon, 21 Jan 2019 at 14:25, Robin Murphy  
> > > > > > > wrote:
> > > > > > >>
> > > > > > >> On 21/01/2019 10:50, Ard Biesheuvel wrote:
> > > > > > >>> On Mon, 21 Jan 2019 at 11:17, Vivek Gautam 
> > > > > > >>>  wrote:
> > > > > > 
> > > > > >  Hi,
> > > > > > 
> > > > > > 
> > > > > >  On Mon, Jan 21, 2019 at 12:56 PM Ard Biesheuvel
> > > > > >   wrote:
> > > > > > >
> > > > > > > On Mon, 21 Jan 2019 at 06:54, Vivek Gautam 
> > > > > > >  wrote:
> > > > > > >>
> > > > > > >> Qualcomm SoCs have an additional level of cache called as
> > > > > > >> System cache, aka. Last level cache (LLC). This cache sits 
> > > > > > >> right
> > > > > > >> before the DDR, and is tightly coupled with the memory 
> > > > > > >> controller.
> > > > > > >> The clients using this cache request their slices from this
> > > > > > >> system cache, make it active, and can then start using it.
> > > > > > >> For these clients with smmu, to start using the system cache 
> > > > > > >> for
> > > > > > >> buffers and, related page tables [1], memory attributes need 
> > > > > > >> to be
> > > > > > >> set accordingly. This series add the required support.
> > > > > > >>
> > > > > > >
> > > > > > > Does this actually improve performance on reads from a 
> > > > > > > device? The
> > > > > > > non-cache coherent DMA routines perform an unconditional 
> > > > > > > D-cache
> > > > > > > invalidate by VA to the PoC before reading from the buffers 
> > > > > > > filled by
> > > > > > > the device, and I would expect the PoC to be defined as lying 
> > > > > > > beyond
> > > > > > > the LLC to still guarantee the architected behavior.
> > > > > > 
> > > > > >  We have seen performance improvements when running Manhattan
> > > > > >  GFXBench benchmarks.
> > > > > > 
> > > > > > >>>
> > > > > > >>> Ah ok, that makes sense, since in that case, the data flow is 
> > > > > > >>> mostly
> > > > > > >>> to the device, not from the device.
> > > > > > >>>
> > > > > >  As for the PoC, from my knowledge on sdm845 the system cache, 
> > > > > >  aka
> > > > > >  Last level cache (LLC) lies beyond the point of coherency.
> > > > > >  Non-cache coherent buffers will not be cached to system cache 
> > > > > >  also, and
> > > > > >  no additional software cache maintenance ops are required for 
> > > > > >  system cache.
> > > > > >  Pratik can add more if I am missing something.
> > > > > > 
> > > > > >  To take care of the memory attributes from DMA APIs side, we 
> > > > > >  can add a
> > > > > >  DMA_ATTR definition to take care of any dma non-coherent APIs 
> > > > > >  calls.
> > > > > > 
> > > > > > >>>
> > > > > > >>> So does the device use the correct inner non-cacheable, outer
> > > > > > >>> writeback cacheable attributes if the SMMU is in pass-through?
> > > > > > >>>
> > > > > > >>> We have been looking into another use case where the fact that 
> > > > > > >>> the
> > > > > > >>> SMMU overrides memory attributes is causing issues (WC mappings 
> > > > > > >>> used
> > > > > > >>> by the radeon and amdgpu driver). So if the SMMU would honour 
> > > > > > >>> the
> > > > > > >>> existing attributes, would you still need the SMMU changes?
> > > > > > >>
> > > > > > >> Even if we could force a stage 2 mapping with the weakest 
> > > > > > >> pagetable
> > > > > > >> attributes (such that combining would work), there would still 
> > > > > > >> need to
> > > > > > >> be a way to set the TCR attributes appropriately if this 
> > > > > > >> behaviour is
> > > > > > >> wanted for the SMMU's own table walks as well.
> > > > > > >>
> > > > > > >
> > > > > > > Isn't that just a matter of implementing support for SMMUs that 
> > > > > > > lack
> > > > > > > the 'dma-coherent' attribute?
> > > > > >
> > > > > > Not quite - in general they need INC-ONC attributes in case there
> > > > > > actually is something in the architectural outer-cacheable domain.
> > > > >
> > > > > But is it a problem to use INC-ONC attributes for the SMMU PTW on this
> > > > > chip? AIUI, the reason for the SMMU changes is to avoid the
> > > > > performance hit of snooping, which is more expensive than cache
> > > > > maintenance of SMMU page tables. So are you saying the by-VA cache
> > > > > maintenance is not relayed to this system cache, resulting in 

Re: use generic DMA mapping code in powerpc V4

2019-01-29 Thread Christian Zigotzky
Hi Christoph,

Thanks a lot for the updates. I will test the full branch tomorrow.

Cheers,
Christian

Sent from my iPhone

> On 29. Jan 2019, at 17:34, Christoph Hellwig  wrote:
> 
>> On Tue, Jan 29, 2019 at 05:14:11PM +0100, Christoph Hellwig wrote:
>>> On Tue, Jan 29, 2019 at 04:03:32PM +0100, Christian Zigotzky wrote:
>>> Hi Christoph,
>>> 
>>> I compiled kernels for the X5000 and X1000 from your new branch 
>>> 'powerpc-dma.6-debug.2' today. The kernels boot and the P.A. Semi Ethernet 
>>> works!
>> 
>> Thanks for testing!  I'll prepare a new series that adds the other
>> patches on top of this one.
> 
> And that was easier than I thought - we just had a few patches left
> in powerpc-dma.6, so I've rebased that branch on top of
> powerpc-dma.6-debug.2:
> 
>git://git.infradead.org/users/hch/misc.git powerpc-dma.6
> 
> Gitweb:
> 
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/powerpc-dma.6
> 
> I hope the other patches are simple enough, so just testing the full
> branch checkout should be fine for now.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 07:08:06PM -0500, Jerome Glisse wrote:
> On Tue, Jan 29, 2019 at 11:02:25PM +, Jason Gunthorpe wrote:
> > On Tue, Jan 29, 2019 at 03:44:00PM -0500, Jerome Glisse wrote:
> > 
> > > > But this API doesn't seem to offer any control - I thought that
> > > > control was all coming from the mm/hmm notifiers triggering p2p_unmaps?
> > > 
> > > The control is within the driver implementation of those callbacks. 
> > 
> > Seems like what you mean by control is 'the exporter gets to choose
> > the physical address at the instant of map' - which seems reasonable
> > for GPU.
> > 
> > 
> > > will only allow p2p map to succeed for objects that have been tagged by 
> > > the
> > > userspace in some way ie the userspace application is in control of what
> > > can be map to peer device.
> > 
> > I would have thought this means the VMA for the object is created
> > without the map/unmap ops? Or are GPU objects and VMAs unrelated?
> 
> GPU object and VMA are unrelated in all open source GPU driver i am
> somewhat familiar with (AMD, Intel, NVidia). You can create a GPU
> object and never map it (and thus never have it associated with a
> vma) and in fact this is very common. For graphic you usualy only
> have hand full of the hundreds of GPU object your application have
> mapped.

I mean the other way does every VMA with a p2p_map/unmap point to
exactly one GPU object?

ie I'm surprised you say that p2p_map needs to have policy, I would
have though the policy is applied when the VMA is created (ie objects
that are not for p2p do not have p2p_map set), and even for GPU
p2p_map should really only have to do with window allocation and pure
'can I even do p2p' type functionality.

> Idea is that we can only ask exporter to be predictable and still allow
> them to fail if things are really going bad.

I think hot unplug / PCI error recovery is one of the 'really going
bad' cases..

> I think i put it in the comment above the ops but in any cases i should
> write something in documentation with example and thorough guideline.
> Note that there won't be any mmu notifier to mmap of a device file
> unless the device driver calls for it or there is a syscall like munmap
> or mremap or mprotect well any syscall that work on vma.

This is something we might need to explore, does calling
zap_vma_ptes() invoke enough notifiers that a MMU notifiers or HMM
mirror consumer will release any p2p maps on that VMA?

> If we ever want to support full pin then we might have to add a
> flag so that GPU driver can refuse an importer that wants things
> pin forever.

This would become interesting for VFIO and RDMA at least - I don't
think VFIO has anything like SVA so it would want to import a p2p_map
and indicate that it will not respond to MMU notifiers.

GPU can refuse, but maybe RDMA would allow it...

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 06:17:43PM -0700, Logan Gunthorpe wrote:

> This isn't answering my question at all... I specifically asked what is
> backing the VMA when we are *not* using HMM.

At least for RDMA what backs the VMA today is non-struct-page BAR
memory filled in with io_remap_pfn.

And we want to expose this for P2P DMA. None of the HMM stuff applies
here and the p2p_map/unmap are a nice simple approach that covers all
the needs RDMA has, at least.

Every attempt to give BAR memory to struct page has run into major
trouble, IMHO, so I like that this approach avoids that.

And if you don't have struct page then the only kernel object left to
hang meta data off is the VMA itself.

It seems very similar to the existing P2P work between in-kernel
consumers, just that VMA is now mediating a general user space driven
discovery process instead of being hard wired into a driver.

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Michael S. Tsirkin
On Wed, Jan 30, 2019 at 11:05:42AM +0800, Jason Wang wrote:
> 
> On 2019/1/30 上午10:36, Michael S. Tsirkin wrote:
> > On Wed, Jan 30, 2019 at 10:24:01AM +0800, Jason Wang wrote:
> > > On 2019/1/30 上午3:02, Michael S. Tsirkin wrote:
> > > > On Tue, Jan 29, 2019 at 03:42:44PM -0200, Thiago Jung Bauermann wrote:
> > > > > Fixing address of powerpc mailing list.
> > > > > 
> > > > > Thiago Jung Bauermann  writes:
> > > > > 
> > > > > > Hello,
> > > > > > 
> > > > > > With Christoph's rework of the DMA API that recently landed, the 
> > > > > > patch
> > > > > > below is the only change needed in virtio to make it work in a POWER
> > > > > > secure guest under the ultravisor.
> > > > > > 
> > > > > > The other change we need (making sure the device's dma_map_ops is 
> > > > > > NULL
> > > > > > so that the dma-direct/swiotlb code is used) can be made in
> > > > > > powerpc-specific code.
> > > > > > 
> > > > > > Of course, I also have patches (soon to be posted as RFC) which 
> > > > > > hook up
> > > > > >  to the powerpc secure guest support code.
> > > > > > 
> > > > > > What do you think?
> > > > > > 
> > > > > >   From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 
> > > > > > 2001
> > > > > > From: Thiago Jung Bauermann 
> > > > > > Date: Thu, 24 Jan 2019 22:08:02 -0200
> > > > > > Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is 
> > > > > > encrypted
> > > > > > 
> > > > > > The host can't access the guest memory when it's encrypted, so using
> > > > > > regular memory pages for the ring isn't an option. Go through the 
> > > > > > DMA API.
> > > > > > 
> > > > > > Signed-off-by: Thiago Jung Bauermann 
> > > > Well I think this will come back to bite us (witness xen which is now
> > > > reworking precisely this path - but at least they aren't to blame, xen
> > > > came before ACCESS_PLATFORM).
> > > > 
> > > > I also still think the right thing would have been to set
> > > > ACCESS_PLATFORM for all systems where device can't access all memory.
> > > > 
> > > > But I also think I don't have the energy to argue about power secure
> > > > guest anymore.  So be it for power secure guest since the involved
> > > > engineers disagree with me.  Hey I've been wrong in the past ;).
> > > > 
> > > > But the name "sev_active" makes me scared because at least AMD guys who
> > > > were doing the sensible thing and setting ACCESS_PLATFORM (unless I'm
> > > > wrong? I reemember distinctly that's so) will likely be affected too.
> > > > We don't want that.
> > > > 
> > > > So let's find a way to make sure it's just power secure guest for now
> > > > pls.
> > > > 
> > > > I also think we should add a dma_api near features under virtio_device
> > > > such that these hacks can move off data path.
> > > 
> > > Anyway the current Xen code is conflict with spec which said:
> > > 
> > > "If this feature bit is set to 0, then the device has same access to 
> > > memory
> > > addresses supplied to it as the driver has. In particular, the device will
> > > always use physical addresses matching addresses used by the driver
> > > (typically meaning physical addresses used by the CPU) and not translated
> > > further, and can access any address supplied to it by the driver. When
> > > clear, this overrides any platform-specific description of whether device
> > > access is limited or translated in any way, e.g. whether an IOMMU may be
> > > present. "
> > > 
> > > I wonder how much value that the above description can give us. It's kind 
> > > of
> > > odd that the behavior of "when the feature is not negotiated" is described
> > > in the spec.
> > Hmm what's odd about it? We need to describe the behaviour is all cases.
> 
> 
> Well, try to limit the behavior of 'legacy' driver is tricky or even
> impossible. Xen is an exact example for this.

So don't. Xen got grand-fathered in because when that came
along we thought it's a one off thing. Was easier to just
add that as a special case. But really >99% of people
have a hypervisor device with direct guest memory access.
All else is esoterica.

> 
> > 
> > > Personally I think we can remove the above and then we can
> > > switch to use DMA API unconditionally in guest driver. It may have single
> > > digit regression probably, we can try to overcome it.
> > > 
> > > Thanks
> > This has been discussed ad nauseum. virtio is all about compatibility.
> > Losing a couple of lines of code isn't worth breaking working setups.
> > People that want "just use DMA API no tricks" now have the option.
> > Setting a flag in a feature bit map is literally a single line
> > of code in the hypervisor. So stop pushing for breaking working
> > legacy setups and just fix it in the right place.
> 
> 
> I may miss soemthing, which kind of legacy setup is broken? Do you mean
> using virtio without IOMMU_PLATFORM on platform with IOMMU? We actually
> unbreak this setup.
> 
> Thanks


Legacy setups by definition are working setups.  The rules are pretty
simple. By default virtio 

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Jason Wang


On 2019/1/30 上午10:36, Michael S. Tsirkin wrote:

On Wed, Jan 30, 2019 at 10:24:01AM +0800, Jason Wang wrote:

On 2019/1/30 上午3:02, Michael S. Tsirkin wrote:

On Tue, Jan 29, 2019 at 03:42:44PM -0200, Thiago Jung Bauermann wrote:

Fixing address of powerpc mailing list.

Thiago Jung Bauermann  writes:


Hello,

With Christoph's rework of the DMA API that recently landed, the patch
below is the only change needed in virtio to make it work in a POWER
secure guest under the ultravisor.

The other change we need (making sure the device's dma_map_ops is NULL
so that the dma-direct/swiotlb code is used) can be made in
powerpc-specific code.

Of course, I also have patches (soon to be posted as RFC) which hook up
 to the powerpc secure guest support code.

What do you think?

  From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann 
Date: Thu, 24 Jan 2019 22:08:02 -0200
Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

The host can't access the guest memory when it's encrypted, so using
regular memory pages for the ring isn't an option. Go through the DMA API.

Signed-off-by: Thiago Jung Bauermann 

Well I think this will come back to bite us (witness xen which is now
reworking precisely this path - but at least they aren't to blame, xen
came before ACCESS_PLATFORM).

I also still think the right thing would have been to set
ACCESS_PLATFORM for all systems where device can't access all memory.

But I also think I don't have the energy to argue about power secure
guest anymore.  So be it for power secure guest since the involved
engineers disagree with me.  Hey I've been wrong in the past ;).

But the name "sev_active" makes me scared because at least AMD guys who
were doing the sensible thing and setting ACCESS_PLATFORM (unless I'm
wrong? I reemember distinctly that's so) will likely be affected too.
We don't want that.

So let's find a way to make sure it's just power secure guest for now
pls.

I also think we should add a dma_api near features under virtio_device
such that these hacks can move off data path.


Anyway the current Xen code is conflict with spec which said:

"If this feature bit is set to 0, then the device has same access to memory
addresses supplied to it as the driver has. In particular, the device will
always use physical addresses matching addresses used by the driver
(typically meaning physical addresses used by the CPU) and not translated
further, and can access any address supplied to it by the driver. When
clear, this overrides any platform-specific description of whether device
access is limited or translated in any way, e.g. whether an IOMMU may be
present. "

I wonder how much value that the above description can give us. It's kind of
odd that the behavior of "when the feature is not negotiated" is described
in the spec.

Hmm what's odd about it? We need to describe the behaviour is all cases.



Well, try to limit the behavior of 'legacy' driver is tricky or even 
impossible. Xen is an exact example for this.






Personally I think we can remove the above and then we can
switch to use DMA API unconditionally in guest driver. It may have single
digit regression probably, we can try to overcome it.

Thanks

This has been discussed ad nauseum. virtio is all about compatibility.
Losing a couple of lines of code isn't worth breaking working setups.
People that want "just use DMA API no tricks" now have the option.
Setting a flag in a feature bit map is literally a single line
of code in the hypervisor. So stop pushing for breaking working
legacy setups and just fix it in the right place.



I may miss soemthing, which kind of legacy setup is broken? Do you mean 
using virtio without IOMMU_PLATFORM on platform with IOMMU? We actually 
unbreak this setup.


Thanks





By the way could you please respond about virtio-iommu and
why there's no support for ACCESS_PLATFORM on power?

I have Cc'd you on these discussions.


Thanks!



---
   drivers/virtio/virtio_ring.c | 5 -
   1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index cd7e755484e3..321a27075380 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 * not work without an even larger kludge.  Instead, enable
 * the DMA API if we're a Xen guest, which at least allows
 * all of the sensible Xen configurations to work correctly.
+*
+* Also, if guest memory is encrypted the host can't access
+* it directly. In this case, we'll need to use the DMA API.
 */
-   if (xen_domain())
+   if (xen_domain() || sev_active())
return true;

return false;

--
Thiago Jung Bauermann
IBM Linux Technology Center

___
iommu mailing list
iommu@lists.linux-foundation.org

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 06:17:43PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 4:47 p.m., Jerome Glisse wrote:
> > The whole point is to allow to use device memory for range of virtual
> > address of a process when it does make sense to use device memory for
> > that range. So they are multiple cases where it does make sense:
> > [1] - Only the device is accessing the range and they are no CPU access
> >   For instance the program is executing/running a big function on
> >   the GPU and they are not concurrent CPU access, this is very
> >   common in all the existing GPGPU code. In fact AFAICT It is the
> >   most common pattern. So here you can use HMM private or public
> >   memory.
> > [2] - Both device and CPU access a common range of virtul address
> >   concurrently. In that case if you are on a platform with cache
> >   coherent inter-connect like OpenCAPI or CCIX then you can use
> >   HMM public device memory and have both access the same memory.
> >   You can not use HMM private memory.
> > 
> > So far on x86 we only have PCIE and thus so far on x86 we only have
> > private HMM device memory that is not accessible by the CPU in any
> > way.
> 
> I feel like you're just moving the rug out from under us... Before you
> said ignore HMM and I was asking about the use case that wasn't using
> HMM and how it works without HMM. In response, you just give me *way*
> too much information describing HMM. And still, as best as I can see,
> managing DMA mappings (which is different from the userspace mappings)
> for GPU P2P should be handled by HMM and the userspace mappings should
> *just* link VMAs to HMM pages using the standard infrastructure we
> already have.

For HMM P2P mapping we need to call into the driver to know if driver
wants to fallback to main memory (running out of BAR addresses) or if
it can allow a peer device to directly access its memory. We also need
the call to exporting device driver as only the exporting device driver
can map the HMM page pfn to some physical BAR address (which would be
allocated by driver for GPU).

I wanted to make sure the HMM case was understood too, sorry if it
caused confusion with the non HMM case which i describe below.


> >> And what struct pages are actually going to be backing these VMAs if
> >> it's not using HMM?
> > 
> > When you have some range of virtual address migrated to HMM private
> > memory then the CPU pte are special swap entry and they behave just
> > as if the memory was swapped to disk. So CPU access to those will
> > fault and trigger a migration back to main memory.
> 
> This isn't answering my question at all... I specifically asked what is
> backing the VMA when we are *not* using HMM.

So when you are not using HMM ie existing GPU object without HMM then
like i said you do not have any valid pte most of the time inside the
CPU page table ie the GPU driver only populate the pte with valid
entry when they are CPU page fault and it clear those as soon as the
corresponding object is use by the GPU. In fact some driver also unmap
it agressively from the BAR making the memory totaly un-accessible to
anything but the GPU.

GPU driver do not like CPU mapping, they are quite aggressive about
clearing them. Then everything i said about having userspace deciding
which object can be share, and, with who, do apply here. So for GPU you
do want to give control to GPU driver and you do not want to require valid
CPU pte for the vma so that the exporting driver can return valid
address to the importing peer device only.

Also exporting device driver might decide to fallback to main memory
(running out of BAR addresses for instance). So again here we want to
go through the exporting device driver so that it can take the right
action.

So the expected pattern (for GPU driver) is:
- no valid pte for the special vma (mmap of device file)
- importing device call p2p_map() for the vma if it succeed the
  first time then we expect it will succeed for the same vma and
  range next time we call it.
- exporting driver can either return physical address to page
  into its BAR space that point to the correct device memory or
  fallback to main memory

Then at any point in time:
- if GPU driver want to move the object around (for whatever
  reasons) it calls zap_vma_ptes() the fact that there is no
  valid CPU pte does not matter it will call mmu notifier and thus
  any importing device driver will invalidate its mapping
- importing device driver that lost the mapping due to mmu
  notification can re-map by re-calling p2p_map() (it should
  check that the vma is still valid ...) and guideline is for
  the exporting device driver to succeed and return valid
  address to the new memory use for the object

This allow device driver like GPU to keep control. The expected
pattern is still the p2p mapping to stay undisrupted for their
whole lifetime. Invalidation 

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Michael S. Tsirkin
On Wed, Jan 30, 2019 at 10:24:01AM +0800, Jason Wang wrote:
> 
> On 2019/1/30 上午3:02, Michael S. Tsirkin wrote:
> > On Tue, Jan 29, 2019 at 03:42:44PM -0200, Thiago Jung Bauermann wrote:
> > > Fixing address of powerpc mailing list.
> > > 
> > > Thiago Jung Bauermann  writes:
> > > 
> > > > Hello,
> > > > 
> > > > With Christoph's rework of the DMA API that recently landed, the patch
> > > > below is the only change needed in virtio to make it work in a POWER
> > > > secure guest under the ultravisor.
> > > > 
> > > > The other change we need (making sure the device's dma_map_ops is NULL
> > > > so that the dma-direct/swiotlb code is used) can be made in
> > > > powerpc-specific code.
> > > > 
> > > > Of course, I also have patches (soon to be posted as RFC) which hook up
> > > >  to the powerpc secure guest support code.
> > > > 
> > > > What do you think?
> > > > 
> > > >  From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
> > > > From: Thiago Jung Bauermann 
> > > > Date: Thu, 24 Jan 2019 22:08:02 -0200
> > > > Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is 
> > > > encrypted
> > > > 
> > > > The host can't access the guest memory when it's encrypted, so using
> > > > regular memory pages for the ring isn't an option. Go through the DMA 
> > > > API.
> > > > 
> > > > Signed-off-by: Thiago Jung Bauermann 
> > Well I think this will come back to bite us (witness xen which is now
> > reworking precisely this path - but at least they aren't to blame, xen
> > came before ACCESS_PLATFORM).
> > 
> > I also still think the right thing would have been to set
> > ACCESS_PLATFORM for all systems where device can't access all memory.
> > 
> > But I also think I don't have the energy to argue about power secure
> > guest anymore.  So be it for power secure guest since the involved
> > engineers disagree with me.  Hey I've been wrong in the past ;).
> > 
> > But the name "sev_active" makes me scared because at least AMD guys who
> > were doing the sensible thing and setting ACCESS_PLATFORM (unless I'm
> > wrong? I reemember distinctly that's so) will likely be affected too.
> > We don't want that.
> > 
> > So let's find a way to make sure it's just power secure guest for now
> > pls.
> > 
> > I also think we should add a dma_api near features under virtio_device
> > such that these hacks can move off data path.
> 
> 
> Anyway the current Xen code is conflict with spec which said:
> 
> "If this feature bit is set to 0, then the device has same access to memory
> addresses supplied to it as the driver has. In particular, the device will
> always use physical addresses matching addresses used by the driver
> (typically meaning physical addresses used by the CPU) and not translated
> further, and can access any address supplied to it by the driver. When
> clear, this overrides any platform-specific description of whether device
> access is limited or translated in any way, e.g. whether an IOMMU may be
> present. "
> 
> I wonder how much value that the above description can give us. It's kind of
> odd that the behavior of "when the feature is not negotiated" is described
> in the spec.

Hmm what's odd about it? We need to describe the behaviour is all cases.

> Personally I think we can remove the above and then we can
> switch to use DMA API unconditionally in guest driver. It may have single
> digit regression probably, we can try to overcome it.
> 
> Thanks

This has been discussed ad nauseum. virtio is all about compatibility.
Losing a couple of lines of code isn't worth breaking working setups.
People that want "just use DMA API no tricks" now have the option.
Setting a flag in a feature bit map is literally a single line
of code in the hypervisor. So stop pushing for breaking working
legacy setups and just fix it in the right place.

> 
> > 
> > By the way could you please respond about virtio-iommu and
> > why there's no support for ACCESS_PLATFORM on power?
> > 
> > I have Cc'd you on these discussions.
> > 
> > 
> > Thanks!
> > 
> > 
> > > > ---
> > > >   drivers/virtio/virtio_ring.c | 5 -
> > > >   1 file changed, 4 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index cd7e755484e3..321a27075380 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device 
> > > > *vdev)
> > > >  * not work without an even larger kludge.  Instead, enable
> > > >  * the DMA API if we're a Xen guest, which at least allows
> > > >  * all of the sensible Xen configurations to work correctly.
> > > > +*
> > > > +* Also, if guest memory is encrypted the host can't access
> > > > +* it directly. In this case, we'll need to use the DMA API.
> > > >  */
> > > > -   if (xen_domain())
> > > > +   if (xen_domain() || sev_active())
> > > >   

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Jason Wang


On 2019/1/30 上午3:02, Michael S. Tsirkin wrote:

On Tue, Jan 29, 2019 at 03:42:44PM -0200, Thiago Jung Bauermann wrote:

Fixing address of powerpc mailing list.

Thiago Jung Bauermann  writes:


Hello,

With Christoph's rework of the DMA API that recently landed, the patch
below is the only change needed in virtio to make it work in a POWER
secure guest under the ultravisor.

The other change we need (making sure the device's dma_map_ops is NULL
so that the dma-direct/swiotlb code is used) can be made in
powerpc-specific code.

Of course, I also have patches (soon to be posted as RFC) which hook up
 to the powerpc secure guest support code.

What do you think?

 From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann 
Date: Thu, 24 Jan 2019 22:08:02 -0200
Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

The host can't access the guest memory when it's encrypted, so using
regular memory pages for the ring isn't an option. Go through the DMA API.

Signed-off-by: Thiago Jung Bauermann 

Well I think this will come back to bite us (witness xen which is now
reworking precisely this path - but at least they aren't to blame, xen
came before ACCESS_PLATFORM).

I also still think the right thing would have been to set
ACCESS_PLATFORM for all systems where device can't access all memory.

But I also think I don't have the energy to argue about power secure
guest anymore.  So be it for power secure guest since the involved
engineers disagree with me.  Hey I've been wrong in the past ;).

But the name "sev_active" makes me scared because at least AMD guys who
were doing the sensible thing and setting ACCESS_PLATFORM (unless I'm
wrong? I reemember distinctly that's so) will likely be affected too.
We don't want that.

So let's find a way to make sure it's just power secure guest for now
pls.

I also think we should add a dma_api near features under virtio_device
such that these hacks can move off data path.



Anyway the current Xen code is conflict with spec which said:

"If this feature bit is set to 0, then the device has same access to 
memory addresses supplied to it as the driver has. In particular, the 
device will always use physical addresses matching addresses used by the 
driver (typically meaning physical addresses used by the CPU) and not 
translated further, and can access any address supplied to it by the 
driver. When clear, this overrides any platform-specific description of 
whether device access is limited or translated in any way, e.g. whether 
an IOMMU may be present. "


I wonder how much value that the above description can give us. It's 
kind of odd that the behavior of "when the feature is not negotiated" is 
described in the spec. Personally I think we can remove the above and 
then we can switch to use DMA API unconditionally in guest driver. It 
may have single digit regression probably, we can try to overcome it.


Thanks




By the way could you please respond about virtio-iommu and
why there's no support for ACCESS_PLATFORM on power?

I have Cc'd you on these discussions.


Thanks!



---
  drivers/virtio/virtio_ring.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index cd7e755484e3..321a27075380 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 * not work without an even larger kludge.  Instead, enable
 * the DMA API if we're a Xen guest, which at least allows
 * all of the sensible Xen configurations to work correctly.
+*
+* Also, if guest memory is encrypted the host can't access
+* it directly. In this case, we'll need to use the DMA API.
 */
-   if (xen_domain())
+   if (xen_domain() || sev_active())
return true;

return false;


--
Thiago Jung Bauermann
IBM Linux Technology Center

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 4:47 p.m., Jerome Glisse wrote:
> The whole point is to allow to use device memory for range of virtual
> address of a process when it does make sense to use device memory for
> that range. So they are multiple cases where it does make sense:
> [1] - Only the device is accessing the range and they are no CPU access
>   For instance the program is executing/running a big function on
>   the GPU and they are not concurrent CPU access, this is very
>   common in all the existing GPGPU code. In fact AFAICT It is the
>   most common pattern. So here you can use HMM private or public
>   memory.
> [2] - Both device and CPU access a common range of virtul address
>   concurrently. In that case if you are on a platform with cache
>   coherent inter-connect like OpenCAPI or CCIX then you can use
>   HMM public device memory and have both access the same memory.
>   You can not use HMM private memory.
> 
> So far on x86 we only have PCIE and thus so far on x86 we only have
> private HMM device memory that is not accessible by the CPU in any
> way.

I feel like you're just moving the rug out from under us... Before you
said ignore HMM and I was asking about the use case that wasn't using
HMM and how it works without HMM. In response, you just give me *way*
too much information describing HMM. And still, as best as I can see,
managing DMA mappings (which is different from the userspace mappings)
for GPU P2P should be handled by HMM and the userspace mappings should
*just* link VMAs to HMM pages using the standard infrastructure we
already have.

>> And what struct pages are actually going to be backing these VMAs if
>> it's not using HMM?
> 
> When you have some range of virtual address migrated to HMM private
> memory then the CPU pte are special swap entry and they behave just
> as if the memory was swapped to disk. So CPU access to those will
> fault and trigger a migration back to main memory.

This isn't answering my question at all... I specifically asked what is
backing the VMA when we are *not* using HMM.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 11:02:25PM +, Jason Gunthorpe wrote:
> On Tue, Jan 29, 2019 at 03:44:00PM -0500, Jerome Glisse wrote:
> 
> > > But this API doesn't seem to offer any control - I thought that
> > > control was all coming from the mm/hmm notifiers triggering p2p_unmaps?
> > 
> > The control is within the driver implementation of those callbacks. 
> 
> Seems like what you mean by control is 'the exporter gets to choose
> the physical address at the instant of map' - which seems reasonable
> for GPU.
> 
> 
> > will only allow p2p map to succeed for objects that have been tagged by the
> > userspace in some way ie the userspace application is in control of what
> > can be map to peer device.
> 
> I would have thought this means the VMA for the object is created
> without the map/unmap ops? Or are GPU objects and VMAs unrelated?

GPU object and VMA are unrelated in all open source GPU driver i am
somewhat familiar with (AMD, Intel, NVidia). You can create a GPU
object and never map it (and thus never have it associated with a
vma) and in fact this is very common. For graphic you usualy only
have hand full of the hundreds of GPU object your application have
mapped.

The control for peer to peer can also be a mutable properties of the
object ie userspace do ioctl on the GPU driver which create an object;
Some times after the object is created the userspace do others ioctl
to allow to export the object to another specific device again this
result in ioctl to the device driver, those ioctl set flags and
update GPU object kernel structure with all the info.

In the meantime you have no control on when other driver might call
the vma p2p call backs. So you must have register the vma with
vm_operations that include the p2p_map and p2p_unmap. Those driver
function will check the object kernel structure each time they get
call and act accordingly.



> > For moving things around after a successful p2p_map yes the exporting
> > device have to call for instance zap_vma_ptes() or something
> > similar.
> 
> Okay, great, RDMA needs this flow for hotplug - we zap the VMA's when
> unplugging the PCI device and we can delay the PCI unplug completion
> until all the p2p_unmaps are called...
> 
> But in this case a future p2p_map will have to fail as the BAR no
> longer exists. How to handle this?

So the comment above the callback (i should write more thorough guideline
and documentation) state that export should/(must?) be predictable ie
if an importer device calls p2p_map() once on a vma and it does succeed
then if the same device calls again p2p_map() on the same vma and if the
vma is still valid (ie no unmap or does not correspond to a different
object ...) then the p2p_map() should/(must?) succeed.

The idea is that the importer would do a first call to p2p_map() when it
setup its own object, report failure to userspace if that fails. If it
does succeed then we should never have an issue next time we call p2p_map()
(after mapping being invalidated by mmu notifier for instance). So it will
succeed just like the first call (again assuming the vma is still valid).

Idea is that we can only ask exporter to be predictable and still allow
them to fail if things are really going bad.


> > > I would think that the importing driver can assume the BAR page is
> > > kept alive until it calls unmap (presumably triggered by notifiers)?
> > > 
> > > ie the exporting driver sees the BAR page as pinned until unmap.
> > 
> > The intention with this patchset is that it is not pin ie the importer
> > device _must_ abide by all mmu notifier invalidations and they can
> > happen at anytime. The importing device can however re-p2p_map the
> > same range after an invalidation.
> >
> > I would like to restrict this to importer that can invalidate for
> > now because i believe all the first device to use can support the
> > invalidation.
> 
> This seems reasonable (and sort of says importers not getting this
> from HMM need careful checking), was this in the comment above the
> ops?

I think i put it in the comment above the ops but in any cases i should
write something in documentation with example and thorough guideline.
Note that there won't be any mmu notifier to mmap of a device file
unless the device driver calls for it or there is a syscall like munmap
or mremap or mprotect well any syscall that work on vma.

So assuming the application is no doing something stupid, nor the
driver. Then the result of p2p_map can stay valid until the importer
is done and call p2p_unmap of its own free will. This is what i do
expect for this. But for GPU i would still like to allow GPU driver
to evict (and thus invalidate importer mapping) to main memory or
defragment their BAR address space if the GPU driver feels a pressing
need to do so.

If we ever want to support full pin then we might have to add a
flag so that GPU driver can refuse an importer that wants things
pin forever.

Cheers,
Jérôme
___

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 03:58:45PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 2:50 p.m., Jerome Glisse wrote:
> > No this is the non HMM case i am talking about here. Fully ignore HMM
> > in this frame. A GPU driver that do not support or use HMM in anyway
> > has all the properties and requirement i do list above. So all the points
> > i was making are without HMM in the picture whatsoever. I should have
> > posted this a separate patches to avoid this confusion.
> > 
> > Regarding your HMM question. You can not map HMM pages, all code path
> > that would try that would trigger a migration back to regular memory
> > and will use the regular memory for CPU access.
> > 
> 
> I thought this was the whole point of HMM... And eventually it would
> support being able to map the pages through the BAR in cooperation with
> the driver. If not, what's that whole layer for? Why not just have HMM
> handle this situation?

The whole point is to allow to use device memory for range of virtual
address of a process when it does make sense to use device memory for
that range. So they are multiple cases where it does make sense:
[1] - Only the device is accessing the range and they are no CPU access
  For instance the program is executing/running a big function on
  the GPU and they are not concurrent CPU access, this is very
  common in all the existing GPGPU code. In fact AFAICT It is the
  most common pattern. So here you can use HMM private or public
  memory.
[2] - Both device and CPU access a common range of virtul address
  concurrently. In that case if you are on a platform with cache
  coherent inter-connect like OpenCAPI or CCIX then you can use
  HMM public device memory and have both access the same memory.
  You can not use HMM private memory.

So far on x86 we only have PCIE and thus so far on x86 we only have
private HMM device memory that is not accessible by the CPU in any
way.

It does not make that memory useless, far from it. Having only the
device work on the dataset while CPU is either waiting or accessing
something else is very common.


Then HMM is a toolbox, so here are some of the tools:
HMM mirror - helper to mirror process address on to a device
ie this is SVM(Share Virtual Memory)/SVA(Share Virtual Address)
in software

HMM private memory - allow to register device memory with the linux
kernel. The memory is not CPU accessible. The memory is fully manage
by the device driver. What and when to migrate is under the control
of the device driver.

HMM public memory - allow to register device memory with the linux
kernel. The memory must be CPU accessible and cache coherent and
abide by the platform memory model. The memory is fully manage by
the device driver because otherwise it would disrupt the device
driver operation (for instance GPU can also be use for graphics).

Migration - helper to perform migration to and from device memory.
It does not make any decission on itself it just perform all the
steps in the right order and call back into the driver to get the
migration going.

It is up to device driver to implement heuristic and provide userspace
API to control memory migration to and from device memory. For device
private memory on CPU page fault the kernel will force a migration back
to system memory so that the CPU can access the memory. What matter here
is that the memory model of the platform is intact and thus you can
safely use CPU atomic operation or rely on your platform memory model
for your program. Note that long term i would like to define common API
to expose to userspace to manage memory binding to specific device
memory so that we can mix and match multiple device memory into a single
process and define policy too.

Also CPU atomic instruction to PCIE BAR gives _undefined_ results and in
fact on some AMD/Intel platform it leads to weirdness/crash/freeze. So
obviously we can not map PCIE BAR to CPU without breaking the memory
model. More over on PCIE you might not be able to resize the BAR to
expose all the device memory. GPU can have several giga bytes of memory
and not all of them support PCIE bar resize, and sometimes PCIE bar
resize does not work either because of bios/firmware issue or simply
because you are running out of IO space.

So on x86 we are stuck with HMM private memory, i am hopping that some
day in the future we will have CCIX or something similar. But for now
we have to work with what we have.

> And what struct pages are actually going to be backing these VMAs if
> it's not using HMM?

When you have some range of virtual address migrated to HMM private
memory then the CPU pte are special swap entry and they behave just
as if the memory was swapped to disk. So CPU access to those will
fault and trigger a migration back to main memory.

We still want to allow peer to peer to exist when using HMM memory
for a range of virtual address (of a vma that is not 

Re: [RFC v3 02/21] iommu: Introduce cache_invalidate API

2019-01-29 Thread Alex Williamson
On Fri, 25 Jan 2019 17:49:20 +0100
Auger Eric  wrote:

> Hi Alex,
> 
> On 1/11/19 10:30 PM, Alex Williamson wrote:
> > On Tue,  8 Jan 2019 11:26:14 +0100
> > Eric Auger  wrote:
> >   
> >> From: "Liu, Yi L" 
> >>
> >> In any virtualization use case, when the first translation stage
> >> is "owned" by the guest OS, the host IOMMU driver has no knowledge
> >> of caching structure updates unless the guest invalidation activities
> >> are trapped by the virtualizer and passed down to the host.
> >>
> >> Since the invalidation data are obtained from user space and will be
> >> written into physical IOMMU, we must allow security check at various
> >> layers. Therefore, generic invalidation data format are proposed here,
> >> model specific IOMMU drivers need to convert them into their own format.
> >>
> >> Signed-off-by: Liu, Yi L 
> >> Signed-off-by: Jean-Philippe Brucker 
> >> Signed-off-by: Jacob Pan 
> >> Signed-off-by: Ashok Raj 
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >> v1 -> v2:
> >> - add arch_id field
> >> - renamed tlb_invalidate into cache_invalidate as this API allows
> >>   to invalidate context caches on top of IOTLBs
> >>
> >> v1:
> >> renamed sva_invalidate into tlb_invalidate and add iommu_ prefix in
> >> header. Commit message reworded.
> >> ---
> >>  drivers/iommu/iommu.c  | 14 ++
> >>  include/linux/iommu.h  | 14 ++
> >>  include/uapi/linux/iommu.h | 95 ++
> >>  3 files changed, 123 insertions(+)
> >>
> >> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> >> index 0f2b7f1fc7c8..b2e248770508 100644
> >> --- a/drivers/iommu/iommu.c
> >> +++ b/drivers/iommu/iommu.c
> >> @@ -1403,6 +1403,20 @@ int iommu_set_pasid_table(struct iommu_domain 
> >> *domain,
> >>  }
> >>  EXPORT_SYMBOL_GPL(iommu_set_pasid_table);
> >>  
> >> +int iommu_cache_invalidate(struct iommu_domain *domain, struct device 
> >> *dev,
> >> + struct iommu_cache_invalidate_info *inv_info)
> >> +{
> >> +  int ret = 0;
> >> +
> >> +  if (unlikely(!domain->ops->cache_invalidate))
> >> +  return -ENODEV;
> >> +
> >> +  ret = domain->ops->cache_invalidate(domain, dev, inv_info);
> >> +
> >> +  return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
> >> +
> >>  static void __iommu_detach_device(struct iommu_domain *domain,
> >>  struct device *dev)
> >>  {
> >> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> >> index 1da2a2357ea4..96d59886f230 100644
> >> --- a/include/linux/iommu.h
> >> +++ b/include/linux/iommu.h
> >> @@ -186,6 +186,7 @@ struct iommu_resv_region {
> >>   * @of_xlate: add OF master IDs to iommu grouping
> >>   * @pgsize_bitmap: bitmap of all possible supported page sizes
> >>   * @set_pasid_table: set pasid table
> >> + * @cache_invalidate: invalidate translation caches
> >>   */
> >>  struct iommu_ops {
> >>bool (*capable)(enum iommu_cap);
> >> @@ -231,6 +232,9 @@ struct iommu_ops {
> >>int (*set_pasid_table)(struct iommu_domain *domain,
> >>   struct iommu_pasid_table_config *cfg);
> >>  
> >> +  int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
> >> +  struct iommu_cache_invalidate_info *inv_info);
> >> +
> >>unsigned long pgsize_bitmap;
> >>  };
> >>  
> >> @@ -294,6 +298,9 @@ extern void iommu_detach_device(struct iommu_domain 
> >> *domain,
> >>struct device *dev);
> >>  extern int iommu_set_pasid_table(struct iommu_domain *domain,
> >> struct iommu_pasid_table_config *cfg);
> >> +extern int iommu_cache_invalidate(struct iommu_domain *domain,
> >> +  struct device *dev,
> >> +  struct iommu_cache_invalidate_info *inv_info);
> >>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
> >>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
> >>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> >> @@ -709,6 +716,13 @@ int iommu_set_pasid_table(struct iommu_domain *domain,
> >>  {
> >>return -ENODEV;
> >>  }
> >> +static inline int
> >> +iommu_cache_invalidate(struct iommu_domain *domain,
> >> + struct device *dev,
> >> + struct iommu_cache_invalidate_info *inv_info)
> >> +{
> >> +  return -ENODEV;
> >> +}
> >>  
> >>  #endif /* CONFIG_IOMMU_API */
> >>  
> >> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> >> index 7a7cf7a3de7c..4605f5cfac84 100644
> >> --- a/include/uapi/linux/iommu.h
> >> +++ b/include/uapi/linux/iommu.h
> >> @@ -47,4 +47,99 @@ struct iommu_pasid_table_config {
> >>};
> >>  };
> >>  
> >> +/**
> >> + * enum iommu_inv_granularity - Generic invalidation granularity
> >> + * @IOMMU_INV_GRANU_DOMAIN_ALL_PASID: TLB entries or PASID caches of 
> >> all
> >> + *PASIDs associated with a domain 
> >> ID

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 2:50 p.m., Jerome Glisse wrote:
> No this is the non HMM case i am talking about here. Fully ignore HMM
> in this frame. A GPU driver that do not support or use HMM in anyway
> has all the properties and requirement i do list above. So all the points
> i was making are without HMM in the picture whatsoever. I should have
> posted this a separate patches to avoid this confusion.
> 
> Regarding your HMM question. You can not map HMM pages, all code path
> that would try that would trigger a migration back to regular memory
> and will use the regular memory for CPU access.
> 

I thought this was the whole point of HMM... And eventually it would
support being able to map the pages through the BAR in cooperation with
the driver. If not, what's that whole layer for? Why not just have HMM
handle this situation?

And what struct pages are actually going to be backing these VMAs if
it's not using HMM?


> Again HMM has nothing to do here, ignore HMM it does not play any role
> and it is not involve in anyway here. GPU want to control what object
> they allow other device to access and object they do not allow. GPU driver
> _constantly_ invalidate the CPU page table and in fact the CPU page table
> do not have any valid pte for a vma that is an mmap of GPU device file
> for most of the vma lifetime. Changing that would highly disrupt and
> break GPU drivers. They need to control that, they need to control what
> to do if another device tries to peer map some of their memory. Hence
> why they need to implement the callback and decide on wether or not they
> allow the peer mapping or use device memory for it (they can decide to
> fallback to main memory).

But mapping is an operation of the memory/struct pages behind the VMA;
not of the VMA itself and I think that's evident by the code in that the
only way the VMA layer is involved is the fact that you're abusing
vm_ops by adding new ops there and calling it by other layers.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 03:44:00PM -0500, Jerome Glisse wrote:

> > But this API doesn't seem to offer any control - I thought that
> > control was all coming from the mm/hmm notifiers triggering p2p_unmaps?
> 
> The control is within the driver implementation of those callbacks. 

Seems like what you mean by control is 'the exporter gets to choose
the physical address at the instant of map' - which seems reasonable
for GPU.


> will only allow p2p map to succeed for objects that have been tagged by the
> userspace in some way ie the userspace application is in control of what
> can be map to peer device.

I would have thought this means the VMA for the object is created
without the map/unmap ops? Or are GPU objects and VMAs unrelated?

> For moving things around after a successful p2p_map yes the exporting
> device have to call for instance zap_vma_ptes() or something
> similar.

Okay, great, RDMA needs this flow for hotplug - we zap the VMA's when
unplugging the PCI device and we can delay the PCI unplug completion
until all the p2p_unmaps are called...

But in this case a future p2p_map will have to fail as the BAR no
longer exists. How to handle this?

> > I would think that the importing driver can assume the BAR page is
> > kept alive until it calls unmap (presumably triggered by notifiers)?
> > 
> > ie the exporting driver sees the BAR page as pinned until unmap.
> 
> The intention with this patchset is that it is not pin ie the importer
> device _must_ abide by all mmu notifier invalidations and they can
> happen at anytime. The importing device can however re-p2p_map the
> same range after an invalidation.
>
> I would like to restrict this to importer that can invalidate for
> now because i believe all the first device to use can support the
> invalidation.

This seems reasonable (and sort of says importers not getting this
from HMM need careful checking), was this in the comment above the
ops?

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 02:50:55PM -0500, Jerome Glisse wrote:

> GPU driver do want more control :) GPU driver are moving things around
> all the time and they have more memory than bar space (on newer platform
> AMD GPU do resize the bar but it is not the rule for all GPUs). So
> GPU driver do actualy manage their BAR address space and they map and
> unmap thing there. They can not allow someone to just pin stuff there
> randomly or this would disrupt their regular work flow. Hence they need
> control and they might implement threshold for instance if they have
> more than N pages of bar space map for peer to peer then they can decide
> to fall back to main memory for any new peer mapping.

But this API doesn't seem to offer any control - I thought that
control was all coming from the mm/hmm notifiers triggering p2p_unmaps?

I would think that the importing driver can assume the BAR page is
kept alive until it calls unmap (presumably triggered by notifiers)?

ie the exporting driver sees the BAR page as pinned until unmap.

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 01:39:49PM -0700, Logan Gunthorpe wrote:

> implement the mapping. And I don't think we should have 'special' vma's
> for this (though we may need something to ensure we don't get mapping
> requests mixed with different types of pages...).

I think Jerome explained the point here is to have a 'special vma'
rather than a 'special struct page' as, really, we don't need a
struct page at all to make this work.

If I recall your earlier attempts at adding struct page for BAR
memory, it ran aground on issues related to O_DIRECT/sgls, etc, etc.

This does seem to avoid that pitfall entirely as we can never
accidently get into the SGL system with this kind of memory or VMA?

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jason Gunthorpe
On Tue, Jan 29, 2019 at 02:11:23PM -0500, Jerome Glisse wrote:
> On Tue, Jan 29, 2019 at 11:36:29AM -0700, Logan Gunthorpe wrote:
> > 
> > 
> > On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> > 
> > > + /*
> > > +  * Optional for device driver that want to allow peer to peer (p2p)
> > > +  * mapping of their vma (which can be back by some device memory) to
> > > +  * another device.
> > > +  *
> > > +  * Note that the exporting device driver might not have map anything
> > > +  * inside the vma for the CPU but might still want to allow a peer
> > > +  * device to access the range of memory corresponding to a range in
> > > +  * that vma.
> > > +  *
> > > +  * FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
> > > +  * DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
> > > +  * SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
> > > +  * device to map once during setup and report any failure at that time
> > > +  * to the userspace. Further mapping of the same range might happen
> > > +  * after mmu notifier invalidation over the range. The exporting device
> > > +  * can use this to move things around (defrag BAR space for instance)
> > > +  * or do other similar task.
> > > +  *
> > > +  * IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
> > > +  * WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
> > > +  * POINT IN TIME WITH NO LOCK HELD.
> > > +  *
> > > +  * In below function, the device argument is the importing device,
> > > +  * the exporting device is the device to which the vma belongs.
> > > +  */
> > > + long (*p2p_map)(struct vm_area_struct *vma,
> > > + struct device *device,
> > > + unsigned long start,
> > > + unsigned long end,
> > > + dma_addr_t *pa,
> > > + bool write);
> > > + long (*p2p_unmap)(struct vm_area_struct *vma,
> > > +   struct device *device,
> > > +   unsigned long start,
> > > +   unsigned long end,
> > > +   dma_addr_t *pa);
> > 
> > I don't understand why we need new p2p_[un]map function pointers for
> > this. In subsequent patches, they never appear to be set anywhere and
> > are only called by the HMM code. I'd have expected it to be called by
> > some core VMA code and set by HMM as that's what vm_operations_struct is
> > for.
> > 
> > But the code as all very confusing, hard to follow and seems to be
> > missing significant chunks. So I'm not really sure what is going on.
> 
> It is set by device driver when userspace do mmap(fd) where fd comes
> from open("/dev/somedevicefile"). So it is set by device driver. HMM
> has nothing to do with this. It must be set by device driver mmap
> call back (mmap callback of struct file_operations). For this patch
> you can completely ignore all the HMM patches. Maybe posting this as
> 2 separate patchset would make it clearer.
> 
> For instance see [1] for how a non HMM driver can export its memory
> by just setting those callback. Note that a proper implementation of
> this should also include some kind of driver policy on what to allow
> to map and what to not allow ... All this is driver specific in any
> way.

I'm imagining that the RDMA drivers would use this interface on their
per-process 'doorbell' BAR pages - we also wish to have P2P DMA to
this memory. Also the entire VFIO PCI BAR mmap would be good to cover
with this too.

Jerome, I think it would be nice to have a helper scheme - I think the
simple case would be simple remapping of PCI BAR memory, so if we
could have, say something like:

static const struct vm_operations_struct my_ops {
  .p2p_map = p2p_ioremap_map_op,
  .p2p_unmap = p2p_ioremap_unmap_op,
}

struct ioremap_data {
  [..]
}

fops_mmap() {
   vma->private_data = _priv->ioremap_data;
   return p2p_ioremap_device_memory(vma, exporting_device, [..]);
}

Which closely matches at least what the RDMA drivers do. Where
p2p_ioremap_device_memory populates p2p_map and p2p_unmap pointers
with sensible functions, etc.

It looks like vfio would be able to use this as well (though I am
unsure why vfio uses remap_pfn_range instead of io_remap_pfn range for
BAR memory..)

Do any drivers need more control than this?

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 02:30:49PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 1:57 p.m., Jerome Glisse wrote:
> > GPU driver must be in control and must be call to. Here there is 2 cases
> > in this patchset and i should have instead posted 2 separate patchset as
> > it seems that it is confusing things.
> > 
> > For the HMM page, the physical address of the page ie the pfn does not
> > correspond to anything ie there is nothing behind it. So the importing
> > device has no idea how to get a valid physical address from an HMM page
> > only the device driver exporting its memory with HMM device memory knows
> > that.
> > 
> > 
> > For the special vma ie mmap of a device file. GPU driver do manage their
> > BAR ie the GPU have a page table that map BAR page to GPU memory and the
> > driver _constantly_ update this page table, it is reflected by invalidating
> > the CPU mapping. In fact most of the time the CPU mapping of GPU object are
> > invalid they are valid only a small fraction of their lifetime. So you
> > _must_ have some call to inform the exporting device driver that another
> > device would like to map one of its vma. The exporting device can then
> > try to avoid as much churn as possible for the importing device. But this
> > has consequence and the exporting device driver must be allow to apply
> > policy and make decission on wether or not it authorize the other device
> > to peer map its memory. For GPU the userspace application have to call
> > specific API that translate into specific ioctl which themself set flags
> > on object (in the kernel struct tracking the user space object). The only
> > way to allow program predictability is if the application can ask and know
> > if it can peer export an object (ie is there enough BAR space left).
> 
> This all seems like it's an HMM problem and not related to mapping
> BARs/"potential BARs" to userspace. If some code wants to DMA map HMM
> pages, it calls an HMM function to map them. If HMM needs to consult
> with the driver on aspects of how that's mapped, then that's between HMM
> and the driver and not something I really care about. But making the
> entire mapping stuff tied to userspace VMAs does not make sense to me.
> What if somebody wants to map some HMM pages in the same way but from
> kernel space and they therefore don't have a VMA?

No this is the non HMM case i am talking about here. Fully ignore HMM
in this frame. A GPU driver that do not support or use HMM in anyway
has all the properties and requirement i do list above. So all the points
i was making are without HMM in the picture whatsoever. I should have
posted this a separate patches to avoid this confusion.

Regarding your HMM question. You can not map HMM pages, all code path
that would try that would trigger a migration back to regular memory
and will use the regular memory for CPU access.


> >> I also figured there'd be a fault version of p2p_ioremap_device_memory()
> >> for when you are mapping P2P memory and you want to assign the pages
> >> lazily. Though, this can come later when someone wants to implement that.
> > 
> > For GPU the BAR address space is manage page by page and thus you do not
> > want to map a range of BAR addresses but you want to allow mapping of
> > multiple page of BAR address that are not adjacent to each other nor
> > ordered in anyway. But providing helper for simpler device does make sense.
> 
> Well, this has little do with the backing device but how the memory is
> mapped into userspace. With p2p_ioremap_device_memory() the entire range
> is mapped into the userspace VMA immediately during the call to mmap().
> With p2p_fault_device_memory(), mmap() would not actually map anything
> and a page in the VMA would be mapped only when userspace accesses it
> (using fault()). It seems to me like GPUs would prefer the latter but if
> HMM takes care of the mapping from userspace potential pages to actual
> GPU pages through the BAR then that may not be true.

Again HMM has nothing to do here, ignore HMM it does not play any role
and it is not involve in anyway here. GPU want to control what object
they allow other device to access and object they do not allow. GPU driver
_constantly_ invalidate the CPU page table and in fact the CPU page table
do not have any valid pte for a vma that is an mmap of GPU device file
for most of the vma lifetime. Changing that would highly disrupt and
break GPU drivers. They need to control that, they need to control what
to do if another device tries to peer map some of their memory. Hence
why they need to implement the callback and decide on wether or not they
allow the peer mapping or use device memory for it (they can decide to
fallback to main memory).

If the exporter can not control than this is useless to GPU driver. I
would rather not exclude GPU driver from this.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 1:57 p.m., Jerome Glisse wrote:
> GPU driver must be in control and must be call to. Here there is 2 cases
> in this patchset and i should have instead posted 2 separate patchset as
> it seems that it is confusing things.
> 
> For the HMM page, the physical address of the page ie the pfn does not
> correspond to anything ie there is nothing behind it. So the importing
> device has no idea how to get a valid physical address from an HMM page
> only the device driver exporting its memory with HMM device memory knows
> that.
> 
> 
> For the special vma ie mmap of a device file. GPU driver do manage their
> BAR ie the GPU have a page table that map BAR page to GPU memory and the
> driver _constantly_ update this page table, it is reflected by invalidating
> the CPU mapping. In fact most of the time the CPU mapping of GPU object are
> invalid they are valid only a small fraction of their lifetime. So you
> _must_ have some call to inform the exporting device driver that another
> device would like to map one of its vma. The exporting device can then
> try to avoid as much churn as possible for the importing device. But this
> has consequence and the exporting device driver must be allow to apply
> policy and make decission on wether or not it authorize the other device
> to peer map its memory. For GPU the userspace application have to call
> specific API that translate into specific ioctl which themself set flags
> on object (in the kernel struct tracking the user space object). The only
> way to allow program predictability is if the application can ask and know
> if it can peer export an object (ie is there enough BAR space left).

This all seems like it's an HMM problem and not related to mapping
BARs/"potential BARs" to userspace. If some code wants to DMA map HMM
pages, it calls an HMM function to map them. If HMM needs to consult
with the driver on aspects of how that's mapped, then that's between HMM
and the driver and not something I really care about. But making the
entire mapping stuff tied to userspace VMAs does not make sense to me.
What if somebody wants to map some HMM pages in the same way but from
kernel space and they therefore don't have a VMA?


>> I also figured there'd be a fault version of p2p_ioremap_device_memory()
>> for when you are mapping P2P memory and you want to assign the pages
>> lazily. Though, this can come later when someone wants to implement that.
> 
> For GPU the BAR address space is manage page by page and thus you do not
> want to map a range of BAR addresses but you want to allow mapping of
> multiple page of BAR address that are not adjacent to each other nor
> ordered in anyway. But providing helper for simpler device does make sense.

Well, this has little do with the backing device but how the memory is
mapped into userspace. With p2p_ioremap_device_memory() the entire range
is mapped into the userspace VMA immediately during the call to mmap().
With p2p_fault_device_memory(), mmap() would not actually map anything
and a page in the VMA would be mapped only when userspace accesses it
(using fault()). It seems to me like GPUs would prefer the latter but if
HMM takes care of the mapping from userspace potential pages to actual
GPU pages through the BAR then that may not be true.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Alex Deucher
On Tue, Jan 29, 2019 at 3:25 PM Logan Gunthorpe  wrote:
>
>
>
> On 2019-01-29 12:56 p.m., Alex Deucher wrote:
> > On Tue, Jan 29, 2019 at 12:47 PM  wrote:
> >>
> >> From: Jérôme Glisse 
> >>
> >> device_test_p2p() return true if two devices can peer to peer to
> >> each other. We add a generic function as different inter-connect
> >> can support peer to peer and we want to genericaly test this no
> >> matter what the inter-connect might be. However this version only
> >> support PCIE for now.
> >>
> >
> > What about something like these patches:
> > https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=4fab9ff69cb968183f717551441b475fabce6c1c
> > https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=f90b12d41c277335d08c9dab62433f27c0fadbe5
> > They are a bit more thorough.
>
> Those new functions seem to have a lot of overlap with the code that is
> already upstream in p2pdma Perhaps you should be improving the
> p2pdma functions if they aren't suitable for what you want already
> instead of creating new ones.

Could be.  Those patches are pretty old.  They probably need to be
rebased on the latest upstream p2p stuff.

Alex
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 01:44:09PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 12:44 p.m., Greg Kroah-Hartman wrote:
> > On Tue, Jan 29, 2019 at 11:24:09AM -0700, Logan Gunthorpe wrote:
> >>
> >>
> >> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> >>> +bool pci_test_p2p(struct device *devA, struct device *devB)
> >>> +{
> >>> + struct pci_dev *pciA, *pciB;
> >>> + bool ret;
> >>> + int tmp;
> >>> +
> >>> + /*
> >>> +  * For now we only support PCIE peer to peer but other inter-connect
> >>> +  * can be added.
> >>> +  */
> >>> + pciA = find_parent_pci_dev(devA);
> >>> + pciB = find_parent_pci_dev(devB);
> >>> + if (pciA == NULL || pciB == NULL) {
> >>> + ret = false;
> >>> + goto out;
> >>> + }
> >>> +
> >>> + tmp = upstream_bridge_distance(pciA, pciB, NULL);
> >>> + ret = tmp < 0 ? false : true;
> >>> +
> >>> +out:
> >>> + pci_dev_put(pciB);
> >>> + pci_dev_put(pciA);
> >>> + return false;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(pci_test_p2p);
> >>
> >> This function only ever returns false
> > 
> > I guess it was nevr actually tested :(
> > 
> > I feel really worried about passing random 'struct device' pointers into
> > the PCI layer.  Are we _sure_ it can handle this properly?
> 
> Yes, there are a couple of pci_p2pdma functions that take struct devices
> directly simply because it's way more convenient for the caller. That's
> what find_parent_pci_dev() takes care of (it returns false if the device
> is not a PCI device). Whether that's appropriate here is hard to say
> seeing we haven't seen any caller code.

Caller code as a reference (i already given that link in other part of
thread but just so that people don't have to follow all branches).

https://cgit.freedesktop.org/~glisse/linux/commit/?h=hmm-p2p=401a567696eafb1d4faf7054ab0d7c3a16a5ef06

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 01:39:49PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 12:32 p.m., Jason Gunthorpe wrote:
> > Jerome, I think it would be nice to have a helper scheme - I think the
> > simple case would be simple remapping of PCI BAR memory, so if we
> > could have, say something like:
> > 
> > static const struct vm_operations_struct my_ops {
> >   .p2p_map = p2p_ioremap_map_op,
> >   .p2p_unmap = p2p_ioremap_unmap_op,
> > }
> > 
> > struct ioremap_data {
> >   [..]
> > }
> > 
> > fops_mmap() {
> >vma->private_data = _priv->ioremap_data;
> >return p2p_ioremap_device_memory(vma, exporting_device, [..]);
> > }
> 
> This is roughly what I was expecting, except I don't see exactly what
> the p2p_map and p2p_unmap callbacks are for. The importing driver should
> see p2pdma/hmm struct pages and use the appropriate function to map
> them. It shouldn't be the responsibility of the exporting driver to
> implement the mapping. And I don't think we should have 'special' vma's
> for this (though we may need something to ensure we don't get mapping
> requests mixed with different types of pages...).

GPU driver must be in control and must be call to. Here there is 2 cases
in this patchset and i should have instead posted 2 separate patchset as
it seems that it is confusing things.

For the HMM page, the physical address of the page ie the pfn does not
correspond to anything ie there is nothing behind it. So the importing
device has no idea how to get a valid physical address from an HMM page
only the device driver exporting its memory with HMM device memory knows
that.


For the special vma ie mmap of a device file. GPU driver do manage their
BAR ie the GPU have a page table that map BAR page to GPU memory and the
driver _constantly_ update this page table, it is reflected by invalidating
the CPU mapping. In fact most of the time the CPU mapping of GPU object are
invalid they are valid only a small fraction of their lifetime. So you
_must_ have some call to inform the exporting device driver that another
device would like to map one of its vma. The exporting device can then
try to avoid as much churn as possible for the importing device. But this
has consequence and the exporting device driver must be allow to apply
policy and make decission on wether or not it authorize the other device
to peer map its memory. For GPU the userspace application have to call
specific API that translate into specific ioctl which themself set flags
on object (in the kernel struct tracking the user space object). The only
way to allow program predictability is if the application can ask and know
if it can peer export an object (ie is there enough BAR space left).

Moreover i would like to be able to use this API between GPUs that are
inter-connected between each other and for those the CPU page table are
just invalid and the physical address to use are only meaning full to the
exporting and importing device. So again here core kernel has no idea of
what the physical address would be.


So in both cases, at very least for GPU, we do want total control to be
given to the exporter.

> 
> I also figured there'd be a fault version of p2p_ioremap_device_memory()
> for when you are mapping P2P memory and you want to assign the pages
> lazily. Though, this can come later when someone wants to implement that.

For GPU the BAR address space is manage page by page and thus you do not
want to map a range of BAR addresses but you want to allow mapping of
multiple page of BAR address that are not adjacent to each other nor
ordered in anyway. But providing helper for simpler device does make sense.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 12:44 p.m., Jerome Glisse wrote:
>> I'd suggest [1] should be a part of the patchset so we can actually see
>> a user of the stuff you're adding.
> 
> I did not wanted to clutter patchset with device driver specific usage
> of this. As the API can be reason about in abstract way.

It's hard to reason about an interface when you can't see what all the
layers want to do with it. Most maintainers (I'd hope) would certainly
never merge code that has no callers, and for much the same reason, I'd
rather not review patches that don't have real use case examples.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 08:24:36PM +, Jason Gunthorpe wrote:
> On Tue, Jan 29, 2019 at 02:50:55PM -0500, Jerome Glisse wrote:
> 
> > GPU driver do want more control :) GPU driver are moving things around
> > all the time and they have more memory than bar space (on newer platform
> > AMD GPU do resize the bar but it is not the rule for all GPUs). So
> > GPU driver do actualy manage their BAR address space and they map and
> > unmap thing there. They can not allow someone to just pin stuff there
> > randomly or this would disrupt their regular work flow. Hence they need
> > control and they might implement threshold for instance if they have
> > more than N pages of bar space map for peer to peer then they can decide
> > to fall back to main memory for any new peer mapping.
> 
> But this API doesn't seem to offer any control - I thought that
> control was all coming from the mm/hmm notifiers triggering p2p_unmaps?

The control is within the driver implementation of those callbacks. So
driver implementation can refuse to map by returning an error on p2p_map
or it can decide to use main memory by migrating its object to main memory
and populating the dma address array with dma_page_map() of the main memory
pages. Driver like GPU can have policy on top of that for instance they
will only allow p2p map to succeed for objects that have been tagged by the
userspace in some way ie the userspace application is in control of what
can be map to peer device. This is needed for GPU driver as we do want
userspace involvement on what object are allowed to have p2p access and
also so that we can report to userspace when we are running out of BAR
addresses for this to work as intended (ie not falling back to main memory)
so that application can take appropriate actions (like decide what to
prioritize).

For moving things around after a successful p2p_map yes the exporting
device have to call for instance zap_vma_ptes() or something similar.
This will trigger notifier call and the importing device will invalidate
its mapping. Once it is invalidated then the exporting device can
point new call of p2p_map (for the same range) to new memory (obviously
the exporting device have to synchronize any concurrent call to p2p_map
with the invalidation).

> 
> I would think that the importing driver can assume the BAR page is
> kept alive until it calls unmap (presumably triggered by notifiers)?
> 
> ie the exporting driver sees the BAR page as pinned until unmap.

The intention with this patchset is that it is not pin ie the importer
device _must_ abide by all mmu notifier invalidations and they can
happen at anytime. The importing device can however re-p2p_map the
same range after an invalidation.

I would like to restrict this to importer that can invalidate for
now because i believe all the first device to use can support the
invalidation.

Also when using HMM private device memory we _can not_ pin virtual
address to device memory as otherwise CPU access would have to SIGBUS
or SEGFAULT and we do not want that. So this was also a motivation to
keep thing consistent for the importer for both cases.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 12:44 p.m., Greg Kroah-Hartman wrote:
> On Tue, Jan 29, 2019 at 11:24:09AM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
>>> +bool pci_test_p2p(struct device *devA, struct device *devB)
>>> +{
>>> +   struct pci_dev *pciA, *pciB;
>>> +   bool ret;
>>> +   int tmp;
>>> +
>>> +   /*
>>> +* For now we only support PCIE peer to peer but other inter-connect
>>> +* can be added.
>>> +*/
>>> +   pciA = find_parent_pci_dev(devA);
>>> +   pciB = find_parent_pci_dev(devB);
>>> +   if (pciA == NULL || pciB == NULL) {
>>> +   ret = false;
>>> +   goto out;
>>> +   }
>>> +
>>> +   tmp = upstream_bridge_distance(pciA, pciB, NULL);
>>> +   ret = tmp < 0 ? false : true;
>>> +
>>> +out:
>>> +   pci_dev_put(pciB);
>>> +   pci_dev_put(pciA);
>>> +   return false;
>>> +}
>>> +EXPORT_SYMBOL_GPL(pci_test_p2p);
>>
>> This function only ever returns false
> 
> I guess it was nevr actually tested :(
> 
> I feel really worried about passing random 'struct device' pointers into
> the PCI layer.  Are we _sure_ it can handle this properly?

Yes, there are a couple of pci_p2pdma functions that take struct devices
directly simply because it's way more convenient for the caller. That's
what find_parent_pci_dev() takes care of (it returns false if the device
is not a PCI device). Whether that's appropriate here is hard to say
seeing we haven't seen any caller code.

Logan


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 12:32 p.m., Jason Gunthorpe wrote:
> Jerome, I think it would be nice to have a helper scheme - I think the
> simple case would be simple remapping of PCI BAR memory, so if we
> could have, say something like:
> 
> static const struct vm_operations_struct my_ops {
>   .p2p_map = p2p_ioremap_map_op,
>   .p2p_unmap = p2p_ioremap_unmap_op,
> }
> 
> struct ioremap_data {
>   [..]
> }
> 
> fops_mmap() {
>vma->private_data = _priv->ioremap_data;
>return p2p_ioremap_device_memory(vma, exporting_device, [..]);
> }

This is roughly what I was expecting, except I don't see exactly what
the p2p_map and p2p_unmap callbacks are for. The importing driver should
see p2pdma/hmm struct pages and use the appropriate function to map
them. It shouldn't be the responsibility of the exporting driver to
implement the mapping. And I don't think we should have 'special' vma's
for this (though we may need something to ensure we don't get mapping
requests mixed with different types of pages...).

I also figured there'd be a fault version of p2p_ioremap_device_memory()
for when you are mapping P2P memory and you want to assign the pages
lazily. Though, this can come later when someone wants to implement that.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Logan Gunthorpe


On 2019-01-29 12:56 p.m., Alex Deucher wrote:
> On Tue, Jan 29, 2019 at 12:47 PM  wrote:
>>
>> From: Jérôme Glisse 
>>
>> device_test_p2p() return true if two devices can peer to peer to
>> each other. We add a generic function as different inter-connect
>> can support peer to peer and we want to genericaly test this no
>> matter what the inter-connect might be. However this version only
>> support PCIE for now.
>>
> 
> What about something like these patches:
> https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=4fab9ff69cb968183f717551441b475fabce6c1c
> https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=f90b12d41c277335d08c9dab62433f27c0fadbe5
> They are a bit more thorough.

Those new functions seem to have a lot of overlap with the code that is
already upstream in p2pdma Perhaps you should be improving the
p2pdma functions if they aren't suitable for what you want already
instead of creating new ones.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 02:56:38PM -0500, Alex Deucher wrote:
> On Tue, Jan 29, 2019 at 12:47 PM  wrote:
> >
> > From: Jérôme Glisse 
> >
> > device_test_p2p() return true if two devices can peer to peer to
> > each other. We add a generic function as different inter-connect
> > can support peer to peer and we want to genericaly test this no
> > matter what the inter-connect might be. However this version only
> > support PCIE for now.
> >
> 
> What about something like these patches:
> https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=4fab9ff69cb968183f717551441b475fabce6c1c
> https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=f90b12d41c277335d08c9dab62433f27c0fadbe5
> They are a bit more thorough.

Yes it would be better, i forgot about those. I can include them
next time i post. Thank you for reminding me about those :)

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 08:46:05PM +0100, Greg Kroah-Hartman wrote:
> On Tue, Jan 29, 2019 at 12:47:25PM -0500, jgli...@redhat.com wrote:
> > From: Jérôme Glisse 
> > 
> > device_test_p2p() return true if two devices can peer to peer to
> > each other. We add a generic function as different inter-connect
> > can support peer to peer and we want to genericaly test this no
> > matter what the inter-connect might be. However this version only
> > support PCIE for now.
> 
> There is no defintion of "peer to peer" in the driver/device model, so
> why should this be in the driver core at all?
> 
> Especially as you only do this for PCI, why not just keep it in the PCI
> layer, that way you _know_ you are dealing with the right pointer types
> and there is no need to mess around with the driver core at all.

Ok i will drop the core device change. I wanted to allow other non
PCI to join latter on (ie allow PCI device to export to non PCI device)
but if that ever happen then we can update pci exporter at the same
time we introduce non pci importer.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Alex Deucher
On Tue, Jan 29, 2019 at 12:47 PM  wrote:
>
> From: Jérôme Glisse 
>
> device_test_p2p() return true if two devices can peer to peer to
> each other. We add a generic function as different inter-connect
> can support peer to peer and we want to genericaly test this no
> matter what the inter-connect might be. However this version only
> support PCIE for now.
>

What about something like these patches:
https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=4fab9ff69cb968183f717551441b475fabce6c1c
https://cgit.freedesktop.org/~deathsimple/linux/commit/?h=p2p=f90b12d41c277335d08c9dab62433f27c0fadbe5
They are a bit more thorough.

Alex

> Signed-off-by: Jérôme Glisse 
> Cc: Logan Gunthorpe 
> Cc: Greg Kroah-Hartman 
> Cc: Rafael J. Wysocki 
> Cc: Bjorn Helgaas 
> Cc: Christian Koenig 
> Cc: Felix Kuehling 
> Cc: Jason Gunthorpe 
> Cc: linux-ker...@vger.kernel.org
> Cc: linux-...@vger.kernel.org
> Cc: dri-de...@lists.freedesktop.org
> Cc: Christoph Hellwig 
> Cc: Marek Szyprowski 
> Cc: Robin Murphy 
> Cc: Joerg Roedel 
> Cc: iommu@lists.linux-foundation.org
> ---
>  drivers/pci/p2pdma.c   | 27 +++
>  include/linux/pci-p2pdma.h |  6 ++
>  2 files changed, 33 insertions(+)
>
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index c52298d76e64..620ac60babb5 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -797,3 +797,30 @@ ssize_t pci_p2pdma_enable_show(char *page, struct 
> pci_dev *p2p_dev,
> return sprintf(page, "%s\n", pci_name(p2p_dev));
>  }
>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
> +
> +bool pci_test_p2p(struct device *devA, struct device *devB)
> +{
> +   struct pci_dev *pciA, *pciB;
> +   bool ret;
> +   int tmp;
> +
> +   /*
> +* For now we only support PCIE peer to peer but other inter-connect
> +* can be added.
> +*/
> +   pciA = find_parent_pci_dev(devA);
> +   pciB = find_parent_pci_dev(devB);
> +   if (pciA == NULL || pciB == NULL) {
> +   ret = false;
> +   goto out;
> +   }
> +
> +   tmp = upstream_bridge_distance(pciA, pciB, NULL);
> +   ret = tmp < 0 ? false : true;
> +
> +out:
> +   pci_dev_put(pciB);
> +   pci_dev_put(pciA);
> +   return false;
> +}
> +EXPORT_SYMBOL_GPL(pci_test_p2p);
> diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
> index bca9bc3e5be7..7671cc499a08 100644
> --- a/include/linux/pci-p2pdma.h
> +++ b/include/linux/pci-p2pdma.h
> @@ -36,6 +36,7 @@ int pci_p2pdma_enable_store(const char *page, struct 
> pci_dev **p2p_dev,
> bool *use_p2pdma);
>  ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
>bool use_p2pdma);
> +bool pci_test_p2p(struct device *devA, struct device *devB);
>  #else /* CONFIG_PCI_P2PDMA */
>  static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
> size_t size, u64 offset)
> @@ -97,6 +98,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
>  {
> return sprintf(page, "none\n");
>  }
> +
> +static inline bool pci_test_p2p(struct device *devA, struct device *devB)
> +{
> +   return false;
> +}
>  #endif /* CONFIG_PCI_P2PDMA */
>
>
> --
> 2.17.2
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 11:26:01AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> > From: Jérôme Glisse 
> > 
> > device_test_p2p() return true if two devices can peer to peer to
> > each other. We add a generic function as different inter-connect
> > can support peer to peer and we want to genericaly test this no
> > matter what the inter-connect might be. However this version only
> > support PCIE for now.
> 
> This doesn't appear to be used in any of the further patches; so it's
> very confusing.
> 
> I'm not sure a struct device wrapper is really necessary...

I wanted to allow other non pci device to join in the fun but
yes right now i have only be doing this on pci devices.

Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 08:44:26PM +0100, Greg Kroah-Hartman wrote:
> On Tue, Jan 29, 2019 at 11:24:09AM -0700, Logan Gunthorpe wrote:
> > 
> > 
> > On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> > > +bool pci_test_p2p(struct device *devA, struct device *devB)
> > > +{
> > > + struct pci_dev *pciA, *pciB;
> > > + bool ret;
> > > + int tmp;
> > > +
> > > + /*
> > > +  * For now we only support PCIE peer to peer but other inter-connect
> > > +  * can be added.
> > > +  */
> > > + pciA = find_parent_pci_dev(devA);
> > > + pciB = find_parent_pci_dev(devB);
> > > + if (pciA == NULL || pciB == NULL) {
> > > + ret = false;
> > > + goto out;
> > > + }
> > > +
> > > + tmp = upstream_bridge_distance(pciA, pciB, NULL);
> > > + ret = tmp < 0 ? false : true;
> > > +
> > > +out:
> > > + pci_dev_put(pciB);
> > > + pci_dev_put(pciA);
> > > + return false;
> > > +}
> > > +EXPORT_SYMBOL_GPL(pci_test_p2p);
> > 
> > This function only ever returns false
> 
> I guess it was nevr actually tested :(
> 
> I feel really worried about passing random 'struct device' pointers into
> the PCI layer.  Are we _sure_ it can handle this properly?
> 

Oh yes i fixed it on the test rig and forgot to patch
my local git tree. My bad.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 07:32:57PM +, Jason Gunthorpe wrote:
> On Tue, Jan 29, 2019 at 02:11:23PM -0500, Jerome Glisse wrote:
> > On Tue, Jan 29, 2019 at 11:36:29AM -0700, Logan Gunthorpe wrote:
> > > 
> > > 
> > > On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> > > 
> > > > +   /*
> > > > +* Optional for device driver that want to allow peer to peer 
> > > > (p2p)
> > > > +* mapping of their vma (which can be back by some device 
> > > > memory) to
> > > > +* another device.
> > > > +*
> > > > +* Note that the exporting device driver might not have map 
> > > > anything
> > > > +* inside the vma for the CPU but might still want to allow a 
> > > > peer
> > > > +* device to access the range of memory corresponding to a 
> > > > range in
> > > > +* that vma.
> > > > +*
> > > > +* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE 
> > > > FOR A
> > > > +* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL 
> > > > VALID
> > > > +* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the 
> > > > importing
> > > > +* device to map once during setup and report any failure at 
> > > > that time
> > > > +* to the userspace. Further mapping of the same range might 
> > > > happen
> > > > +* after mmu notifier invalidation over the range. The 
> > > > exporting device
> > > > +* can use this to move things around (defrag BAR space for 
> > > > instance)
> > > > +* or do other similar task.
> > > > +*
> > > > +* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL 
> > > > p2p_unmap()
> > > > +* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT 
> > > > ANY
> > > > +* POINT IN TIME WITH NO LOCK HELD.
> > > > +*
> > > > +* In below function, the device argument is the importing 
> > > > device,
> > > > +* the exporting device is the device to which the vma belongs.
> > > > +*/
> > > > +   long (*p2p_map)(struct vm_area_struct *vma,
> > > > +   struct device *device,
> > > > +   unsigned long start,
> > > > +   unsigned long end,
> > > > +   dma_addr_t *pa,
> > > > +   bool write);
> > > > +   long (*p2p_unmap)(struct vm_area_struct *vma,
> > > > + struct device *device,
> > > > + unsigned long start,
> > > > + unsigned long end,
> > > > + dma_addr_t *pa);
> > > 
> > > I don't understand why we need new p2p_[un]map function pointers for
> > > this. In subsequent patches, they never appear to be set anywhere and
> > > are only called by the HMM code. I'd have expected it to be called by
> > > some core VMA code and set by HMM as that's what vm_operations_struct is
> > > for.
> > > 
> > > But the code as all very confusing, hard to follow and seems to be
> > > missing significant chunks. So I'm not really sure what is going on.
> > 
> > It is set by device driver when userspace do mmap(fd) where fd comes
> > from open("/dev/somedevicefile"). So it is set by device driver. HMM
> > has nothing to do with this. It must be set by device driver mmap
> > call back (mmap callback of struct file_operations). For this patch
> > you can completely ignore all the HMM patches. Maybe posting this as
> > 2 separate patchset would make it clearer.
> > 
> > For instance see [1] for how a non HMM driver can export its memory
> > by just setting those callback. Note that a proper implementation of
> > this should also include some kind of driver policy on what to allow
> > to map and what to not allow ... All this is driver specific in any
> > way.
> 
> I'm imagining that the RDMA drivers would use this interface on their
> per-process 'doorbell' BAR pages - we also wish to have P2P DMA to
> this memory. Also the entire VFIO PCI BAR mmap would be good to cover
> with this too.

Correct, you would set those callback on the mmap of your doorbell.

> 
> Jerome, I think it would be nice to have a helper scheme - I think the
> simple case would be simple remapping of PCI BAR memory, so if we
> could have, say something like:
> 
> static const struct vm_operations_struct my_ops {
>   .p2p_map = p2p_ioremap_map_op,
>   .p2p_unmap = p2p_ioremap_unmap_op,
> }
> 
> struct ioremap_data {
>   [..]
> }
> 
> fops_mmap() {
>vma->private_data = _priv->ioremap_data;
>return p2p_ioremap_device_memory(vma, exporting_device, [..]);
> }
> 
> Which closely matches at least what the RDMA drivers do. Where
> p2p_ioremap_device_memory populates p2p_map and p2p_unmap pointers
> with sensible functions, etc.
> 
> It looks like vfio would be able to use this as well (though I am
> unsure why vfio uses remap_pfn_range instead of io_remap_pfn range for
> BAR memory..)

Yes simple helper that implement a sane default 

Re: [RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread Greg Kroah-Hartman
On Tue, Jan 29, 2019 at 12:47:25PM -0500, jgli...@redhat.com wrote:
> From: Jérôme Glisse 
> 
> device_test_p2p() return true if two devices can peer to peer to
> each other. We add a generic function as different inter-connect
> can support peer to peer and we want to genericaly test this no
> matter what the inter-connect might be. However this version only
> support PCIE for now.

There is no defintion of "peer to peer" in the driver/device model, so
why should this be in the driver core at all?

Especially as you only do this for PCI, why not just keep it in the PCI
layer, that way you _know_ you are dealing with the right pointer types
and there is no need to mess around with the driver core at all.

thanks,

greg k-h
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Greg Kroah-Hartman
On Tue, Jan 29, 2019 at 11:24:09AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> > +bool pci_test_p2p(struct device *devA, struct device *devB)
> > +{
> > +   struct pci_dev *pciA, *pciB;
> > +   bool ret;
> > +   int tmp;
> > +
> > +   /*
> > +* For now we only support PCIE peer to peer but other inter-connect
> > +* can be added.
> > +*/
> > +   pciA = find_parent_pci_dev(devA);
> > +   pciB = find_parent_pci_dev(devB);
> > +   if (pciA == NULL || pciB == NULL) {
> > +   ret = false;
> > +   goto out;
> > +   }
> > +
> > +   tmp = upstream_bridge_distance(pciA, pciB, NULL);
> > +   ret = tmp < 0 ? false : true;
> > +
> > +out:
> > +   pci_dev_put(pciB);
> > +   pci_dev_put(pciA);
> > +   return false;
> > +}
> > +EXPORT_SYMBOL_GPL(pci_test_p2p);
> 
> This function only ever returns false

I guess it was nevr actually tested :(

I feel really worried about passing random 'struct device' pointers into
the PCI layer.  Are we _sure_ it can handle this properly?

thanks,

greg k-h
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 12:24:04PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 12:11 p.m., Jerome Glisse wrote:
> > On Tue, Jan 29, 2019 at 11:36:29AM -0700, Logan Gunthorpe wrote:
> >>
> >>
> >> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> >>
> >>> + /*
> >>> +  * Optional for device driver that want to allow peer to peer (p2p)
> >>> +  * mapping of their vma (which can be back by some device memory) to
> >>> +  * another device.
> >>> +  *
> >>> +  * Note that the exporting device driver might not have map anything
> >>> +  * inside the vma for the CPU but might still want to allow a peer
> >>> +  * device to access the range of memory corresponding to a range in
> >>> +  * that vma.
> >>> +  *
> >>> +  * FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
> >>> +  * DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
> >>> +  * SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
> >>> +  * device to map once during setup and report any failure at that time
> >>> +  * to the userspace. Further mapping of the same range might happen
> >>> +  * after mmu notifier invalidation over the range. The exporting device
> >>> +  * can use this to move things around (defrag BAR space for instance)
> >>> +  * or do other similar task.
> >>> +  *
> >>> +  * IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
> >>> +  * WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
> >>> +  * POINT IN TIME WITH NO LOCK HELD.
> >>> +  *
> >>> +  * In below function, the device argument is the importing device,
> >>> +  * the exporting device is the device to which the vma belongs.
> >>> +  */
> >>> + long (*p2p_map)(struct vm_area_struct *vma,
> >>> + struct device *device,
> >>> + unsigned long start,
> >>> + unsigned long end,
> >>> + dma_addr_t *pa,
> >>> + bool write);
> >>> + long (*p2p_unmap)(struct vm_area_struct *vma,
> >>> +   struct device *device,
> >>> +   unsigned long start,
> >>> +   unsigned long end,
> >>> +   dma_addr_t *pa);
> >>
> >> I don't understand why we need new p2p_[un]map function pointers for
> >> this. In subsequent patches, they never appear to be set anywhere and
> >> are only called by the HMM code. I'd have expected it to be called by
> >> some core VMA code and set by HMM as that's what vm_operations_struct is
> >> for.
> >>
> >> But the code as all very confusing, hard to follow and seems to be
> >> missing significant chunks. So I'm not really sure what is going on.
> > 
> > It is set by device driver when userspace do mmap(fd) where fd comes
> > from open("/dev/somedevicefile"). So it is set by device driver. HMM
> > has nothing to do with this. It must be set by device driver mmap
> > call back (mmap callback of struct file_operations). For this patch
> > you can completely ignore all the HMM patches. Maybe posting this as
> > 2 separate patchset would make it clearer.
> > 
> > For instance see [1] for how a non HMM driver can export its memory
> > by just setting those callback. Note that a proper implementation of
> > this should also include some kind of driver policy on what to allow
> > to map and what to not allow ... All this is driver specific in any
> > way.
> 
> I'd suggest [1] should be a part of the patchset so we can actually see
> a user of the stuff you're adding.

I did not wanted to clutter patchset with device driver specific usage
of this. As the API can be reason about in abstract way.

> 
> But it still doesn't explain everything as without the HMM code nothing
> calls the new vm_ops. And there's still no callers for the p2p_test
> functions you added. And I still don't understand why we need the new
> vm_ops or who calls them and when. Why can't drivers use the existing
> 'fault' vm_op and call a new helper function to map p2p when appropriate
> or a different helper function to map a large range in its mmap
> operation? Just like regular mmap code...

HMM code is just one user, if you have a driver that use HMM mirror
then your driver get support for this for free. If you do not want to
use HMM then you can directly call this in your driver.

The flow is, device driver want to setup some mapping for a range of
virtual address [va_start, va_end]:
1 - Lookup vma for the range
2 - If vma is regular vma (not an mmap of a file) then use GUP
If vma is a mmap of a file and they are p2p_map call back
then call p2p_map()
3 - Use either the result of GUP or p2p_map() to program the
device

The p2p test function is use by device driver implementing the call
back for instance see:

https://cgit.freedesktop.org/~glisse/linux/commit/?h=hmm-p2p=401a567696eafb1d4faf7054ab0d7c3a16a5ef06

The vm_fault callback is not suited because here we are mapping to
another device so this will need special handling, someone must have
both 

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 12:11 p.m., Jerome Glisse wrote:
> On Tue, Jan 29, 2019 at 11:36:29AM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
>>
>>> +   /*
>>> +* Optional for device driver that want to allow peer to peer (p2p)
>>> +* mapping of their vma (which can be back by some device memory) to
>>> +* another device.
>>> +*
>>> +* Note that the exporting device driver might not have map anything
>>> +* inside the vma for the CPU but might still want to allow a peer
>>> +* device to access the range of memory corresponding to a range in
>>> +* that vma.
>>> +*
>>> +* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
>>> +* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
>>> +* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
>>> +* device to map once during setup and report any failure at that time
>>> +* to the userspace. Further mapping of the same range might happen
>>> +* after mmu notifier invalidation over the range. The exporting device
>>> +* can use this to move things around (defrag BAR space for instance)
>>> +* or do other similar task.
>>> +*
>>> +* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
>>> +* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
>>> +* POINT IN TIME WITH NO LOCK HELD.
>>> +*
>>> +* In below function, the device argument is the importing device,
>>> +* the exporting device is the device to which the vma belongs.
>>> +*/
>>> +   long (*p2p_map)(struct vm_area_struct *vma,
>>> +   struct device *device,
>>> +   unsigned long start,
>>> +   unsigned long end,
>>> +   dma_addr_t *pa,
>>> +   bool write);
>>> +   long (*p2p_unmap)(struct vm_area_struct *vma,
>>> + struct device *device,
>>> + unsigned long start,
>>> + unsigned long end,
>>> + dma_addr_t *pa);
>>
>> I don't understand why we need new p2p_[un]map function pointers for
>> this. In subsequent patches, they never appear to be set anywhere and
>> are only called by the HMM code. I'd have expected it to be called by
>> some core VMA code and set by HMM as that's what vm_operations_struct is
>> for.
>>
>> But the code as all very confusing, hard to follow and seems to be
>> missing significant chunks. So I'm not really sure what is going on.
> 
> It is set by device driver when userspace do mmap(fd) where fd comes
> from open("/dev/somedevicefile"). So it is set by device driver. HMM
> has nothing to do with this. It must be set by device driver mmap
> call back (mmap callback of struct file_operations). For this patch
> you can completely ignore all the HMM patches. Maybe posting this as
> 2 separate patchset would make it clearer.
> 
> For instance see [1] for how a non HMM driver can export its memory
> by just setting those callback. Note that a proper implementation of
> this should also include some kind of driver policy on what to allow
> to map and what to not allow ... All this is driver specific in any
> way.

I'd suggest [1] should be a part of the patchset so we can actually see
a user of the stuff you're adding.

But it still doesn't explain everything as without the HMM code nothing
calls the new vm_ops. And there's still no callers for the p2p_test
functions you added. And I still don't understand why we need the new
vm_ops or who calls them and when. Why can't drivers use the existing
'fault' vm_op and call a new helper function to map p2p when appropriate
or a different helper function to map a large range in its mmap
operation? Just like regular mmap code...

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Jerome Glisse
On Tue, Jan 29, 2019 at 11:36:29AM -0700, Logan Gunthorpe wrote:
> 
> 
> On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> 
> > +   /*
> > +* Optional for device driver that want to allow peer to peer (p2p)
> > +* mapping of their vma (which can be back by some device memory) to
> > +* another device.
> > +*
> > +* Note that the exporting device driver might not have map anything
> > +* inside the vma for the CPU but might still want to allow a peer
> > +* device to access the range of memory corresponding to a range in
> > +* that vma.
> > +*
> > +* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
> > +* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
> > +* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
> > +* device to map once during setup and report any failure at that time
> > +* to the userspace. Further mapping of the same range might happen
> > +* after mmu notifier invalidation over the range. The exporting device
> > +* can use this to move things around (defrag BAR space for instance)
> > +* or do other similar task.
> > +*
> > +* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
> > +* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
> > +* POINT IN TIME WITH NO LOCK HELD.
> > +*
> > +* In below function, the device argument is the importing device,
> > +* the exporting device is the device to which the vma belongs.
> > +*/
> > +   long (*p2p_map)(struct vm_area_struct *vma,
> > +   struct device *device,
> > +   unsigned long start,
> > +   unsigned long end,
> > +   dma_addr_t *pa,
> > +   bool write);
> > +   long (*p2p_unmap)(struct vm_area_struct *vma,
> > + struct device *device,
> > + unsigned long start,
> > + unsigned long end,
> > + dma_addr_t *pa);
> 
> I don't understand why we need new p2p_[un]map function pointers for
> this. In subsequent patches, they never appear to be set anywhere and
> are only called by the HMM code. I'd have expected it to be called by
> some core VMA code and set by HMM as that's what vm_operations_struct is
> for.
> 
> But the code as all very confusing, hard to follow and seems to be
> missing significant chunks. So I'm not really sure what is going on.

It is set by device driver when userspace do mmap(fd) where fd comes
from open("/dev/somedevicefile"). So it is set by device driver. HMM
has nothing to do with this. It must be set by device driver mmap
call back (mmap callback of struct file_operations). For this patch
you can completely ignore all the HMM patches. Maybe posting this as
2 separate patchset would make it clearer.

For instance see [1] for how a non HMM driver can export its memory
by just setting those callback. Note that a proper implementation of
this should also include some kind of driver policy on what to allow
to map and what to not allow ... All this is driver specific in any
way.

Cheers,
Jérôme

[1] 
https://cgit.freedesktop.org/~glisse/linux/commit/?h=wip-p2p-showcase=964214dcd4df96f296e2214042e8cfce135ae3d4
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Michael S. Tsirkin
On Tue, Jan 29, 2019 at 03:42:44PM -0200, Thiago Jung Bauermann wrote:
> 
> Fixing address of powerpc mailing list.
> 
> Thiago Jung Bauermann  writes:
> 
> > Hello,
> >
> > With Christoph's rework of the DMA API that recently landed, the patch
> > below is the only change needed in virtio to make it work in a POWER
> > secure guest under the ultravisor.
> >
> > The other change we need (making sure the device's dma_map_ops is NULL
> > so that the dma-direct/swiotlb code is used) can be made in
> > powerpc-specific code.
> >
> > Of course, I also have patches (soon to be posted as RFC) which hook up
> >  to the powerpc secure guest support code.
> >
> > What do you think?
> >
> > From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
> > From: Thiago Jung Bauermann 
> > Date: Thu, 24 Jan 2019 22:08:02 -0200
> > Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted
> >
> > The host can't access the guest memory when it's encrypted, so using
> > regular memory pages for the ring isn't an option. Go through the DMA API.
> >
> > Signed-off-by: Thiago Jung Bauermann 

Well I think this will come back to bite us (witness xen which is now
reworking precisely this path - but at least they aren't to blame, xen
came before ACCESS_PLATFORM).

I also still think the right thing would have been to set
ACCESS_PLATFORM for all systems where device can't access all memory.

But I also think I don't have the energy to argue about power secure
guest anymore.  So be it for power secure guest since the involved
engineers disagree with me.  Hey I've been wrong in the past ;).

But the name "sev_active" makes me scared because at least AMD guys who
were doing the sensible thing and setting ACCESS_PLATFORM (unless I'm
wrong? I reemember distinctly that's so) will likely be affected too.
We don't want that.

So let's find a way to make sure it's just power secure guest for now
pls.

I also think we should add a dma_api near features under virtio_device
such that these hacks can move off data path.

By the way could you please respond about virtio-iommu and
why there's no support for ACCESS_PLATFORM on power?

I have Cc'd you on these discussions.


Thanks!


> > ---
> >  drivers/virtio/virtio_ring.c | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index cd7e755484e3..321a27075380 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device 
> > *vdev)
> >  * not work without an even larger kludge.  Instead, enable
> >  * the DMA API if we're a Xen guest, which at least allows
> >  * all of the sensible Xen configurations to work correctly.
> > +*
> > +* Also, if guest memory is encrypted the host can't access
> > +* it directly. In this case, we'll need to use the DMA API.
> >  */
> > -   if (xen_domain())
> > +   if (xen_domain() || sev_active())
> > return true;
> >
> > return false;
> 
> 
> -- 
> Thiago Jung Bauermann
> IBM Linux Technology Center
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 0/7] Add virtio-iommu driver

2019-01-29 Thread Michael S. Tsirkin
On Mon, Jan 21, 2019 at 11:29:05AM +, Jean-Philippe Brucker wrote:
> Hi,
> 
> On 18/01/2019 15:51, Michael S. Tsirkin wrote:
> > 
> > On Tue, Jan 15, 2019 at 12:19:52PM +, Jean-Philippe Brucker wrote:
> >> Implement the virtio-iommu driver, following specification v0.9 [1].
> >>
> >> This is a simple rebase onto Linux v5.0-rc2. We now use the
> >> dev_iommu_fwspec_get() helper introduced in v5.0 instead of accessing
> >> dev->iommu_fwspec, but there aren't any functional change from v6 [2].
> >>
> >> Our current goal for virtio-iommu is to get a paravirtual IOMMU working
> >> on Arm, and enable device assignment to guest userspace. In this
> >> use-case the mappings are static, and don't require optimal performance,
> >> so this series tries to keep things simple. However there is plenty more
> >> to do for features and optimizations, and having this base in v5.1 would
> >> be good. Given that most of the changes are to drivers/iommu, I believe
> >> the driver and future changes should go via the IOMMU tree.
> >>
> >> You can find Linux driver and kvmtool device on v0.9.2 branches [3],
> >> module and x86 support on virtio-iommu/devel. Also tested with Eric's
> >> QEMU device [4]. Please note that the series depends on Robin's
> >> probe-deferral fix [5], which will hopefully land in v5.0.
> >>
> >> [1] Virtio-iommu specification v0.9, sources and pdf
> >> git://linux-arm.org/virtio-iommu.git virtio-iommu/v0.9
> >> http://jpbrucker.net/virtio-iommu/spec/v0.9/virtio-iommu-v0.9.pdf
> >>
> >> [2] [PATCH v6 0/7] Add virtio-iommu driver
> >> 
> >> https://lists.linuxfoundation.org/pipermail/iommu/2018-December/032127.html
> >>
> >> [3] git://linux-arm.org/linux-jpb.git virtio-iommu/v0.9.2
> >> git://linux-arm.org/kvmtool-jpb.git virtio-iommu/v0.9.2
> >>
> >> [4] [RFC v9 00/17] VIRTIO-IOMMU device
> >> https://www.mail-archive.com/qemu-devel@nongnu.org/msg575578.html
> >>
> >> [5] [PATCH] iommu/of: Fix probe-deferral
> >> https://www.spinics.net/lists/arm-kernel/msg698371.html
> > 
> > Thanks for the work!
> > So really my only issue with this is that there's no
> > way for the IOMMU to describe the devices that it
> > covers.
> > 
> > As a result that is then done in a platform-specific way.
> > 
> > And this means that for example it does not solve the problem that e.g.
> > some power people have in that their platform simply does not have a way
> > to specify which devices are covered by the IOMMU.
> 
> Isn't power using device tree? I haven't looked much at power because I
> was told a while ago that they already paravirtualize their IOMMU and
> don't need virtio-iommu, except perhaps for some legacy platforms. Or
> something along those lines. But I would certainly be interested in
> enabling the IOMMU for more architectures.

I have CC'd the relevant ppc developers, let's see what do they think.


> As for the enumeration problem, I still don't think we can get much
> better than DT and ACPI as solutions (and IMO they are necessary to make
> this device portable). But I believe that getting DT and ACPI support is
> just a one-off inconvenience. That is, once the required bindings are
> accepted, any future extension can then be done at the virtio level with
> feature bits and probe requests, without having to update ACPI or DT.
> 
> Thanks,
> Jean
> 
> > Solving that problem would make me much more excited about
> > this device.
> > 
> > On the other hand I can see that while there have been some
> > developments most of the code has been stable for quite a while now.
> > 
> > So what I am trying to do right about now, is making a small module that
> > loads early and pokes at the IOMMU sufficiently to get the data about
> > which devices use the IOMMU out of it using standard virtio config
> > space.  IIUC it's claimed to be impossible without messy changes to the
> > boot sequence.
> > 
> > If I succeed at least on some platforms I'll ask that this design is
> > worked into this device, minimizing info that goes through DT/ACPI.  If
> > I see I can't make it in time to meet the next merge window, I plan
> > merging the existing patches using DT (barring surprises).
> > 
> > As I only have a very small amount of time to spend on this attempt, If
> > someone else wants to try doing that in parallel, that would be great!
> > 
> > 
> >> Jean-Philippe Brucker (7):
> >>   dt-bindings: virtio-mmio: Add IOMMU description
> >>   dt-bindings: virtio: Add virtio-pci-iommu node
> >>   of: Allow the iommu-map property to omit untranslated devices
> >>   PCI: OF: Initialize dev->fwnode appropriately
> >>   iommu: Add virtio-iommu driver
> >>   iommu/virtio: Add probe request
> >>   iommu/virtio: Add event queue
> >>
> >>  .../devicetree/bindings/virtio/iommu.txt  |   66 +
> >>  .../devicetree/bindings/virtio/mmio.txt   |   30 +
> >>  MAINTAINERS   |7 +
> >>  drivers/iommu/Kconfig |   11 +
> >>  drivers/iommu/Makefile   

Re: [RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:

> + /*
> +  * Optional for device driver that want to allow peer to peer (p2p)
> +  * mapping of their vma (which can be back by some device memory) to
> +  * another device.
> +  *
> +  * Note that the exporting device driver might not have map anything
> +  * inside the vma for the CPU but might still want to allow a peer
> +  * device to access the range of memory corresponding to a range in
> +  * that vma.
> +  *
> +  * FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
> +  * DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
> +  * SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
> +  * device to map once during setup and report any failure at that time
> +  * to the userspace. Further mapping of the same range might happen
> +  * after mmu notifier invalidation over the range. The exporting device
> +  * can use this to move things around (defrag BAR space for instance)
> +  * or do other similar task.
> +  *
> +  * IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
> +  * WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
> +  * POINT IN TIME WITH NO LOCK HELD.
> +  *
> +  * In below function, the device argument is the importing device,
> +  * the exporting device is the device to which the vma belongs.
> +  */
> + long (*p2p_map)(struct vm_area_struct *vma,
> + struct device *device,
> + unsigned long start,
> + unsigned long end,
> + dma_addr_t *pa,
> + bool write);
> + long (*p2p_unmap)(struct vm_area_struct *vma,
> +   struct device *device,
> +   unsigned long start,
> +   unsigned long end,
> +   dma_addr_t *pa);

I don't understand why we need new p2p_[un]map function pointers for
this. In subsequent patches, they never appear to be set anywhere and
are only called by the HMM code. I'd have expected it to be called by
some core VMA code and set by HMM as that's what vm_operations_struct is
for.

But the code as all very confusing, hard to follow and seems to be
missing significant chunks. So I'm not really sure what is going on.

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread Logan Gunthorpe


On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> From: Jérôme Glisse 
> 
> device_test_p2p() return true if two devices can peer to peer to
> each other. We add a generic function as different inter-connect
> can support peer to peer and we want to genericaly test this no
> matter what the inter-connect might be. However this version only
> support PCIE for now.

This doesn't appear to be used in any of the further patches; so it's
very confusing.

I'm not sure a struct device wrapper is really necessary...

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread Logan Gunthorpe



On 2019-01-29 10:47 a.m., jgli...@redhat.com wrote:
> +bool pci_test_p2p(struct device *devA, struct device *devB)
> +{
> + struct pci_dev *pciA, *pciB;
> + bool ret;
> + int tmp;
> +
> + /*
> +  * For now we only support PCIE peer to peer but other inter-connect
> +  * can be added.
> +  */
> + pciA = find_parent_pci_dev(devA);
> + pciB = find_parent_pci_dev(devB);
> + if (pciA == NULL || pciB == NULL) {
> + ret = false;
> + goto out;
> + }
> +
> + tmp = upstream_bridge_distance(pciA, pciB, NULL);
> + ret = tmp < 0 ? false : true;
> +
> +out:
> + pci_dev_put(pciB);
> + pci_dev_put(pciA);
> + return false;
> +}
> +EXPORT_SYMBOL_GPL(pci_test_p2p);

This function only ever returns false

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH 4/5] mm/hmm: add support for peer to peer to HMM device memory

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 include/linux/hmm.h | 47 +
 mm/hmm.c| 63 +
 2 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4a1454e3efba..7a3ac182cc48 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -710,6 +710,53 @@ struct hmm_devmem_ops {
 const struct page *page,
 unsigned int flags,
 pmd_t *pmdp);
+
+   /*
+* p2p_map() - map page for peer to peer between device
+* @devmem: device memory structure (see struct hmm_devmem)
+* @range: range of virtual address that is being mapped
+* @device: device the range is being map to
+* @addr: first virtual address in the range to consider
+* @pa: device address (where actual mapping is store)
+* Returns: number of page successfuly mapped, 0 otherwise
+*
+* Map page belonging to devmem to another device for peer to peer
+* access. Device can decide not to map in which case memory will
+* be migrated to main memory.
+*
+* Also there is no garantee that all the pages in the range does
+* belongs to the devmem so it is up to the function to check that
+* every single page does belong to devmem.
+*
+* Note for now we do not care about error exect error, so on failure
+* function should just return 0.
+*/
+   long (*p2p_map)(struct hmm_devmem *devmem,
+   struct hmm_range *range,
+   struct device *device,
+   unsigned long addr,
+   dma_addr_t *pas);
+
+   /*
+* p2p_unmap() - unmap page from peer to peer between device
+* @devmem: device memory structure (see struct hmm_devmem)
+* @range: range of virtual address that is being mapped
+* @device: device the range is being map to
+* @addr: first virtual address in the range to consider
+* @pa: device address (where actual mapping is store)
+* Returns: number of page successfuly unmapped, 0 otherwise
+*
+* Unmap page belonging to devmem previously map with p2p_map().
+*
+* Note there is no garantee that all the pages in the range does
+* belongs to the devmem so it is up to the function to check that
+* every single page does belong to devmem.
+*/
+   unsigned long (*p2p_unmap)(struct hmm_devmem *devmem,
+  struct hmm_range *range,
+  struct device *device,
+  unsigned long addr,
+  dma_addr_t *pas);
 };
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 1a444885404e..fd49b1e116d0 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1193,16 +1193,19 @@ long hmm_range_dma_map(struct hmm_range *range,
   dma_addr_t *daddrs,
   bool block)
 {
-   unsigned long i, npages, mapped, page_size;
+   unsigned long i, npages, mapped, page_size, addr;
long ret;
 
+again:
ret = hmm_range_fault(range, block);
if (ret <= 0)
return ret ? ret : -EBUSY;
 
+   mapped = 0;
+   addr = range->start;
page_size = hmm_range_page_size(range);
npages = (range->end - range->start) >> range->page_shift;
-   for (i = 0, mapped = 0; i < npages; ++i) {
+   for (i = 0; i < npages; ++i, addr += page_size) {
enum dma_data_direction dir = DMA_FROM_DEVICE;
struct page *page;
 
@@ -1226,6 +1229,29 @@ long hmm_range_dma_map(struct hmm_range *range,
goto unmap;
}
 
+   if (is_device_private_page(page)) {
+   struct hmm_devmem *devmem = page->pgmap->data;
+
+   if (!devmem->ops->p2p_map || !devmem->ops->p2p_unmap) {
+   /* Fall-back to main memory. */
+   range->default_flags |=
+   range->flags[HMM_PFN_DEVICE_PRIVATE];
+   goto again;
+   }
+
+   ret = devmem->ops->p2p_map(devmem, range, device,
+  addr, daddrs);
+   if (ret <= 0) {
+   /* Fall-back to main memory. */
+   range->default_flags |=
+

[RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Allow mmap of device file to export device memory to peer to peer
devices. This will allow for instance a network device to access a
GPU memory or to access a storage device queue directly.

The common case will be a vma created by userspace device driver
that is then share to another userspace device driver which call
in its kernel device driver to map that vma.

The vma does not need to have any valid CPU mapping so that only
peer to peer device might access its content. Or it could have
valid CPU mapping too in that case it should point to same memory
for consistency.

Note that peer to peer mapping is highly platform and device
dependent and it might not work in all the cases. However we do
expect supports for this to grow on more hardware platform.

This patch only adds new call backs to vm_operations_struct bulk
of code light within common bus driver (like pci) and device
driver (both the exporting and importing device).

Current design mandate that the importer must obey mmu_notifier
and invalidate any peer to peer mapping anytime a notification
of invalidation happens for a range that have been peer to peer
mapped. This allows exporter device to easily invalidate mapping
for any importer device.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-ker...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 include/linux/mm.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..1bd60a90e575 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -429,6 +429,44 @@ struct vm_operations_struct {
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
 
+   /*
+* Optional for device driver that want to allow peer to peer (p2p)
+* mapping of their vma (which can be back by some device memory) to
+* another device.
+*
+* Note that the exporting device driver might not have map anything
+* inside the vma for the CPU but might still want to allow a peer
+* device to access the range of memory corresponding to a range in
+* that vma.
+*
+* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
+* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
+* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
+* device to map once during setup and report any failure at that time
+* to the userspace. Further mapping of the same range might happen
+* after mmu notifier invalidation over the range. The exporting device
+* can use this to move things around (defrag BAR space for instance)
+* or do other similar task.
+*
+* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
+* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
+* POINT IN TIME WITH NO LOCK HELD.
+*
+* In below function, the device argument is the importing device,
+* the exporting device is the device to which the vma belongs.
+*/
+   long (*p2p_map)(struct vm_area_struct *vma,
+   struct device *device,
+   unsigned long start,
+   unsigned long end,
+   dma_addr_t *pa,
+   bool write);
+   long (*p2p_unmap)(struct vm_area_struct *vma,
+ struct device *device,
+ unsigned long start,
+ unsigned long end,
+ dma_addr_t *pa);
+
/* notification that a previously read-only page is about to become
 * writable, if an error is returned it will cause a SIGBUS */
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
-- 
2.17.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[RFC PATCH 5/5] mm/hmm: add support for peer to peer to special device vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Special device vma (mmap of a device file) can correspond to device
driver object that some device driver might want to share with other
device (giving access to). This add support for HMM to map those
special device vma if the owning device (exporter) allows it.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 include/linux/hmm.h |   6 ++
 mm/hmm.c| 156 ++--
 2 files changed, 128 insertions(+), 34 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7a3ac182cc48..98ebe9f52432 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -137,6 +137,7 @@ enum hmm_pfn_flag_e {
  *  result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should 
not
  *  be mirrored by a device, because the entry will never have 
HMM_PFN_VALID
  *  set and the pfn value is undefined.
+ * HMM_PFN_P2P: this entry have been map as P2P ie the dma address is valid
  *
  * Driver provide entry value for none entry, error entry and special entry,
  * driver can alias (ie use same value for error and special for instance). It
@@ -151,6 +152,7 @@ enum hmm_pfn_value_e {
HMM_PFN_ERROR,
HMM_PFN_NONE,
HMM_PFN_SPECIAL,
+   HMM_PFN_P2P,
HMM_PFN_VALUE_MAX
 };
 
@@ -250,6 +252,8 @@ static inline bool hmm_range_valid(struct hmm_range *range)
 static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
   uint64_t pfn)
 {
+   if (pfn == range->values[HMM_PFN_P2P])
+   return NULL;
if (pfn == range->values[HMM_PFN_NONE])
return NULL;
if (pfn == range->values[HMM_PFN_ERROR])
@@ -270,6 +274,8 @@ static inline struct page *hmm_pfn_to_page(const struct 
hmm_range *range,
 static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
   uint64_t pfn)
 {
+   if (pfn == range->values[HMM_PFN_P2P])
+   return -1UL;
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
if (pfn == range->values[HMM_PFN_ERROR])
diff --git a/mm/hmm.c b/mm/hmm.c
index fd49b1e116d0..621a4f831483 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1058,37 +1058,36 @@ long hmm_range_snapshot(struct hmm_range *range)
 }
 EXPORT_SYMBOL(hmm_range_snapshot);
 
-/*
- * hmm_range_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 on success ortherwise:
- *  -EINVAL:
- *  Invalid argument
- *  -ENOMEM:
- *  Out of memory.
- *  -EPERM:
- *  Invalid permission (for instance asking for write and range
- *  is read only).
- *  -EAGAIN:
- *  If you need to retry and mmap_sem was drop. This can only
- *  happens if block argument is false.
- *  -EBUSY:
- *  If the the range is being invalidated and you should wait for
- *  invalidation to finish.
- *  -EFAULT:
- *  Invalid (ie either no valid vma or it is illegal to access that
- *  range), number of valid pages in range->pfns[] (from range 
start
- *  address).
- *
- * This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs
- * and caller does not ask for migration.
- *
- * On error, for one virtual address in the range, the function will mark the
- * corresponding HMM pfn entry with an error flag.
- */
-long hmm_range_fault(struct hmm_range *range, bool block)
+static int hmm_vma_p2p_map(struct hmm_range *range, struct vm_area_struct *vma,
+  unsigned long start, unsigned long end,
+  struct device *device, dma_addr_t *pas)
+{
+   struct hmm_vma_walk hmm_vma_walk;
+   unsigned long npages, i;
+   bool fault, write;
+   uint64_t *pfns;
+   int ret;
+
+   i = (start - range->start) >> PAGE_SHIFT;
+   npages = (end - start) >> PAGE_SHIFT;
+   pfns = >pfns[i];
+   pas = [i];
+
+   hmm_vma_walk.range = range;
+   hmm_vma_walk.fault = true;
+   hmm_range_need_fault(_vma_walk, pfns, npages,
+   0, , );
+
+   ret = vma->vm_ops->p2p_map(vma, device, start, end, pas, write);
+   for (i = 0; i < npages; ++i) {
+   pfns[i] = ret ? range->values[HMM_PFN_ERROR] :
+ range->values[HMM_PFN_P2P];
+   }
+   return ret;
+}
+
+static long 

Re: [RFC v3 02/21] iommu: Introduce cache_invalidate API

2019-01-29 Thread Auger Eric
Hi Jean-Philippe,

On 1/28/19 6:32 PM, Jean-Philippe Brucker wrote:
> Hi Eric,
> 
> On 25/01/2019 16:49, Auger Eric wrote:
> [...]
 diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
 index 7a7cf7a3de7c..4605f5cfac84 100644
 --- a/include/uapi/linux/iommu.h
 +++ b/include/uapi/linux/iommu.h
 @@ -47,4 +47,99 @@ struct iommu_pasid_table_config {
};
  };
  
 +/**
 + * enum iommu_inv_granularity - Generic invalidation granularity
 + * @IOMMU_INV_GRANU_DOMAIN_ALL_PASID: TLB entries or PASID caches of 
 all
 + *PASIDs associated with a domain 
 ID
 + * @IOMMU_INV_GRANU_PASID_SEL:TLB entries or PASID cache 
 associated
 + *with a PASID and a domain
 + * @IOMMU_INV_GRANU_PAGE_PASID:   TLB entries of selected page 
 range
 + *within a PASID
 + *
 + * When an invalidation request is passed down to IOMMU to flush 
 translation
 + * caches, it may carry different granularity levels, which can be 
 specific
 + * to certain types of translation caches.
 + * This enum is a collection of granularities for all types of translation
 + * caches. The idea is to make it easy for IOMMU model specific driver to
 + * convert from generic to model specific value. Each IOMMU driver
 + * can enforce check based on its own conversion table. The conversion is
 + * based on 2D look-up with inputs as follows:
 + * - translation cache types
 + * - granularity
 + *
 + * type |   DTLB|TLB|   PASID   |
 + *  granule |   |   |   cache   |
 + * -+---+---+---+
 + *  DN_ALL_PASID|   Y   |   Y   |   Y   |
 + *  PASID_SEL   |   Y   |   Y   |   Y   |
 + *  PAGE_PASID  |   Y   |   Y   |   N/A |
 + *
 + */
 +enum iommu_inv_granularity {
 +  IOMMU_INV_GRANU_DOMAIN_ALL_PASID,
 +  IOMMU_INV_GRANU_PASID_SEL,
 +  IOMMU_INV_GRANU_PAGE_PASID,
 +  IOMMU_INV_NR_GRANU,
 +};
 +
 +/**
 + * enum iommu_inv_type - Generic translation cache types for invalidation
 + *
 + * @IOMMU_INV_TYPE_DTLB:  device IOTLB
 + * @IOMMU_INV_TYPE_TLB:   IOMMU paging structure cache
 + * @IOMMU_INV_TYPE_PASID: PASID cache
 + * Invalidation requests sent to IOMMU for a given device need to indicate
 + * which type of translation cache to be operated on. Combined with enum
 + * iommu_inv_granularity, model specific driver can do a simple lookup to
 + * convert from generic to model specific value.
 + */
 +enum iommu_inv_type {
 +  IOMMU_INV_TYPE_DTLB,
 +  IOMMU_INV_TYPE_TLB,
 +  IOMMU_INV_TYPE_PASID,
 +  IOMMU_INV_NR_TYPE
 +};
 +
 +/**
 + * Translation cache invalidation header that contains mandatory meta 
 data.
 + * @version:  info format version, expecting future extesions
 + * @type: type of translation cache to be invalidated
 + */
 +struct iommu_cache_invalidate_hdr {
 +  __u32 version;
 +#define TLB_INV_HDR_VERSION_1 1
 +  enum iommu_inv_type type;
 +};
 +
 +/**
 + * Translation cache invalidation information, contains generic IOMMU
 + * data which can be parsed based on model ID by model specific drivers.
 + * Since the invalidation of second level page tables are included in the
 + * unmap operation, this info is only applicable to the first level
 + * translation caches, i.e. DMA request with PASID.
 + *
 + * @granularity:  requested invalidation granularity, type dependent
 + * @size: 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
>>>
>>> Why is this a 4K page centric interface?
>> This matches the vt-d Address Mask (AM) field of the IOTLB Invalidate
>> Descriptor. We can pass a log2size instead.
>>>
 + * @nr_pages: number of pages to invalidate
 + * @pasid:processor address space ID value per PCI spec.
 + * @arch_id:  architecture dependent id characterizing a 
 context
 + *and tagging the caches, ie. domain Identfier on 
 VTD,
 + *asid on ARM SMMU
 + * @addr: page address to be invalidated
 + * @flags IOMMU_INVALIDATE_ADDR_LEAF: leaf paging entries
 + *IOMMU_INVALIDATE_GLOBAL_PAGE: global pages
>>>
>>> Shouldn't some of these be tied the the granularity of the
>>> invalidation?  It seems like this should be more similar to
>>> iommu_pasid_table_config where the granularity of the invalidation
>>> defines which entry within a union at the end of the structure is valid
>>> and populated.  Otherwise we have fields that don't make sense for
>>> 

[RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread jglisse
From: Jérôme Glisse 

device_test_p2p() return true if two devices can peer to peer to
each other. We add a generic function as different inter-connect
can support peer to peer and we want to genericaly test this no
matter what the inter-connect might be. However this version only
support PCIE for now.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-ker...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/pci/p2pdma.c   | 27 +++
 include/linux/pci-p2pdma.h |  6 ++
 2 files changed, 33 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index c52298d76e64..620ac60babb5 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -797,3 +797,30 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev 
*p2p_dev,
return sprintf(page, "%s\n", pci_name(p2p_dev));
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
+
+bool pci_test_p2p(struct device *devA, struct device *devB)
+{
+   struct pci_dev *pciA, *pciB;
+   bool ret;
+   int tmp;
+
+   /*
+* For now we only support PCIE peer to peer but other inter-connect
+* can be added.
+*/
+   pciA = find_parent_pci_dev(devA);
+   pciB = find_parent_pci_dev(devB);
+   if (pciA == NULL || pciB == NULL) {
+   ret = false;
+   goto out;
+   }
+
+   tmp = upstream_bridge_distance(pciA, pciB, NULL);
+   ret = tmp < 0 ? false : true;
+
+out:
+   pci_dev_put(pciB);
+   pci_dev_put(pciA);
+   return false;
+}
+EXPORT_SYMBOL_GPL(pci_test_p2p);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index bca9bc3e5be7..7671cc499a08 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -36,6 +36,7 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev 
**p2p_dev,
bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
   bool use_p2pdma);
+bool pci_test_p2p(struct device *devA, struct device *devB);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
@@ -97,6 +98,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
 {
return sprintf(page, "none\n");
 }
+
+static inline bool pci_test_p2p(struct device *devA, struct device *devB)
+{
+   return false;
+}
 #endif /* CONFIG_PCI_P2PDMA */
 
 
-- 
2.17.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[RFC PATCH 0/5] Device peer to peer (p2p) through vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This patchset add support for peer to peer between device in two manner.
First for device memory use through HMM in process regular address space
(ie inside a regular vma that is not an mmap of device file or special
file). Second for special vma ie mmap of a device file, in this case some
device driver might want to allow other device to directly access memory
use for those special vma (not that the memory might not even be map to
CPU in this case).

They are many use cases for this they mainly fall into 2 category:
[A]-Allow device to directly map and control another device command
queue.

[B]-Allow device to access another device memory without disrupting
the other device computation.

Corresponding workloads:

[1]-Network device directly access an control a block device command
queue so that it can do storage access without involving the CPU.
This fall into [A]
[2]-Accelerator device doing heavy computation and network device is
monitoring progress. Direct accelerator's memory access by the
network device avoid the need to use much slower system memory.
This fall into [B].
[3]-Accelerator device doing heavy computation and network device is
streaming out the result. This avoid the need to first bounce the
result through system memory (it saves both system memory and
bandwidth). This fall into [B].
[4]-Chaining device computation. For instance a camera device take a
picture, stream it to a color correction device that stream it
to final memory. This fall into [A and B].

People have more ideas on how to use this than i can list here. The
intention of this patchset is to provide the means to achieve those
and much more.

I have done a testing using nouveau and Mellanox mlx5 where the mlx5
device can directly access GPU memory [1]. I intend to use this inside
nouveau and help porting AMD ROCm RDMA to use this [2]. I believe
other people have express interest in working on using this with
network device and block device.

From implementation point of view this just add 2 new call back to
vm_operations struct (for special device vma support) and 2 new call
back to HMM device memory structure for HMM device memory support.

For now it needs IOMMU off with ACS disabled and for both device to
be on same PCIE sub-tree (can not cross root complex). However the
intention here is different from some other peer to peer work in that
we do want to support IOMMU and are fine with going through the root
complex in that case. In other words, the bandwidth advantage of
avoiding the root complex is of less importance than the programming
model for the feature. We do actualy expect that this will be use
mostly with IOMMU enabled and thus with having to go through the root
bridge.

Another difference from other p2p solution is that we do require that
the importing device abide to mmu notifier invalidation so that the
exporting device can always invalidate a mapping at any point in time.
For this reasons we do not need a struct page for the device memory.

Also in all the cases the policy and final decision on wether to map
or not is solely under the control of the exporting device.

Finaly the device memory might not even be map to the CPU and thus
we have to go through the exporting device driver to get the physical
address at which the memory is accessible.

The core change are minimal (adding new call backs to some struct).
IOMMU support will need little change too. Most of the code is in
driver to implement export policy and BAR space management. Very gross
playground with IOMMU support in [3] (top 3 patches).

Cheers,
Jérôme

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-p2p
[2] https://github.com/RadeonOpenCompute/ROCnRDMA
[3] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-hmm-p2p

Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org

Jérôme Glisse (5):
  pci/p2p: add a function to test peer to peer capability
  drivers/base: add a function to test peer to peer capability
  mm/vma: add support for peer to peer to device vma
  mm/hmm: add support for peer to peer to HMM device memory
  mm/hmm: add support for peer to peer to special device vma

 drivers/base/core.c|  20 
 drivers/pci/p2pdma.c   |  27 +
 include/linux/device.h |   1 +
 include/linux/hmm.h|  53 +
 include/linux/mm.h |  38 +++
 include/linux/pci-p2pdma.h |   6 +
 mm/hmm.c   | 219 ++---
 7 files changed, 325 insertions(+), 39 deletions(-)

-- 
2.17.2

___
iommu mailing list
iommu@lists.linux-foundation.org

[RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread jglisse
From: Jérôme Glisse 

device_test_p2p() return true if two devices can peer to peer to
each other. We add a generic function as different inter-connect
can support peer to peer and we want to genericaly test this no
matter what the inter-connect might be. However this version only
support PCIE for now.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-ker...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/base/core.c| 20 
 include/linux/device.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 0073b09bb99f..56023b00e108 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "base.h"
 #include "power/power.h"
@@ -3167,3 +3168,22 @@ void device_set_of_node_from_dev(struct device *dev, 
const struct device *dev2)
dev->of_node_reused = true;
 }
 EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
+
+/**
+ * device_test_p2p - test if two device can peer to peer to each other
+ * @devA: device A
+ * @devB: device B
+ * Returns: true if device can peer to peer to each other, false otherwise
+ */
+bool device_test_p2p(struct device *devA, struct device *devB)
+{
+   /*
+* For now we only support PCIE peer to peer but other inter-connect
+* can be added.
+*/
+   if (pci_test_p2p(devA, devB))
+   return true;
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(device_test_p2p);
diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..0d532d7f0779 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1250,6 +1250,7 @@ extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle 
*fwnode);
 extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle 
*fwnode);
 void device_set_of_node_from_dev(struct device *dev, const struct device 
*dev2);
+bool device_test_p2p(struct device *devA, struct device *devB);
 
 static inline int dev_num_vf(struct device *dev)
 {
-- 
2.17.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Thiago Jung Bauermann


Fixing address of powerpc mailing list.

Thiago Jung Bauermann  writes:

> Hello,
>
> With Christoph's rework of the DMA API that recently landed, the patch
> below is the only change needed in virtio to make it work in a POWER
> secure guest under the ultravisor.
>
> The other change we need (making sure the device's dma_map_ops is NULL
> so that the dma-direct/swiotlb code is used) can be made in
> powerpc-specific code.
>
> Of course, I also have patches (soon to be posted as RFC) which hook up
>  to the powerpc secure guest support code.
>
> What do you think?
>
> From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
> From: Thiago Jung Bauermann 
> Date: Thu, 24 Jan 2019 22:08:02 -0200
> Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted
>
> The host can't access the guest memory when it's encrypted, so using
> regular memory pages for the ring isn't an option. Go through the DMA API.
>
> Signed-off-by: Thiago Jung Bauermann 
> ---
>  drivers/virtio/virtio_ring.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index cd7e755484e3..321a27075380 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
>* not work without an even larger kludge.  Instead, enable
>* the DMA API if we're a Xen guest, which at least allows
>* all of the sensible Xen configurations to work correctly.
> +  *
> +  * Also, if guest memory is encrypted the host can't access
> +  * it directly. In this case, we'll need to use the DMA API.
>*/
> - if (xen_domain())
> + if (xen_domain() || sev_active())
>   return true;
>
>   return false;


-- 
Thiago Jung Bauermann
IBM Linux Technology Center

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/5] virtio: Introduce virtio_max_dma_size()

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:43:41AM +0100, Joerg Roedel wrote:
> From: Joerg Roedel 
> 
> This function returns the maximum segment size for a single
> dma transaction of a virtio device. The possible limit comes
> from the SWIOTLB implementation in the Linux kernel, that
> has an upper limit of (currently) 256kb of contiguous
> memory it can map. Other DMA-API implementations might also
> have limits.
> 
> Use the new dma_max_mapping_size() function to determine the
> maximum mapping size when DMA-API is in use for virtio.
> 
> Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Joerg Roedel 

Looks good,

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/5] dma: Introduce dma_max_mapping_size()

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:43:40AM +0100, Joerg Roedel wrote:
> From: Joerg Roedel 
> 
> The function returns the maximum size that can be mapped
> using DMA-API functions. The patch also adds the
> implementation for direct DMA and a new dma_map_ops pointer
> so that other implementations can expose their limit.
> 
> Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Joerg Roedel 

Looks good:

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/5] virtio-blk: Consider virtio_max_dma_size() for maximum segment size

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:43:42AM +0100, Joerg Roedel wrote:
> From: Joerg Roedel 
> 
> Segments can't be larger than the maximum DMA mapping size
> supported on the platform. Take that into account when
> setting the maximum segment size for a block device.
> 
> Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Joerg Roedel 

Looks good:

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/5] swiotlb: Add is_swiotlb_active() function

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:43:39AM +0100, Joerg Roedel wrote:
> From: Joerg Roedel 
> 
> This function will be used from dma_direct code to determine
> the maximum segment size of a dma mapping.
> 
> Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Joerg Roedel 

Looks good,

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/5] swiotlb: Introduce swiotlb_max_mapping_size()

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 09:43:38AM +0100, Joerg Roedel wrote:
> From: Joerg Roedel 
> 
> The function returns the maximum size that can be remapped
> by the SWIOTLB implementation. This function will be later
> exposed to users through the DMA-API.
> 
> Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Joerg Roedel 

Looks good:

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-01-29 Thread Thiago Jung Bauermann


Hello,

With Christoph's rework of the DMA API that recently landed, the patch
below is the only change needed in virtio to make it work in a POWER
secure guest under the ultravisor.

The other change we need (making sure the device's dma_map_ops is NULL
so that the dma-direct/swiotlb code is used) can be made in
powerpc-specific code.

Of course, I also have patches (soon to be posted as RFC) which hook up
 to the powerpc secure guest support code.

What do you think?

>From d0629a36a75c678b4a72b853f8f7f8c17eedd6b3 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann 
Date: Thu, 24 Jan 2019 22:08:02 -0200
Subject: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

The host can't access the guest memory when it's encrypted, so using
regular memory pages for the ring isn't an option. Go through the DMA API.

Signed-off-by: Thiago Jung Bauermann 
---
 drivers/virtio/virtio_ring.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index cd7e755484e3..321a27075380 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -259,8 +259,11 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 * not work without an even larger kludge.  Instead, enable
 * the DMA API if we're a Xen guest, which at least allows
 * all of the sensible Xen configurations to work correctly.
+*
+* Also, if guest memory is encrypted the host can't access
+* it directly. In this case, we'll need to use the DMA API.
 */
-   if (xen_domain())
+   if (xen_domain() || sev_active())
return true;

return false;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use generic DMA mapping code in powerpc V4

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 05:14:11PM +0100, Christoph Hellwig wrote:
> On Tue, Jan 29, 2019 at 04:03:32PM +0100, Christian Zigotzky wrote:
> > Hi Christoph,
> >
> > I compiled kernels for the X5000 and X1000 from your new branch 
> > 'powerpc-dma.6-debug.2' today. The kernels boot and the P.A. Semi Ethernet 
> > works!
> 
> Thanks for testing!  I'll prepare a new series that adds the other
> patches on top of this one.

And that was easier than I thought - we just had a few patches left
in powerpc-dma.6, so I've rebased that branch on top of
powerpc-dma.6-debug.2:

git://git.infradead.org/users/hch/misc.git powerpc-dma.6

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/powerpc-dma.6

I hope the other patches are simple enough, so just testing the full
branch checkout should be fine for now.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use generic DMA mapping code in powerpc V4

2019-01-29 Thread Christoph Hellwig
On Tue, Jan 29, 2019 at 04:03:32PM +0100, Christian Zigotzky wrote:
> Hi Christoph,
>
> I compiled kernels for the X5000 and X1000 from your new branch 
> 'powerpc-dma.6-debug.2' today. The kernels boot and the P.A. Semi Ethernet 
> works!

Thanks for testing!  I'll prepare a new series that adds the other
patches on top of this one.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use generic DMA mapping code in powerpc V4

2019-01-29 Thread Christian Zigotzky

Hi Christoph,

I compiled kernels for the X5000 and X1000 from your new branch 
'powerpc-dma.6-debug.2' today. The kernels boot and the P.A. Semi 
Ethernet works!


Cheers,
Christian


On 28 January 2019 at 5:52PM, Christian Zigotzky wrote:

Thanks a lot! I will test it tomorrow.

— Christian

Sent from my iPhone


On 28. Jan 2019, at 17:22, Christoph Hellwig  wrote:


On Mon, Jan 28, 2019 at 08:04:22AM +0100, Christoph Hellwig wrote:

On Sun, Jan 27, 2019 at 02:13:09PM +0100, Christian Zigotzky wrote:
Christoph,

What shall I do next?

I'll need to figure out what went wrong with the new zone selection
on powerpc and give you another branch to test.

Can you try the new powerpc-dma.6-debug.2 branch:

git://git.infradead.org/users/hch/misc.git powerpc-dma.6-debug.2

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/powerpc-dma.6-debug.2




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 0/3] iommu/arm-smmu: Add support to use Last level cache

2019-01-29 Thread Ard Biesheuvel
(+ Bjorn)

On Mon, 28 Jan 2019 at 12:27, Vivek Gautam  wrote:
>
> Hi Ard,
>
> On Thu, Jan 24, 2019 at 1:25 PM Ard Biesheuvel
>  wrote:
> >
> > On Thu, 24 Jan 2019 at 07:58, Vivek Gautam  
> > wrote:
> > >
> > > On Mon, Jan 21, 2019 at 7:55 PM Ard Biesheuvel
> > >  wrote:
> > > >
> > > > On Mon, 21 Jan 2019 at 14:56, Robin Murphy  wrote:
> > > > >
> > > > > On 21/01/2019 13:36, Ard Biesheuvel wrote:
> > > > > > On Mon, 21 Jan 2019 at 14:25, Robin Murphy  
> > > > > > wrote:
> > > > > >>
> > > > > >> On 21/01/2019 10:50, Ard Biesheuvel wrote:
> > > > > >>> On Mon, 21 Jan 2019 at 11:17, Vivek Gautam 
> > > > > >>>  wrote:
> > > > > 
> > > > >  Hi,
> > > > > 
> > > > > 
> > > > >  On Mon, Jan 21, 2019 at 12:56 PM Ard Biesheuvel
> > > > >   wrote:
> > > > > >
> > > > > > On Mon, 21 Jan 2019 at 06:54, Vivek Gautam 
> > > > > >  wrote:
> > > > > >>
> > > > > >> Qualcomm SoCs have an additional level of cache called as
> > > > > >> System cache, aka. Last level cache (LLC). This cache sits 
> > > > > >> right
> > > > > >> before the DDR, and is tightly coupled with the memory 
> > > > > >> controller.
> > > > > >> The clients using this cache request their slices from this
> > > > > >> system cache, make it active, and can then start using it.
> > > > > >> For these clients with smmu, to start using the system cache 
> > > > > >> for
> > > > > >> buffers and, related page tables [1], memory attributes need 
> > > > > >> to be
> > > > > >> set accordingly. This series add the required support.
> > > > > >>
> > > > > >
> > > > > > Does this actually improve performance on reads from a device? 
> > > > > > The
> > > > > > non-cache coherent DMA routines perform an unconditional D-cache
> > > > > > invalidate by VA to the PoC before reading from the buffers 
> > > > > > filled by
> > > > > > the device, and I would expect the PoC to be defined as lying 
> > > > > > beyond
> > > > > > the LLC to still guarantee the architected behavior.
> > > > > 
> > > > >  We have seen performance improvements when running Manhattan
> > > > >  GFXBench benchmarks.
> > > > > 
> > > > > >>>
> > > > > >>> Ah ok, that makes sense, since in that case, the data flow is 
> > > > > >>> mostly
> > > > > >>> to the device, not from the device.
> > > > > >>>
> > > > >  As for the PoC, from my knowledge on sdm845 the system cache, aka
> > > > >  Last level cache (LLC) lies beyond the point of coherency.
> > > > >  Non-cache coherent buffers will not be cached to system cache 
> > > > >  also, and
> > > > >  no additional software cache maintenance ops are required for 
> > > > >  system cache.
> > > > >  Pratik can add more if I am missing something.
> > > > > 
> > > > >  To take care of the memory attributes from DMA APIs side, we can 
> > > > >  add a
> > > > >  DMA_ATTR definition to take care of any dma non-coherent APIs 
> > > > >  calls.
> > > > > 
> > > > > >>>
> > > > > >>> So does the device use the correct inner non-cacheable, outer
> > > > > >>> writeback cacheable attributes if the SMMU is in pass-through?
> > > > > >>>
> > > > > >>> We have been looking into another use case where the fact that the
> > > > > >>> SMMU overrides memory attributes is causing issues (WC mappings 
> > > > > >>> used
> > > > > >>> by the radeon and amdgpu driver). So if the SMMU would honour the
> > > > > >>> existing attributes, would you still need the SMMU changes?
> > > > > >>
> > > > > >> Even if we could force a stage 2 mapping with the weakest pagetable
> > > > > >> attributes (such that combining would work), there would still 
> > > > > >> need to
> > > > > >> be a way to set the TCR attributes appropriately if this behaviour 
> > > > > >> is
> > > > > >> wanted for the SMMU's own table walks as well.
> > > > > >>
> > > > > >
> > > > > > Isn't that just a matter of implementing support for SMMUs that lack
> > > > > > the 'dma-coherent' attribute?
> > > > >
> > > > > Not quite - in general they need INC-ONC attributes in case there
> > > > > actually is something in the architectural outer-cacheable domain.
> > > >
> > > > But is it a problem to use INC-ONC attributes for the SMMU PTW on this
> > > > chip? AIUI, the reason for the SMMU changes is to avoid the
> > > > performance hit of snooping, which is more expensive than cache
> > > > maintenance of SMMU page tables. So are you saying the by-VA cache
> > > > maintenance is not relayed to this system cache, resulting in page
> > > > table updates to be invisible to masters using INC-ONC attributes?
> > >
> > > The reason for this SMMU changes is that the non-coherent devices
> > > can't access the inner caches at all. But they have a way to allocate
> > > and lookup in system cache.
> > >
> > > CPU will by default make use of system cache when the inner-cacheable
> > > and outer-cacheable memory 

Re: [PATCH 2/2] iommu/arm-smmu: Add support for non-coherent page table mappings

2019-01-29 Thread Vivek Gautam
Hi Will,

On Tue, Jan 22, 2019 at 11:14 AM Will Deacon  wrote:
>
> On Mon, Jan 21, 2019 at 11:35:30AM +0530, Vivek Gautam wrote:
> > On Sun, Jan 20, 2019 at 5:31 AM Will Deacon  wrote:
> > > On Thu, Jan 17, 2019 at 02:57:18PM +0530, Vivek Gautam wrote:
> > > > Adding a device tree option for arm smmu to enable non-cacheable
> > > > memory for page tables.
> > > > We already enable a smmu feature for coherent walk based on
> > > > whether the smmu device is dma-coherent or not. Have an option
> > > > to enable non-cacheable page table memory to force set it for
> > > > particular smmu devices.
> > >
> > > Hmm, I must be missing something here. What is the difference between this
> > > new property, and simply omitting dma-coherent on the SMMU?
> >
> > So, this is what I understood from the email thread for Last level
> > cache support -
> > Robin pointed to the fact that we may need to add support for setting
> > non-cacheable
> > mappings in the TCR.
> > Currently, we don't do that for SMMUs that omit dma-coherent.
> > We rely on the interconnect to handle the configuration set in TCR,
> > and let interconnect
> > ignore the cacheability if it can't support.
>
> I think that's a bug. With that fixed, can you get what you want by omitting
> "dma-coherent"?

Based on the discussion on the first patch in this series [1], I can
update the series.
First thing can be -
if QUIRK_NO_DMA is set (i.e. the IOMMU _is_ coherent) then we use a
cacheable TCR;
So, we may need an additional check for this when setting the TCR.

For the second case -
IOMMUs that are *not* coherent, i.e ones that are omitting
'dma-coherent' property,
anyways have to access the page table directly from memory. We take
care of the CPU
side of this by allocating non-coherent memory, and making sure that we sync the
PTEs from map call.
Shouldn't we mark TCR for these IOMMUs as non-cacheable for inner and outer
cacheability attribute?


[1] https://lore.kernel.org/patchwork/patch/1032939/

Regards
Vivek

>
> Will



-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/5] virtio-blk: Consider virtio_max_dma_size() for maximum segment size

2019-01-29 Thread Joerg Roedel
From: Joerg Roedel 

Segments can't be larger than the maximum DMA mapping size
supported on the platform. Take that into account when
setting the maximum segment size for a block device.

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Joerg Roedel 
---
 drivers/block/virtio_blk.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b16a887bbd02..4bc083b7c9b5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -723,7 +723,7 @@ static int virtblk_probe(struct virtio_device *vdev)
struct request_queue *q;
int err, index;
 
-   u32 v, blk_size, sg_elems, opt_io_size;
+   u32 v, blk_size, max_size, sg_elems, opt_io_size;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
 
@@ -826,14 +826,16 @@ static int virtblk_probe(struct virtio_device *vdev)
/* No real sector limit. */
blk_queue_max_hw_sectors(q, -1U);
 
+   max_size = virtio_max_dma_size(vdev);
+
/* Host can optionally specify maximum segment size and number of
 * segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
   struct virtio_blk_config, size_max, );
if (!err)
-   blk_queue_max_segment_size(q, v);
-   else
-   blk_queue_max_segment_size(q, -1U);
+   max_size = min(max_size, v);
+
+   blk_queue_max_segment_size(q, max_size);
 
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/5] swiotlb: Introduce swiotlb_max_mapping_size()

2019-01-29 Thread Joerg Roedel
From: Joerg Roedel 

The function returns the maximum size that can be remapped
by the SWIOTLB implementation. This function will be later
exposed to users through the DMA-API.

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Joerg Roedel 
---
 include/linux/swiotlb.h | 5 +
 kernel/dma/swiotlb.c| 5 +
 2 files changed, 10 insertions(+)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7c007ed7505f..1c22d96e1742 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,6 +62,7 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
 
 extern int
 swiotlb_dma_supported(struct device *hwdev, u64 mask);
+size_t swiotlb_max_mapping_size(struct device *dev);
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
@@ -95,6 +96,10 @@ static inline unsigned int swiotlb_max_segment(void)
 {
return 0;
 }
+static inline size_t swiotlb_max_mapping_size(struct device *dev)
+{
+   return SIZE_MAX;
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd68b9c7..9cb21259cb0b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -662,3 +662,8 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
+
+size_t swiotlb_max_mapping_size(struct device *dev)
+{
+   return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+}
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/5 v4] Fix virtio-blk issue with SWIOTLB

2019-01-29 Thread Joerg Roedel


Hi,

here is the fourth version of this patch-set. Previous
versions can be found here:

V1: https://lore.kernel.org/lkml/20190110134433.15672-1-j...@8bytes.org/

V2: https://lore.kernel.org/lkml/20190115132257.6426-1-j...@8bytes.org/

V3: https://lore.kernel.org/lkml/20190123163049.24863-1-j...@8bytes.org/

The problem solved here is a limitation of the SWIOTLB implementation,
which does not support allocations larger than 256kb.  When the
virtio-blk driver tries to read/write a block larger than that, the
allocation of the dma-handle fails and an IO error is reported.

For all changes to v3, a diff to v3 of the patch-set is at
the end of this email.

Please review.

Thanks,

Joerg

Joerg Roedel (5):
  swiotlb: Introduce swiotlb_max_mapping_size()
  swiotlb: Add is_swiotlb_active() function
  dma: Introduce dma_max_mapping_size()
  virtio: Introduce virtio_max_dma_size()
  virtio-blk: Consider virtio_max_dma_size() for maximum segment size

 Documentation/DMA-API.txt|  8 
 drivers/block/virtio_blk.c   | 10 ++
 drivers/virtio/virtio_ring.c | 10 ++
 include/linux/dma-mapping.h  | 16 
 include/linux/swiotlb.h  | 11 +++
 include/linux/virtio.h   |  2 ++
 kernel/dma/direct.c  | 11 +++
 kernel/dma/swiotlb.c | 14 ++
 8 files changed, 78 insertions(+), 4 deletions(-)

-- 
2.17.1

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index e133ccd60228..acfe3d0f78d1 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -195,6 +195,14 @@ Requesting the required mask does not alter the current 
mask.  If you
 wish to take advantage of it, you should issue a dma_set_mask()
 call to set the mask to the value returned.
 
+::
+
+   size_t
+   dma_direct_max_mapping_size(struct device *dev);
+
+Returns the maximum size of a mapping for the device. The size parameter
+of the mapping functions like dma_map_single(), dma_map_page() and
+others should not be larger than the returned value.
 
 Part Id - Streaming DMA mappings
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 5c087d330b4b..e9e786b4b598 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,7 +62,7 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
 
 extern int
 swiotlb_dma_supported(struct device *hwdev, u64 mask);
-extern size_t swiotlb_max_mapping_size(struct device *dev);
+size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
 
 #ifdef CONFIG_SWIOTLB
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9fbd075081d9..c873f9cc2146 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -670,5 +670,9 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 
 bool is_swiotlb_active(void)
 {
-   return !no_iotlb_memory;
+   /*
+* When SWIOTLB is initialized, even if io_tlb_start points to physical
+* address zero, io_tlb_end surely doesn't.
+*/
+   return io_tlb_end != 0;
 }
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/5] virtio: Introduce virtio_max_dma_size()

2019-01-29 Thread Joerg Roedel
From: Joerg Roedel 

This function returns the maximum segment size for a single
dma transaction of a virtio device. The possible limit comes
from the SWIOTLB implementation in the Linux kernel, that
has an upper limit of (currently) 256kb of contiguous
memory it can map. Other DMA-API implementations might also
have limits.

Use the new dma_max_mapping_size() function to determine the
maximum mapping size when DMA-API is in use for virtio.

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Joerg Roedel 
---
 drivers/virtio/virtio_ring.c | 10 ++
 include/linux/virtio.h   |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index cd7e755484e3..9ca3fe6af9fa 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -266,6 +266,16 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
return false;
 }
 
+size_t virtio_max_dma_size(struct virtio_device *vdev)
+{
+   size_t max_segment_size = SIZE_MAX;
+
+   if (vring_use_dma_api(vdev))
+   max_segment_size = dma_max_mapping_size(>dev);
+
+   return max_segment_size;
+}
+
 static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
  dma_addr_t *dma_handle, gfp_t flag)
 {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index fa1b5da2804e..673fe3ef3607 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -157,6 +157,8 @@ int virtio_device_freeze(struct virtio_device *dev);
 int virtio_device_restore(struct virtio_device *dev);
 #endif
 
+size_t virtio_max_dma_size(struct virtio_device *vdev);
+
 #define virtio_device_for_each_vq(vdev, vq) \
list_for_each_entry(vq, >vqs, list)
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 3/5] dma: Introduce dma_max_mapping_size()

2019-01-29 Thread Joerg Roedel
From: Joerg Roedel 

The function returns the maximum size that can be mapped
using DMA-API functions. The patch also adds the
implementation for direct DMA and a new dma_map_ops pointer
so that other implementations can expose their limit.

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Joerg Roedel 
---
 Documentation/DMA-API.txt   |  8 
 include/linux/dma-mapping.h | 16 
 kernel/dma/direct.c | 11 +++
 3 files changed, 35 insertions(+)

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index e133ccd60228..acfe3d0f78d1 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -195,6 +195,14 @@ Requesting the required mask does not alter the current 
mask.  If you
 wish to take advantage of it, you should issue a dma_set_mask()
 call to set the mask to the value returned.
 
+::
+
+   size_t
+   dma_direct_max_mapping_size(struct device *dev);
+
+Returns the maximum size of a mapping for the device. The size parameter
+of the mapping functions like dma_map_single(), dma_map_page() and
+others should not be larger than the returned value.
 
 Part Id - Streaming DMA mappings
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f6ded992c183..a3ca8a71a704 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -130,6 +130,7 @@ struct dma_map_ops {
enum dma_data_direction direction);
int (*dma_supported)(struct device *dev, u64 mask);
u64 (*get_required_mask)(struct device *dev);
+   size_t (*max_mapping_size)(struct device *dev);
 };
 
 #define DMA_MAPPING_ERROR  (~(dma_addr_t)0)
@@ -257,6 +258,8 @@ static inline void dma_direct_sync_sg_for_cpu(struct device 
*dev,
 }
 #endif
 
+size_t dma_direct_max_mapping_size(struct device *dev);
+
 #ifdef CONFIG_HAS_DMA
 #include 
 
@@ -440,6 +443,19 @@ static inline int dma_mapping_error(struct device *dev, 
dma_addr_t dma_addr)
return 0;
 }
 
+static inline size_t dma_max_mapping_size(struct device *dev)
+{
+   const struct dma_map_ops *ops = get_dma_ops(dev);
+   size_t size = SIZE_MAX;
+
+   if (dma_is_direct(ops))
+   size = dma_direct_max_mapping_size(dev);
+   else if (ops && ops->max_mapping_size)
+   size = ops->max_mapping_size(dev);
+
+   return size;
+}
+
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t flag, unsigned long attrs);
 void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..6310ad01f915 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -380,3 +380,14 @@ int dma_direct_supported(struct device *dev, u64 mask)
 */
return mask >= __phys_to_dma(dev, min_mask);
 }
+
+size_t dma_direct_max_mapping_size(struct device *dev)
+{
+   size_t size = SIZE_MAX;
+
+   /* If SWIOTLB is active, use its maximum mapping size */
+   if (is_swiotlb_active())
+   size = swiotlb_max_mapping_size(dev);
+
+   return size;
+}
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/5] swiotlb: Add is_swiotlb_active() function

2019-01-29 Thread Joerg Roedel
From: Joerg Roedel 

This function will be used from dma_direct code to determine
the maximum segment size of a dma mapping.

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Joerg Roedel 
---
 include/linux/swiotlb.h | 6 ++
 kernel/dma/swiotlb.c| 9 +
 2 files changed, 15 insertions(+)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 1c22d96e1742..e9e786b4b598 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -63,6 +63,7 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
 extern int
 swiotlb_dma_supported(struct device *hwdev, u64 mask);
 size_t swiotlb_max_mapping_size(struct device *dev);
+bool is_swiotlb_active(void);
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
@@ -100,6 +101,11 @@ static inline size_t swiotlb_max_mapping_size(struct 
device *dev)
 {
return SIZE_MAX;
 }
+
+static inline bool is_swiotlb_active(void)
+{
+   return false;
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9cb21259cb0b..c873f9cc2146 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -667,3 +667,12 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 {
return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
 }
+
+bool is_swiotlb_active(void)
+{
+   /*
+* When SWIOTLB is initialized, even if io_tlb_start points to physical
+* address zero, io_tlb_end surely doesn't.
+*/
+   return io_tlb_end != 0;
+}
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu