Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-02 Thread David Gibson
On Mon, Jul 02, 2018 at 04:32:27PM +1000, Alexey Kardashevskiy wrote:
> On Mon, 2 Jul 2018 14:52:43 +1000
> David Gibson  wrote:
> 
> > On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> > > On Mon, 2 Jul 2018 14:08:52 +1000
> > > David Gibson  wrote:
> > >   
> > > > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:  
> > > > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > > > Alexey Kardashevskiy  wrote:
> > > > > 
> > > > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > > > David Gibson  wrote:
> > > > > > 
> > > > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy 
> > > > > > > wrote:  
> > > > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > > > David Gibson  wrote:
> > > > > > > > 
> > > > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey 
> > > > > > > > > Kardashevskiy wrote:
> > > > > > > > > > We already have a check in 
> > > > > > > > > > drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > > > hardware won't
> > > > > > > > > > get access to unassigned host memory.
> > > > > > > > > > 
> > > > > > > > > > However we do not have this check in KVM fastpath 
> > > > > > > > > > (H_PUT_TCE accelerated
> > > > > > > > > > code) so the user space can pin memory backed with 64k 
> > > > > > > > > > pages and create
> > > > > > > > > > a hardware TCE table with a bigger page size. We were lucky 
> > > > > > > > > > so far and
> > > > > > > > > > did not hit this yet as the very first time the mapping 
> > > > > > > > > > happens
> > > > > > > > > > we do not have tbl::it_userspace allocated yet and fall 
> > > > > > > > > > back to
> > > > > > > > > > the userspace which in turn calls VFIO IOMMU driver and 
> > > > > > > > > > that fails
> > > > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is 
> > > > > > > > > > really
> > > > > > > > > > sustainable solution.
> > > > > > > > > > 
> > > > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > > > preregistered
> > > > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > > > this against
> > > > > > > > > > the IOMMU page size.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > > > ---
> > > > > > > > > > Changes:
> > > > > > > > > > v2:
> > > > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > > > compound_order()
> > > > > > > > > > 
> > > > > > > > > > ---
> > > > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and 
> > > > > > > > > > tell it to
> > > > > > > > > > advertise 16MB pages to the guest; a typical pseries guest 
> > > > > > > > > > will use 16MB
> > > > > > > > > > for IOMMU pages without checking the mmu pagesize and this 
> > > > > > > > > > will fail
> > > > > > > > > > at 
> > > > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > > > 
> > > > > > > > > > With the change, mapping will fail in KVM and the guest 
> > > > > > > > > > will print:
> > > > > > > > > > 
> > > > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 
> > > > > > > > > > starting addr = 800 0)
> > > > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 
> > > > > > > > > > for /pci@8002000/ethernet@0
> > > > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > > > /pci@8002000/ethernet@0: -1  
> > > > > > > > > 
> > > > > > > > > [snip]
> > > > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > > > >  {
> > > > > > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > > > struct page *page = NULL;
> > > > > > > > > >  
> > > > > > > > > > mutex_lock(_list_mutex);
> > > > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > > > goto unlock_exit;
> > > > > > > > > > }
> > > > > > > > > >  
> > > > > > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > > > biggest we have */  
> > > > > > > > > 
> > > > > > > > > What about 16G pages on an HPT system?
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Below in the loop mem->pageshift will reduce to the biggest 
> > > > > > > > actual size
> > > > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-02 Thread Alexey Kardashevskiy
On Mon, 2 Jul 2018 14:52:43 +1000
David Gibson  wrote:

> On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> > On Mon, 2 Jul 2018 14:08:52 +1000
> > David Gibson  wrote:
> >   
> > > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:  
> > > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > > Alexey Kardashevskiy  wrote:
> > > > 
> > > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > > David Gibson  wrote:
> > > > > 
> > > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy 
> > > > > > wrote:  
> > > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > > David Gibson  wrote:
> > > > > > > 
> > > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > > > wrote:
> > > > > > > > > We already have a check in 
> > > > > > > > > drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > > hardware won't
> > > > > > > > > get access to unassigned host memory.
> > > > > > > > > 
> > > > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > > > accelerated
> > > > > > > > > code) so the user space can pin memory backed with 64k pages 
> > > > > > > > > and create
> > > > > > > > > a hardware TCE table with a bigger page size. We were lucky 
> > > > > > > > > so far and
> > > > > > > > > did not hit this yet as the very first time the mapping 
> > > > > > > > > happens
> > > > > > > > > we do not have tbl::it_userspace allocated yet and fall back 
> > > > > > > > > to
> > > > > > > > > the userspace which in turn calls VFIO IOMMU driver and that 
> > > > > > > > > fails
> > > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > > > sustainable solution.
> > > > > > > > > 
> > > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > > preregistered
> > > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > > this against
> > > > > > > > > the IOMMU page size.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > > ---
> > > > > > > > > Changes:
> > > > > > > > > v2:
> > > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > > compound_order()
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and 
> > > > > > > > > tell it to
> > > > > > > > > advertise 16MB pages to the guest; a typical pseries guest 
> > > > > > > > > will use 16MB
> > > > > > > > > for IOMMU pages without checking the mmu pagesize and this 
> > > > > > > > > will fail
> > > > > > > > > at 
> > > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > > 
> > > > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > > > print:
> > > > > > > > > 
> > > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 
> > > > > > > > > starting addr = 800 0)
> > > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 
> > > > > > > > > for /pci@8002000/ethernet@0
> > > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > > /pci@8002000/ethernet@0: -1  
> > > > > > > > 
> > > > > > > > [snip]
> > > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > >   struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > > >  {
> > > > > > > > >   struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > > - long i, j, ret = 0, locked_entries = 0;
> > > > > > > > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > >   struct page *page = NULL;
> > > > > > > > >  
> > > > > > > > >   mutex_lock(_list_mutex);
> > > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > >   goto unlock_exit;
> > > > > > > > >   }
> > > > > > > > >  
> > > > > > >  > > +mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > > biggest we have */  
> > > > > > > > 
> > > > > > > > What about 16G pages on an HPT system?
> > > > > > > 
> > > > > > > 
> > > > > > > Below in the loop mem->pageshift will reduce to the biggest 
> > > > > > > actual size
> > > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > > > pinned, no loss there.
> > > > > > 
> > > > > > Are you saying that 16G IOMMU pages aren't supported?  Or that 
> > > > > > there's
> > > > > > some reason a guest can never use them?  
> > > > > 
> > > > > 
> > > > > ah, 16_G_, not 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread David Gibson
On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> On Mon, 2 Jul 2018 14:08:52 +1000
> David Gibson  wrote:
> 
> > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > Alexey Kardashevskiy  wrote:
> > >   
> > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > David Gibson  wrote:
> > > >   
> > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote: 
> > > > >
> > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > David Gibson  wrote:
> > > > > >   
> > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > > wrote:  
> > > > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c 
> > > > > > > > that
> > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > hardware won't
> > > > > > > > get access to unassigned host memory.
> > > > > > > > 
> > > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > > accelerated
> > > > > > > > code) so the user space can pin memory backed with 64k pages 
> > > > > > > > and create
> > > > > > > > a hardware TCE table with a bigger page size. We were lucky so 
> > > > > > > > far and
> > > > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > > > the userspace which in turn calls VFIO IOMMU driver and that 
> > > > > > > > fails
> > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > > sustainable solution.
> > > > > > > > 
> > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > preregistered
> > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > this against
> > > > > > > > the IOMMU page size.
> > > > > > > > 
> > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > ---
> > > > > > > > Changes:
> > > > > > > > v2:
> > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > compound_order()
> > > > > > > > 
> > > > > > > > ---
> > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell 
> > > > > > > > it to
> > > > > > > > advertise 16MB pages to the guest; a typical pseries guest will 
> > > > > > > > use 16MB
> > > > > > > > for IOMMU pages without checking the mmu pagesize and this will 
> > > > > > > > fail
> > > > > > > > at 
> > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > 
> > > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > > print:
> > > > > > > > 
> > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 starting 
> > > > > > > > addr = 800 0)
> > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > > > /pci@8002000/ethernet@0
> > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > /pci@8002000/ethernet@0: -1
> > > > > > > 
> > > > > > > [snip]  
> > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > >  {
> > > > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > struct page *page = NULL;
> > > > > > > >  
> > > > > > > > mutex_lock(_list_mutex);
> > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > goto unlock_exit;
> > > > > > > > }
> > > > > > > >  
> > > > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > biggest we have */
> > > > > > > 
> > > > > > > What about 16G pages on an HPT system?  
> > > > > > 
> > > > > > 
> > > > > > Below in the loop mem->pageshift will reduce to the biggest actual 
> > > > > > size
> > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > > pinned, no loss there.  
> > > > > 
> > > > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > > > some reason a guest can never use them?
> > > > 
> > > > 
> > > > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > > > lift the limit up to 64 then, easier this way.  
> > > 
> > > 
> > > Ah, no, rather this as the upper limit:
> > > 
> > > mem->pageshift = ilog2(entries) + PAGE_SHIFT;  
> > 
> > I can't make sense of this comment in context.  I see how you're
> > computing 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread Alexey Kardashevskiy
On Mon, 2 Jul 2018 14:08:52 +1000
David Gibson  wrote:

> On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> > On Fri, 29 Jun 2018 15:18:20 +1000
> > Alexey Kardashevskiy  wrote:
> >   
> > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > David Gibson  wrote:
> > >   
> > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:   
> > > >  
> > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > David Gibson  wrote:
> > > > >   
> > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > wrote:  
> > > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c 
> > > > > > > that
> > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > hardware won't
> > > > > > > get access to unassigned host memory.
> > > > > > > 
> > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > accelerated
> > > > > > > code) so the user space can pin memory backed with 64k pages and 
> > > > > > > create
> > > > > > > a hardware TCE table with a bigger page size. We were lucky so 
> > > > > > > far and
> > > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > sustainable solution.
> > > > > > > 
> > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > preregistered
> > > > > > > region descriptor and changes the mm_iommu_xxx API to check this 
> > > > > > > against
> > > > > > > the IOMMU page size.
> > > > > > > 
> > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > ---
> > > > > > > Changes:
> > > > > > > v2:
> > > > > > > * explicitly check for compound pages before calling 
> > > > > > > compound_order()
> > > > > > > 
> > > > > > > ---
> > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell 
> > > > > > > it to
> > > > > > > advertise 16MB pages to the guest; a typical pseries guest will 
> > > > > > > use 16MB
> > > > > > > for IOMMU pages without checking the mmu pagesize and this will 
> > > > > > > fail
> > > > > > > at 
> > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > 
> > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > print:
> > > > > > > 
> > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 
> > > > > > > 800 0)
> > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > > /pci@8002000/ethernet@0
> > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > /pci@8002000/ethernet@0: -1
> > > > > > 
> > > > > > [snip]  
> > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > unsigned long ua, unsigned long entries,
> > > > > > >   struct mm_iommu_table_group_mem_t **pmem)
> > > > > > >  {
> > > > > > >   struct mm_iommu_table_group_mem_t *mem;
> > > > > > > - long i, j, ret = 0, locked_entries = 0;
> > > > > > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > >   struct page *page = NULL;
> > > > > > >  
> > > > > > >   mutex_lock(_list_mutex);
> > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > unsigned long ua, unsigned long entries,
> > > > > > >   goto unlock_exit;
> > > > > > >   }
> > > > > > >  
> > > > >  > > +mem->pageshift = 30; /* start from 1G pages - the 
> > > > > biggest we have */
> > > > > > 
> > > > > > What about 16G pages on an HPT system?  
> > > > > 
> > > > > 
> > > > > Below in the loop mem->pageshift will reduce to the biggest actual 
> > > > > size
> > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > pinned, no loss there.  
> > > > 
> > > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > > some reason a guest can never use them?
> > > 
> > > 
> > > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > > lift the limit up to 64 then, easier this way.  
> > 
> > 
> > Ah, no, rather this as the upper limit:
> > 
> > mem->pageshift = ilog2(entries) + PAGE_SHIFT;  
> 
> I can't make sense of this comment in context.  I see how you're
> computing the minimum page size in the reserved region.
> 
> My question is about what the "maximum minimum" is - the starting
> value from which you calculate.  Currently it's 1G, but I can't
> immediately see a reason that 16G is impossible here.


16GB is impossible if the chunk we are preregistering here is smaller
than that, for example, the entire guest ram 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread David Gibson
On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> On Fri, 29 Jun 2018 15:18:20 +1000
> Alexey Kardashevskiy  wrote:
> 
> > On Fri, 29 Jun 2018 14:57:02 +1000
> > David Gibson  wrote:
> > 
> > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:  
> > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > David Gibson  wrote:
> > > > 
> > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote: 
> > > > >
> > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > an IOMMU page is contained in the physical page so the PCI hardware 
> > > > > > won't
> > > > > > get access to unassigned host memory.
> > > > > > 
> > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > accelerated
> > > > > > code) so the user space can pin memory backed with 64k pages and 
> > > > > > create
> > > > > > a hardware TCE table with a bigger page size. We were lucky so far 
> > > > > > and
> > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > sustainable solution.
> > > > > > 
> > > > > > This stores the smallest preregistered page size in the 
> > > > > > preregistered
> > > > > > region descriptor and changes the mm_iommu_xxx API to check this 
> > > > > > against
> > > > > > the IOMMU page size.
> > > > > > 
> > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > ---
> > > > > > Changes:
> > > > > > v2:
> > > > > > * explicitly check for compound pages before calling 
> > > > > > compound_order()
> > > > > > 
> > > > > > ---
> > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it 
> > > > > > to
> > > > > > advertise 16MB pages to the guest; a typical pseries guest will use 
> > > > > > 16MB
> > > > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > > > at 
> > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > 
> > > > > > With the change, mapping will fail in KVM and the guest will print:
> > > > > > 
> > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 
> > > > > > 800 0)
> > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > /pci@8002000/ethernet@0
> > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > /pci@8002000/ethernet@0: -1  
> > > > > 
> > > > > [snip]
> > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > >  {
> > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > struct page *page = NULL;
> > > > > >  
> > > > > > mutex_lock(_list_mutex);
> > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > > goto unlock_exit;
> > > > > > }
> > > > > >  
> > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the biggest we 
> > > > have */  
> > > > > 
> > > > > What about 16G pages on an HPT system?
> > > > 
> > > > 
> > > > Below in the loop mem->pageshift will reduce to the biggest actual size
> > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > pinned, no loss there.
> > > 
> > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > some reason a guest can never use them?  
> > 
> > 
> > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > lift the limit up to 64 then, easier this way.
> 
> 
> Ah, no, rather this as the upper limit:
> 
> mem->pageshift = ilog2(entries) + PAGE_SHIFT;

I can't make sense of this comment in context.  I see how you're
computing the minimum page size in the reserved region.

My question is about what the "maximum minimum" is - the starting
value from which you calculate.  Currently it's 1G, but I can't
immediately see a reason that 16G is impossible here.

> @entries here is a number of system pages being pinned in that
> function.
> 
> 
> 
> > 
> > >   
> > > > > > for (i = 0; i < entries; ++i) {
> > > > > > if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > > > > 1/* pages */, 1/* iswrite */, 
> > > > > > )) {
> > > > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > >  

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-29 Thread Alexey Kardashevskiy
On Fri, 29 Jun 2018 15:18:20 +1000
Alexey Kardashevskiy  wrote:

> On Fri, 29 Jun 2018 14:57:02 +1000
> David Gibson  wrote:
> 
> > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:  
> > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > David Gibson  wrote:
> > > 
> > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:   
> > > >  
> > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > an IOMMU page is contained in the physical page so the PCI hardware 
> > > > > won't
> > > > > get access to unassigned host memory.
> > > > > 
> > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > accelerated
> > > > > code) so the user space can pin memory backed with 64k pages and 
> > > > > create
> > > > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > > > did not hit this yet as the very first time the mapping happens
> > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > sustainable solution.
> > > > > 
> > > > > This stores the smallest preregistered page size in the preregistered
> > > > > region descriptor and changes the mm_iommu_xxx API to check this 
> > > > > against
> > > > > the IOMMU page size.
> > > > > 
> > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > ---
> > > > > Changes:
> > > > > v2:
> > > > > * explicitly check for compound pages before calling compound_order()
> > > > > 
> > > > > ---
> > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > > > advertise 16MB pages to the guest; a typical pseries guest will use 
> > > > > 16MB
> > > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > > at 
> > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > 
> > > > > With the change, mapping will fail in KVM and the guest will print:
> > > > > 
> > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 800 
> > > > > 0)
> > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > /pci@8002000/ethernet@0
> > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > /pci@8002000/ethernet@0: -1  
> > > > 
> > > > [snip]
> > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > > long ua, unsigned long entries,
> > > > >   struct mm_iommu_table_group_mem_t **pmem)
> > > > >  {
> > > > >   struct mm_iommu_table_group_mem_t *mem;
> > > > > - long i, j, ret = 0, locked_entries = 0;
> > > > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > >   struct page *page = NULL;
> > > > >  
> > > > >   mutex_lock(_list_mutex);
> > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > > long ua, unsigned long entries,
> > > > >   goto unlock_exit;
> > > > >   }
> > > > >  
> > >  > > +mem->pageshift = 30; /* start from 1G pages - the biggest we 
> > > have */  
> > > > 
> > > > What about 16G pages on an HPT system?
> > > 
> > > 
> > > Below in the loop mem->pageshift will reduce to the biggest actual size
> > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > pinned, no loss there.
> > 
> > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > some reason a guest can never use them?  
> 
> 
> ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> lift the limit up to 64 then, easier this way.


Ah, no, rather this as the upper limit:

mem->pageshift = ilog2(entries) + PAGE_SHIFT;


@entries here is a number of system pages being pinned in that
function.



> 
> >   
> > > > >   for (i = 0; i < entries; ++i) {
> > > > >   if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > > >   1/* pages */, 1/* iswrite */, 
> > > > > )) {
> > > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > > long ua, unsigned long entries,
> > > > >   }
> > > > >   }
> > > > >  populate:
> > > > > + pageshift = PAGE_SHIFT;
> > > > > + if (PageCompound(page))
> > > > > + pageshift += 
> > > > > compound_order(compound_head(page));
> > > > > + mem->pageshift = min_t(unsigned int, mem->pageshift, 
> > > > > pageshift);  
> > > > 
> > > > Why not make mem->pageshift and pageshift local the same type to avoid
> > > > the min_t() ?
> > > 
> > > I was under impression min() is deprecated (misinterpret checkpatch.pl
> > > may be) and therefore did not pay attention to it. I 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-28 Thread Alexey Kardashevskiy
On Fri, 29 Jun 2018 14:57:02 +1000
David Gibson  wrote:

> On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:
> > On Fri, 29 Jun 2018 14:12:41 +1000
> > David Gibson  wrote:
> >   
> > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:  
> > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > an IOMMU page is contained in the physical page so the PCI hardware 
> > > > won't
> > > > get access to unassigned host memory.
> > > > 
> > > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > > > code) so the user space can pin memory backed with 64k pages and create
> > > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > > did not hit this yet as the very first time the mapping happens
> > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > sustainable solution.
> > > > 
> > > > This stores the smallest preregistered page size in the preregistered
> > > > region descriptor and changes the mm_iommu_xxx API to check this against
> > > > the IOMMU page size.
> > > > 
> > > > Signed-off-by: Alexey Kardashevskiy 
> > > > ---
> > > > Changes:
> > > > v2:
> > > > * explicitly check for compound pages before calling compound_order()
> > > > 
> > > > ---
> > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > at 
> > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > 
> > > > With the change, mapping will fail in KVM and the guest will print:
> > > > 
> > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 800 0)
> > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > /pci@8002000/ethernet@0
> > > > mlx5_core :00:00.0: failed to map direct window for
> > > > /pci@8002000/ethernet@0: -1
> > > 
> > > [snip]  
> > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > long ua, unsigned long entries,
> > > > struct mm_iommu_table_group_mem_t **pmem)
> > > >  {
> > > > struct mm_iommu_table_group_mem_t *mem;
> > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > struct page *page = NULL;
> > > >  
> > > > mutex_lock(_list_mutex);
> > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > long ua, unsigned long entries,
> > > > goto unlock_exit;
> > > > }
> > > >  
> >  > > +  mem->pageshift = 30; /* start from 1G pages - the biggest we 
> > have */
> > > 
> > > What about 16G pages on an HPT system?  
> > 
> > 
> > Below in the loop mem->pageshift will reduce to the biggest actual size
> > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > pinned, no loss there.  
> 
> Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> some reason a guest can never use them?


ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
lift the limit up to 64 then, easier this way.

> 
> > > > for (i = 0; i < entries; ++i) {
> > > > if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > > 1/* pages */, 1/* iswrite */, 
> > > > )) {
> > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > > long ua, unsigned long entries,
> > > > }
> > > > }
> > > >  populate:
> > > > +   pageshift = PAGE_SHIFT;
> > > > +   if (PageCompound(page))
> > > > +   pageshift += 
> > > > compound_order(compound_head(page));
> > > > +   mem->pageshift = min_t(unsigned int, mem->pageshift, 
> > > > pageshift);
> > > 
> > > Why not make mem->pageshift and pageshift local the same type to avoid
> > > the min_t() ?  
> > 
> > I was under impression min() is deprecated (misinterpret checkpatch.pl
> > may be) and therefore did not pay attention to it. I can fix this and
> > repost if there is no other question.  
> 
> Hm, it's possible.


Nah, tried min(), compiles fine.



--
Alexey


pgpKknRvvcuha.pgp
Description: OpenPGP digital signature


Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-28 Thread David Gibson
On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:
> On Fri, 29 Jun 2018 14:12:41 +1000
> David Gibson  wrote:
> 
> > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > an IOMMU page is contained in the physical page so the PCI hardware won't
> > > get access to unassigned host memory.
> > > 
> > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > > code) so the user space can pin memory backed with 64k pages and create
> > > a hardware TCE table with a bigger page size. We were lucky so far and
> > > did not hit this yet as the very first time the mapping happens
> > > we do not have tbl::it_userspace allocated yet and fall back to
> > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > sustainable solution.
> > > 
> > > This stores the smallest preregistered page size in the preregistered
> > > region descriptor and changes the mm_iommu_xxx API to check this against
> > > the IOMMU page size.
> > > 
> > > Signed-off-by: Alexey Kardashevskiy 
> > > ---
> > > Changes:
> > > v2:
> > > * explicitly check for compound pages before calling compound_order()
> > > 
> > > ---
> > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > at 
> > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > 
> > > With the change, mapping will fail in KVM and the guest will print:
> > > 
> > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 
> > > 18 1f returned 0 (liobn = 0x8001 starting addr = 800 0)
> > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > /pci@8002000/ethernet@0
> > > mlx5_core :00:00.0: failed to map direct window for
> > > /pci@8002000/ethernet@0: -1  
> > 
> > [snip]
> > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> > > ua, unsigned long entries,
> > >   struct mm_iommu_table_group_mem_t **pmem)
> > >  {
> > >   struct mm_iommu_table_group_mem_t *mem;
> > > - long i, j, ret = 0, locked_entries = 0;
> > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > >   struct page *page = NULL;
> > >  
> > >   mutex_lock(_list_mutex);
> > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> > > ua, unsigned long entries,
> > >   goto unlock_exit;
> > >   }
> > >  
>  > > +mem->pageshift = 30; /* start from 1G pages - the biggest we 
> have */  
> > 
> > What about 16G pages on an HPT system?
> 
> 
> Below in the loop mem->pageshift will reduce to the biggest actual size
> which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> pinned, no loss there.

Are you saying that 16G IOMMU pages aren't supported?  Or that there's
some reason a guest can never use them?

> > >   for (i = 0; i < entries; ++i) {
> > >   if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > >   1/* pages */, 1/* iswrite */, )) {
> > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned 
> > > long ua, unsigned long entries,
> > >   }
> > >   }
> > >  populate:
> > > + pageshift = PAGE_SHIFT;
> > > + if (PageCompound(page))
> > > + pageshift += compound_order(compound_head(page));
> > > + mem->pageshift = min_t(unsigned int, mem->pageshift, 
> > > pageshift);  
> > 
> > Why not make mem->pageshift and pageshift local the same type to avoid
> > the min_t() ?
> 
> I was under impression min() is deprecated (misinterpret checkpatch.pl
> may be) and therefore did not pay attention to it. I can fix this and
> repost if there is no other question.

Hm, it's possible.


-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-28 Thread Alexey Kardashevskiy
On Fri, 29 Jun 2018 14:12:41 +1000
David Gibson  wrote:

> On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > an IOMMU page is contained in the physical page so the PCI hardware won't
> > get access to unassigned host memory.
> > 
> > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> > code) so the user space can pin memory backed with 64k pages and create
> > a hardware TCE table with a bigger page size. We were lucky so far and
> > did not hit this yet as the very first time the mapping happens
> > we do not have tbl::it_userspace allocated yet and fall back to
> > the userspace which in turn calls VFIO IOMMU driver and that fails
> > because of the check in vfio_iommu_spapr_tce.c which is really
> > sustainable solution.
> > 
> > This stores the smallest preregistered page size in the preregistered
> > region descriptor and changes the mm_iommu_xxx API to check this against
> > the IOMMU page size.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> > Changes:
> > v2:
> > * explicitly check for compound pages before calling compound_order()
> > 
> > ---
> > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> > advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> > for IOMMU pages without checking the mmu pagesize and this will fail
> > at 
> > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > 
> > With the change, mapping will fail in KVM and the guest will print:
> > 
> > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 
> > 18 1f returned 0 (liobn = 0x8001 starting addr = 800 0)
> > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > /pci@8002000/ethernet@0
> > mlx5_core :00:00.0: failed to map direct window for
> > /pci@8002000/ethernet@0: -1  
> 
> [snip]
> > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> > ua, unsigned long entries,
> > struct mm_iommu_table_group_mem_t **pmem)
> >  {
> > struct mm_iommu_table_group_mem_t *mem;
> > -   long i, j, ret = 0, locked_entries = 0;
> > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > struct page *page = NULL;
> >  
> > mutex_lock(_list_mutex);
> > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> > ua, unsigned long entries,
> > goto unlock_exit;
> > }
> >  
 > > +  mem->pageshift = 30; /* start from 1G pages - the biggest we have */  
> 
> What about 16G pages on an HPT system?


Below in the loop mem->pageshift will reduce to the biggest actual size
which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
pinned, no loss there.


> 
> > for (i = 0; i < entries; ++i) {
> > if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > 1/* pages */, 1/* iswrite */, )) {
> > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> > ua, unsigned long entries,
> > }
> > }
> >  populate:
> > +   pageshift = PAGE_SHIFT;
> > +   if (PageCompound(page))
> > +   pageshift += compound_order(compound_head(page));
> > +   mem->pageshift = min_t(unsigned int, mem->pageshift, 
> > pageshift);  
> 
> Why not make mem->pageshift and pageshift local the same type to avoid
> the min_t() ?

I was under impression min() is deprecated (misinterpret checkpatch.pl
may be) and therefore did not pay attention to it. I can fix this and
repost if there is no other question.


> 
> > +
> > mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
> > }
> >  
> > @@ -349,7 +357,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct 
> > mm_struct *mm,
> >  EXPORT_SYMBOL_GPL(mm_iommu_find);
> >  
> >  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> > -   unsigned long ua, unsigned long *hpa)
> > +   unsigned long ua, unsigned int pageshift, unsigned long *hpa)
> >  {
> > const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> > u64 *va = >hpas[entry];
> > @@ -357,6 +365,9 @@ long mm_iommu_ua_to_hpa(struct 
> > mm_iommu_table_group_mem_t *mem,
> > if (entry >= mem->entries)
> > return -EFAULT;
> >  
> > +   if (pageshift > mem->pageshift)
> > +   return -EFAULT;
> > +
> > *hpa = *va | (ua & ~PAGE_MASK);
> >  
> > return 0;
> > @@ -364,7 +375,7 @@ long mm_iommu_ua_to_hpa(struct 
> > mm_iommu_table_group_mem_t *mem,
> >  EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
> >  
> >  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> > -   unsigned long ua, unsigned long *hpa)
> > +   unsigned long ua, unsigned int pageshift, unsigned long *hpa)
> >  {
> > const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> > void *va 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-28 Thread David Gibson
On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote:
> We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> an IOMMU page is contained in the physical page so the PCI hardware won't
> get access to unassigned host memory.
> 
> However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
> code) so the user space can pin memory backed with 64k pages and create
> a hardware TCE table with a bigger page size. We were lucky so far and
> did not hit this yet as the very first time the mapping happens
> we do not have tbl::it_userspace allocated yet and fall back to
> the userspace which in turn calls VFIO IOMMU driver and that fails
> because of the check in vfio_iommu_spapr_tce.c which is really
> sustainable solution.
> 
> This stores the smallest preregistered page size in the preregistered
> region descriptor and changes the mm_iommu_xxx API to check this against
> the IOMMU page size.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v2:
> * explicitly check for compound pages before calling compound_order()
> 
> ---
> The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> for IOMMU pages without checking the mmu pagesize and this will fail
> at 
> https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> 
> With the change, mapping will fail in KVM and the guest will print:
> 
> mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 18 
> 1f returned 0 (liobn = 0x8001 starting addr = 800 0)
> mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> /pci@8002000/ethernet@0
> mlx5_core :00:00.0: failed to map direct window for
> /pci@8002000/ethernet@0: -1

[snip]
> @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
> unsigned long entries,
>   struct mm_iommu_table_group_mem_t **pmem)
>  {
>   struct mm_iommu_table_group_mem_t *mem;
> - long i, j, ret = 0, locked_entries = 0;
> + long i, j, ret = 0, locked_entries = 0, pageshift;
>   struct page *page = NULL;
>  
>   mutex_lock(_list_mutex);
> @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
> unsigned long entries,
>   goto unlock_exit;
>   }
>  
> + mem->pageshift = 30; /* start from 1G pages - the biggest we have */

What about 16G pages on an HPT system?

>   for (i = 0; i < entries; ++i) {
>   if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
>   1/* pages */, 1/* iswrite */, )) {
> @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> ua, unsigned long entries,
>   }
>   }
>  populate:
> + pageshift = PAGE_SHIFT;
> + if (PageCompound(page))
> + pageshift += compound_order(compound_head(page));
> + mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift);

Why not make mem->pageshift and pageshift local the same type to avoid
the min_t() ?

> +
>   mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
>   }
>  
> @@ -349,7 +357,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct 
> mm_struct *mm,
>  EXPORT_SYMBOL_GPL(mm_iommu_find);
>  
>  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> - unsigned long ua, unsigned long *hpa)
> + unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>   const long entry = (ua - mem->ua) >> PAGE_SHIFT;
>   u64 *va = >hpas[entry];
> @@ -357,6 +365,9 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t 
> *mem,
>   if (entry >= mem->entries)
>   return -EFAULT;
>  
> + if (pageshift > mem->pageshift)
> + return -EFAULT;
> +
>   *hpa = *va | (ua & ~PAGE_MASK);
>  
>   return 0;
> @@ -364,7 +375,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t 
> *mem,
>  EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
>  
>  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> - unsigned long ua, unsigned long *hpa)
> + unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>   const long entry = (ua - mem->ua) >> PAGE_SHIFT;
>   void *va = >hpas[entry];
> @@ -373,6 +384,9 @@ long mm_iommu_ua_to_hpa_rm(struct 
> mm_iommu_table_group_mem_t *mem,
>   if (entry >= mem->entries)
>   return -EFAULT;
>  
> + if (pageshift > mem->pageshift)
> + return -EFAULT;
> +
>   pa = (void *) vmalloc_to_phys(va);
>   if (!pa)
>   return -EFAULT;
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 2da5f05..7cd63b0 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ 

[PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-06-26 Thread Alexey Kardashevskiy
We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
an IOMMU page is contained in the physical page so the PCI hardware won't
get access to unassigned host memory.

However we do not have this check in KVM fastpath (H_PUT_TCE accelerated
code) so the user space can pin memory backed with 64k pages and create
a hardware TCE table with a bigger page size. We were lucky so far and
did not hit this yet as the very first time the mapping happens
we do not have tbl::it_userspace allocated yet and fall back to
the userspace which in turn calls VFIO IOMMU driver and that fails
because of the check in vfio_iommu_spapr_tce.c which is really
sustainable solution.

This stores the smallest preregistered page size in the preregistered
region descriptor and changes the mm_iommu_xxx API to check this against
the IOMMU page size.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v2:
* explicitly check for compound pages before calling compound_order()

---
The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
advertise 16MB pages to the guest; a typical pseries guest will use 16MB
for IOMMU pages without checking the mmu pagesize and this will fail
at 
https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256

With the change, mapping will fail in KVM and the guest will print:

mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 18 1f 
returned 0 (liobn = 0x8001 starting addr = 800 0)
mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
/pci@8002000/ethernet@0
mlx5_core :00:00.0: failed to map direct window for 
/pci@8002000/ethernet@0: -1
---
 arch/powerpc/include/asm/mmu_context.h |  4 ++--
 arch/powerpc/kvm/book3s_64_vio.c   |  2 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c|  6 --
 arch/powerpc/mm/mmu_context_iommu.c| 20 +---
 drivers/vfio/vfio_iommu_spapr_tce.c|  2 +-
 5 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 896efa5..79d570c 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -35,9 +35,9 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
-   unsigned long ua, unsigned long *hpa);
+   unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
-   unsigned long ua, unsigned long *hpa);
+   unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d066e37..8c456fa 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -449,7 +449,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct 
iommu_table *tbl,
/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
return H_TOO_HARD;
 
-   if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, )))
+   if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, )))
return H_HARDWARE;
 
if (mm_iommu_mapped_inc(mem))
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 925fc31..5b298f5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -279,7 +279,8 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
if (!mem)
return H_TOO_HARD;
 
-   if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, )))
+   if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
+   )))
return H_HARDWARE;
 
pua = (void *) vmalloc_to_phys(pua);
@@ -469,7 +470,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
if (mem)
-   prereg = mm_iommu_ua_to_hpa_rm(mem, ua, ) == 0;
+   prereg = mm_iommu_ua_to_hpa_rm(mem, ua,
+   IOMMU_PAGE_SHIFT_4K, ) == 0;
}
 
if (!prereg) {
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index abb4364..dc0e8cd 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -27,6 +27,7 @@ struct mm_iommu_table_group_mem_t {
struct rcu_head rcu;
unsigned long used;